In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import sklearn

In [2]:
! git clone https://github.com/benrosenberg/ORIE-4741-project.git

Cloning into 'ORIE-4741-project'...
remote: Enumerating objects: 67, done.[K
remote: Counting objects: 100% (67/67), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 67 (delta 19), reused 18 (delta 3), pack-reused 0[K
Unpacking objects: 100% (67/67), done.


In [16]:
users_train = pd.read_csv("/content/ORIE-4741-project/datasets/training_users_list.csv")
users_test = pd.read_csv("/content/ORIE-4741-project/datasets/testing_users_list.csv")

Try without dummy variables to get a more simplified model

In [44]:
# feature engineering
users_train["year_last_online"] = pd.to_datetime(users_train.last_online).apply(lambda date: date.year)
# drop columns that have no value or are completely null
X_train_no_dummies = users_train.drop(['Unnamed: 0', 'username', 'birth_date', 'user_id', 'join_date', 'last_online', 'access_rank'], axis=1)
# drop columns we expect to be overcorrelated
X_train_no_dummies = X_train_no_dummies.drop(['stats_episodes'], axis=1)
# drop feature column
X_train_no_dummies = X_train_no_dummies.drop(['user_days_spent_watching'], axis=1)
# drop categorical 
X_train_no_dummies = X_train_no_dummies.drop(['gender', 'location'], axis=1)
# fill na values with their average value in the column
X_train_no_dummies = X_train_no_dummies.fillna(X_train_no_dummies.mean())


y_train_no_dummies = users_train.user_days_spent_watching

In [45]:
no_dummies_model = sm.OLS(y_train_no_dummies, X_train_no_dummies).fit()
w_no_dummies = no_dummies_model.params
print(w_no_dummies)

user_watching      -0.001327
user_completed      0.230240
user_onhold        -0.023962
user_dropped        0.010255
user_plantowatch   -0.010366
stats_mean_score    0.260649
stats_rewatched     0.000309
join_year          -1.549046
age                -0.124907
year_last_online    1.554855
dtype: float64


In [46]:
print(no_dummies_model.summary())

                                    OLS Regression Results                                   
Dep. Variable:     user_days_spent_watching   R-squared (uncentered):                   0.919
Model:                                  OLS   Adj. R-squared (uncentered):              0.919
Method:                       Least Squares   F-statistic:                          1.729e+04
Date:                      Wed, 27 Oct 2021   Prob (F-statistic):                        0.00
Time:                              23:26:51   Log-Likelihood:                         -68452.
No. Observations:                     15204   AIC:                                  1.369e+05
Df Residuals:                         15194   BIC:                                  1.370e+05
Df Model:                                10                                                  
Covariance Type:                  nonrobust                                                  
                       coef    std err          t      P>|t|

In [47]:
# feature engineering
users_test["year_last_online"] = pd.to_datetime(users_test.last_online).apply(lambda date: date.year)
# drop columns that have no value or 
X_test_no_dummies = users_test.drop(['Unnamed: 0', 'username', 'birth_date', 'user_id', 'join_date', 'last_online', 'access_rank'], axis=1)
# drop columns we expect to be overcorrelated
X_test_no_dummies = X_test_no_dummies.drop(['stats_episodes'], axis=1)
# drop feature column
X_test_no_dummies = X_test_no_dummies.drop(['user_days_spent_watching'], axis=1)
# drop categorical 
X_test_no_dummies = X_test_no_dummies.drop(['gender', 'location'], axis=1)
# fill na values with their average value in the column
X_test_no_dummies = X_test_no_dummies.fillna(X_test.mean())

y_test_no_dummies = users_test.user_days_spent_watching

In [56]:
y_pred_no_dummies = no_dummies_model.predict(X_test_no_dummies)
print('The MSE of our simplified model is', sklearn.metrics.mean_squared_error(y_test_no_dummies, y_pred_no_dummies))

The MSE of our simplified model is 448.48880098922444


Now try creating dummy variables so we don't lose the power of our categorical variables. 

In [49]:
# feature engineering
users_train["year_last_online"] = pd.to_datetime(users_train.last_online).apply(lambda date: date.year)
# drop columns that have no value or are completely null
X_train = users_train.drop(['Unnamed: 0', 'username', 'birth_date', 'user_id', 'join_date', 'last_online', 'access_rank'], axis=1)
# drop columns we expect to be overcorrelated
X_train = X_train.drop(['stats_episodes'], axis=1)
# drop feature column
X_train = X_train.drop(['user_days_spent_watching'], axis=1)
# create dummies
X_train = pd.get_dummies(X_train)
# fill na values with their average value in the column
X_train = X_train.fillna(X_train.mean())


y_train = users_train.user_days_spent_watching

In [50]:
model = sm.OLS(y_train, X_train).fit()
w = model.params
print(w)

user_watching       -0.002608
user_completed       0.227981
user_onhold         -0.021432
user_dropped         0.011228
user_plantowatch    -0.010295
                      ...    
location_VT        -24.275407
location_WA        -31.763535
location_WI        -32.755475
location_WV        -34.510741
location_WY        -28.993112
Length: 64, dtype: float64


In [51]:
print(model.summary())

                               OLS Regression Results                               
Dep. Variable:     user_days_spent_watching   R-squared:                       0.844
Model:                                  OLS   Adj. R-squared:                  0.843
Method:                       Least Squares   F-statistic:                     1323.
Date:                      Wed, 27 Oct 2021   Prob (F-statistic):               0.00
Time:                              23:26:54   Log-Likelihood:                -68316.
No. Observations:                     15204   AIC:                         1.368e+05
Df Residuals:                         15141   BIC:                         1.372e+05
Df Model:                                62                                         
Covariance Type:                  nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------

In [52]:
# feature engineering
users_test["year_last_online"] = pd.to_datetime(users_test.last_online).apply(lambda date: date.year)
# drop columns that have no value or 
X_test = users_test.drop(['Unnamed: 0', 'username', 'birth_date', 'user_id', 'join_date', 'last_online', 'access_rank'], axis=1)
# drop columns we expect to be overcorrelated
X_test = X_test.drop(['stats_episodes'], axis=1)
# drop feature column
X_test = X_test.drop(['user_days_spent_watching'], axis=1)
# create dummies
X_test = pd.get_dummies(X_test)
# fill na values with their average value in the column
X_test = X_test.fillna(X_test.mean())

y_test = users_test.user_days_spent_watching

In [55]:
y_pred = model.predict(X_test)
print('The MSE is of our model with categorical variables is', sklearn.metrics.mean_squared_error(y_test, y_pred))

The MSE is of our model with categorical variables is 441.20913294612023
