In [76]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib inline

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV
import statsmodels.api as sm
import statsmodels.formula.api as smf 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('data/df_model.csv')

In [4]:
print("The shape of the DataFrame is:", df.shape)
df.head()

The shape of the DataFrame is: (2514, 13)


Unnamed: 0,Genre,Language,Nudity,Rating,Sex,Title,Violence,Release_Date,Production_Budget,Domestic_Gross,Worldwide_Gross,Release_Year,Release_Month
0,Action,Medium,Light,PG-13,,Skyscraper,Medium,2018-07-13,125000000,31562840,71955649,2018,Jul
1,Animated,,Light,PG,,Hotel Transylvania 3: Summer Vacation,Light,2018-07-13,65000000,57597439,112373051,2018,Jul
2,Thriller,Heavy,Medium,R,Medium,The First Purge,Heavy,2018-07-04,13000000,52929930,76023230,2018,Jul
3,Science Fiction,Heavy,Light,PG-13,,Ant-Man And The Wasp,Medium,2018-07-06,130000000,142156135,293129020,2018,Jul
4,Thriller,Heavy,,R,,Sicario: Day Of The Soldado,Heavy,2018-06-29,35000000,44370869,60095029,2018,Jun


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2514 entries, 0 to 2513
Data columns (total 13 columns):
Genre                2514 non-null object
Language             2514 non-null object
Nudity               2514 non-null object
Rating               2514 non-null object
Sex                  2514 non-null object
Title                2514 non-null object
Violence             2514 non-null object
Release_Date         2514 non-null object
Production_Budget    2514 non-null int64
Domestic_Gross       2514 non-null int64
Worldwide_Gross      2514 non-null int64
Release_Year         2514 non-null int64
Release_Month        2514 non-null object
dtypes: int64(4), object(9)
memory usage: 255.4+ KB


In [109]:
X = df.drop(['Title', 'Domestic_Gross', 'Worldwide_Gross', 
             'Release_Date'],axis=1)
X = pd.get_dummies(X)
y = df['Domestic_Gross']


print(X.shape)
X.head()

(2514, 55)


Unnamed: 0,Production_Budget,Release_Year,Genre_Action,Genre_Animated,Genre_Biography,Genre_Comedy,Genre_Documentary,Genre_Drama,Genre_Family,Genre_Fantasy,...,Release_Month_Dec,Release_Month_Feb,Release_Month_Jan,Release_Month_Jul,Release_Month_Jun,Release_Month_Mar,Release_Month_May,Release_Month_Nov,Release_Month_Oct,Release_Month_Sep
0,125000000,2018,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,65000000,2018,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,13000000,2018,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,130000000,2018,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,35000000,2018,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


### Baseline Accuracy

In [97]:
X, X_val, y, y_val = train_test_split(X, y, test_size=.2, random_state=71) #hold out 20% of the data for final testing

In [98]:
X_bl = X.loc[:, ['Production_Budget', 'Release_Year']]

kf = KFold(n_splits=5, shuffle=True, random_state = 10)
bl_r2 = []
bl_rmse = []

for train_ind, test_ind in kf.split(X, y):

    X_train, y_train = X_bl.iloc[train_ind], y.iloc[train_ind]
    X_test, y_test = X_bl.iloc[test_ind], y.iloc[test_ind] 

#     X_train = np.array(X_train)
#     X_train = X_train.reshape(1, -1)

#     X_test = np.array(X_test)
#     X_test = X_test.reshape(1, -1)
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    bl_r2.append(lr.score(X_test, y_test))
    bl_rmse.append(mean_squared_error(y_test, lr.predict(X_test)))

In [99]:
print('Baseline R^2: %.3f +- %.3f' %(np.mean(bl_r2),np.std(bl_r2)))
print('Baseline RMSE: %.3f +- %.3f' %(np.mean(bl_rmse),np.std(bl_rmse)))

Baseline R^2: 0.466 +- 0.037
Baseline RMSE: 2845426019394933.500 +- 426588457439695.375


### Accuracy With All Features

In [101]:
allf_r2 = []
allf_rmse = []

for train_ind, test_ind in kf.split(X, y):

    X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
    X_test, y_test = X.iloc[test_ind], y.iloc[test_ind] 

#     X_train = np.array(X_train)
#     X_train = X_train.reshape(1, -1)

#     X_test = np.array(X_test)
#     X_test = X_test.reshape(1, -1)
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    allf_r2.append(lr.score(X_test, y_test))
    allf_rmse.append(mean_squared_error(y_test, lr.predict(X_test)))

In [102]:
print('All features R^2: %.3f +- %.3f' %(np.mean(allf_r2),np.std(allf_r2)))
print('All features R^2: %.3f +- %.3f' %(np.mean(allf_rmse),np.std(allf_rmse)))

All features R^2: 0.464 +- 0.031
All features R^2: 2860961931781463.000 +- 430467176792388.062


In [96]:
lr.predict(X_test.head())

array([1.52357590e+08, 1.66262001e+07, 4.76423984e+07, 1.33774962e+08,
       3.78131830e+07])

### Check Polynomial Degree

In [45]:
degrees = 3

r2score = [[] for i in range(degrees)]

for i in range(2, degrees+1):
    print("Degree:", i)
    k = 1
    poly = PolynomialFeatures(degree=i, interaction_only=True)
    for train_ind, test_ind in kf.split(X, y):
        print('k:', k)
        k += 1
        
        X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
        X_test, y_test = X.iloc[test_ind], y.iloc[test_ind] 

        #Feature transforms for train, val, and test so that we can run our poly model on each
        X_train_poly = poly.fit_transform(X_train)
        X_test_poly = poly.transform(X_test)

        lr_poly = LinearRegression()
        lr_poly.fit(X_train_poly, y_train)
        r2score[i-2].append(lr_poly.score(X_test_poly, y_test))
        

Degree: 2
k: 1
k: 2
k: 3
k: 4
k: 5
Degree: 3
k: 1
k: 2
k: 3
k: 4
k: 5


In [46]:
for i in range(degrees):
    print('R^2 of degree %.0f: %.3f +- %.3f' %(i+1, np.mean(r2score[i]),np.std(r2score[i])))

R^2 of degree 1: -242877696264150.844 +- 485755392528264.500
R^2 of degree 2: -9179880926451618.000 +- 18359718084654796.000


### Run LASSO

In [120]:
X = df.drop(['Title', 'Domestic_Gross', 'Worldwide_Gross', 
             'Release_Date'],axis=1)
X = pd.get_dummies(X)

In [121]:
## Scale the data
std = StandardScaler()

temp_df = X.loc[:, ['Production_Budget', 'Release_Year']]
std.fit(temp_df)
X.loc[:, ['Production_Budget', 'Release_Year']] = std.transform(temp_df)
X.head()

Unnamed: 0,Production_Budget,Release_Year,Genre_Action,Genre_Animated,Genre_Biography,Genre_Comedy,Genre_Documentary,Genre_Drama,Genre_Family,Genre_Fantasy,...,Release_Month_Dec,Release_Month_Feb,Release_Month_Jan,Release_Month_Jul,Release_Month_Jun,Release_Month_Mar,Release_Month_May,Release_Month_Nov,Release_Month_Oct,Release_Month_Sep
0,1.73381,1.843074,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0.479371,1.843074,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,-0.60781,1.843074,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1.838347,1.843074,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,-0.147849,1.843074,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [122]:
#hold out 20% of the data for final testing
X, X_val, y, y_val = train_test_split(X, y, test_size=.2, random_state=71) 

In [128]:
# Run the cross validation, find the best alpha, refit the model on all the data with that alpha

alphavec = 10**np.linspace(1,7,90)

lasso = LassoCV(alphas = alphavec, cv=5)
lasso.fit(X,y)

LassoCV(alphas=array([1.00000e+01, 1.16793e+01, ..., 8.56218e+06, 1.00000e+07]),
    copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=1000,
    n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [129]:
lasso.alpha_

281473.9464453597

In [130]:
lasso.score(X_val, y_val)

0.5839654636913627

In [132]:
# These are the coefficients when it refit using that best alpha
lasso_coefs = list(zip(X.columns,lasso.coef_))
lasso_coefs

[('Production_Budget', 47907994.88597694),
 ('Release_Year', 1745675.0968751658),
 ('Genre_Action', -0.0),
 ('Genre_Animated', 25406625.674501423),
 ('Genre_Biography', 0.0),
 ('Genre_Comedy', 4886388.698352582),
 ('Genre_Documentary', -0.0),
 ('Genre_Drama', -3127891.168543533),
 ('Genre_Family', 0.0),
 ('Genre_Fantasy', -0.0),
 ('Genre_Historical', -0.0),
 ('Genre_Horror', 5270612.227038803),
 ('Genre_Musical', 0.0),
 ('Genre_Mystery', -0.0),
 ('Genre_Religious', 0.0),
 ('Genre_Romance', 0.0),
 ('Genre_Science Fiction', 0.0),
 ('Genre_Soap Opera', -0.0),
 ('Genre_Thriller', -1185450.9212965346),
 ('Genre_Unknown', -0.0),
 ('Genre_Western', -0.0),
 ('Language_Heavy', 77349.16880571534),
 ('Language_Light', 0.0),
 ('Language_Medium', -3670145.4272821504),
 ('Language_None', -0.0),
 ('Nudity_Heavy', 0.0),
 ('Nudity_Light', 0.0),
 ('Nudity_Medium', -5600944.210200765),
 ('Nudity_None', -0.0),
 ('Rating_G', 1387574.5090972316),
 ('Rating_NC-17', -0.0),
 ('Rating_PG', -0.0),
 ('Rating_PG-1