In [127]:
from __future__ import print_function

import numpy as np
import pandas as pd

import patsy
import pickle
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from seaborn import plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.cross_validation import train_test_split
from collections import defaultdict

%matplotlib inline

In [128]:
df = pd.read_pickle('df_all_cat.pkl')

In [129]:
df.head(1)

Unnamed: 0,title,score_c,rating_c,score_u,rating_u,director,runtime,audience_rating,month,month[T.Aug],...,director[T.Yoshitaka Amano],director[T.Yvan Attal],director[T.Zach Helm],director[T.Zack Snyder],director[T.Zhang Zhiliang],audience_rating[T.NC17],audience_rating[T.NR],audience_rating[T.PG],audience_rating[T.PG-13],audience_rating[T.R]
0,Resident Evil: The Final Chapter,35,0.44,51.0,0.59,Paul W.S. Anderson,106.0,R,Jan,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [130]:
df_no_cat = pd.read_pickle('no_cat.pkl')

In [131]:
df_no_cat.head()

Unnamed: 0,title,score_c,rating_c,score_u,rating_u,director,runtime,audience_rating,month
0,Resident Evil: The Final Chapter,35,0.44,51.0,0.59,Paul W.S. Anderson,106.0,R,Jan
1,Passengers,31,0.49,63.0,0.47,Morten Tyldum,116.0,PG-13,Dec
2,Beauty And The Beast (La Belle Et La Bête),32,0.46,53.0,0.66,Christophe Gans,112.0,PG-13,Jan
3,Fantastic Beasts And Where To Find Them,73,0.68,79.0,0.7,David Yates (II),132.0,PG-13,Nov
4,Absolutely Anything,18,0.37,31.0,0.54,Terry Jones,85.0,NR,May


In [132]:
director_counts = defaultdict(int)
for director in df_no_cat.director:
    director_counts[director]+=1

In [133]:
productive_directors = []
for k, v in director_counts.items():
    if v > 5:
        productive_directors.append(k)

In [134]:
productive_directors

['Paul W.S. Anderson',
 'Roland Emmerich',
 'Tim Burton',
 'Bryan Singer',
 'Steven Spielberg',
 'Peter Jackson',
 'Terry Gilliam',
 'Michael Bay',
 'Sam Raimi',
 'Joe Dante',
 'David Cronenberg',
 'Ridley Scott',
 'M. Night Shyamalan',
 'James Cameron',
 'Robert Zemeckis',
 'Robert Stevenson',
 'John Carpenter']

In [135]:
df_prod_dir = df[df.director.isin(productive_directors)]

In [136]:
len(df_prod_dir)

125

In [137]:
df_prod_dir.columns

Index(['title', 'score_c', 'rating_c', 'score_u', 'rating_u', 'director',
       'runtime', 'audience_rating', 'month', 'month[T.Aug]',
       ...
       'director[T.Yoshitaka Amano]', 'director[T.Yvan Attal]',
       'director[T.Zach Helm]', 'director[T.Zack Snyder]',
       'director[T.Zhang Zhiliang]', 'audience_rating[T.NC17]',
       'audience_rating[T.NR]', 'audience_rating[T.PG]',
       'audience_rating[T.PG-13]', 'audience_rating[T.R]'],
      dtype='object', length=1038)

In [138]:
def run_model_sk(X_train, y_train):
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    return lr

## With all variables

### audience ratings

In [139]:
y = df_prod_dir.rating_u
X = df_prod_dir.drop(['title', 'rating_c', 'score_c', 'rating_u', 'score_u', 'director', 'audience_rating', 'month'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

sk_a = run_model_sk(X_train, y_train)
print('Training score audience: ', sk_a.score(X_train, y_train))
print('Testing score audience: ', sk_a.score(X_test, y_test))

Training score audience:  0.642496722955
Testing score audience:  0.0436441890304


In [140]:
for alpha in np.arange(0.1, 1.1, 0.1):
    print(alpha, Ridge(normalize=True, alpha=alpha).fit(X_train, y_train).score(X_test, y_test))

0.1 0.175828004412
0.2 0.222855245352
0.3 0.243283391944
0.4 0.251942094227
0.5 0.254661238491
0.6 0.254134593245
0.7 0.251747769548
0.8 0.248270298103
0.9 0.244154428099
1.0 0.239677377836


In [141]:
rid_a = Ridge(normalize=True, alpha=1)
fit_a = rid_a.fit(X_train, y_train)
print(fit_a.score(X_train, y_train))
print(fit_a.score(X_test, y_test))

0.440932481334
0.239677377836


### Critics ratings

In [142]:
y = df_prod_dir.rating_c
X = df_prod_dir.drop(['title', 'rating_c', 'score_c', 'rating_u', 'score_u', 'director', 'audience_rating', 'month'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
sk_d = run_model_sk(X_train, y_train)
print('Training score audience: ', sk_d.score(X_train, y_train))
print('Testing score audience: ', sk_d.score(X_test, y_test))

for alpha in np.arange(0.1, 1.1, 0.1):
    print(alpha, Ridge(normalize=True, alpha=alpha).fit(X_train, y_train).score(X_test, y_test))

Training score audience:  0.671844900753
Testing score audience:  -0.00590450125018
0.1 0.144044138164
0.2 0.205496009166
0.3 0.235162112433
0.4 0.250022173255
0.5 0.257034848974
0.6 0.259540641166
0.7 0.259321901011
0.8 0.257403633413
0.9 0.254408720869
1.0 0.250731697168


In [143]:
rid_c = Ridge(normalize=True, alpha=0.6)
fit_c = rid_c.fit(X_train, y_train)
print(fit_c.score(X_train, y_train))
print(fit_c.score(X_test, y_test))

0.517908236329
0.259540641166


## Only directors

In [144]:
df_dir_ncat = df_no_cat[df_no_cat.director.isin(productive_directors)]

In [145]:
df_dir_ncat.columns

Index(['title', 'score_c', 'rating_c', 'score_u', 'rating_u', 'director',
       'runtime', 'audience_rating', 'month'],
      dtype='object')

In [146]:
director = patsy.dmatrix('director',data=df_dir_ncat,return_type='dataframe')
with_d = df_dir_ncat.join(director).drop('Intercept', axis=1)
with_d.columns

Index(['title', 'score_c', 'rating_c', 'score_u', 'rating_u', 'director',
       'runtime', 'audience_rating', 'month', 'director[T.David Cronenberg]',
       'director[T.James Cameron]', 'director[T.Joe Dante]',
       'director[T.John Carpenter]', 'director[T.M. Night Shyamalan]',
       'director[T.Michael Bay]', 'director[T.Paul W.S. Anderson]',
       'director[T.Peter Jackson]', 'director[T.Ridley Scott]',
       'director[T.Robert Stevenson]', 'director[T.Robert Zemeckis]',
       'director[T.Roland Emmerich]', 'director[T.Sam Raimi]',
       'director[T.Steven Spielberg]', 'director[T.Terry Gilliam]',
       'director[T.Tim Burton]'],
      dtype='object')

### Audience rating

In [147]:
y = with_d.rating_u
X = with_d.drop(['title', 'rating_c', 'score_c', 'rating_u', 'score_u', 'director', 'audience_rating', 'month', 'runtime'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

sk_da = run_model_sk(X_train, y_train)
print('Training score audience: ', sk_da.score(X_train, y_train))
print('Testing score audience: ', sk_da.score(X_test, y_test))

Training score audience:  0.48800742439
Testing score audience:  0.342112266032


In [148]:
for alpha in np.arange(0.1, 1.1, 0.1):
    print(alpha, Ridge(normalize=True, alpha=alpha).fit(X_train, y_train).score(X_test, y_test))

0.1 0.349501044704
0.2 0.348566942966
0.3 0.344044557138
0.4 0.337417798256
0.5 0.329586795552
0.6 0.321120511572
0.7 0.31238280607
0.8 0.303607013028
0.9 0.294941939142
1.0 0.286480933419


In [149]:
rid_ad = Ridge(normalize=True, alpha=0.5)
fit_ad = rid_ad.fit(X_train, y_train)
print(fit_ad.score(X_train, y_train))
print(fit_ad.score(X_test, y_test))

0.434477841718
0.329586795552


In [150]:
zipped = zip(sk_da.coef_, with_d.columns)
sorted(zipped, key = lambda x: x[0], reverse=True)[:7]

[(0.12166666666666667, 'month'),
 (0.090714285714285775, 'audience_rating'),
 (0.040000000000000105, 'director[T.John Carpenter]'),
 (0.025000000000000203, 'director[T.M. Night Shyamalan]'),
 (0.007500000000000312, 'score_c'),
 (-0.011666666666666548, 'rating_c'),
 (-0.021666666666666348, 'director[T.James Cameron]')]

### Critics rating

In [151]:
y = with_d.rating_c
X = with_d.drop(['title', 'rating_c', 'score_c', 'rating_u', 'score_u', 'director', 'audience_rating', 'month'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
sk_dc = run_model_sk(X_train, y_train)
print('Training score audience: ', sk_dc.score(X_train, y_train))
print('Testing score audience: ', sk_dc.score(X_test, y_test))

Training score audience:  0.463677003307
Testing score audience:  0.465122845119


In [152]:
for alpha in np.arange(0.1, 1.1, 0.1):
    print(alpha, Ridge(normalize=True, alpha=alpha).fit(X_train, y_train).score(X_test, y_test))

0.1 0.468253219929
0.2 0.45690575415
0.3 0.442287036241
0.4 0.426744406785
0.5 0.411213560377
0.6 0.396118775108
0.7 0.381655364456
0.8 0.367904890838
0.9 0.354889199688
1.0 0.34259809994


In [153]:
rid_cd = Ridge(normalize=True, alpha=0.6)
fit_cd = rid_c.fit(X_train, y_train)
print(fit_cd.score(X_train, y_train))
print(fit_cd.score(X_test, y_test))

0.388330114168
0.396118775108


In [154]:
zipped = zip(sk_dc.coef_, with_d.columns)
sorted(zipped, key = lambda x: x[0], reverse=True)[:7]

[(0.0933043228929894, 'director[T.David Cronenberg]'),
 (0.063566917243398052, 'director[T.M. Night Shyamalan]'),
 (0.035061179376725482, 'rating_c'),
 (0.034898791671741383, 'score_c'),
 (0.026541524716297074, 'month'),
 (0.0014875714604644409, 'title'),
 (-0.0043336574445343307, 'director[T.Peter Jackson]')]