In [25]:
# Predicting Box Office Revenue
import pandas as pd
from sklearn import linear_model
from sklearn import tree
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
#import scipy
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
# using statsmodels formula api
movies = pd.read_csv('data/Movies.csv')
movies.rename(columns ={'Sci.Fi': 'SciFi', 'Production.Budget': 'ProductionBudget'}, inplace=True)
movies.Rated = movies.Rated.replace(np.nan, 'NA')

trainf = movies[movies.Year < 2010]
testf = movies[movies.Year >= 2010]

form = 'Worldwide ~ C(Rated) + ' + \
       ' + '.join(np.setdiff1d(trainf.columns.values, np.array(['Rated', 'Worldwide', 'Name', 'Year'])))

resultsf = smf.ols(form, data=trainf).fit()
resultsf.summary()

0,1,2,3
Dep. Variable:,Worldwide,R-squared:,0.541
Model:,OLS,Adj. R-squared:,0.49
Method:,Least Squares,F-statistic:,10.48
Date:,"Thu, 20 Aug 2015",Prob (F-statistic):,1.15e-25
Time:,11:16:17,Log-Likelihood:,-228.67
No. Observations:,248,AIC:,509.3
Df Residuals:,222,BIC:,600.7
Df Model:,25,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,15.9977,1.158,13.815,0.000,13.716 18.280
C(Rated)[T.G],0.4102,0.704,0.583,0.561,-0.977 1.798
C(Rated)[T.NA],0.5562,0.732,0.760,0.448,-0.887 1.999
C(Rated)[T.PG],0.7091,0.689,1.029,0.305,-0.649 2.067
C(Rated)[T.PG-13],0.6028,0.711,0.848,0.397,-0.798 2.003
C(Rated)[T.R],0.4128,0.721,0.572,0.568,-1.008 1.834
Action,0.0172,0.133,0.129,0.897,-0.246 0.280
Adventure,-0.2232,0.128,-1.739,0.083,-0.476 0.030
Animation,0.6117,0.209,2.933,0.004,0.201 1.023

0,1,2,3
Omnibus:,5.872,Durbin-Watson:,1.374
Prob(Omnibus):,0.053,Jarque-Bera (JB):,5.566
Skew:,-0.33,Prob(JB):,0.0619
Kurtosis:,3.322,Cond. No.,5080.0


In [3]:
form2 = 'Worldwide ~ ' + ' + '.join(resultsf.pvalues[resultsf.pvalues < 0.05].index[1:])
resultsf2 = smf.ols(form2, data=trainf).fit()
pred = resultsf2.predict(testf)
sse = sum((pred - testf.Worldwide.values)**2)
# mean of training set - actual values
sst = sum((trainf.Worldwide.values.mean() - testf.Worldwide.values)**2) 
r2 = 1- (sse/sst)
(sse, sst, r2)

(25.000599780719131, 60.433977364224546, 0.58631549881211553)

In [4]:
def perf_helper(x):
    if x >= movies.Worldwide.quantile(.75): return 'Excellent'
    elif x >= movies.Worldwide.quantile(.25): return 'Average'
    else: return 'Poor'
movies['Performance'] = movies.Worldwide.map(lambda x: perf_helper(x))

In [6]:
# using sm.OLS(y, X)
df = sm.add_constant(movies, prepend=False) 

# create three dummy variables using get_dummies, then exclude the first dummy column
df.Rated = df.Rated.astype('category')
_dummies = pd.get_dummies(df.Rated, prefix='Rated').iloc[:, 1:] 

# concatenate the dummy variable columns onto the original DataFrame (axis=0 means rows, axis=1 means columns)
df = pd.concat([df, _dummies], axis=1) 
train = df[df.Year < 2010]
test = df[df.Year >= 2010]

col = np.setdiff1d(df.columns.values, np.array(['Rated', 'Worldwide', 'Name', 'Year', 'Performance']))
results = sm.OLS(train.Worldwide, train[col]).fit()
results.summary()

0,1,2,3
Dep. Variable:,Worldwide,R-squared:,0.541
Model:,OLS,Adj. R-squared:,0.49
Method:,Least Squares,F-statistic:,10.48
Date:,"Thu, 20 Aug 2015",Prob (F-statistic):,1.15e-25
Time:,11:16:35,Log-Likelihood:,-228.67
No. Observations:,248,AIC:,509.3
Df Residuals:,222,BIC:,600.7
Df Model:,25,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Action,0.0172,0.133,0.129,0.897,-0.246 0.280
Adventure,-0.2232,0.128,-1.739,0.083,-0.476 0.030
Animation,0.6117,0.209,2.933,0.004,0.201 1.023
Comedy,-0.1431,0.161,-0.888,0.375,-0.461 0.174
Crime,-0.3316,0.147,-2.250,0.025,-0.622 -0.041
Documentary,-0.4481,0.483,-0.928,0.355,-1.400 0.504
Drama,-0.2098,0.188,-1.115,0.266,-0.581 0.161
Family,-0.3207,0.178,-1.801,0.073,-0.672 0.030
Fantasy,0.1635,0.141,1.163,0.246,-0.114 0.441

0,1,2,3
Omnibus:,5.872,Durbin-Watson:,1.374
Prob(Omnibus):,0.053,Jarque-Bera (JB):,5.566
Skew:,-0.33,Prob(JB):,0.0619
Kurtosis:,3.322,Cond. No.,5080.0


In [8]:
# Classification Tree
df.drop(['Worldwide', 'const'], inplace = True, axis =1)
trainP, testP = cross_validation.train_test_split(df, train_size=0.7, test_size=0.3, random_state=15071)
clf = tree.DecisionTreeClassifier(min_samples_split=20)
clf.fit(trainP.loc[:,np.setdiff1d(col, np.array(['const']))], trainP['Performance'])            

In [28]:
pred = clf.predict(trainP.loc[:,np.setdiff1d(col, np.array(['const']))])
accuracy_score(pred, trainP['Performance'])
# same as clf.score(trainP.loc[:,np.setdiff1d(col, np.array(['const']))],trainP['Performance']) <- slower

0.80257510729613735

In [30]:
pred_test = clf.predict(testP.loc[:,np.setdiff1d(col, np.array(['const']))])
accuracy_score(pred_test, testP['Performance'])

0.71287128712871284

In [44]:
len(np.where(pred_test == 'Average')[0])/float(len(testP)) #baseline accuracy of most common test prediction

0.48514851485148514