In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import patsy
import math
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [1]:
df = pd.read_pickle('/Users/zacharyheick/ds/metis/metisgh/Project_Luther/roger_final_model_ready.pickle')

NameError: name 'pd' is not defined

Before I began modeling, I split the data up into training and test sets. 

In [None]:
X = df.drop('Star_Score', axis=1)
y = df['Star_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

The features I start with are `MovieLens_Score` and `Imdb_Score`. I slowly add features that have either stronger positively correlation or stronger negatively correlation until I was I get to an Adjusted R-squared value that is best.

In [None]:
X_1 = X_train[[
    'Runtime',
    'MovieLens_Score',
    'Imdb_Score',
    'Genre_Animation',
    'Genre_Comedy',
    'Genre_Crime',
    'Genre_Documentary',
    'Genre_Drama',
    'Genre_Family',
    'Genre_Music',
    'Genre_Thriller',
    'Sub_Genre_Animation',
    'Sub_Genre_Comedy',
    'Sub_Genre_Family',
    'Sub_Genre_Fantasy',
    'Sub_Genre_Horror',
    'Sub_Genre_No Sub-genre',
    'Sub_Genre_Romance'
]]
model = sm.OLS(y_train, X_1)
model_fit = model.fit()
model_fit.summary()

I couldn't just rely on the adjusted R-squared value for measuring my model. I also need to look at the distribution of residuals to make sure the model is capturing all trends across the whole set of training data. A normal distribution of the model's residuals indicates this.

In [None]:
plt.hist(model_fit.resid)
plt.title('Distribution of Star Score Residuals (Train)', fontsize=18)
plt.xlabel('Star Score Residuals', fontsize=12)
plt.ylabel('Distribution', fontsize=12);

To see if the model is performing consistently across the whole set of data, I'll use cross validation.

In [None]:
# in-sample
in_lr = linear_model.LinearRegression()
in_lr.fit(X_1, y_train)
y_pred_in = in_lr.predict(X_1)
np.sqrt(mean_squared_error(y_train, y_pred_in))

In [None]:
# out-of-sample
out_lr = linear_model.LinearRegression()
out_scores = cross_val_score(out_lr, X_1, y_train, scoring='neg_mean_squared_error', cv = 5) * -1
np.sqrt(out_scores.mean())

I get a slightly higher root mean squared error on the out-of-sample data than the in-sample data. This means that the model generalizes the data well. I am now ready to test the model on the test set.

In [None]:
X_test_final = X_test[[
    'Runtime',
    'MovieLens_Score',
    'Imdb_Score',
    'Genre_Animation',
    'Genre_Comedy',
    'Genre_Crime',
    'Genre_Documentary',
    'Genre_Drama',
    'Genre_Family',
    'Genre_Music',
    'Genre_Thriller',
    'Sub_Genre_Animation',
    'Sub_Genre_Comedy',
    'Sub_Genre_Family',
    'Sub_Genre_Fantasy',
    'Sub_Genre_Horror',
    'Sub_Genre_No Sub-genre',
    'Sub_Genre_Romance'
]]
y_pred_test = model_fit.predict(X_test_final)
np.sqrt(mean_squared_error(y_test, y_pred_test))

I get a final root mean squared error of 0.73, meaning my model's predictions are off by slightly more than a half star rating. Looking at the model visualization, this model loses some accuracy as Ebert's rating decreases. Even at the higher ratings, the model does not perform terribly, but it could definately be improved.

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(y_pred_test, y_test, alpha=0.2)
plt.plot([0,4], [0,4], 'r')
plt.title('Roger Ebert Scores vs. Predicted Scores', fontsize=14)
plt.xlabel('Predicted Star Score', fontsize=12)
plt.ylabel('Roger Ebert Star Score', fontsize=12);

Now that the model is ready, I found some recent movies and plugged their values into the model to get a predicted rating out of four stars.

In [None]:
Dunkirk = np.array([106,3.98,8.4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0])
Emoji_Movie = np.array([86,1.08,2.2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0])
La_La_Land = np.array([128,3.80,8.2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1])
Mad_Max_Fury_Road = np.array([120,3.83,8.1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0])
print('Dunkirk:', model_fit.predict(Dunkirk)[0])
print('The Emoji Movie:', model_fit.predict(Emoji_Movie)[0])
print('La La Land:', model_fit.predict(La_La_Land)[0])
print('Mad Max Fury Road:', model_fit.predict(Mad_Max_Fury_Road)[0])