In [1]:
# Import necessary packages
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

from sklearn import metrics

In [2]:
# Read in the preprocessed dataset
movies = pd.read_csv('movies_final_dataset.csv')

In [3]:
scores = movies.pop('imdb_score')

In [4]:
# Split dataset into test and train dataset
X_train, X_test, y_train, y_test = train_test_split(movies,scores,test_size = 0.4, random_state = 11)

In [5]:
# Creating parameter grids for the models to be tested
paramsDT = [{'max_depth' : [2,4,8,16,32,64,128,256] , 'min_samples_split' :[2,4,8,16,32],'max_features':[4,8,16,32,64,128,256,512]}] 
paramsGBM = [{'learning_rate': [0.1,0.2,0.3], 'n_estimators': [50,100,150], 'min_samples_split' : [2,4,8,16], 'max_depth':[2,3,4,5,6,7], 'alpha':[0.7,0.8,0.9],'verbose':[1]}]
paramsLasso = [{'alpha' :[0.8,0.9,1.0], 'max_iter':[512,1024,2048],'tol':[0.001,0.005,0.0001,0.0005,0.00001]}]
paramsSVR = [{'tol':[0.01,0.05,0.001,0.005,0.0001], 'C':[0.5,1.0,1.5,2.0]}]
paramsRF = [{'n_estimators':[4,8,16], 'min_samples_split':[2,4,8]}]

In [6]:
# Inintialize evaluation metric
from sklearn.metrics import make_scorer, mean_squared_error
mse = make_scorer(mean_squared_error)

In [7]:
# Fit and test accuracy for Gradient Boosting Regressor
alg1 = GridSearchCV(estimator = GradientBoostingRegressor(random_state =5), param_grid =paramsGBM,n_jobs=4,verbose=2,scoring = mse)
alg1.fit(X_train, y_train)
y_pred = alg1.predict(X_test)
metrics.mean_squared_error(y_pred,y_test)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   15.3s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:  3.6min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:  6.7min
[Parallel(n_jobs=4)]: Done 1005 tasks      | elapsed: 10.4min
[Parallel(n_jobs=4)]: Done 1450 tasks      | elapsed: 15.0min
[Parallel(n_jobs=4)]: Done 1944 out of 1944 | elapsed: 20.2min finished


      Iter       Train Loss   Remaining Time 
         1           1.1145            0.98s
         2           1.0705            0.90s
         3           1.0333            0.88s
         4           1.0028            0.84s
         5           0.9759            0.83s
         6           0.9540            0.81s
         7           0.9337            0.80s
         8           0.9165            0.77s
         9           0.8997            0.76s
        10           0.8861            0.74s
        20           0.7893            0.58s
        30           0.7331            0.39s
        40           0.6907            0.19s
        50           0.6582            0.00s


0.8445091897907957

In [8]:
# Fit and test accuracy for Decision Tree
alg2 = GridSearchCV(estimator = DecisionTreeRegressor(), param_grid =paramsDT,n_jobs=4,verbose=2,scoring = mse)
alg2.fit(X_train, y_train)
y_pred = alg2.predict(X_test)
metrics.mean_squared_error(y_pred,y_test)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    7.9s
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:   14.9s
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:   24.4s
[Parallel(n_jobs=4)]: Done 960 out of 960 | elapsed:   35.0s finished


1.6032875894988068

In [9]:
# Fit and test accuracy for Lasso regression
alg3 = GridSearchCV(estimator = Lasso(), param_grid =paramsLasso,n_jobs=4,verbose=2,scoring = mse)
alg3.fit(X_train, y_train)
y_pred = alg3.predict(X_test)
metrics.mean_squared_error(y_pred,y_test)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    4.7s
[Parallel(n_jobs=4)]: Done 135 out of 135 | elapsed:    8.6s finished


1.3432175873171397

In [10]:
# Fit and test accuracy for Support Vector Regressor
alg4 = GridSearchCV(estimator = SVR(), param_grid =paramsSVR,n_jobs=4,verbose=2,scoring = mse)
alg4.fit(X_train, y_train)
y_pred = alg4.predict(X_test)
metrics.mean_squared_error(y_pred,y_test)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:  2.3min finished


1.3760574817026985

In [11]:
# Fit and test accuracy for Random forest model
alg5 = GridSearchCV(estimator = RandomForestRegressor(), param_grid =paramsRF,n_jobs=4,verbose=2,scoring = mse)
alg5.fit(X_train, y_train)
y_pred = alg5.predict(X_test)
metrics.mean_squared_error(y_pred,y_test)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:    6.0s finished


0.86204355608591876

In [30]:
# The differnece between testing error and training error is least for the Gradient Boosting Method
alg_final = GradientBoostingRegressor(alpha=0.9,max_depth=2, max_features=None,
             min_samples_leaf=1, min_samples_split=2, n_estimators=100,random_state=5,verbose=1)

In [31]:
# Compute train and test mean squared error for train and test dataset 
alg_final.fit(X_train, y_train)
y_pred = alg5.predict(X_test)
test_error = metrics.mean_squared_error(y_pred,y_test)
y_pred = alg5.predict(X_train)
train_error = metrics.mean_squared_error(y_pred,y_train)

      Iter       Train Loss   Remaining Time 
         1           1.1145            1.94s
         2           1.0705            1.77s
         3           1.0333            1.77s
         4           1.0028            1.70s
         5           0.9759            1.71s
         6           0.9540            1.70s
         7           0.9337            1.72s
         8           0.9165            1.70s
         9           0.8997            1.70s
        10           0.8861            1.70s
        20           0.7893            1.61s
        30           0.7331            1.31s
        40           0.6906            1.13s
        50           0.6582            0.94s
        60           0.6334            0.77s
        70           0.6138            0.58s
        80           0.5977            0.38s
        90           0.5816            0.19s
       100           0.5695            0.00s


In [32]:
print("Training error: ",train_error)
print("Test error", test_error)

Training error:  0.182561703822
Test error 0.862043556086
