## XGBOOST REGRESSION MODEL FOR PREDICTING MOVIE REVENUE

In [1]:
import pandas as pd
import numpy as np 
import pandas_profiling as profile
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from collections import Counter 
%matplotlib inline

This imports the libraries I need. 

In [2]:
train_data_final = pd.read_csv('./data/train_data_final.csv')
X_train = pd.read_csv('./data/train_data_final_X_train.csv')
X_test = pd.read_csv('./data/train_data_final_X_test.csv')
y_train = pd.read_csv('./data/train_data_final_y_train.csv')
y_test = pd.read_csv('./data/train_data_final_y_test.csv')

This imports the csvs that I created earlier. 

In [3]:
xgboost = xgb.XGBRegressor(random_state=42)

In [4]:
xgboost.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [5]:
xgboost.score(X_train,y_train)

0.5295839164377187

In [6]:
xgboost.score(X_test,y_test)

0.40996540286194244

In [7]:
predictions = xgboost.predict(X_test)

In [8]:
mean_squared_error(y_test, predictions) ** 0.5

121836200.64493038

The above fits and scores my model and calculates the RMSE. 

In [9]:
parameters = {
              'learning_rate': [0.05,0.075,0.1,0.2],
              'max_depth': [1,2,3,4,5,6,7,8],
              'n_estimators': [10,20,30,40,50,60,70,80,90,100]
}


grid_search = GridSearchCV(xgboost, parameters, n_jobs=-1, cv=3, verbose=2)


In [10]:
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   49.8s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  1.9min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.05, 0.075, 0.1, 0.2], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8], 'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [11]:
final_predictions = grid_search.predict(X_test)

In [12]:
mean_squared_error(y_test, final_predictions) ** 0.5

121742747.43561216

This is the RMSE after further hyperparameter tuning. 

In [24]:
xgboost.get_booster().get_score(importance_type = 'weight')

{'number_of_crew': 128,
 'number_of_cast': 136,
 'homepage': 58,
 'number_of_keywords': 49,
 'number_of_genres': 34,
 'Steven Spielberg': 26,
 'Hans Zimmer': 22,
 'Samuel L. Jackson': 16,
 'James Newton Howard': 14,
 'John Goodman': 1,
 'Janet Hirshenson': 9,
 'number_of_production_countries': 6,
 'John Turturro': 9,
 'James Horner': 9,
 'number_of_spoken_languages': 15,
 'Sigourney Weaver': 18,
 'number_of_production_companies': 30,
 'Willem Dafoe': 14,
 'Sylvester Stallone': 9,
 'Bruce Willis': 6,
 'John Debney': 1,
 'Mel Gibson': 3,
 'Avy Kaufman': 3,
 'Bill Murray': 3,
 'Kerry Barden': 2,
 'Jerry Goldsmith': 2}