In [1]:
import psycopg2
import pandas as pd

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump, load

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLars

from sklearn.model_selection import RandomizedSearchCV

In [37]:
# Query modeling_data2 table from PostgreSQL database

try:
    conn = psycopg2.connect(user="cohort17",
                            password="Cohort17Movies",
                            host="moviesdb.ce8d6g1pa5lm.us-east-1.rds.amazonaws.com",
                            port="5432",database="moviesdb")

    dbquery = "select * from modeling_data2"
    
    movies = pd.read_sql_query(dbquery, conn)

except (Exception, psycopg2.Error) as error :
    print ("Error while fetching data from PostgreSQL", error)

finally:
    if(conn):
        conn.close()

In [38]:
X = movies.drop(['primarytitle','domesticgross'], axis=1)
y = movies['domesticgross']

numeric_features = X[['productionbudget','runtimeminutes','release_year','release_week']].columns
dummy_features = X.drop(numeric_features, axis=1).columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [4]:
impute_numeric = SimpleImputer(missing_values=np.nan, strategy='median', copy=False, fill_value=None)
impute_dummy = SimpleImputer(missing_values=np.nan, strategy='constant', copy=False, fill_value=0)

scale_numeric = MinMaxScaler(copy=False)

numeric_transformer = Pipeline(steps=[
    ('imputer', impute_numeric),
    ('scaler', scale_numeric)])

dummy_transformer = Pipeline(steps=[
    ('imputer', impute_dummy)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('dum', dummy_transformer, dummy_features)])

In [5]:
gbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=0))])
gbm.fit(X_train, y_train)
gbm.score(X_test, y_test)

0.5794720892443368

In [6]:
dump(gbm, 'gbm_1.joblib')

['gbm_1.joblib']

In [10]:
#Warning: This cell may take a long time to run, depending on resources

gbm = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', GradientBoostingRegressor())])

param_grid = {'regressor__learning_rate': [.05],
              'regressor__n_estimators': [125, 150, 175],
              'regressor__subsample': [.8, .9, 1],
              'regressor__min_samples_split': [8, 10, 12],
              'regressor__min_samples_leaf': [1, 2],
              'regressor__max_depth': [3, 4, 5, 6],
              'regressor__max_features': ['sqrt']}

CV = RandomizedSearchCV(estimator = gbm,
                        param_distributions = param_grid,
                        n_iter = 50,
                        cv = 12,
                        verbose = 2,
                        random_state = 0,
                        n_jobs = -1)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)
print(CV.best_score_)
CV.score(X_test, y_test)

Fitting 12 folds for each of 50 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:  1.2min finished


{'regressor__subsample': 1, 'regressor__n_estimators': 125, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 2, 'regressor__max_features': 'sqrt', 'regressor__max_depth': 5, 'regressor__learning_rate': 0.05}
0.5004134244666113


0.5862917568828443

In [24]:
dump(CV.best_estimator_, 'gbm_randomsearch.joblib')

['gbm_randomsearch.joblib']

In [25]:
gbm_random = load('gbm_randomsearch.joblib')

In [26]:
gbm_random.score(X_test, y_test)

0.5862917568828443

In [28]:
gbm_random['regressor']

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls', max_depth=5,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=2, min_samples_split=8,
                          min_weight_fraction_leaf=0.0, n_estimators=125,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [34]:
gbm2 = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls', max_depth=5,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=2, min_samples_split=8,
                          min_weight_fraction_leaf=0.0, n_estimators=125,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

gbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', gbm2)])
gbm.fit(X_train, y_train)
gbm.score(X_test, y_test)

0.575956517685184

In [35]:
dump(gbm, 'gbm_2.joblib')

['gbm_2.joblib']

In [32]:
#Warning: This cell may take a long time to run, depending on resources

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', RandomForestRegressor())])

param_grid = {'regressor__max_depth': [10, 20, 40, 60, None],
               'regressor__max_features': ['auto', 'sqrt'],
               'regressor__min_samples_leaf': [1, 2, 4, 8, 10, 20, 30],
               'regressor__min_samples_split': [2, 4, 8, 10, 20, 30],
               'regressor__n_estimators': [50, 100, 150, 200]}

CV = RandomizedSearchCV(estimator = rf,
                        param_distributions = param_grid,
                        n_iter = 500,
                        cv = 12,
                        verbose = 2,
                        random_state = 0,
                        n_jobs = -1)
                  
CV.fit(X_train, y_train)  
print(CV.best_params_)
print(CV.best_score_)
CV.score(X_test, y_test)

Fitting 12 folds for each of 500 candidates, totalling 6000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   58.7s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 20.2min
[Parallel(n_jobs=-1)]: Done 3273 tasks      | elapsed: 25.8min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 33.3min
[Parallel(n_jobs=-1)]: Done 4893 tasks      | elapsed: 41.2min
[Parallel(n_jobs=-1)]: Done 5824 tasks      | elapsed: 49.2min
[Parallel(n_jobs=-1)]: Done 6000 out of 6000 | elapsed: 50.6min finished


{'regressor__n_estimators': 150, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'sqrt', 'regressor__max_depth': 20}
0.49260840335675715


0.5738235993316629

In [34]:
dump(CV, 'rf_randomsearch.joblib')

['rf_randomsearch.joblib']