In [1]:
import pandas as pd
import os 

processed_data_folder = "../data/processed"

data_path = os.path.join(processed_data_folder, "data.csv")
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,production_budget,worldwide_gross,title_year,aspect_ratio,duration,cast_total_facebook_likes,budget,imdb_score,opening_gross,screens
0,425000000,2783918982,2009.0,1.78,178.0,4834,237000000.0,7.9,77025481.0,3452.0
1,300000000,963420425,2007.0,2.35,169.0,48350,300000000.0,7.1,139802190.0,4362.0
2,300000000,879620923,2015.0,2.35,148.0,11700,245000000.0,6.8,70403148.0,3929.0
3,275000000,1084439099,2012.0,2.35,164.0,106759,250000000.0,8.5,160887295.0,4404.0
4,275000000,260002115,2013.0,2.35,150.0,45757,215000000.0,6.5,29210849.0,3904.0


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
import numpy as np



In [3]:
data.columns

Index(['production_budget', 'worldwide_gross', 'title_year', 'aspect_ratio',
       'duration', 'cast_total_facebook_likes', 'budget', 'imdb_score',
       'opening_gross', 'screens'],
      dtype='object')

In [4]:
labels = data.pop("worldwide_gross")
features = data 

print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")

Features shape: (2140, 9)
Labels shape: (2140,)


In [5]:
pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("core_model", GradientBoostingRegressor())
    ]
)

In [6]:
results = cross_validate(
    pipeline, 
    features, 
    labels, 
    return_train_score=True, 
    cv=10
)
results 

{'fit_time': array([0.26470876, 0.23735213, 0.23708606, 0.23821712, 0.23907185,
        0.2381022 , 0.23930383, 0.23814917, 0.2424829 , 0.23773575]),
 'score_time': array([0.00294805, 0.00142097, 0.00134706, 0.00139904, 0.00138402,
        0.001441  , 0.00137901, 0.00125384, 0.00140405, 0.00127125]),
 'test_score': array([0.38624917, 0.52744248, 0.67076446, 0.66248703, 0.59396628,
        0.58028093, 0.52295786, 0.43058285, 0.48723192, 0.35074813]),
 'train_score': array([0.87699806, 0.93454467, 0.92959909, 0.93416561, 0.92750989,
        0.92792894, 0.92902854, 0.92575243, 0.92706867, 0.92753681])}

In [7]:
train_score = np.mean(results["train_score"])
test_score = np.mean(results["test_score"])

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")

Train Score: 0.9240132712754834
Test Score: 0.5212711104721646


In [8]:
from sklearn.model_selection import GridSearchCV

param_tunning = {"core_model__n_estimators": range(20, 501, 20)}

estimator = Pipeline(
    [
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("core_model", GradientBoostingRegressor())
    ]
)

grid_search = GridSearchCV(
    estimator, 
    param_grid=param_tunning, 
    scoring="r2", 
    cv=5
)

In [9]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2)

In [10]:
grid_search.fit(train_features, train_labels)

In [11]:
final_result = cross_validate(
    grid_search.best_estimator_, 
    train_features, 
    train_labels, 
    return_train_score=True,
    cv=7
)

In [12]:
train_score = np.mean(final_result["train_score"])
test_score = np.mean(final_result["test_score"])

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")

Train Score: 0.9469034979949298
Test Score: 0.7626298519900248


In [13]:
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()),
  ('core_model', GradientBoostingRegressor(n_estimators=120))],
 'verbose': False,
 'imputer': SimpleImputer(),
 'core_model': GradientBoostingRegressor(n_estimators=120),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__keep_empty_features': False,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'imputer__verbose': 'deprecated',
 'core_model__alpha': 0.9,
 'core_model__ccp_alpha': 0.0,
 'core_model__criterion': 'friedman_mse',
 'core_model__init': None,
 'core_model__learning_rate': 0.1,
 'core_model__loss': 'squared_error',
 'core_model__max_depth': 3,
 'core_model__max_features': None,
 'core_model__max_leaf_nodes': None,
 'core_model__min_impurity_decrease': 0.0,
 'core_model__min_samples_leaf': 1,
 'core_model__min_samples_split': 2,
 'core_model__min_weight_fraction_leaf': 0.0,
 'core_model__n_estimators': 120,
 'core_model__n_iter_no_change': None,
 'core_m

In [14]:
estimator = Pipeline(
    [
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("core_model", GradientBoostingRegressor(
            n_estimators=220,
            alpha=0.9,
            ccp_alpha=0.0,
            criterion='friedman_mse',
            init=None,
            learning_rate=0.1,
            loss='squared_error',
            max_depth=3,
            max_features=None,
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.0,
            n_iter_no_change = None, 
            random_state = None,
            subsample=1.0,
            tol=0.0001,
            validation_fraction=0.1,
            verbose=0,
            warm_start=False
        )
        )
    ]
)



In [15]:
estimator.fit(train_features, train_labels)

In [16]:
estimator.score(test_features, test_labels)

0.8157505527491966

In [17]:
!mkdir models

In [18]:
from joblib import dump
model_folder = "../models/"
model_name = "model.pkl"
output_model_folder = os.path.join(model_folder, model_name)
dump(estimator, output_model_folder)

['../models/model.pkl']

In [19]:
train_features.columns

Index(['production_budget', 'title_year', 'aspect_ratio', 'duration',
       'cast_total_facebook_likes', 'budget', 'imdb_score', 'opening_gross',
       'screens'],
      dtype='object')