## Data Preparation

In [1]:
import pandas as pd

## a) cargamos los datos

In [2]:
# indicamos las rutas del dataset
movies_data_path = '../dataset/movies.csv'
finantial_data_path = '../dataset/finantials.csv'
opening_data_path = '../dataset/opening_gross.csv'

In [40]:
movie_data.shape

(5043, 8)

In [51]:
# cargamos los datasets en formato de dataframe 
fin_data = pd.read_csv(finantial_data_path)
movie_data = pd.read_csv(movies_data_path)
opening_data = pd.read_csv(opening_data_path)
fin_data

Unnamed: 0.1,Unnamed: 0,movie_name,domestic_gross,movie_link,production_budget,release_date,worldwide_gross,movie_title
0,0,Avatar,760507625,http://www.the-numbers.com/movie/Avatar#tab=su...,425000000,12/18/2009,2783918982,Avatar
1,1,Star Wars Ep. VII: The Force Awakens,936662225,http://www.the-numbers.com/movie/Star-Wars-Ep-...,306000000,12/18/2015,2058662225,Star Wars: Episode VII - The Force Awakens ...
2,2,Pirates of the Caribbean: At World's End,309420425,http://www.the-numbers.com/movie/Pirates-of-th...,300000000,5/24/2007,963420425,Pirates of the Caribbean: At World's End
3,3,Spectre,200074175,http://www.the-numbers.com/movie/Spectre#tab=s...,300000000,11/6/2015,879620923,Spectre
4,4,The Dark Knight Rises,448139099,http://www.the-numbers.com/movie/Dark-Knight-R...,275000000,7/20/2012,1084439099,The Dark Knight Rises
...,...,...,...,...,...,...,...,...
4336,5157,The Mongol King,900,http://www.the-numbers.com/movie/Mongol-King-T...,7000,1/1/2004,900,The Mongol King
4337,5160,Signed Sealed Delivered,0,http://www.the-numbers.com/movie/Signed-Sealed...,5000,9/29/2015,0,Signed Sealed Delivered
4338,5161,Shanghai Calling,10443,http://www.the-numbers.com/movie/Shanghai-Call...,3967,2/15/2013,10443,Shanghai Calling
4339,5162,A Plague So Pleasant,0,http://www.the-numbers.com/movie/Plague-So-Ple...,1400,9/29/2015,0,A Plague So Pleasant


## b) creamos un solo dataset con todas las columnas y filas utiles 
en nuestro caso tenemos las columnas repartidas en 3 DF, solo couparaemos las columnas numericas asi que elimnamos las categoricas, el numero de registros varia asi que haremos merge para que solo se concerven los registros que existen en todas las tablas. 

In [32]:
# extraemos las columnas que vamos a utilizar (las numericas y la del titulo[sera el key del merge]) del DF movie
"""numeric_columns_mask = (movie_data.dtypes == float) | (movie_data.dtypes == int)
numeric_columns = [column for column in numeric_columns_mask.index if numeric_columns_mask[column]]
movie_data = movie_data[numeric_columns+['movie_title']]"""
movie_data = pd.concat([movie_data.select_dtypes(include=['float64', "int64"]) ,movie_data["movie_title"]], 
       axis=1) # este es mi codigo que cree y es mas facil  axis=1 = unir columnas axis=0 = unir filas

In [46]:
fin_data = fin_data[['movie_title','production_budget','worldwide_gross']] 
# hacemos un left para que se concerben los registros que existen solo en las 3 tablas
fin_movie_data = pd.merge(fin_data, movie_data, on= 'movie_title', how='left') 
full_movie_data = pd.merge( opening_data,fin_movie_data, on = 'movie_title', how='left')
full_movie_data.shape

(2304, 12)

In [47]:
# ignoraremos las peliculas que tienen 0 y null en dinero generado 
full_movie_data[(full_movie_data.worldwide_gross != 0) & (full_movie_data.worldwide_gross.notnull())].shape

(2304, 12)

In [48]:
# quitamos las columnas que no son utiles para el entrenamiento
full_movie_data = full_movie_data.drop(['movie_title','gross'],axis=1)

In [49]:
full_movie_data.columns # vemos el DF final

Index(['opening_gross', 'screens', 'production_budget', 'worldwide_gross',
       'title_year', 'aspect_ratio', 'duration', 'cast_total_facebook_likes',
       'budget', 'imdb_score'],
      dtype='object')

en este punto ya tenemos un solo dataframe con toda la data que sera util para entrenar el modelo 

## Modeling

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
import numpy as np

In [12]:
# definimos nuestro input var y output var
X = full_movie_data.drop(['worldwide_gross'], axis = 1)
y = full_movie_data['worldwide_gross']

In [13]:
# contruimos un pipeline simple 
pipeline = Pipeline([ # solo especifiamos el alias del modelo y el modelo 
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')), # completa los valores Null
    ('core_model', GradientBoostingRegressor()) #modelo de entrenamiento
])

In [14]:
results = cross_validate(pipeline ,X,y,return_train_score=True,cv=10) # cv = numero de bloques 
results # ahora tenemos los datos de puntuacion y tiempo para el fit y el test de las 10 iteraciones 

{'fit_time': array([0.33126116, 0.32594347, 0.32048607, 0.30208063, 0.32461882,
        0.32264709, 0.31112003, 0.31348014, 0.30392337, 0.30737305]),
 'score_time': array([0.00176954, 0.00277519, 0.00225043, 0.00226712, 0.00200582,
        0.00266671, 0.00191855, 0.00207758, 0.00160885, 0.00166345]),
 'test_score': array([0.67386087, 0.84945318, 0.64398525, 0.77411956, 0.78347799,
        0.85577395, 0.76009985, 0.86835663, 0.65362997, 0.65825827]),
 'train_score': array([0.91673951, 0.91581777, 0.9228721 , 0.91654412, 0.92172829,
        0.91476722, 0.92151444, 0.91734995, 0.92320705, 0.91766026])}

In [15]:
# obtenemos el promedio de las 10 iteraciones del crossvalidation 
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])
assert train_score > 0.7 # asi se usa assert:
assert test_score > 0.65 #  afirmo que train score es > 0.7 si no deten la ejecucion por que hay un error atras
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.9188200721676859
Test Score: 0.7521015504747262


## Hyperparameter tunning

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
# de todos los parametros del modelo GBR definimos uno que sera ajustado y en que rango se haran pruebas
param_tunning = {'core_model__n_estimators': range(20,501,20)} 

In [19]:
# volvemos a instanciar el pipeline de atras
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor())
])

In [20]:
grid_search= GridSearchCV(estimator, # el modelo o pipeline
                       param_grid = param_tunning, # el paramatro a ajustar
                       scoring='r2', # la metrica de validacion 
                       cv=5)         # numero de bloques de particion en el crossvalidation 

In [21]:
# hacemos un hold out 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.35,random_state= 42)

In [22]:
# entrenamos el estimador
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('core_model',
                                        GradientBoostingRegressor())]),
             param_grid={'core_model__n_estimators': range(20, 501, 20)},
             scoring='r2')

In [23]:
# hacemos la tecnica de cross validation sobre el  mejor modelo (estimator) encontrado en grid_search 
final_result = cross_validate(grid_search.best_estimator_,X_train,y_train,return_train_score=True,cv=7)

In [24]:
# volvemos a hacer el promedio de las metricas del crossvalidation para el mejor modelo 
train_score = np.mean(final_result['train_score'])
test_score = np.mean(final_result['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')
# la metrica de evaluacion es r^2

Train Score: 0.9667702920281013
Test Score: 0.7612924726750518


In [25]:
# obtenmso los parametros para el mejor modelo (estimator)
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()),
  ('core_model', GradientBoostingRegressor(n_estimators=220))],
 'verbose': False,
 'imputer': SimpleImputer(),
 'core_model': GradientBoostingRegressor(n_estimators=220),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'imputer__verbose': 0,
 'core_model__alpha': 0.9,
 'core_model__ccp_alpha': 0.0,
 'core_model__criterion': 'friedman_mse',
 'core_model__init': None,
 'core_model__learning_rate': 0.1,
 'core_model__loss': 'squared_error',
 'core_model__max_depth': 3,
 'core_model__max_features': None,
 'core_model__max_leaf_nodes': None,
 'core_model__min_impurity_decrease': 0.0,
 'core_model__min_samples_leaf': 1,
 'core_model__min_samples_split': 2,
 'core_model__min_weight_fraction_leaf': 0.0,
 'core_model__n_estimators': 220,
 'core_model__n_iter_no_change': None,
 'core_model__random_state': None,
 'core_model__subsample'

In [26]:
# creamos el pipeline final con el modelo y sus parametros mas optimos
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor(n_estimators=220,
                                             alpha=0.9,
                                             ccp_alpha=0.0,
                                             criterion='friedman_mse',
                                             init=None,
                                             learning_rate=0.1,
                                             loss='squared_error',
                                             max_depth=3,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_iter_no_change=None,
                                             random_state=None,
                                             subsample=1.0,
                                             tol=0.0001,
                                             validation_fraction=0.1,
                                             verbose=0,
                                             warm_start=False))
])

In [27]:
# entrenamos el modelo final
estimator.fit(X_train,y_train)

Pipeline(steps=[('imputer', SimpleImputer()),
                ('core_model', GradientBoostingRegressor(n_estimators=220))])

In [28]:
estimator.score(X_test, y_test)

0.7297050595008379

## Saving model

In [29]:
from joblib import dump

In [30]:
# guardamos el modelo final ya entrenado en formato .pkl
dump(estimator, '../model/model.pkl')

['../model/model.pkl']

In [31]:
X_train.columns

Index(['opening_gross', 'screens', 'production_budget', 'title_year',
       'aspect_ratio', 'duration', 'cast_total_facebook_likes', 'budget',
       'imdb_score'],
      dtype='object')