## Data Preparation

In [1]:
import pandas as pd

## a) cargamos los datos

In [2]:
# indicamos las rutas del dataset
movies_data_path = '../dataset/original_data/movies.csv'
finantial_data_path = '../dataset/original_data/finantials.csv'
opening_data_path = '../dataset/original_data/opening_gross.csv'

In [3]:
# cargamos los datasets en formato de dataframe 
fin_data = pd.read_csv(finantial_data_path)
movie_data = pd.read_csv(movies_data_path)
opening_data = pd.read_csv(opening_data_path)

## b) creamos un solo dataset con todas las columnas y filas utiles 
en nuestro caso tenemos las columnas repartidas en 3 DF, solo couparaemos las columnas numericas asi que elimnamos las categoricas, el numero de registros varia asi que haremos merge para que solo se concerven los registros que existen en todas las tablas. 

In [4]:
# extraemos las columnas que vamos a utilizar (las numericas y la del titulo[sera el key del merge]) del DF movie
"""numeric_columns_mask = (movie_data.dtypes == float) | (movie_data.dtypes == int)
numeric_columns = [column for column in numeric_columns_mask.index if numeric_columns_mask[column]]
movie_data = movie_data[numeric_columns+['movie_title']]"""
movie_data = pd.concat([movie_data.select_dtypes(include=['float64', "int64"]) ,movie_data["movie_title"]], 
       axis=1) # este es mi codigo que cree y es mas facil  axis=1 = unir columnas axis=0 = unir filas

In [5]:
fin_data = fin_data[['movie_title','production_budget','worldwide_gross']] 
# hacemos un left para que se concerben los registros que existen solo en las 3 tablas
fin_movie_data = pd.merge(fin_data, movie_data, on= 'movie_title', how='left') 
full_movie_data = pd.merge( opening_data,fin_movie_data, on = 'movie_title', how='left')
full_movie_data.shape

(2304, 12)

In [6]:
# ignoraremos las peliculas que tienen 0 y null en dinero generado 
full_movie_data[(full_movie_data.worldwide_gross != 0) & (full_movie_data.worldwide_gross.notnull())].shape

(2304, 12)

In [7]:
# quitamos las columnas que no son utiles para el entrenamiento
full_movie_data = full_movie_data.drop(['movie_title','gross'],axis=1)

In [8]:
full_movie_data.columns # vemos el DF final

Index(['opening_gross', 'screens', 'production_budget', 'worldwide_gross',
       'title_year', 'aspect_ratio', 'duration', 'cast_total_facebook_likes',
       'budget', 'imdb_score'],
      dtype='object')

en este punto ya tenemos un solo dataframe con toda la data que sera util para entrenar el modelo 

## Modeling

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate
import numpy as np

In [10]:
# definimos nuestro input var y output var
X = full_movie_data.drop(['worldwide_gross'], axis = 1)
y = full_movie_data['worldwide_gross']

In [11]:
# contruimos un pipeline simple 
pipeline = Pipeline([ # solo especifiamos el alias del modelo y el modelo 
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')), # completa los valores Null
    ('core_model', GradientBoostingRegressor()) #modelo de entrenamiento
])

In [12]:
results = cross_validate(pipeline ,X,y,return_train_score=True,cv=10) # cv = numero de bloques 
results # ahora tenemos los datos de puntuacion y tiempo para el fit y el test de las 10 iteraciones 

{'fit_time': array([0.7619803 , 0.77409816, 0.67874742, 0.65125394, 0.66451001,
        0.79956365, 0.78478551, 0.73388386, 0.67225266, 0.79540873]),
 'score_time': array([0.00400424, 0.00701118, 0.0040102 , 0.00401521, 0.00499701,
        0.00585794, 0.00686502, 0.00699759, 0.00501227, 0.00597072]),
 'test_score': array([0.67454511, 0.85166096, 0.64392674, 0.78093676, 0.78345327,
        0.86422566, 0.76040778, 0.87655337, 0.68674236, 0.65732363]),
 'train_score': array([0.91673951, 0.91581777, 0.9228721 , 0.91654412, 0.92172829,
        0.91476722, 0.92151444, 0.91734995, 0.92320705, 0.91766026])}

In [13]:
# obtenemos el promedio de las 10 iteraciones del crossvalidation 
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])
assert train_score > 0.7 # asi se usa assert:
assert test_score > 0.65 #  afirmo que train score es > 0.7 si no deten la ejecucion por que hay un error atras
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.918820072167686
Test Score: 0.7579775628432255


## Hyperparameter tunning

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
# de todos los parametros del modelo GBR definimos uno que sera ajustado y en que rango se haran pruebas
param_tunning = {'core_model__n_estimators': range(20,501,20)} 

In [16]:
# volvemos a instanciar el pipeline de atras
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor())
])

In [17]:
grid_search= GridSearchCV(estimator, # el modelo o pipeline
                        param_grid = param_tunning, # el paramatro a ajustar
                        scoring='r2', # la metrica de validacion 
                        cv=5)         # numero de bloques de particion en el crossvalidation 

In [18]:
# hacemos un hold out 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.35,random_state= 42)

In [19]:
# entrenamos el estimador
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('core_model',
                                        GradientBoostingRegressor())]),
             param_grid={'core_model__n_estimators': range(20, 501, 20)},
             scoring='r2')

In [20]:
# hacemos la tecnica de cross validation sobre el  mejor modelo (estimator) encontrado en grid_search 
final_result = cross_validate(grid_search.best_estimator_,X_train,y_train,return_train_score=True,cv=7)

In [21]:
# volvemos a hacer el promedio de las metricas del crossvalidation para el mejor modelo 
train_score = np.mean(final_result['train_score'])
test_score = np.mean(final_result['test_score'])
assert train_score > 0.7
assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')
# la metrica de evaluacion es r^2

Train Score: 0.9667702920281013
Test Score: 0.7628977122405335


In [22]:
# obtenmso los parametros para el mejor modelo (estimator)
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()),
  ('core_model', GradientBoostingRegressor(n_estimators=220))],
 'verbose': False,
 'imputer': SimpleImputer(),
 'core_model': GradientBoostingRegressor(n_estimators=220),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'imputer__verbose': 0,
 'core_model__alpha': 0.9,
 'core_model__ccp_alpha': 0.0,
 'core_model__criterion': 'friedman_mse',
 'core_model__init': None,
 'core_model__learning_rate': 0.1,
 'core_model__loss': 'squared_error',
 'core_model__max_depth': 3,
 'core_model__max_features': None,
 'core_model__max_leaf_nodes': None,
 'core_model__min_impurity_decrease': 0.0,
 'core_model__min_samples_leaf': 1,
 'core_model__min_samples_split': 2,
 'core_model__min_weight_fraction_leaf': 0.0,
 'core_model__n_estimators': 220,
 'core_model__n_iter_no_change': None,
 'core_model__random_state': None,
 'core_model__subsample'

In [23]:
# este paso solo es para ver que sucedia en la funcion de save_simple_metrics_report en utils.py
for key, value in grid_search.best_estimator_.named_steps.items():
    print(f'### {key}:{value.__repr__()}'+'\n')

### imputer:SimpleImputer()

### core_model:GradientBoostingRegressor(n_estimators=220)



In [24]:
# creamos el pipeline final con el modelo y sus parametros mas optimos
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor(n_estimators=220,
                                             alpha=0.9,
                                             ccp_alpha=0.0,
                                             criterion='friedman_mse',
                                             init=None,
                                             learning_rate=0.1,
                                             loss='squared_error',
                                             max_depth=3,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_iter_no_change=None,
                                             random_state=None,
                                             subsample=1.0,
                                             tol=0.0001,
                                             validation_fraction=0.1,
                                             verbose=0,
                                             warm_start=False))
])

In [25]:
# entrenamos el modelo final
estimator.fit(X_train,y_train)

Pipeline(steps=[('imputer', SimpleImputer()),
                ('core_model', GradientBoostingRegressor(n_estimators=220))])

In [26]:
estimator.score(X_test, y_test)

0.7290239326685686

## Saving model

In [27]:
from joblib import dump

In [28]:
# guardamos el modelo final ya entrenado en formato .pkl
dump(estimator, '../model/model.pkl')

['../model/model.pkl']

In [29]:
X_train.columns

Index(['opening_gross', 'screens', 'production_budget', 'title_year',
       'aspect_ratio', 'duration', 'cast_total_facebook_likes', 'budget',
       'imdb_score'],
      dtype='object')

creamos un .txt con los diccionarios para hacer las pruebas usando FastAPI

In [83]:
import json

Xsample = X_test.sample(3, random_state=42)
Xsample = Xsample.to_dict(orient="index")

Ysample = pd.DataFrame(y_test.sample(3, random_state=42))
Ysample.columns = ["worldwide_gross"]
Ysample = Ysample.to_dict(orient="index")


with open('../api/sample_to_test_API.txt', 'w') as convert_file:
    convert_file.write("Tomamos 3 muestras aleatorias con la misma seed para hacer un test del modelo en FastAPI \n \n")
    convert_file.write("# input dict: \n")
    convert_file.write(json.dumps(Xsample) + "\n \n")
    convert_file.write("# output dict: \n")
    convert_file.write(json.dumps(Ysample))

In [88]:
import os 

app_FA_path = os.environ.get('app_FAST','C:\Users\Panda\Desktop\platzi_code\44.5_films-ml-model-deployment\api\main.py')
print(os.environ.get('app_FAST'))

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1548479437.py, line 3)

In [95]:

os.environ['path_FASTAPI'] = r"C:\Users\Panda\Desktop\platzi_code\44.5_films-ml-model-deployment\api\main.py"
os.environ['path_FASTAPI']

'C:\\Users\\Panda\\Desktop\\platzi_code\\44.5_films-ml-model-deployment\\api\\main.py'

In [96]:
path_FASTAPI = os.environ.get('path_FASTAPI', r"C:\Users\Panda\Desktop\platzi_code\44.5_films-ml-model-deployment\api\main.py")
print(path_FASTAPI)

C:\Users\Panda\Desktop\platzi_code\44.5_films-ml-model-deployment\api\main.py
