In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Decision Trees

In [3]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=2)

In [4]:
from sklearn.model_selection import train_test_split

X = pd.read_csv('../datasets/X_opening.csv')
y = X['worldwide_gross']
X = X.drop('worldwide_gross',axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [8]:
model.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=2)

## Ensembles

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

forest = RandomForestRegressor(200)
results = cross_validate(forest,X,y,cv=5,scoring='r2',return_train_score=True)

In [17]:
test_scores = results['test_score']
train_scores = results['train_score']
print(np.mean(train_scores))
print(np.mean(test_scores))

0.9659619686482206
0.5099814936104453


Mejor resultado que Lasso! Ya no tenemos Bias y tenemos un mejor score r2. Sin embargo tenemos una diferencia importante entre score de entrenamiento y de test (overfit).

## Gradient Boosted Trees

In [20]:
from sklearn.ensemble import GradientBoostingRegressor

ensemble = GradientBoostingRegressor()
results = cross_validate(ensemble,X,y,cv=5,scoring='r2',return_train_score=True)

In [21]:
test_scores = results['test_score']
train_scores = results['train_score']
print(np.mean(train_scores))
print(np.mean(test_scores))

0.9125979599345971
0.5300002606401091


## Optimización de hiperparametros

**Grid Search**

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [23]:
from sklearn.model_selection import GridSearchCV

param_test1 = {'n_estimators': range(20,501,20)}

In [24]:
list(param_test1['n_estimators'])

[20,
 40,
 60,
 80,
 100,
 120,
 140,
 160,
 180,
 200,
 220,
 240,
 260,
 280,
 300,
 320,
 340,
 360,
 380,
 400,
 420,
 440,
 460,
 480,
 500]

In [25]:
estimator = GradientBoostingRegressor(learning_rate=0.1,
                                     min_samples_split=500,
                                     min_samples_leaf=50,
                                     max_depth=8,
                                     max_features='sqrt',
                                     subsample=0.8,
                                     random_state=10)

In [26]:
gsearch1 = GridSearchCV(estimator,
                       param_grid = param_test1,
                       scoring='r2',
                       cv=5)

In [27]:
gsearch1.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=GradientBoostingRegressor(max_depth=8,
                                                 max_features='sqrt',
                                                 min_samples_leaf=50,
                                                 min_samples_split=500,
                                                 random_state=10,
                                                 subsample=0.8),
             param_grid={'n_estimators': range(20, 501, 20)}, scoring='r2')

In [29]:
list(zip(gsearch1.cv_results_['mean_test_score'], gsearch1.cv_results_['std_test_score'], gsearch1.cv_results_['params']))

[(0.6561729887295528, 0.06228182533840906, {'n_estimators': 20}),
 (0.723914140672915, 0.06520751792639172, {'n_estimators': 40}),
 (0.737493203222167, 0.06223746101758368, {'n_estimators': 60}),
 (0.7426252345459441, 0.06362534318686655, {'n_estimators': 80}),
 (0.744340449143776, 0.06243723071630083, {'n_estimators': 100}),
 (0.7478712248063208, 0.061180346298888064, {'n_estimators': 120}),
 (0.7516553944628435, 0.06213821946414476, {'n_estimators': 140}),
 (0.753065146080784, 0.06199268315431224, {'n_estimators': 160}),
 (0.7555079911892786, 0.06133057026214709, {'n_estimators': 180}),
 (0.7554100485032091, 0.06170625761578737, {'n_estimators': 200}),
 (0.7555770454950556, 0.06145779173354153, {'n_estimators': 220}),
 (0.755655849434276, 0.061696089417870664, {'n_estimators': 240}),
 (0.7555065586937293, 0.060380280696370005, {'n_estimators': 260}),
 (0.7557298196602917, 0.05938820577212784, {'n_estimators': 280}),
 (0.7563590519623536, 0.05951176413052686, {'n_estimators': 300}),
 

In [32]:
final_results = cross_validate(gsearch1.best_estimator_,X_train,y_train,return_train_score=True)

In [33]:
test_scores = final_results['test_score']
train_scores = final_results['train_score']
print(np.mean(train_scores))
print(np.mean(test_scores))

0.8500239677331072
0.7582797554708545


In [34]:
estimator = GradientBoostingRegressor(learning_rate=0.1,
                                     min_samples_split=500,
                                     min_samples_leaf=50,
                                     max_depth=8,
                                     max_features='sqrt',
                                     subsample=0.8,
                                     random_state=10,
                                     n_estimators=240)

In [35]:
estimator.fit(X_train,y_train)

GradientBoostingRegressor(max_depth=8, max_features='sqrt', min_samples_leaf=50,
                          min_samples_split=500, n_estimators=240,
                          random_state=10, subsample=0.8)

In [36]:
estimator.score(X_test,y_test)

0.8119141232433791

## Guardando el modelo

In [37]:
import pickle

In [38]:
pickle.dump(estimator, open('../model/model.pkl','wb'))