# 4.2 - Ajuste

### GridSearching  -  CrossValidation


![grid](images/grid.png)

![cv](images/cv.ppm)

In [None]:
import warnings
warnings.simplefilter('ignore')

import numpy as np

from sklearn.datasets import make_circles, load_boston
from sklearn.model_selection import train_test_split as tts

from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import RandomForestClassifier as RFC

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
def grid(modelo, param, cv=5): # por fuerza bruta
    
    grid=GridSearchCV(modelo, 
                      param, 
                      cv=cv,    # cross-validation
                      iid=True, # independiente e identicamente distribuidos
                      return_train_score=True,
                      n_jobs=-1)
    
    grid.fit(X_train, y_train)
    
    print('Acierto test: {:.2f}'.format(grid.score(X_test, y_test)))
    print('Acierto train: {:.2f}'.format(grid.score(X_train, y_train)))
    print('Mejores parametros: {}'.format(grid.best_params_))
    print('Mejor Acierto cv: {:.2f}'.format(grid.best_score_))
    
    return grid.best_estimator_.fit(X_train, y_train)

In [None]:
X=load_boston().data
y=load_boston().target

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)

In [None]:
rfr=RFR()
rfr.fit(X_train, y_train)

train_score=rfr.score(X_train, y_train)
test_score=rfr.score(X_test, y_test)

train_score, test_score

In [None]:
params={'max_leaf_nodes': [25, 30, 35], 'n_estimators': [80, 100, 120]}

In [None]:
grid(rfr, params)

In [None]:
modelo=grid(rfr, params)

In [None]:
modelo.predict(X_test)[:10]

### Random GridSearching

In [None]:
X, y = make_circles(noise=0.2, factor=0.5, random_state=1)

In [None]:
X[:10]

In [None]:
y[:10]

In [None]:
n_estimators=[int(x) for x in np.linspace(200, 2000, 10)]

max_features=['auto', 'sqrt']

min_samples_split=[2, 5, 10]

bootstrap=[True, False]

In [None]:
random_grid={'n_estimators': n_estimators,
             'max_features': max_features,
             'min_samples_split': min_samples_split,
             'bootstrap': bootstrap}

In [None]:
rfc=RFC()

In [None]:
# busqueda de parametros de manera aleatoria

rf_random=RandomizedSearchCV(estimator=rfc,
                             param_distributions=random_grid,
                             n_iter=100,
                             cv=3,
                             verbose=10,
                             n_jobs=-1,
                             scoring='roc_auc')

In [None]:
rf_random.fit(X, y)

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_score_

In [None]:
#rf_random.cv_results_

In [None]:
rf_random.best_estimator_.fit(X, y)

### HyperOpt (GridSearching bayesiano)

In [None]:
#!pip install hyperopt

import pandas as pd
from pandas.plotting import scatter_matrix

import xgboost as xgb

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split as tts

In [None]:
df=pd.read_csv('../data/diamonds_train.csv')
df=df.dropna()

df.head()

In [None]:
scatter_matrix(df, figsize=(15, 15));

In [None]:
X=df.drop('price', axis=1)
y=df.price

In [None]:
clarity={'I1':0, 'SI2':1, 'SI1':2, 'VS2':3, 'VS1':4,
         'VVS2':5, 'VVS1':6, 'IF':7}


cut={'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4}


color={'J':1, 'I':5, 'H':15, 'G':30, 'F':40, 'E':60, 'D':80}

In [None]:
def label(s, dic):
    return dic[s]

In [None]:
X.clarity=X.clarity.apply(lambda x: label(x, clarity))

X.cut=X.cut.apply(lambda x: label(x, cut))

X.color=X.color.apply(lambda x: label(x, color))

X.head()

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)

In [None]:
modelo=xgb.XGBRegressor()
modelo.fit(X_train, y_train)

y_pred=modelo.predict(X_test)

mse(y_test, y_pred, squared=False) #rmse

In [None]:
#help(modelo)

In [None]:
space={
    'n_estimators':hp.quniform('n_estimators', 10, 1000, 25),
    
    'learning_rate':hp.uniform('learning_rate', 0.0001, 1.0),
    
    'max_depth':hp.quniform('x_max_depth', 4, 16, 1),
    
    'min_child_weight':hp.quniform('x_min_child', 1, 10, 1),
    
    'subsample':hp.uniform('x_subsample', 0.7, 1),
    
    'gamma':hp.uniform('x_gamma', 0.1, 0.5),
    
    'reg_lambda':hp.uniform('x_reg_lambda', 0, 1)
}
# espacio de hiperparametros, mis a prioris


In [None]:
def objetivo(space):
    
    modelo=xgb.XGBRegressor(
        n_estimators=int(space['n_estimators']),
        learning_rate=space['learning_rate'],
        max_depth=int(space['max_depth']),
        min_child_weight=space['min_child_weight'],
        subsample=space['subsample'],
        gamma=space['gamma'],
        reg_lambda=space['reg_lambda'],
        objective='reg:squarederror'
    )
    
    eval_set=[(X_train, y_train), (X_test, y_test)]
    
    modelo.fit(X_train, y_train, eval_set=eval_set, eval_metric='rmse', verbose=False)
    
    y_pred=modelo.predict(X_test)
    
    rmse=mse(y_test, y_pred)**0.5
    
    return {'loss':rmse, 'status':STATUS_OK}

In [None]:
intentos=Trials()

best=fmin(fn=objetivo,
          space=espacio,
          algo=tpe.suggest,
          max_evals=25,
          trials=intentos)

best

In [None]:
modelo=xgb.XGBRegressor(
        n_estimators=int(best['n_estimators']),
        learning_rate=best['learning_rate'],
        max_depth=int(best['max_depth']),
        min_child_weight=best['min_child_weight'],
        subsample=best['subsample'],
        gamma=best['gamma'],
        reg_lambda=best['reg_lambda'],
        objective='reg:squarederror'
    )

In [None]:
modelo.fit(X_train, y_train)
y_pred=modelo.predict(X_test)

mse(y_test, y_pred)**0.5

In [None]:
m=xgb.XGBRegressor()
m.fit(X_train, y_train)
y_pred=m.predict(X_test)

mse(y_test, y_pred)**0.5