## Grid Search CV

GridSearchCV es una herramienta de la libreria Sklearn.

Esta herramienta hace una busqueda exhaustiva (o por fuerza bruta) de los mejores parámetros de un modelo siguiendo alguna métrica en particular.

_**Documentacion:** https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html_

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Normalizacion
from sklearn.preprocessing import MinMaxScaler

# Train, Test
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import jaccard_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
# Dataset del titanic preprocesado en clase

titanic = pd.read_csv("../Data/titanic_preprocesamiento.csv")

X = titanic.drop(["Fare-Binning", "Age-Binning", "Survived"], axis = 1)
y = titanic["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape},  y_test: {y_test.shape}")

X_train: (621, 11), y_train: (621,)
X_test: (267, 11),  y_test: (267,)


### Modelo

In [7]:
titanic.head()

Unnamed: 0,Age,Pclass,Fare,Fare-Binning,Age-Binning,female,male,Familia,Soltero,Miss.,Mr.,Mrs.,Other,Survived
0,22.0,3.0,7.25,2.0,1.0,0,1,1,0,0,1,0,0,0.0
1,38.0,1.0,71.2833,2.0,0.0,1,0,0,1,0,0,1,0,1.0
2,26.0,3.0,7.925,2.0,1.0,1,0,0,1,1,0,0,0,1.0
3,35.0,1.0,53.1,2.0,0.0,1,0,1,0,0,0,1,0,1.0
4,35.0,3.0,8.05,2.0,0.0,0,1,1,0,0,1,0,0,0.0


In [3]:
from sklearn.neighbors import KNeighborsClassifier

In [16]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)

### Predicciones

In [17]:
yhat = model.predict(X_test)

yhat

array([0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1.,
       0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
       0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 1., 1., 0., 0.

In [18]:
print("Jaccard Index:", jaccard_score(y_test, yhat, average = "macro"))
print("Accuracy:"     , accuracy_score(y_test, yhat))
print("Precisión:"    , precision_score(y_test, yhat, average = "macro"))
print("Sensibilidad:" , recall_score(y_test, yhat, average = "macro"))
print("F1-score:"     , f1_score(y_test, yhat, average = "macro"))
print("ROC AUC:"      , roc_auc_score(y_test, yhat))

Jaccard Index: 0.5632746863579852
Accuracy: 0.7415730337078652
Precisión: 0.7421652421652422
Sensibilidad: 0.7085280373831775
F1-score: 0.7146596453186711
ROC AUC: 0.7085280373831775


### GridSearchCV

In [8]:
from sklearn.model_selection import GridSearchCV

In [20]:
model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [28]:
%%time

# Modelo
model = KNeighborsClassifier()

# Parametros a iterar
params = {
    "n_neighbors" : [3, 4, 5, 6],
    "leaf_size" : [30, 40],
    "metric": ['minkowski', 'euclidean'],
}

# Metricas
scorers = ["f1_macro", "accuracy", "recall_macro"]

#GridSearchCV
grid_solver = GridSearchCV(estimator  = model     , 
                           param_grid = params    , 
                           scoring    = scorers   ,
                           cv         = 5         ,
                           refit      = "accuracy",
                           n_jobs     = -1         )

# Resultados
model_result = grid_solver.fit(X_train, y_train)

Wall time: 137 ms


In [29]:
# model_result.best_estimator es el mejor modelo que obtuvimos al iterar sobre todos los parámetros
# 

model_result.best_estimator_

In [30]:
yhat = model_result.best_estimator_.predict(X_test)

print("Jaccard Index:", jaccard_score(y_test, yhat, average = "macro"))
print("Accuracy:"     , accuracy_score(y_test, yhat))
print("Precisión:"    , precision_score(y_test, yhat, average = "macro"))
print("Sensibilidad:" , recall_score(y_test, yhat, average = "macro"))
print("F1-score:"     , f1_score(y_test, yhat, average = "macro"))
print("ROC AUC:"      , roc_auc_score(y_test, yhat))

Jaccard Index: 0.5413667757179208
Accuracy: 0.7228464419475655
Precisión: 0.7174432497013142
Sensibilidad: 0.6913551401869158
F1-score: 0.696255073176731
ROC AUC: 0.6913551401869158


In [31]:
model_result.cv_results_

{'mean_fit_time': array([0.00226088, 0.00414209, 0.00365434, 0.00340581, 0.00361595,
        0.00281487, 0.00281081, 0.00402098, 0.00311623, 0.00341558,
        0.00311003, 0.00322237, 0.00483985, 0.00342488, 0.00271993,
        0.00241132]),
 'std_fit_time': array([0.00044054, 0.00037868, 0.00048936, 0.00047215, 0.00086638,
        0.00067825, 0.00040515, 0.00131332, 0.00037289, 0.00189382,
        0.00048347, 0.0004058 , 0.00260742, 0.00048728, 0.0005201 ,
        0.00036821]),
 'mean_score_time': array([0.00919638, 0.00907049, 0.0098536 , 0.00907698, 0.00853877,
        0.00823326, 0.00884385, 0.01046228, 0.00986099, 0.00894866,
        0.01107721, 0.01056237, 0.01045952, 0.00887074, 0.00652332,
        0.00743532]),
 'std_score_time': array([0.00050453, 0.00062665, 0.001319  , 0.00031467, 0.00090703,
        0.00147392, 0.00051534, 0.00341204, 0.00349619, 0.0011672 ,
        0.00354388, 0.00307995, 0.00375875, 0.00173841, 0.0012683 ,
        0.00096778]),
 'param_leaf_size': masked

In [32]:
print(model_result.cv_results_["mean_test_recall_macro"].mean())
print(model_result.cv_results_["mean_test_f1_macro"].mean())
print(model_result.cv_results_["mean_test_accuracy"].mean())

print(model_result.best_score_)
print(model_result.best_params_)

0.6456983715482791
0.6482295227007806
0.6931919354838709
0.7004774193548388
{'leaf_size': 40, 'metric': 'minkowski', 'n_neighbors': 3}


In [37]:
%%time

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

params = {"n_estimators"           : [100, 200, 300], # Numero de arboles
          "criterion"              : ["gini", "entropy"], # Es la función para medir la calidad de una división/split.
          "max_depth"              : [3, 4, 5], # La profundidad máxima del árbol.
          "max_features"           : [2, 3], # El número de características (atributos) a considerar en cada split
          "max_leaf_nodes"         : [8], # Maximo de nodos hoja del arbol
          "min_impurity_decrease"  : [0.02, 0.3], # Un nodo se dividirá si esta división induce una disminución de la impureza mayor o igual a este valor.
          "min_samples_split"      : [2, 5]} # El número mínimo de muestras requeridas para llegar a nodo hoja.

scorers = {"f1_macro", "roc_auc", "f1_weighted", "accuracy", "balanced_accuracy"}

grid_solver = GridSearchCV(estimator  = model    , 
                           param_grid = params   , 
                           scoring    = scorers  ,
                           cv         = 5        ,
                           refit      = "f1_macro",
                           n_jobs     = -1        )

model_result = grid_solver.fit(X_train, y_train)

print(model_result.cv_results_["mean_test_roc_auc"].mean())
print(model_result.cv_results_["mean_test_f1_macro"].mean())
print(model_result.cv_results_["mean_test_f1_weighted"].mean())
print(model_result.cv_results_["mean_test_accuracy"].mean())
print(model_result.cv_results_["mean_test_balanced_accuracy"].mean())

print("*"*100)

print(model_result.best_score_)
print(model_result.best_params_)

0.6806917958484795
0.570736978860586
0.6283162191837905
0.7014279569892473
0.6263701671397092
****************************************************************************************************
0.7622214088993038
{'criterion': 'entropy', 'max_depth': 4, 'max_features': 3, 'max_leaf_nodes': 8, 'min_impurity_decrease': 0.02, 'min_samples_split': 5, 'n_estimators': 200}
Wall time: 25.2 s


In [38]:
model_result.cv_results_

{'mean_fit_time': array([0.18809729, 0.33030186, 0.55575085, 0.18340549, 0.3499639 ,
        0.52301455, 0.18053932, 0.34262567, 0.55240855, 0.17054987,
        0.34364824, 0.53271646, 0.1895143 , 0.37313919, 0.62016549,
        0.21028099, 0.3774363 , 0.58318548, 0.17162256, 0.32062316,
        0.53160195, 0.18587084, 0.31471548, 0.54540443, 0.17806759,
        0.371069  , 0.51803627, 0.17740655, 0.362673  , 0.50800052,
        0.16594353, 0.37048731, 0.52677832, 0.18145199, 0.36004276,
        0.53408341, 0.17579246, 0.34883404, 0.54785428, 0.17290516,
        0.39056792, 0.55181308, 0.21960711, 0.36034675, 0.59529448,
        0.17810955, 0.36256924, 0.54297132, 0.19755893, 0.33871932,
        0.53120594, 0.18115125, 0.34332881, 0.51577621, 0.18151555,
        0.34249372, 0.5257081 , 0.17166114, 0.35260835, 0.51757464,
        0.18944068, 0.37968855, 0.52071686, 0.16692195, 0.36188178,
        0.52976251, 0.176193  , 0.34410453, 0.50780358, 0.1766736 ,
        0.36243906, 0.51011233,

In [35]:
# Metricas para el GridSearchCV

from sklearn.metrics import SCORERS

sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_