## Optimización de hiper-parámetros

### Recapitulando

In [10]:
import pandas as pd
#import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn import tree

from sklearn.tree import DecisionTreeClassifier

from skopt import gp_minimize
from skopt.space import Categorical, Dimension, Integer, Real  # noqa

#from IPython.display import Image  

In [2]:
directorio_data = '../data/'

x_training_file = directorio_data + 'x_traininig.csv'
y_training_file = directorio_data + 'y_traininig.csv'

x_testing_file = directorio_data + 'x_testing.csv'
y_testing_file = directorio_data + 'y_testing.csv'

In [3]:
df_x_training_validacion = pd.read_csv(x_training_file)
df_y_training_validacion = pd.read_csv(y_training_file)

X_testing = pd.read_csv(x_testing_file)
Y_testing = pd.read_csv(y_testing_file)

### Separamos nuestro dataset training-validación en training y validación

In [4]:
random_state = 661
test_size = 0.3

X_training, X_validacion, Y_training, Y_validacion = train_test_split(
    df_x_training_validacion, 
    df_y_training_validacion, 
    test_size=test_size, 
    random_state=random_state,
    stratify=df_y_training_validacion
)

### Dejamos solo las columnas que queremos

In [5]:
columnas = [
    "review_overall",
    "review_aroma",
    "review_appearance",
    "review_palate",
    "review_taste",
    "beer_abv"
]

In [6]:
X_training = X_training[columnas]
X_validacion = X_validacion[columnas]
X_testing = X_testing[columnas]

## Optimización con grid search

In [7]:
punto_de_corte = 0.11111111

max_depth_grid_search = [1,2,3,4,5,6,7,8,9,10,11,12]

for max_depth in max_depth_grid_search:
    #Creamos el árbol con los parámeros
    criterion='gini'
    clf = DecisionTreeClassifier(
                criterion = criterion,
                max_depth=max_depth,
                random_state=random_state
            )
    _ = clf.fit(X_training, Y_training)
    
    #Predecimos en validación
    Y_validacion_pred_proba = clf.predict_proba(X_validacion)
    Y_validacion_pred_proba = pd.DataFrame(Y_validacion_pred_proba, columns = ['NO_prob', 'SI_prob'])
    Y_validacion_pred_proba['prediccion'] = 'NO'
    Y_validacion_pred_proba.loc[Y_validacion_pred_proba['SI_prob']>punto_de_corte,'prediccion'] = 'SI'
    cm = confusion_matrix(Y_validacion, Y_validacion_pred_proba['prediccion'], labels=['SI', 'NO'])
    true_positives = cm[0,0]
    true_negatives = cm[1,1]
    false_positives = cm[1, 0]
    false_negatives = cm[0,1]
    metrica_negocio = ((4000*true_positives) - (500*false_positives))/(true_positives + false_negatives)
    print(max_depth, metrica_negocio)
    

1 0.0
2 0.0
3 16.736401673640167
4 43.93305439330544
5 334.72803347280336
6 345.1882845188284
7 207.11297071129707
8 309.6234309623431
9 303.34728033472805
10 372.3849372384937
11 364.01673640167365
12 309.6234309623431


## Optimización con grid search, dos parámetros

In [8]:
punto_de_corte = 0.11111111

max_depth_grid_search = [1,2,3,4,5,6,7,8,9,10,11,12]

min_samples_leaf_search = [1,2,3,4,5,6,7,8,9,10,11,12]

for min_samples_leaf in min_samples_leaf_search:
    for max_depth in max_depth_grid_search:
        #Creamos el árbol con los parámeros
        criterion='gini'
        clf = DecisionTreeClassifier(
                    criterion = criterion,
                    max_depth=max_depth,
                    min_samples_leaf = min_samples_leaf,
                    random_state=random_state
                )
        _ = clf.fit(X_training, Y_training)

        #Predecimos en validación
        Y_validacion_pred_proba = clf.predict_proba(X_validacion)
        Y_validacion_pred_proba = pd.DataFrame(Y_validacion_pred_proba, columns = ['NO_prob', 'SI_prob'])
        Y_validacion_pred_proba['prediccion'] = 'NO'
        Y_validacion_pred_proba.loc[Y_validacion_pred_proba['SI_prob']>punto_de_corte,'prediccion'] = 'SI'
        cm = confusion_matrix(Y_validacion, Y_validacion_pred_proba['prediccion'], labels=['SI', 'NO'])
        true_positives = cm[0,0]
        true_negatives = cm[1,1]
        false_positives = cm[1, 0]
        false_negatives = cm[0,1]
        metrica_negocio = ((4000*true_positives) - (500*false_positives))/(true_positives + false_negatives)
        print(max_depth, min_samples_leaf, metrica_negocio)
    

1 1 0.0
2 1 0.0
3 1 16.736401673640167
4 1 43.93305439330544
5 1 334.72803347280336
6 1 345.1882845188284
7 1 207.11297071129707
8 1 309.6234309623431
9 1 303.34728033472805
10 1 372.3849372384937
11 1 364.01673640167365
12 1 309.6234309623431
1 2 0.0
2 2 0.0
3 2 16.736401673640167
4 2 43.93305439330544
5 2 311.7154811715481
6 2 343.0962343096234
7 2 177.82426778242677
8 2 288.7029288702929
9 2 269.8744769874477
10 2 223.84937238493723
11 2 269.8744769874477
12 2 198.744769874477
1 3 0.0
2 3 0.0
3 3 16.736401673640167
4 3 43.93305439330544
5 3 311.7154811715481
6 3 364.01673640167365
7 3 232.21757322175733
8 3 269.8744769874477
9 3 278.2426778242678
10 3 152.71966527196653
11 3 207.11297071129707
12 3 274.05857740585776
1 4 0.0
2 4 0.0
3 4 16.736401673640167
4 4 43.93305439330544
5 4 311.7154811715481
6 4 393.30543933054395
7 4 288.7029288702929
8 4 328.45188284518827
9 4 341.0041841004184
10 4 255.23012552301256
11 4 244.76987447698744
12 4 336.8200836820084
1 5 0.0
2 5 0.0
3 5 16.736

### Otras estrategias - Optimización bayesiana

### Probamos con testing

In [40]:
directorio_data = '../data/'

x_training_file = directorio_data + 'x_traininig.csv'
y_training_file = directorio_data + 'y_traininig.csv'

x_testing_file = directorio_data + 'x_testing.csv'
y_testing_file = directorio_data + 'y_testing.csv'

In [41]:
df_x_training_validacion = pd.read_csv(x_training_file)
df_y_training_validacion = pd.read_csv(y_training_file)

X_testing = pd.read_csv(x_testing_file)
Y_testing = pd.read_csv(y_testing_file)

In [42]:
columnas = [
    "review_overall",
    "review_aroma",
    "review_appearance",
    "review_palate",
    "review_taste",
    "beer_abv"
]

In [43]:
X_training_validacion = df_x_training_validacion[columnas]
X_testing = X_testing[columnas]

In [47]:
max_depth = 6
criterion='gini'
clf = DecisionTreeClassifier(
            criterion = criterion,
            max_depth=max_depth,
            random_state=random_state
        )
_ = clf.fit(X_training_validacion, df_y_training_validacion)

In [48]:
umbral_de_corte = 0.11111111

Y_testing_pred_proba = clf.predict_proba(X_testing)
Y_testing_pred_proba = pd.DataFrame(Y_testing_pred_proba, columns = ['NO_prob', 'SI_prob'])

Y_testing_pred_proba['prediccion'] = 'NO'
Y_testing_pred_proba.loc[Y_testing_pred_proba['SI_prob']>umbral_de_corte,'prediccion'] = 'SI'

cm = confusion_matrix(Y_testing, Y_testing_pred_proba['prediccion'], labels=['SI', 'NO'])
true_positives = cm[0,0]
true_negatives = cm[1,1]
false_positives = cm[1, 0]
false_negatives = cm[0,1]

metrica_negocio = ((4000*true_positives) - (500*false_positives))/(true_positives + false_negatives)
metrica_negocio

365.0

### Optimización con Grid search: max_depth + min_samples_split