https://www.kaggle.com/datasets/thedevastator/1-5-million-beer-reviews-from-beer-advocate?select=beer_reviews.csv

# Adecuando el ambiente

## Instalando paquetes necesarios

In [None]:
!pip install pandas

In [None]:
!pip install seaborn

In [None]:
!pip install scikit-learn

In [None]:
!pip install pydotplus

In [None]:
!pip install mlflow

# Iniciando la aventura

## Importando paquetes necesarios

In [18]:
import pandas as pd
import numpy as np
import pydotplus

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

import mlflow
from mlflow import log_metric, log_param, log_params, log_artifacts

from IPython.display import Image  

import alejandro_model_utils as amu
import alejandro_graphviz_tools as agt
import alejandro_tree_utils as atu

# Preparando nuestro primer modelo

## Cargamos nuestros dataset

In [2]:
file_cervezas_unicas_resumen_valores_con_clase_training = 'dataset/cervezas_unicas_resumen_valores_con_clase_train.csv'
file_cervezas_unicas_resumen_valores_con_clase_validacion = 'dataset/cervezas_unicas_resumen_valores_con_clase_validation.csv'

campos = ['beer_abv', 'review_aroma', 'review_appearance', 'review_taste', 'review_overall', 'review_palate', 'es_popular']

df_training_testing = pd.read_csv(file_cervezas_unicas_resumen_valores_con_clase_training)
df_validacion = pd.read_csv(file_cervezas_unicas_resumen_valores_con_clase_validacion)

df_training_testing = df_training_testing.loc[:, campos]
df_validacion = df_validacion.loc[:, campos]

df_training_testing.head()

Unnamed: 0,beer_abv,review_aroma,review_appearance,review_taste,review_overall,review_palate,es_popular
0,8.2,4.5,4.25,4.5,4.5,4.5,NO
1,,3.375,3.5,3.25,3.625,3.25,NO
2,4.2,3.391304,3.5,3.565217,3.76087,3.434783,NO
3,,3.5,3.5,4.0,3.5,3.5,NO
4,5.0,2.75,3.5,3.0,3.0,3.5,NO


## Separamos en training y testing

In [9]:
random_state = 1
test_size = 0.3

atributos = ['beer_abv', 'review_aroma', 'review_appearance', 'review_taste', 'review_overall', 'review_palate']
clase = ['es_popular']

X_df_training_testing = df_training_testing.loc[:, atributos]
Y_df_training_testing = df_training_testing.loc[:, clase]

### Tambien abrimos en X e Y el dataset de validacion

In [10]:
X_validacion = df_validacion.loc[:, atributos]
Y_validacion = df_validacion.loc[:, clase]

## Generamos nuestro primer modelo

### El algoritmo no acepta valores nulos de variables

In [14]:
X_df_training_testing = amu.trasform_dataframe(X_df_training_testing, [amu.transform_NaN_beerabv])
X_validacion = amu.trasform_dataframe(X_validacion, [amu.transform_NaN_beerabv])

In [None]:
def f(umbral_corte):
    resultado = amu.calculate_metrics(
        y_true=y_test['es_popular'].values, 
        y_pred = amu.evaluar_corte(df_y_test_pred_proba['NO_prob'].values, umbral_corte, 'SI', 'NO'), 
        labels = ['NO', 'SI'])
    print([umbral_corte, resultado['metrica negocio']])
    return -resultado['metrica negocio']

In [38]:
def cv_con_umbral_de_corte(params):
    umbral_de_corte = params[0]
    max_depth = params[1]
    X = X_df_training_testing
    y = Y_df_training_testing
    #criterion='entropy'
    criterion='gini'
    #criterion='log_loss'
    clf = DecisionTreeClassifier(
                criterion = criterion,
                max_depth=max_depth
            )

    # 5-fold cross validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    def all_metrics(clf, X, y):
        y_pred = clf.predict_proba(X)
        df_y_pred_proba = pd.DataFrame(y_pred, columns = ['NO_prob', 'SI_prob'])
        df_y_pred_proba['prediccion'] = np.where(df_y_pred_proba['NO_prob'].values <= umbral_corte, 'SI', 'NO')

        metricas = amu.calculate_metrics(
            y_true = y['es_popular'].values, 
            y_pred = df_y_pred_proba['prediccion'].values, 
            labels=['NO', 'SI']
        )

        return metricas

    scores = cross_validate(clf, X, y, scoring=all_metrics, cv=cv)
    resultados = {
        'true negatives':          round(np.mean(scores['test_true negatives']), 2),
        'true positives':          round(np.mean(scores['test_true positives']),2),
        'false positives':         round(np.mean(scores['test_false positives']),2),
        'false negatives':         round(np.mean(scores['test_false negatives']),2),
        'accuracy':                round(np.mean(scores['test_accuracy']),2),
        'recall':                  round(np.mean(scores['test_recall']),2),
        'precision':               round(np.mean(scores['test_precision']),2),
        'specificity':             round(np.mean(scores['test_specificity']),2),
        'tasa falso positivo':     round(np.mean(scores['test_tasa falso positivo']),2),
        'tasa falso negativo':     round(np.mean(scores['test_tasa falso negativo']),2),
        'f1':                      round(np.mean(scores['test_f1']),2),
        'metrica negocio':         round(np.mean(scores['test_metrica negocio']),2),
        'metrica optima negocio':  round(np.mean(scores['test_metrica optima negocio']),2)
    }
    print([umbral_de_corte, max_depth, resultados['metrica negocio']])
    return resultados['metrica negocio']

resultados = cv_con_umbral_de_corte([0.9, 4])
resultados

[0.9, 4, 78.29]


78.29

In [54]:
import warnings
warnings.filterwarnings('ignore')

from skopt import gp_minimize
from skopt.acquisition import gaussian_lcb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from skopt.space import Categorical, Dimension, Integer, Real  # noqa

space  = [
    Real(0, 1, name='umbral_corte'),
    Integer(3, 6, name = 'max_depth')
]

res = gp_minimize(cv_con_umbral_de_corte,                  # the function to minimize
                  space,     # the bounds on each dimension of x
                  n_initial_points = 10,
                  #acq="LCB",          # the acquisition function (optional)
                  #base_estimator=gp,  # a GP estimator (optional)
                  n_calls=50,         # the number of evaluations of f including at x0
                  n_random_starts=10,  # the number of random initialization points
                  random_state=777)

[0.6464497544046984, 4, 78.29]
[0.636254291682395, 4, 78.29]
[0.5635290438458104, 5, 82.53]
[0.756333230201127, 6, 90.21]
[0.3879867542090215, 4, 78.29]
[0.3875795161700688, 5, 82.54]
[0.38639825838312913, 6, 90.2]
[0.9348256858175489, 4, 78.29]
[0.9296758886594835, 5, 82.53]
[0.11971063444956678, 5, 82.53]
[0.9406374817405783, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[1.9011474657881781e-06, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.0, 3, 63.29]
[1.0, 3, 63.29]
[0.00012503817511270735, 3, 63.29]
[0.999992613957533, 3, 63.29]
[6.300284092117146e-05, 3, 63.29]
[0.9996251558840376, 3, 63.29]
[0.0, 3, 63.29]
[0.9997777525708927, 3, 63.29]
[0.0, 3, 63.29]


In [55]:
[res.x, -res.fun]

[[0.9406374817405783, 3], -63.29]

### Utilizamos estos parámetros para crear un nuevo modelo

In [63]:
umbral_de_corte = res.x[0]
max_depth = res.x[1]
#criterion='entropy'
criterion='gini'
#criterion='log_loss'

clf = DecisionTreeClassifier(
            criterion = criterion,
            max_depth=max_depth
        )
_ = clf.fit(X_df_training_testing, Y_df_training_testing)


### Probamos los nuevos parámetros en validación

In [64]:
host = 'http://localhost:5000'

mlflow.set_tracking_uri(host)
tracking_uri = mlflow.get_tracking_uri()

print("Current tracking uri: {}".format(tracking_uri))

Current tracking uri: http://localhost:5000


In [66]:
nombre_experimento = 'AnalisisDeCervezas'
mlflow.set_experiment(nombre_experimento)
experiment = mlflow.get_experiment_by_name(nombre_experimento)

y_validacion_pred = clf.predict_proba(X_validacion)

df_y_validacion_pred_proba = pd.DataFrame(y_validacion_pred, columns = ['NO_prob', 'SI_prob'])
df_y_validacion_pred_proba['prediccion'] = np.where(df_y_validacion_pred_proba['NO_prob'].values <= umbral_corte, 'SI', 'NO')

metricas = amu.calculate_metrics(
    y_true = Y_validacion['es_popular'].values, 
    y_pred = df_y_validacion_pred_proba['prediccion'].values, 
    labels=['NO', 'SI']
)

metricas['max_depth'] = max_depth
metricas['umbral_corte'] = umbral_corte

with mlflow.start_run(experiment_id=experiment.experiment_id, run_name="Sexta ejecución", description='CV + max_depth + umbral_de_corte'):
    mlflow.set_tag("dataset", "validacion")
    
    mlflow.log_param("max_depth", metricas['max_depth'])
    mlflow.log_param("umbral_corte", metricas['umbral_corte'])
    
    mlflow.log_metric("true negatives", metricas['true negatives'])
    mlflow.log_metric("true positives", metricas['true positives'])
    mlflow.log_metric("false positives", metricas['false positives'])
    mlflow.log_metric("false negatives", metricas['false negatives'])
    mlflow.log_metric("accuracy", metricas['accuracy'])
    mlflow.log_metric("recall", metricas['recall'])
    mlflow.log_metric("precision", metricas['precision'])
    mlflow.log_metric("specificity", metricas['specificity'])
    mlflow.log_metric("tasa falso positivo", metricas['tasa falso positivo'])
    mlflow.log_metric("tasa falso negativo", metricas['tasa falso negativo'])
    mlflow.log_metric("f1", metricas['f1'])
    mlflow.log_metric("metrica negocio", metricas['metrica negocio'])
    mlflow.log_metric("metrica optima negocio", metricas['metrica optima negocio'])
    
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="clasification_tree_model"
    )
        
    mlflow.end_run()
    
metricas

{'true negatives': 16171,
 'true positives': 488,
 'false positives': 2695,
 'false negatives': 462,
 'accuracy': 0.8407,
 'recall': 0.5137,
 'precision': 0.1533,
 'specificity': 0.8572,
 'tasa falso positivo': 0.1428,
 'tasa falso negativo': 0.0245,
 'f1': 0.2361,
 'metrica negocio': 71.30601534113848,
 'metrica optima negocio': 191.76423092450545,
 'max_depth': 3,
 'umbral_corte': 0.9}