In [14]:
import awswrangler as wr

import mlflow

# Para que funciones, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000
#%env MLFLOW_S3_ENDPOINT_URL=http://192.168.0.21:9000
#%env AWS_ENDPOINT_URL_S3=http://192.168.0.21:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


# B칰squeda de mejor modelo e hiperpar치metros

Dado nuestro dataset de Airbnb, el cual ya pas칩 por el proceso de ETL y se encuentra en nuestro S3 bucket, vamos a realizar una b칰squeda de cual seria el mejor modelo y que hiperparametros usar.

La b칰squeda de hiperparametros la haremos usando Optuna y el tracking ser치 realizado mediante MLFlow.

OBS: Para la confecci칩n de esta notebook, nos basamos en el tutorial de [MLFlow](https://mlflow.org/docs/latest/traditional-ml/hyperparameter-tuning-with-child-runs/notebooks/index.html).

In [15]:
mlflow_server = "http://localhost:5001"
#mlflow_server = "http://192.168.0.21:5001"

mlflow.set_tracking_uri(mlflow_server)

In [16]:
# Cargamos los datos para realizar nuestro estudio.
# OBS, no vamos a cargar los datos de testing, nada de Data leakage por aqu칤
X_train =  wr.s3.read_csv("s3://data/final/train/airbnb_X_train.csv")
y_train =  wr.s3.read_csv("s3://data/final/train/airbnb_y_train.csv")

X_test =  wr.s3.read_csv("s3://data/final/test/airbnb_X_test.csv")
y_test =  wr.s3.read_csv("s3://data/final/test/airbnb_y_test.csv")

print(f"Shape de X_train: {X_train.shape}")
print(f"Shape de y_train: {y_train.shape}")
print(f"Shape de X_test: {X_test.shape}")  
print(f"Shape de y_test: {y_test.shape}")

Shape de X_train: (22113, 78)
Shape de y_train: (22113, 1)
Shape de X_test: (9478, 78)
Shape de y_test: (9478, 1)


In [17]:
X_train.head()

Unnamed: 0,latitude,longitude,accommodates,bedrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,...,property_type_Shared room in hotel,property_type_Shared room in loft,property_type_Shared room in rental unit,property_type_Shared room in tent,property_type_Shared room in villa,property_type_Tiny home,property_type_Tower,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,-0.316174,1.324221,0.028897,-0.186296,0.065425,-0.057579,-0.177515,-0.116602,-0.400772,0.11203,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.024366,-0.729501,0.028897,-0.186296,0.065425,-0.053007,-0.118336,1.928279,1.251697,-0.523804,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.526138,-1.57119,-0.615911,-0.186296,0.707846,-0.044777,-0.118336,-0.937245,-0.593918,0.657031,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.396405,-1.382108,-0.615911,-1.308487,0.065425,-0.057579,-0.118336,-1.04218,0.178665,0.323975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.543755,-0.53649,-0.615911,-1.308487,-0.576997,-0.054003,-0.177515,-0.116602,1.165854,0.293697,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22113 entries, 0 to 22112
Data columns (total 78 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   latitude                                          22113 non-null  float64
 1   longitude                                         22113 non-null  float64
 2   accommodates                                      22113 non-null  float64
 3   bedrooms                                          22113 non-null  float64
 4   beds                                              22113 non-null  float64
 5   price                                             22113 non-null  float64
 6   minimum_nights                                    22113 non-null  float64
 7   maximum_nights                                    22113 non-null  float64
 8   number_of_reviews                                 22113 non-null  float64
 9   review_scores_rat

## Investigamos la correlaci칩n de features con la variable objetivo

Antes de profundizar en el proceso de construcci칩n de modelo, es esencial comprender las relaciones entre nuestras features  y la variable objetivo. Por lo que vamos a realizar un gr치fico que indica el coeficiente de correlaci칩n de cada feature en relaci칩n con la variable objetivo. Esto nos sirve para:

- Evitar data leakage: Debemos asegurarnos de que ninguna caracter칤stica se correlacione perfectamente con el objetivo (un coeficiente de correlaci칩n de aproximadamente 1.0). Si existe tal correlaci칩n, es una se침al de que nuestro conjunto de datos podr칤a estar "filtrando" informaci칩n sobre el objetivo. 

- Garantizar relaciones significativas: Idealmente, nuestras caracter칤sticas deber칤an tener alg칰n grado de correlaci칩n con el objetivo. Inclusive si estamos trabajando con un problema de clasificaci칩n, aunque los resultados no son tan importantes como en un caso de regresi칩n.

- Auditor칤a y trazabilidad: Loggear esta visualizaci칩n de correlaci칩n con nuestra ejecuci칩n principal de MLflow garantiza la trazabilidad. Proporciona una instant치nea de las caracter칤sticas de los datos en el momento del entrenamiento del modelo, lo cual es invaluable para prop칩sitos de auditor칤a y replicabilidad.

In [19]:
from plots import plot_correlation_with_target, plot_information_gain_with_target

In [20]:
target_column = y_train.columns[0]
correlation_plot = plot_correlation_with_target(X_train, y_train, target_col=target_column)
information_gain_plot = plot_information_gain_with_target(X_train, y_train, target_col=target_column)

## Arrancamos a experimentar

In [21]:
import datetime
import optuna

from mlflow.models import infer_signature
from mlflow_aux import get_or_create_experiment

from optuna_aux import champion_callback, objective

from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Optuna es un poco verboso, dejamos que solo nos muestre logs de errores
optuna.logging.set_verbosity(optuna.logging.ERROR)

Antes de poder realizar experimentos, vamos a crear el experimento en MLFLow, pero para evitar desorden, vamos a usar una funci칩n que se fije primero si el experimento existe, si esto es as칤, devuelve su ID.

Adem치s creamos el nombre del run padre con el que vamos a ir registrando las ejecuciones.

In [9]:
# Creemos el experimento
experiment_id = get_or_create_experiment("Airbnb Buenos Aires")
print(experiment_id)

run_name_parent = "best_hyperparam_"  + datetime.datetime.today().strftime('%Y/%m/%d-%H:%M:%S"')

1


Ya con todo seteado, vamos a ejecutar la optimizaci칩n usando Optuna, el cual realiza una b칰squeda Bayesiana, la cual es m치s eficiente que una b칰squeda de grilla tradicional. La desventaja es que es m치s dif칤cil de paralelizar.

In [10]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             roc_auc_score, classification_report, confusion_matrix, 
                             precision_recall_curve, roc_curve, auc)
import numpy as np

with mlflow.start_run(experiment_id=experiment_id, run_name=run_name_parent, nested=True):
    # Inicializamos el estudio de Optuna
    study = optuna.create_study(direction="maximize")

    # Ejecutamos los trials de optimizaci칩n de hiperparametros. Cada uno de estos trials se ejecuta con un run separado, pero 
    # est치 anidado al run padre.
    # Notar la adici칩n del `champion_callback` para controlar qu칠 mensajes mostramos
    # Para entender mejor esto ver la documentaci칩n de objective y champion_callback en optuna_aux
    study.optimize(lambda trial: objective(trial, X_train, y_train, experiment_id), n_trials=20, callbacks=[champion_callback])

    # Una vez que terminamos la b칰squeda, guardamos los mejores par치metros en el run padre.
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_train_f1", study.best_value)

    mlflow.set_tags(
        tags={
            "project": "Airbnb Buenos Aires",
            "optimizer_engine": "optuna",
            "model_family": "sklearn",
            "feature_set_version": 1,
        }
    )

    # Una vez que terminamos la b칰squeda, nos quedamos con el mejor modelo y lo entrenamos
    if study.best_params["classifier"] == "SVC_linear":
        model = SVC(C=study.best_params["svc_c"], kernel='linear', gamma='scale', probability=True)
    elif study.best_params["classifier"] == "SVC_poly":
        model = SVC(C=study.best_params["svc_c"], kernel='poly', 
                    gamma='scale', degree=study.best_params["svc_poly_degree"], probability=True)
    elif study.best_params["classifier"] == "SVC_rbf":
        model = SVC(C=study.best_params["svc_c"], kernel='rbf', gamma='scale', probability=True)
    elif study.best_params["classifier"] == "DecisionTreeClassifier":
        model = DecisionTreeClassifier(max_depth=study.best_params["tree_max_depth"])
    else:
        model = RandomForestClassifier(max_depth=study.best_params["rf_max_depth"], 
                                       n_estimators=study.best_params["rf_n_estimators"])

    model = model.fit(X_train, y_train.to_numpy().ravel())

    # Predicciones y probabilidades para las m칠tricas
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probabilidades para la clase positiva
    
    y_test_array = y_test.to_numpy().ravel()

    # ========== M칄TRICAS DE CLASIFICACI칍N ==========
    
    # M칠tricas b치sicas
    accuracy = accuracy_score(y_test_array, y_pred)
    precision = precision_score(y_test_array, y_pred)
    recall = recall_score(y_test_array, y_pred)
    f1 = f1_score(y_test_array, y_pred)
    
    # AUC-ROC
    roc_auc = roc_auc_score(y_test_array, y_pred_proba)
    
    # Precision-Recall AUC
    precision_vals, recall_vals, _ = precision_recall_curve(y_test_array, y_pred_proba)
    pr_auc = auc(recall_vals, precision_vals)
    
    # Especificidad (True Negative Rate)
    tn, fp, fn, tp = confusion_matrix(y_test_array, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    # Balanced Accuracy
    balanced_accuracy = (recall + specificity) / 2
    
    # Matthews Correlation Coefficient
    mcc = ((tp * tn) - (fp * fn)) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    
    # Log de todas las m칠tricas en MLflow
    mlflow.log_metrics({
        "test_accuracy": accuracy,
        "test_precision": precision,
        "test_recall": recall,
        "test_f1": f1,
        "test_roc_auc": roc_auc,
        "test_pr_auc": pr_auc,
        "test_specificity": specificity,
        "test_balanced_accuracy": balanced_accuracy,
        "test_mcc": mcc,
        "test_true_positives": int(tp),
        "test_true_negatives": int(tn),
        "test_false_positives": int(fp),
        "test_false_negatives": int(fn)
    })
    
    # Imprimir resumen de m칠tricas
    print("========== M칄TRICAS DE EVALUACI칍N ==========")
    print(f"Accuracy:           {accuracy:.4f}")
    print(f"Precision:          {precision:.4f}")
    print(f"Recall (Sensitivity): {recall:.4f}")
    print(f"Specificity:        {specificity:.4f}")
    print(f"F1-Score:           {f1:.4f}")
    print(f"Balanced Accuracy:  {balanced_accuracy:.4f}")
    print(f"ROC-AUC:            {roc_auc:.4f}")
    print(f"PR-AUC:             {pr_auc:.4f}")
    print(f"Matthews Corr Coef: {mcc:.4f}")
    print("\n========== MATRIZ DE CONFUSI칍N ==========")
    print(f"True Positives:     {tp}")
    print(f"True Negatives:     {tn}")
    print(f"False Positives:    {fp}")
    print(f"False Negatives:    {fn}")

    # Logueamos los artefactos de las gr치ficas de correlaci칩n y de information_gain
    mlflow.log_figure(figure=correlation_plot, artifact_file="correlation_plot.png")
    mlflow.log_figure(figure=information_gain_plot, artifact_file="information_gain_plot.png")

    # Guardamos el artefacto del modelo
    artifact_path = "model"

    signature = infer_signature(X_train, model.predict(X_train))

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=artifact_path,
        signature=signature,
        serialization_format='cloudpickle',
        registered_model_name="airbnb_model_dev",
        metadata={"model_data_version": 1}
    )

    # Obtenemos la ubicaci칩n del modelo guardado en MLFlow
    model_uri = mlflow.get_artifact_uri(artifact_path)


游끢 View run Trial: 0 at: http://localhost:5001/#/experiments/1/runs/92cb225f27a4415da3d550bbd1b3d844
游빍 View experiment at: http://localhost:5001/#/experiments/1
Initial trial 0 achieved value: 0.00969804334151603
游끢 View run Trial: 1 at: http://localhost:5001/#/experiments/1/runs/a9beb1e50cd044bb807a561edad5cc2e
游빍 View experiment at: http://localhost:5001/#/experiments/1
Trial 1 achieved value: 0.05171043124636158 with  81.2455% improvement
游끢 View run Trial: 1 at: http://localhost:5001/#/experiments/1/runs/a9beb1e50cd044bb807a561edad5cc2e
游빍 View experiment at: http://localhost:5001/#/experiments/1
Trial 1 achieved value: 0.05171043124636158 with  81.2455% improvement
游끢 View run Trial: 2 at: http://localhost:5001/#/experiments/1/runs/7c8da153ebb240ab8cd6ddd6c3f923ec
游빍 View experiment at: http://localhost:5001/#/experiments/1
游끢 View run Trial: 2 at: http://localhost:5001/#/experiments/1/runs/7c8da153ebb240ab8cd6ddd6c3f923ec
游빍 View experiment at: http://localhost:5001/#/experiment

Successfully registered model 'airbnb_model_dev'.
Successfully registered model 'airbnb_model_dev'.
2025/08/02 22:19:48 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: airbnb_model_dev, version 1
2025/08/02 22:19:48 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: airbnb_model_dev, version 1


游끢 View run best_hyperparam_2025/08/02-20:50:38" at: http://localhost:5001/#/experiments/1/runs/8bd9704dbdd24be68200fe1be54c7335
游빍 View experiment at: http://localhost:5001/#/experiments/1


Created version '1' of model 'airbnb_model_dev'.


## Testeando el modelo

Una vez que el modelo fue entrenado, podemos levantarlo y testearlo de una forma agn칩stica a donde est치 guardado.

In [None]:
loaded = mlflow.sklearn.load_model(model_uri)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
import numpy as np
test_data = [-1.0431146438366603, 0.6689936080056726, 0.5955141571109206, 1.4218278518511829, -0.3955054753168235,
             -0.195684619877533, 1.445528359737701, -0.8782783888787548, -0.4354941703556927, -0.6313862911472252,
             1.0752906583803283, -0.0987729596649589, 0.957427107756338, 1.1071614388213236, -0.2991215208080594,
             -0.5494422557947561, -0.362142984170074, 4.690415759823429, -0.2253029545296664, 1.1980376111153852]
loaded.predict(np.array(test_data).reshape([1, -1]))



array([1])

## Registramos el modelo 

Realizamos el registro del modelo en MLflow. En este registro se pone el modelo productivo que luego se usar치 para servir en formato on-line.

In [None]:
from mlflow import MlflowClient

client = MlflowClient()
name = "airbnb_model_prod"
desc = "This classifier detects if an appartment will be occupied or not"

# Creamos el modelo productivo
client.create_registered_model(name=name, description=desc)

# Guardamos como tag los hiper-parametros en la version del modelo
tags = model.get_params()
tags["model"] = type(model).__name__
tags["f1-score"] = f1_score

# Guardamos la version del modelo
result = client.create_model_version(
    name=name,
    source=model_uri,
    run_id=model_uri.split("/")[-3],
    tags=tags
)

# Y creamos como la version con el alias de champion para poder levantarlo en nuestro
# proceso de servicio del modelo on-line.
client.set_registered_model_alias(name, "champion", result.version)