In [9]:
# Para que funciones, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [10]:
!echo $AWS_ACCESS_KEY_ID
!echo $AWS_SECRET_ACCESS_KEY
!echo $MLFLOW_S3_ENDPOINT_URL

$AWS_ACCESS_KEY_ID
$AWS_SECRET_ACCESS_KEY
$MLFLOW_S3_ENDPOINT_URL


In [11]:
import awswrangler as wr

# Cargamos los datos para realizar nuestro estudio.
X_train =  wr.s3.read_csv("s3://data/final/train/X_train_scaled.csv")
y_train =  wr.s3.read_csv("s3://data/final/train/y_train.csv")

X_val =  wr.s3.read_csv("s3://data/final/val/X_val_scaled.csv")
y_val =  wr.s3.read_csv("s3://data/final/val/y_val.csv")

X_test =  wr.s3.read_csv("s3://data/final/test/X_test_scaled.csv")
y_test =  wr.s3.read_csv("s3://data/final/test/y_test.csv")

In [12]:
import pandas as pd
import optuna
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold


# Set the MLflow tracking URI
mlflow.set_tracking_uri('http://localhost:5000') 

# Set up the MLflow experiment
mlflow.set_experiment("classification-cross-validation-02-02-25_3")

def objective(trial):
    # Elegimos el clasificador aleatoriamente
    classifier_name = trial.suggest_categorical('classifier', 
        ['RandomForest', 'XGBoost', 'SVC', 'LogisticRegression', 'KNN', 'KNN_simple'])
    
    # Combinamos los conjuntos de entrenamiento y validación
    X_combined = pd.concat([X_train, X_val])
    y_combined = pd.concat([y_train, y_val])
    
    # Definimos el número de folds para cross-validation
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    if classifier_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 10, 100)
        max_depth = trial.suggest_int('max_depth', 2, 32)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    
    elif classifier_name == 'XGBoost':
        eta = trial.suggest_float('eta', 0.01, 0.5)
        max_depth = trial.suggest_int('max_depth', 2, 20)
        model = XGBClassifier(eta=eta, max_depth=max_depth, use_label_encoder=False)
    
    elif classifier_name == 'SVC':
        C = trial.suggest_float('C', 0.1, 10.0, log=True)
        model = SVC(C=C)
    
    elif classifier_name == 'LogisticRegression':
        C = trial.suggest_float('C', 0.1, 10.0, log=True)
        penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
        model = LogisticRegression(C=C, penalty=penalty, solver='liblinear')
    
    elif classifier_name == 'KNN_simple':
        n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
        weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
        p = trial.suggest_int('p', 1, 2)
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p)
    
    else:  # KNN
        n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
        weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
        p = trial.suggest_int('p', 1, 2)
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p)
    
    # Lista para almacenar los scores de CV
    cv_scores = []
    cv_f1_scores = []
    
    # Iniciamos una ejecución en MLflow
    with mlflow.start_run(run_name=f"{classifier_name}_cv_run"):
        # Registramos parámetros
        mlflow.log_param("classifier", classifier_name)
        mlflow.log_params(trial.params)
        
        # Realizamos cross-validation
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_combined)):
            X_fold_train = X_combined.iloc[train_idx]
            y_fold_train = y_combined.iloc[train_idx]
            X_fold_val = X_combined.iloc[val_idx]
            y_fold_val = y_combined.iloc[val_idx]
            
            if classifier_name == 'XGBoost':
                model.fit(
                    X_fold_train,
                    y_fold_train,
                    eval_set=[(X_fold_val, y_fold_val)],
                    verbose=False
                )
            else:
                model.fit(X_fold_train, y_fold_train.values.ravel())
            
            # Evaluamos en el fold de validación
            y_pred = model.predict(X_fold_val)
            fold_accuracy = accuracy_score(y_fold_val, y_pred)
            fold_f1 = f1_score(y_fold_val, y_pred, average='weighted')
            
            cv_scores.append(fold_accuracy)
            cv_f1_scores.append(fold_f1)
            # Registramos métricas del fold
            mlflow.log_metric(f"fold_{fold}_accuracy", fold_accuracy)
            mlflow.log_metric(f"fold_{fold}_f1", fold_f1)
        # Calculamos y registramos la media y desviación estándar de CV
        mean_cv_accuracy = np.mean(cv_scores)
        std_cv_accuracy = np.std(cv_scores)
        mean_cv_f1 = np.mean(cv_f1_scores)
        std_cv_f1 = np.std(cv_f1_scores)
        
        mlflow.log_metric("mean_cv_accuracy", mean_cv_accuracy)
        mlflow.log_metric("std_cv_accuracy", std_cv_accuracy)
        mlflow.log_metric("mean_cv_f1", mean_cv_f1)
        mlflow.log_metric("std_cv_f1", std_cv_f1)
        
        # Entrenamos el modelo final con todos los datos combinados
        if classifier_name == 'XGBoost':
            model.fit(
                X_combined,
                y_combined,
                eval_set=[(X_test, y_test)],
                verbose=False
            )
        else:
            model.fit(X_combined, y_combined.values.ravel())
        
        # Evaluamos en el conjunto de prueba
        y_pred = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred, average='weighted')
        
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("test_f1", test_f1)
        
        # Registramos el modelo final
        mlflow.sklearn.log_model(model, classifier_name)
        
        return mean_cv_f1

# Ejecutamos la optimización con Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Imprimimos el mejor modelo y sus parámetros
print(f"Mejor modelo: {study.best_trial.params}")
print(f"Mejor accuracy CV: {study.best_trial.value}")

2025/02/02 00:14:32 INFO mlflow.tracking.fluent: Experiment with name 'classification-cross-validation-02-02-25_3' does not exist. Creating a new experiment.
[I 2025-02-02 00:14:32,500] A new study created in memory with name: no-name-859ed2a0-79bf-4e76-b2eb-3de2badef31e
2025/02/02 00:14:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNN_cv_run at: http://localhost:5000/#/experiments/16/runs/9cd81d8a0d18472a9cec9b68a48b5c32.
2025/02/02 00:14:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/16.
[I 2025-02-02 00:14:37,167] Trial 0 finished with value: 0.952208601960205 and parameters: {'classifier': 'KNN', 'n_neighbors': 2, 'weights': 'uniform', 'p': 1}. Best is trial 0 with value: 0.952208601960205.
2025/02/02 00:14:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNN_cv_run at: http://localhost:5000/#/experiments/16/runs/aaddf0637404492ebc6adc0f46108655.
2025/02/02 00:14:42 INFO mlflow.tracking._tracki

Mejor modelo: {'classifier': 'LogisticRegression', 'C': 9.490791980882449, 'penalty': 'l1'}
Mejor accuracy CV: 0.9602066816079716
