In [4]:
# Para que funciones, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [5]:
!echo $AWS_ACCESS_KEY_ID
!echo $AWS_SECRET_ACCESS_KEY
!echo $MLFLOW_S3_ENDPOINT_URL

$AWS_ACCESS_KEY_ID


$AWS_SECRET_ACCESS_KEY
$MLFLOW_S3_ENDPOINT_URL


In [6]:
import awswrangler as wr

# Cargamos los datos para realizar nuestro estudio.
X_train =  wr.s3.read_csv("s3://data/final/train/X_train_scaled.csv")
y_train =  wr.s3.read_csv("s3://data/final/train/y_train.csv")

X_val =  wr.s3.read_csv("s3://data/final/val/X_val_scaled.csv")
y_val =  wr.s3.read_csv("s3://data/final/val/y_val.csv")

X_test =  wr.s3.read_csv("s3://data/final/test/X_test_scaled.csv")
y_test =  wr.s3.read_csv("s3://data/final/test/y_test.csv")

In [None]:
import pandas as pd
import optuna
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
import numpy as np

# Set the MLflow tracking URI
mlflow.set_tracking_uri('http://localhost:5000') 

# Set up the MLflow experiment
mlflow.set_experiment("classification_optimization_03/02/2025")

# Objective function for Optuna
def objective(trial):
    # Sugerimos el modelo de clasificación
    classifier_name = trial.suggest_categorical('classifier', ['RandomForest', 'XGBoost', 'SVC', 'LogisticRegression', 'KNN', 'KNN_simple'])
    
    # Definimos hiperparámetros para cada clasificador
    if classifier_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 10, 200)
        max_depth = trial.suggest_int('max_depth', 2, 32)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
        
    elif classifier_name == 'XGBoost':
        eta = trial.suggest_float('eta', 0.01, 0.5)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        model = XGBClassifier(
            eta=eta,
            max_depth=max_depth,
            use_label_encoder=False,
            early_stopping_rounds=10,
            eval_metric='logloss'
        )
    elif classifier_name == 'SVC':
        C = trial.suggest_float('C', 0.1, 10.0)
        model = SVC(C=C, probability=True)  # SVC con probability=True para ROC AUC
    
    elif classifier_name == 'LogisticRegression':
        C = trial.suggest_float('C', 0.1, 10.0)
        penalty = trial.suggest_categorical('penalty', ['l2'])  # Simplificado a 'l2'
        solver = 'lbfgs'  # Solver por defecto
        model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=1000)
    
    elif classifier_name == 'KNN_simple':
        # KNN con muy pocos vecinos (1-3)
        n_neighbors = trial.suggest_int('n_neighbors', 1, 3)
        weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
        p = trial.suggest_int('p', 1, 2)  # p=1 Manhattan, p=2 Euclidiana
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p)
    
    else:  # KNN
        n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
        weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
        p = trial.suggest_int('p', 1, 2)  # p=1 Manhattan, p=2 Euclidiana
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p)
    
    # Iniciamos una ejecución en MLflow
    with mlflow.start_run(run_name=f"{classifier_name}_run"):
        # Registramos el nombre del clasificador
        mlflow.log_param("classifier", classifier_name)
        
        # Registramos los hiperparámetros según el clasificador
        if classifier_name == 'RandomForest':
            mlflow.log_param('n_estimators', n_estimators)
            mlflow.log_param('max_depth', max_depth)
        elif classifier_name == 'XGBoost':
            mlflow.log_param('eta', eta)
            mlflow.log_param('max_depth', max_depth)
        elif classifier_name == 'SVC':
            mlflow.log_param('C', C)
        elif classifier_name == 'LogisticRegression':
            mlflow.log_param('C', C)
            mlflow.log_param('penalty', penalty)
        else:  # KNN
            mlflow.log_param('n_neighbors', n_neighbors)
            mlflow.log_param('weights', weights)
            mlflow.log_param('p', p)     
    
        # Entrenamiento específico según el modelo
        if classifier_name == 'XGBoost':
            # Entrenar XGBoost con conjunto de validación
            model.fit(
                X_train, 
                y_train,
                eval_set=[(X_val, y_val)],
                verbose=False
            )
            
            # Para predicciones usaremos X_test completo
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)[:, 1]
            
        else:
            # Entrenamiento para el resto de modelos
            model.fit(X_train, y_train.values.ravel())
            y_pred = model.predict(X_test)
            
            # Verificamos si el modelo tiene predict_proba
            if hasattr(model, "predict_proba"):
                y_proba = model.predict_proba(X_test)[:, 1]
            else:
                # Para modelos sin predict_proba, usamos decision_function
                if hasattr(model, "decision_function"):
                    y_scores = model.decision_function(X_test)
                    y_proba = (y_scores - y_scores.min()) / (y_scores.max() - y_scores.min())
                else:
                    y_proba = None  # No se puede calcular ROC AUC
        
        # Evaluamos las métricas
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        
        # Registramos accuracy y F1 score
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        
        # Calculamos y registramos ROC AUC si es aplicable
        if y_proba is not None:
            try:
                if len(set(y_train.values.ravel())) > 2:
                    lb = LabelBinarizer()
                    y_test_bin = lb.fit_transform(y_test)
                    roc_auc = roc_auc_score(y_test_bin, model.predict_proba(X_test), average='weighted', multi_class='ovr')
                else:
                    roc_auc = roc_auc_score(y_test, y_proba)
                
                mlflow.log_metric("roc_auc", roc_auc)
                
                # Calculamos la curva ROC
                fpr, tpr, thresholds = roc_curve(y_test, y_proba)
                roc_auc_value = auc(fpr, tpr)
                
                # Graficamos la curva ROC
                plt.figure()
                plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_value)
                plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'Receiver Operating Characteristic - {classifier_name}')
                plt.legend(loc="lower right")
                # Guardamos la figura temporalmente
                plt.savefig('roc_curve.png')
                plt.close()
                # Registramos la curva ROC
                mlflow.log_artifact('roc_curve.png')
                
            except ValueError as e:
                print(f"Error al calcular ROC AUC: {e}")
        
        # Calculamos la matriz de confusión
        cm = confusion_matrix(y_test, y_pred)
        
        # Graficamos la matriz de confusión
        plt.figure()
        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title(f'Matriz de Confusión - {classifier_name}')
        plt.colorbar()
        tick_marks = np.arange(len(np.unique(y_test)))
        plt.xticks(tick_marks, np.unique(y_test), rotation=45)
        plt.yticks(tick_marks, np.unique(y_test))
        plt.tight_layout()
        plt.ylabel('Etiqueta verdadera')
        plt.xlabel('Etiqueta predicha')
        # Guardamos la figura temporalmente
        plt.savefig('confusion_matrix.png')
        plt.close()
        # Registramos la matriz de confusión
        mlflow.log_artifact('confusion_matrix.png')
        
        # Registramos el modelo
        mlflow.sklearn.log_model(model, classifier_name)
        
        return accuracy

# Ejecutamos la optimización con Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Imprimimos el mejor modelo y sus parámetros
print(f"Mejor modelo: {study.best_trial.params}")

  from .autonotebook import tqdm as notebook_tqdm
2025/02/03 08:51:44 INFO mlflow.tracking.fluent: Experiment with name 'classification_optimization_03/02/2025' does not exist. Creating a new experiment.
[I 2025-02-03 08:51:44,095] A new study created in memory with name: no-name-d8f36da0-1041-43f8-b75e-2ea4abc07c72
2025/02/03 08:51:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNN_simple_run at: http://localhost:5000/#/experiments/20/runs/17b64e815e03443aa9c147dfc8ac1192.
2025/02/03 08:51:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/20.
[I 2025-02-03 08:51:54,556] Trial 0 finished with value: 0.9183673469387755 and parameters: {'classifier': 'KNN_simple', 'n_neighbors': 3, 'weights': 'uniform', 'p': 2}. Best is trial 0 with value: 0.9183673469387755.
Parameters: { "use_label_encoder" } are not used.

2025/02/03 08:51:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost_run at: http://localhos

Mejor modelo: {'classifier': 'XGBoost', 'eta': 0.3672273375506452, 'max_depth': 3}
