In [7]:
# Para que funciones, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [8]:
!echo $AWS_ACCESS_KEY_ID
!echo $AWS_SECRET_ACCESS_KEY
!echo $MLFLOW_S3_ENDPOINT_URL

$AWS_ACCESS_KEY_ID
$AWS_SECRET_ACCESS_KEY
$MLFLOW_S3_ENDPOINT_URL


In [9]:
import awswrangler as wr

# Cargamos los datos para realizar nuestro estudio.
X_train =  wr.s3.read_csv("s3://data/final/train/sca_X_train.csv")
y_train =  wr.s3.read_csv("s3://data/final/train/sca_y_train.csv")

X_test =  wr.s3.read_csv("s3://data/final/test/sca_X_test.csv")
y_test =  wr.s3.read_csv("s3://data/final/test/sca_y_test.csv")

In [10]:
import pandas as pd
import optuna
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

# Set the MLflow tracking URI
mlflow.set_tracking_uri('http://localhost:5000') 

# Set up the MLflow experiment
mlflow.set_experiment("classification_optimization_2")

# Objective function for Optuna
def objective(trial):
    # Suggest classification model
    classifier_name = trial.suggest_categorical('classifier', ['RandomForest', 'XGBoost', 'SVC'])
    
    # Define hyperparameters for each classifier
    if classifier_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 10, 200)
        max_depth = trial.suggest_int('max_depth', 2, 32)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
        
    elif classifier_name == 'XGBoost':
        eta = trial.suggest_float('eta', 0.01, 0.5)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        model = XGBClassifier(eta=eta, max_depth=max_depth, use_label_encoder=False)
        
    else:
        C = trial.suggest_float('C', 0.1, 10.0)
        model = SVC(C=C, probability=True)  # SVC with probability=True for ROC AUC
    
    # Start an MLflow run
    with mlflow.start_run(run_name=f"{classifier_name}_run"):
        # Log the classifier name
        mlflow.log_param("classifier", classifier_name)
        
        # Log hyperparameters based on the classifier
        if classifier_name == 'RandomForest':
            mlflow.log_param('n_estimators', n_estimators)
            mlflow.log_param('max_depth', max_depth)
        elif classifier_name == 'XGBoost':
            mlflow.log_param('eta', eta)
            mlflow.log_param('max_depth', max_depth)
        else:
            mlflow.log_param('C', C)
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if classifier_name != 'SVC' or len(set(y_train)) > 2 else None
        
        # Evaluate the metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Log accuracy and F1 score
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        
        # ROC AUC is applicable if we have predicted probabilities
        if y_proba is not None:
            try:
                if len(set(y_train)) > 2:
                    lb = LabelBinarizer()
                    y_test_bin = lb.fit_transform(y_test)
                    roc_auc = roc_auc_score(y_test_bin, y_proba, average='weighted', multi_class='ovr')
                else:
                    roc_auc = roc_auc_score(y_test, y_proba)
                mlflow.log_metric("roc_auc", roc_auc)
            except ValueError as e:
                print(f"Error calculating ROC AUC: {e}")
        
        # Log the model
        mlflow.sklearn.log_model(model, classifier_name)
        
        return accuracy

# Run the optimization with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best model and parameters
print(f"Best model: {study.best_trial.params}")


2024/09/17 23:27:41 INFO mlflow.tracking.fluent: Experiment with name 'classification_optimization_2' does not exist. Creating a new experiment.
[I 2024-09-17 23:27:41,639] A new study created in memory with name: no-name-b0b090a6-dce9-44d1-81f7-0a4431fe995c
  y = column_or_1d(y, warn=True)
2024/09/17 23:27:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVC_run at: http://localhost:5000/#/experiments/3/runs/26a1bcfc452a48f3a6fa17b43103efee.
2024/09/17 23:27:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/3.
[I 2024-09-17 23:27:49,589] Trial 0 finished with value: 0.967032967032967 and parameters: {'classifier': 'SVC', 'C': 3.3664476978681055}. Best is trial 0 with value: 0.967032967032967.
  return fit_method(estimator, *args, **kwargs)
2024/09/17 23:27:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest_run at: http://localhost:5000/#/experiments/3/runs/ac4cb0dbb89441e4beac9325653ceae9.
202

Best model: {'classifier': 'XGBoost', 'eta': 0.3859624698807333, 'max_depth': 17}
