In [5]:
# Para que funciones, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [6]:
!echo $AWS_ACCESS_KEY_ID
!echo $AWS_SECRET_ACCESS_KEY
!echo $MLFLOW_S3_ENDPOINT_URL

$AWS_ACCESS_KEY_ID
$AWS_SECRET_ACCESS_KEY
$MLFLOW_S3_ENDPOINT_URL


In [7]:
import awswrangler as wr

# Cargamos los datos para realizar nuestro estudio.
X_train =  wr.s3.read_csv("s3://data/final/train/X_train_scaled.csv")
y_train =  wr.s3.read_csv("s3://data/final/train/y_train.csv")

X_val =  wr.s3.read_csv("s3://data/final/val/X_val_scaled.csv")
y_val =  wr.s3.read_csv("s3://data/final/val/y_val.csv")

X_test =  wr.s3.read_csv("s3://data/final/test/X_test_scaled.csv")
y_test =  wr.s3.read_csv("s3://data/final/test/y_test.csv")

In [8]:
import pandas as pd
import optuna
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

# Set the MLflow tracking URI
mlflow.set_tracking_uri('http://localhost:5000')

# Set up the MLflow experiment
mlflow.set_experiment("classification_neural_network_experiment_2")

class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNetwork, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size // 2, 1),  # Single output for binary classification
            nn.Sigmoid()  # Sigmoid for binary classification
        )
        
    def forward(self, x):
        return self.layers(x)

def objective(trial):
    # Hyperparameters to optimize
    hidden_size = trial.suggest_int('hidden_size', 32, 256)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_int('batch_size', 16, 128)
    epochs = trial.suggest_int('epochs', 10, 50)
    
    # Combine training and validation sets
    X_combined = pd.concat([X_train, X_val])
    y_combined = pd.concat([y_train, y_val])
    
    # Convert to PyTorch tensors
    X_tensor = torch.FloatTensor(X_combined.values)
    y_tensor = torch.FloatTensor(y_combined.values).reshape(-1, 1)  # Reshape to match network output
    
    # K-fold cross validation setup
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    cv_f1_scores = []
    
    with mlflow.start_run(run_name="neural_network_cv_run"):
        # Log parameters
        mlflow.log_params({
            'hidden_size': hidden_size,
            'learning_rate': learning_rate,
            'batch_size': batch_size,
            'epochs': epochs
        })
        
        # Perform cross-validation
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_tensor)):
            # Split data for this fold
            X_fold_train = X_tensor[train_idx]
            y_fold_train = y_tensor[train_idx]
            X_fold_val = X_tensor[val_idx]
            y_fold_val = y_tensor[val_idx]
            
            # Create data loaders
            train_dataset = TensorDataset(X_fold_train, y_fold_train)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            
            # Initialize model and optimizer
            model = NeuralNetwork(X_tensor.shape[1], hidden_size)
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            criterion = nn.BCELoss()  # Binary Cross Entropy Loss
            
            # Training loop
            model.train()
            for epoch in range(epochs):
                for batch_X, batch_y in train_loader:
                    optimizer.zero_grad()
                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y)
                    loss.backward()
                    optimizer.step()
            
            # Evaluation
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_fold_val)
                predicted = (val_outputs >= 0.5).float()  # Threshold at 0.5 for binary prediction
                
                fold_accuracy = accuracy_score(y_fold_val, predicted)
                fold_f1 = f1_score(y_fold_val, predicted, average='weighted')
                
                cv_scores.append(fold_accuracy)
                cv_f1_scores.append(fold_f1)
                
                # Log metrics for this fold
                mlflow.log_metric(f"fold_{fold}_accuracy", fold_accuracy)
                mlflow.log_metric(f"fold_{fold}_f1", fold_f1)
        
        # Calculate and log mean metrics
        mean_cv_accuracy = np.mean(cv_scores)
        std_cv_accuracy = np.std(cv_scores)
        mean_cv_f1 = np.mean(cv_f1_scores)
        std_cv_f1 = np.std(cv_f1_scores)
        
        mlflow.log_metric("mean_cv_accuracy", mean_cv_accuracy)
        mlflow.log_metric("std_cv_accuracy", std_cv_accuracy)
        mlflow.log_metric("mean_cv_f1", mean_cv_f1)
        mlflow.log_metric("std_cv_f1", std_cv_f1)
        
        return mean_cv_f1

# Run the optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print results
print(f"Best parameters: {study.best_trial.params}")
print(f"Best F1 score: {study.best_trial.value}")

2025/02/02 00:21:57 INFO mlflow.tracking.fluent: Experiment with name 'classification_neural_network_experiment_2' does not exist. Creating a new experiment.
[I 2025-02-02 00:21:57,469] A new study created in memory with name: no-name-3f3c7f94-35d9-4462-992d-a61fedc831e0
2025/02/02 00:22:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run neural_network_cv_run at: http://localhost:5000/#/experiments/18/runs/183bd992ae3f4d02b654b39b365e6dcd.
2025/02/02 00:22:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/18.
[I 2025-02-02 00:22:09,588] Trial 0 finished with value: 0.9479326291343082 and parameters: {'hidden_size': 80, 'learning_rate': 0.00046744353116018924, 'batch_size': 48, 'epochs': 21}. Best is trial 0 with value: 0.9479326291343082.
2025/02/02 00:22:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run neural_network_cv_run at: http://localhost:5000/#/experiments/18/runs/3b597bedc05c4588a45d9b87ddabce2d.
2

Best parameters: {'hidden_size': 121, 'learning_rate': 0.0013103557445230478, 'batch_size': 28, 'epochs': 35}
Best F1 score: 0.962015754586327
