In [1]:
# Para que funciones, todos nuestros scripts debemos exportar las siguientes variables de entorno
%env AWS_ACCESS_KEY_ID=minio   
%env AWS_SECRET_ACCESS_KEY=minio123 
%env MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
%env AWS_ENDPOINT_URL_S3=http://localhost:9000

env: AWS_ACCESS_KEY_ID=minio
env: AWS_SECRET_ACCESS_KEY=minio123
env: MLFLOW_S3_ENDPOINT_URL=http://localhost:9000
env: AWS_ENDPOINT_URL_S3=http://localhost:9000


In [2]:
!echo $AWS_ACCESS_KEY_ID
!echo $AWS_SECRET_ACCESS_KEY
!echo $MLFLOW_S3_ENDPOINT_URL

$AWS_ACCESS_KEY_ID
$AWS_SECRET_ACCESS_KEY
$MLFLOW_S3_ENDPOINT_URL


In [3]:
import pandas as pd
import numpy as np
import mlflow
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import awswrangler as wr

# Set the MLflow tracking URI
mlflow.set_tracking_uri('http://localhost:5000')

# Set up the MLflow experiment
mlflow.set_experiment("classification_neural_network_single")

# Load data
X_train = wr.s3.read_csv("s3://data/final/train/X_train_scaled.csv")
y_train = wr.s3.read_csv("s3://data/final/train/y_train.csv")

X_val = wr.s3.read_csv("s3://data/final/val/X_val_scaled.csv")
y_val = wr.s3.read_csv("s3://data/final/val/y_val.csv")

X_test = wr.s3.read_csv("s3://data/final/test/X_test_scaled.csv")
y_test = wr.s3.read_csv("s3://data/final/test/y_test.csv")

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.FloatTensor(y_train.values).reshape(-1, 1)

X_val_tensor = torch.FloatTensor(X_val.values)
y_val_tensor = torch.FloatTensor(y_val.values).reshape(-1, 1)

X_test_tensor = torch.FloatTensor(X_test.values)
y_test_tensor = torch.FloatTensor(y_test.values).reshape(-1, 1)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Fixed hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.001
HIDDEN_SIZE = 128
EPOCHS = 500

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNetwork, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 1000),
            nn.ReLU(),
            nn.Linear(1000, 500),
            nn.ReLU(),
            nn.Linear(500, 250),
            nn.ReLU(),
            nn.Linear(250, 125),
            nn.ReLU(),
            nn.Linear(125, 62),
            nn.ReLU(),
            nn.Linear(62, 31),
            nn.ReLU(),
            nn.Linear(31, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.layers(x)

def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            preds = (outputs >= 0.5).float()
            
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    
    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)
    
    f1 = f1_score(all_targets, all_preds, average='weighted')
    return f1

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, optimizer, and loss function
model = NeuralNetwork(X_train.shape[1], HIDDEN_SIZE).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCELoss()

# Training loop with validation
with mlflow.start_run(run_name="neural_network_single"):
    # Log parameters
    mlflow.log_params({
        'hidden_size': HIDDEN_SIZE,
        'learning_rate': LEARNING_RATE,
        'batch_size': BATCH_SIZE,
        'epochs': EPOCHS
    })
    
    best_val_f1 = 0
    best_model = None
    
    for epoch in range(EPOCHS):
        # Training
        model.train()
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
        
        # Loss
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss.item():.4f}")
        # Validation
        model.eval()
        val_f1 = evaluate_model(model, val_loader, device)
        
        """
        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model = model.state_dict().copy()
        """
        
        print(f"Epoch {epoch+1}/{EPOCHS}, Val F1: {val_f1:.4f}")
  
    # Load best model and evaluate on test set
    #model.load_state_dict(best_model)
    model.eval()
    test_f1 = evaluate_model(model, test_loader, device)
    
    # Log final metrics
    mlflow.log_metrics({
        "val_f1": best_val_f1,
        "test_f1": test_f1
    })
    
    # Log the model
    mlflow.pytorch.log_model(model, "model")

print("Training completed!")
print(f"Best Validation F1 Score: {best_val_f1:.4f}")
print(f"Final Test F1 Score: {test_f1:.4f}")

Epoch 1/500, Loss: 0.1918
Epoch 1/500, Val F1: 0.8942
Epoch 2/500, Loss: 0.4913
Epoch 2/500, Val F1: 0.8942
Epoch 3/500, Loss: 0.2730
Epoch 3/500, Val F1: 0.8942
Epoch 4/500, Loss: 0.0793
Epoch 4/500, Val F1: 0.8942
Epoch 5/500, Loss: 0.0327
Epoch 5/500, Val F1: 0.8942
Epoch 6/500, Loss: 0.1667
Epoch 6/500, Val F1: 0.8942
Epoch 7/500, Loss: 0.2563
Epoch 7/500, Val F1: 0.8942
Epoch 8/500, Loss: 0.0534
Epoch 8/500, Val F1: 0.8942
Epoch 9/500, Loss: 0.0167
Epoch 9/500, Val F1: 0.8942
Epoch 10/500, Loss: 0.0860
Epoch 10/500, Val F1: 0.9380
Epoch 11/500, Loss: 0.0041
Epoch 11/500, Val F1: 0.9457
Epoch 12/500, Loss: 0.0687
Epoch 12/500, Val F1: 0.9142
Epoch 13/500, Loss: 0.0462
Epoch 13/500, Val F1: 0.9311
Epoch 14/500, Loss: 0.0281
Epoch 14/500, Val F1: 0.9142
Epoch 15/500, Loss: 0.0931
Epoch 15/500, Val F1: 0.9240
Epoch 16/500, Loss: 0.0495
Epoch 16/500, Val F1: 0.9307
Epoch 17/500, Loss: 0.0347
Epoch 17/500, Val F1: 0.9429
Epoch 18/500, Loss: 0.0541
Epoch 18/500, Val F1: 0.9307
Epoch 19/5

2025/02/02 00:54:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run neural_network_single at: http://localhost:5000/#/experiments/19/runs/26183a59046140459017737ee0cc7cfb.
2025/02/02 00:54:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/19.


Training completed!
Best Validation F1 Score: 0.0000
Final Test F1 Score: 0.9130
