<a href="https://colab.research.google.com/github/cam2149/MachineLearningV/blob/main/MLP_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Equipo**

- Nicolás Colmenares

- Carlos Martinez

1. Modelo con Redes Neuronales Tradicionales (MLP)  - 0.5 pts

  -  .

  - .

  - .

**Situación:**
Una ciudad enfrenta un aumento significativo de casos de dengue, con una tasa de incidencia que supera el promedio nacional.
La anticipación de brotes es crucial para implementar medidas preventivas y reducir la propagación de la enfermedad.

**Objetivo:**
Desarrollar un modelo predictivo utilizando redes neuronales para pronosticar futuros brotes de dengue en cada barrio de la ciudad.
Utilizar una base de datos histórica de casos de dengue desde 2015 hasta 2022 para entrenar el modelo.
Anticiparse a los brotes con al menos 3 semanas de anticipación.

**Finalidad:**
Permitir a las autoridades de salud pública tomar acciones oportunas, como:
Preparar a las instituciones prestadoras de salud (IPS).
Gestionar recursos (carros fumigadores, limpieza de sumideros).
Capacitar a la comunidad.

*   Modelo con Redes Neuronales Tradicionales (MLP)
*   .
*   .

# 0. Configuraciones de Colab

Mover Kaggle.json a la ubicación correcta después de subirlo

In [None]:
#Estas líneas son comandos de shell que se ejecutan dentro del Jupyter notebook. Se usan para configurar las credenciales de la API de Kaggle, que son necesarias para descargar conjuntos de datos (datasets) desde Kaggle.

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!rm -rf /content/kaggle/output
!rm -rf /content/kaggle/input

Descargar dataset de la competencia

In [None]:
!kaggle competitions download -c aa-v-2025-i-pronosticos-nn-rnn-cnn

In [None]:
!mkdir -p /content/kaggle/output
!mkdir -p /content/kaggle/input

In [None]:
!mv aa-v-2025-i-pronosticos-nn-rnn-cnn.zip /content/kaggle/input

In [None]:
!unzip /content/kaggle/input/aa-v-2025-i-pronosticos-nn-rnn-cnn.zip -d /content/kaggle/input/

In [None]:
#/kaggle/input
import os
for dirname, _, filenames in os.walk('/content/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# 1. Imports

In [None]:
!pip uninstall numpy -y

In [None]:
import numpy as np
print(np.__version__)

In [None]:
!pip install --force-reinstall numpy==1.26.4

In [None]:
!pip install darts

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import optuna
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

config = {
    "TRAIN_DIR": '/content/kaggle/input/df_train.parquet',
    "TEST_DIR": '/content/kaggle/input/df_test.parquet',
    "SUBMISSION_DIR": '/content/sample_submission.csv',
    "BATCH_SIZE": 32,
    "TARGET_COLUMN": 'dengue',
    "GROUP_COLUMN": 'id_bar',
    "WINDOW_SIZE": 5,
    "HORIZON": 3,
}

train_full = pd.read_parquet(config["TRAIN_DIR"])
test_df = pd.read_parquet(config["TEST_DIR"])

In [None]:
# Generar columna 'fecha' (último día de la semana, domingo)
def last_day_of_week(year, week):
    first_day = datetime.strptime(f'{year} {week} 1', "%Y %W %w")
    days_ahead = 6 - first_day.weekday()
    last_day = first_day + timedelta(days=days_ahead)
    return last_day

train_df['fecha'] = train_df.apply(lambda row: last_day_of_week(row['anio'], row['semana']), axis=1)
test_df['fecha'] = test_df.apply(lambda row: last_day_of_week(row['anio'], row['semana']), axis=1)

In [None]:
train_df.index = pd.to_datetime(train_df['fecha'])
test_df.index = pd.to_datetime(test_df['fecha'])

In [None]:
train_df.index

In [None]:
train_df.resample('W').sum(numeric_only=True)


In [None]:
train_df = train_full[train_full['anio'] <= 2020].copy()
val_df = train_full[train_full['anio'] >= 2021].copy()
# test_df ya está cargado como el conjunto de prueba (2022)

In [None]:
# Function to convert a DataFrame column to TimeSeries
def df_col_to_timeseries(df, col_name, freq='W-SUN'):
    """Converts a DataFrame column to a TimeSeries object."""
    return TimeSeries.from_series(df[col_name], freq=freq)


In [None]:
lluvia_cols = ['lluvia_mean', 'lluvia_var', 'lluvia_max', 'lluvia_min']
temperatura_cols = ['temperatura_mean', 'temperatura_var', 'temperatura_max', 'temperatura_min']
other_cols = ['ESTRATO', 'area_barrio', 'concentraciones', 'vivienda', 'equipesado', 'sumideros', 'maquina']
target_col = ['dengue']

# Escalar variables de lluvia
scaler_lluvia = StandardScaler()
train_df[lluvia_cols] = scaler_lluvia.fit_transform(train_df[lluvia_cols])
val_df[lluvia_cols] = scaler_lluvia.transform(val_df[lluvia_cols])
test_df[lluvia_cols] = scaler_lluvia.transform(test_df[lluvia_cols])

pca_lluvia = PCA(n_components=0.95)
train_lluvia_pca = pca_lluvia.fit_transform(train_df[lluvia_cols])
val_lluvia_pca = pca_lluvia.transform(val_df[lluvia_cols])
test_lluvia_pca = pca_lluvia.transform(test_df[lluvia_cols])
n_components_lluvia = pca_lluvia.n_components_
print(f"Componentes PCA Lluvia: {n_components_lluvia}")

# Escalar variables de temperatura
scaler_temperatura = StandardScaler()
train_df[temperatura_cols] = scaler_temperatura.fit_transform(train_df[temperatura_cols])
val_df[temperatura_cols] = scaler_temperatura.transform(val_df[temperatura_cols])
test_df[temperatura_cols] = scaler_temperatura.transform(test_df[temperatura_cols])

pca_temperatura = PCA(n_components=0.95)
train_temperatura_pca = pca_temperatura.fit_transform(train_df[temperatura_cols])
val_temperatura_pca = pca_temperatura.transform(val_df[temperatura_cols])
test_temperatura_pca = pca_temperatura.transform(test_df[temperatura_cols])
n_components_temperatura = pca_temperatura.n_components_
print(f"Componentes PCA Temperatura: {n_components_temperatura}")

# Escalar otras variables
scaler_other = StandardScaler()
train_df[other_cols] = scaler_other.fit_transform(train_df[other_cols])
val_df[other_cols] = scaler_other.transform(val_df[other_cols])
test_df[other_cols] = scaler_other.transform(test_df[other_cols])

# Escalar target
scaler_target = StandardScaler()
train_df[target_col] = scaler_target.fit_transform(train_df[target_col])
val_df[target_col] = scaler_target.transform(val_df[target_col])

In [None]:
# Para val_df
X_val, y_val = [], []
for barrio in barrios:
    barrio_data = val_df[val_df['id_bar'] == barrio]
    # Obtener las posiciones de barrio_data en val_df
    barrio_pos = val_df.index.get_indexer(barrio_data.index)
    # Usar posiciones en lugar de índices para acceder a los arreglos PCA
    barrio_features = np.hstack((val_lluvia_pca[barrio_pos],
                                 val_temperatura_pca[barrio_pos],
                                 barrio_data[other_cols].values))
    barrio_target = barrio_data['dengue'].values
    if len(barrio_data) >= config["WINDOW_SIZE"] + config["HORIZON"]:
        X_barrio, y_barrio = create_sequences(barrio_features, barrio_target,
                                              config["WINDOW_SIZE"], config["HORIZON"])
        X_val.append(X_barrio)
        y_val.append(y_barrio)
X_val = np.vstack(X_val)
y_val = np.vstack(y_val)

In [None]:
class MLPModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate=0.2):
        super(MLPModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        batch_size = x.size(0)
        x = x.view(batch_size, -1)  # Aplanar: (batch, window_size * features)
        return self.model(x)


In [None]:
def objective(trial):
    epochs = trial.suggest_categorical('epochs', [100])
    lr = trial.suggest_categorical('lr', [0.01, 0.001])
    optimizer_class = trial.suggest_categorical('optimizer', [optim.Adam, optim.AdamW, optim.SGD, optim.RMSprop])
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 48])
    hidden_dim = trial.suggest_categorical('hidden_dim', [32, 64, 128])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)

    train_loader = DataLoader(TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                            torch.tensor(y_train, dtype=torch.float32)),
                              batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                                          torch.tensor(y_val, dtype=torch.float32)),
                            batch_size=batch_size, shuffle=False)

    input_dim = X_train.shape[1] * X_train.shape[2]
    output_dim = config["HORIZON"]
    model = MLPModel(input_dim, hidden_dim, output_dim, dropout_rate)
    optimizer = optimizer_class(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

    model.eval()
    val_losses = []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            val_losses.append(loss.item())
    val_rmse = np.sqrt(np.mean(val_losses))
    return val_rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
best_params = study.best_params
print("Mejores parámetros:", best_params)

In [None]:
input_dim = X_train.shape[1] * X_train.shape[2]
output_dim = config["HORIZON"]
model = MLPModel(input_dim, best_params['hidden_dim'], output_dim, best_params['dropout_rate'])
optimizer = eval(f"optim.{best_params['optimizer'].__name__}")(model.parameters(), lr=best_params['lr'])
criterion = nn.MSELoss()

train_loader = DataLoader(TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                        torch.tensor(y_train, dtype=torch.float32)),
                          batch_size=best_params['batch_size'], shuffle=True)
val_loader = DataLoader(TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                                      torch.tensor(y_val, dtype=torch.float32)),
                        batch_size=best_params['batch_size'], shuffle=False)

train_losses, val_losses = [], []
best_val_loss = float('inf')
patience = 5
counter = 0

for epoch in range(best_params['epochs']):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    train_losses.append(epoch_loss / len(train_loader))

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            y_pred = model(X_batch)
            val_loss += criterion(y_pred, y_batch).item()
    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        torch.save(model.state_dict(), 'mejor_modelo.pt')
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping en epoch {epoch}")
            break

In [None]:
model.load_state_dict(torch.load('mejor_modelo.pt'))
model.eval()
y_pred_val = []
y_true_val = []
with torch.no_grad():
    for X_batch, y_batch in val_loader:
        y_pred = model(X_batch)
        y_pred_val.extend(y_pred[:, 0].cpu().numpy())  # Primer horizonte
        y_true_val.extend(y_batch[:, 0].cpu().numpy())

y_pred_val = scaler_target.inverse_transform(np.array(y_pred_val).reshape(-1, 1))
y_true_val = scaler_target.inverse_transform(np.array(y_true_val).reshape(-1, 1))

mae = mean_absolute_error(y_true_val, y_pred_val)
mse = mean_squared_error(y_true_val, y_pred_val)
rmse = np.sqrt(mse)
print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Losses')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(y_true_val[:100], label='Real')
plt.plot(y_pred_val[:100], label='Predicho')
plt.xlabel('Muestra')
plt.ylabel('Casos de Dengue')
plt.title('Predicciones vs Valores Reales (Primeras 100 muestras)')
plt.legend()
plt.show()

In [None]:
X_test = []
test_indices = []
for barrio in test_df['id_bar'].unique():
    barrio_full = pd.concat([train_full[train_full['id_bar'] == barrio],
                             test_df[test_df['id_bar'] == barrio]])
    # Reset index to avoid using original indices from train_full and test_df
    barrio_full.reset_index(drop=True, inplace=True)

    idx_full = np.arange(len(barrio_full))
    # Use iloc to access data based on the position in barrio_full
    barrio_features_full = np.hstack((
        pca_lluvia.transform(barrio_full[lluvia_cols].iloc[:].values),
        pca_temperatura.transform(barrio_full[temperatura_cols].iloc[:].values),
        barrio_full[other_cols].iloc[:].values
    ))
    barrio_target_full = barrio_full['dengue'].fillna(0).values
    test_start_idx = len(train_full[train_full['id_bar'] == barrio])
    for i in range(test_start_idx, len(barrio_full)):
        if i - config["WINDOW_SIZE"] >= 0:
            X_seq = np.hstack((barrio_features_full[i - config["WINDOW_SIZE"]:i],
                               barrio_target_full[i - config["WINDOW_SIZE"]:i].reshape(-1, 1)))
            X_test.append(X_seq)
            test_indices.append(barrio_full.index[i])
X_test = np.array(X_test)

test_loader = DataLoader(TensorDataset(torch.tensor(X_test, dtype=torch.float32)),
                         batch_size=best_params['batch_size'], shuffle=False)

model.eval()
predictions = []
with torch.no_grad():
    for X_batch in test_loader:
        y_pred = model(X_batch[0])
        predictions.extend(y_pred[:, 0].cpu().numpy())  # Primer horizonte

predictions = scaler_target.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()

In [None]:
def create_submission(test_df, predictions, filename='submission.csv'):
    test_df_reset = test_df.reset_index()
    ids = test_df_reset['id_bar'].astype(str) + '_' + test_df_reset['anio'].astype(str) + '_' + test_df_reset['semana'].astype(str)
    df_submission = pd.DataFrame({'id': ids, 'dengue': predictions})
    df_submission.to_csv(filename, index=False)
    print(f'Submission guardado en {filename}, con {len(df_submission)} predicciones.')

create_submission(test_df, predictions)