# 02_baseline_mlp_improved — Robust, log1p target, LayerNorm

In [1]:

# Imports & paths
import os, sys, json, random
from pathlib import Path
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

random.seed(42); np.random.seed(42); torch.manual_seed(42)
PROJECT_DIR = Path.cwd().parent if Path.cwd().name=='notebooks' else Path.cwd()
DATA_PROCESSED = PROJECT_DIR / 'data' / 'processed'
MODELS = PROJECT_DIR / 'models'
REPORTS = PROJECT_DIR / 'reports'
for p in [DATA_PROCESSED, MODELS, REPORTS]: p.mkdir(parents=True, exist_ok=True)
print('DATA_PROCESSED:', DATA_PROCESSED)


DATA_PROCESSED: c:\Users\byed2\Documents\miacd\Aprendizaje Profundo\Proyecto Final\kan_mlp_sales\data\processed


In [7]:

# Load processed v2 if available
train_p = DATA_PROCESSED / 'train_processed_v2.csv'
valid_p = DATA_PROCESSED / 'valid_processed_v2.csv'
test_p  = DATA_PROCESSED / 'test_processed_v2.csv'
if not train_p.exists():
    train_p = DATA_PROCESSED / 'train_processed.csv'
    valid_p = DATA_PROCESSED / 'valid_processed.csv'
    test_p  = DATA_PROCESSED / 'test_processed.csv'
assert train_p.exists(), 'No processed train file found.'
train_df = pd.read_csv(train_p); valid_df = pd.read_csv(valid_p); test_df = pd.read_csv(test_p)
target_col = 'Weekly_Sales'
feature_cols = [c for c in train_df.columns if c != target_col]
X_train = train_df[feature_cols].values.astype('float32'); y_train_raw = train_df[target_col].values.astype('float32')
X_valid = valid_df[feature_cols].values.astype('float32'); y_valid_raw = valid_df[target_col].values.astype('float32')
X_test  = test_df[feature_cols].values.astype('float32');  y_test_raw  = test_df[target_col].values.astype('float32')
print('Shapes:', X_train.shape, X_valid.shape, X_test.shape)
# Handle negative sales by clipping to positive before log1p
y_train_raw = np.maximum(y_train_raw, 0.001); y_valid_raw = np.maximum(y_valid_raw, 0.001); y_test_raw = np.maximum(y_test_raw, 0.001)
y_train = np.log1p(y_train_raw); y_valid = np.log1p(y_valid_raw); y_test = np.log1p(y_test_raw)
print(f"Target stats - train: min={y_train.min():.3f}, max={y_train.max():.3f}, mean={y_train.mean():.3f}")


Shapes: (338738, 36) (41369, 36) (41463, 36)
Target stats - train: min=0.001, max=13.449, mean=8.512


In [8]:

# Dataset & DataLoader
class TabularDataset(torch.utils.data.Dataset):
    def __init__(self, X, y): self.X = torch.from_numpy(X); self.y = torch.from_numpy(y).view(-1,1)
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

train_ds = TabularDataset(X_train, y_train); valid_ds = TabularDataset(X_valid, y_valid); test_ds = TabularDataset(X_test, y_test)
train_loader = DataLoader(train_ds, batch_size=1024, shuffle=True); valid_loader = DataLoader(valid_ds, batch_size=2048, shuffle=False); test_loader = DataLoader(test_ds, batch_size=2048, shuffle=False)


In [9]:

# Model: LayerNorm MLP
in_features = X_train.shape[1]
class MLP_LN(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256), nn.GELU(), nn.LayerNorm(256), nn.Dropout(0.1),
            nn.Linear(256, 128), nn.GELU(), nn.LayerNorm(128), nn.Dropout(0.1),
            nn.Linear(128, 64), nn.GELU(), nn.LayerNorm(64),
            nn.Linear(64, 1)
        )
    def forward(self,x): return self.net(x)

model = MLP_LN(in_features)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)


In [10]:

# ===== Unified evaluate utility (robust to shapes/NaN/inf) =====
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def _to_1d(a):
    arr = np.asarray(a)
    return arr.ravel()

def filter_invalid(preds, targets):
    preds = _to_1d(preds)
    targets = _to_1d(targets)
    mask = ~(np.isnan(preds) | np.isnan(targets) | np.isinf(preds) | np.isinf(targets))
    return preds[mask], targets[mask]

def evaluate_model(model, loader, device='cpu', inv_transform=lambda x: x):
    model.eval()
    preds_all = []
    targets_all = []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)
            out = model(xb)
            # Ensure 1D arrays
            out_np = out.detach().cpu().numpy().ravel()
            y_np = yb.detach().cpu().numpy().ravel()
            preds_all.append(out_np)
            targets_all.append(y_np)
    if len(preds_all) == 0:
        return {'mae': float('nan'), 'rmse': float('nan'), 'r2': float('nan')}, np.array([]), np.array([])
    preds_log = np.concatenate(preds_all)
    targets_log = np.concatenate(targets_all)
    preds = inv_transform(preds_log)
    targets = inv_transform(targets_log)
    preds, targets = filter_invalid(preds, targets)
    if preds.size == 0:
        return {'mae': float('nan'), 'rmse': float('nan'), 'r2': float('nan')}, preds, targets
    mae = float(mean_absolute_error(targets, preds))
    rmse = float(np.sqrt(mean_squared_error(targets, preds)))
    r2 = float(r2_score(targets, preds))
    return {'mae': mae, 'rmse': rmse, 'r2': r2}, preds, targets


In [11]:

# Training loop with early stopping and robust evaluation
best_rmse = float('inf'); patience = 8; wait = 0; best_state = None
train_losses = []; valid_rmses = []
EPOCHS = 50; max_grad_norm = 1.0
for epoch in range(1, EPOCHS+1):
    model.train(); total = 0.0
    for xb, yb in train_loader:
        xb = xb.to(device); yb = yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        total += float(loss.detach().cpu().numpy()) * xb.size(0)
    train_loss = total / len(train_loader.dataset); train_losses.append(train_loss)
    metrics_v, preds_v, y_v = evaluate_model(model, valid_loader, device=device, inv_transform=lambda x: np.expm1(x))
    valid_rmses.append(metrics_v['rmse'])
    scheduler.step(metrics_v['rmse'])
    print(f"Epoch {epoch:03d} | train_loss={train_loss:.6f} | valid_RMSE={metrics_v['rmse']:.2f} | MAE={metrics_v['mae']:.2f} | R2={metrics_v['r2']:.4f}")
    if metrics_v['rmse'] < best_rmse - 1e-4:
        best_rmse = metrics_v['rmse']; wait = 0
        best_state = {k:v.cpu() for k,v in model.state_dict().items()}
        best_preds_v = preds_v.copy(); best_y_v = y_v.copy()
    else:
        wait += 1
        if wait >= patience:
            print('Early stopping'); break

# restore best and evaluate on test
if best_state is not None:
    model.load_state_dict(best_state)
val_metrics, preds_v, y_v = evaluate_model(model, valid_loader, device=device, inv_transform=lambda x: np.expm1(x))
test_metrics, preds_t, y_t = evaluate_model(model, test_loader, device=device, inv_transform=lambda x: np.expm1(x))
print('Best valid metrics:', val_metrics); print('Test metrics:', test_metrics)

# Save artifacts
import json, pandas as pd, torch as _torch
_models = MODELS / 'baseline_mlp_improved.pt'
_reports = REPORTS / 'baseline_mlp_improved_metrics.json'
_preds_val = REPORTS / 'baseline_mlp_valid_predictions.csv'
_preds_test = REPORTS / 'baseline_mlp_test_predictions.csv'
_tdf = pd.DataFrame({'y_true': y_v, 'y_pred': preds_v})
_tdf.to_csv(_preds_val, index=False)
_tdf2 = pd.DataFrame({'y_true': y_t, 'y_pred': preds_t}); _tdf2.to_csv(_preds_test, index=False)
_torch.save(model.state_dict(), _models)
json.dump({'valid': val_metrics, 'test': test_metrics}, open(_reports,'w'), indent=2)
print('Saved model, metrics and predictions.')


Epoch 001 | train_loss=0.784165 | valid_RMSE=6395.79 | MAE=3117.76 | R2=0.9161
Epoch 002 | train_loss=0.175461 | valid_RMSE=5360.79 | MAE=2547.34 | R2=0.9410
Epoch 002 | train_loss=0.175461 | valid_RMSE=5360.79 | MAE=2547.34 | R2=0.9410
Epoch 003 | train_loss=0.154901 | valid_RMSE=5314.12 | MAE=2593.02 | R2=0.9421
Epoch 003 | train_loss=0.154901 | valid_RMSE=5314.12 | MAE=2593.02 | R2=0.9421
Epoch 004 | train_loss=0.143442 | valid_RMSE=5796.14 | MAE=2804.46 | R2=0.9311
Epoch 004 | train_loss=0.143442 | valid_RMSE=5796.14 | MAE=2804.46 | R2=0.9311
Epoch 005 | train_loss=0.133161 | valid_RMSE=6338.02 | MAE=3155.57 | R2=0.9176
Epoch 005 | train_loss=0.133161 | valid_RMSE=6338.02 | MAE=3155.57 | R2=0.9176
Epoch 006 | train_loss=0.127340 | valid_RMSE=4848.93 | MAE=2335.17 | R2=0.9518
Epoch 006 | train_loss=0.127340 | valid_RMSE=4848.93 | MAE=2335.17 | R2=0.9518
Epoch 007 | train_loss=0.121523 | valid_RMSE=4403.77 | MAE=2019.68 | R2=0.9602
Epoch 007 | train_loss=0.121523 | valid_RMSE=4403.77