In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from argparse import Namespace
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Add the parent directory of 'ml' to sys.path
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import warnings
warnings.filterwarnings('ignore')

from ml.models.transformer import TimeSeriesTransformer
from ml.utils.data_utils import prepare_dataset 

  from pandas.core import (


In [2]:
# -----------------------
# Config (cluster dataset)
# -----------------------
args = Namespace(
    data_path='../dataset/combined_with_cluster_feature_with_extraData.csv',  # <— clustered data
    targets=['rnti_count', 'rb_down', 'rb_up', 'down', 'up'],
    num_lags=10,                  # encoder length (L)
    forecast_steps=6,             # horizon (H)
    test_size=0.2,                # 80/20 split
    ignore_cols=None,
    identifier='District',
    nan_constant=0,
    x_scaler='minmax',
    y_scaler='minmax',
    outlier_detection=True,
    use_time_features=False,      # (kept off to mirror cluster runs)

    # Transformer model hyperparams (match working eval config)
    d_model=128,
    nhead=4,
    num_encoder_layers=2,
    num_decoder_layers=2,
    dim_feedforward=256,
    dropout=0.1,

    # Training
    epochs=30,                    # you can bump this (e.g., 50) if needed
    batch_size=64,
    lr=1e-3,
    weight_decay=0.0,
    grad_clip=1.0,
    early_stopping_patience=8,    # stop if val loss doesn't improve

    device='cuda' if torch.cuda.is_available() else 'cpu',
    save_path='transformer_multistep_cluster_with_extra_data.pt',     # best checkpoint (weights)
)

## Process data

In [3]:
# -----------------------
# Load data
# -----------------------
X_train, y_train, X_test, y_test, x_scaler, y_scaler, id_train, id_test = prepare_dataset(args)
# Shapes
# X_*: [N, L, D]
# y_*: [N, H, T]
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test : {X_test.shape},  y_test : {y_test.shape}")

input_size  = X_train.shape[2]  # D
output_size = y_train.shape[2]  # T
L = X_train.shape[1]
H = y_train.shape[1]
assert L == args.num_lags and H == args.forecast_steps, "Check num_lags/forecast_steps vs data."

train_loader = DataLoader(
    TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                  torch.tensor(y_train, dtype=torch.float32)),
    batch_size=args.batch_size, shuffle=True, drop_last=False
)
val_loader = DataLoader(  # using test as validation for training loop; can split train if you prefer
    TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                  torch.tensor(y_test, dtype=torch.float32)),
    batch_size=args.batch_size, shuffle=False, drop_last=False
)

X_train: (21708, 10, 7), y_train: (21708, 6, 5)
X_test : (5427, 10, 7),  y_test : (5427, 6, 5)


## Model Train

In [4]:
# -----------------------
# Build model
# -----------------------
model = TimeSeriesTransformer(
    input_size=input_size,
    output_size=output_size,
    forecast_steps=args.forecast_steps,
    d_model=args.d_model,
    nhead=args.nhead,
    num_encoder_layers=args.num_encoder_layers,
    num_decoder_layers=args.num_decoder_layers,
    dim_feedforward=args.dim_feedforward,
    dropout=args.dropout,
).to(args.device)

optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
criterion = nn.MSELoss()

# Optional scheduler (cosine annealing works well for Transformers)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max(1, args.epochs))

In [5]:
# -----------------------
# Train / Val loops
# -----------------------
best_val = float('inf')
best_epoch = -1
no_improve = 0

scaler_amp = torch.cuda.amp.GradScaler(enabled=(args.device == 'cuda'))

for epoch in range(1, args.epochs + 1):
    # ---- Train ----
    model.train()
    running = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(args.device), yb.to(args.device)
        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=(args.device == 'cuda')):
            preds = model(xb)            # [B, H, T], trained on SCALED space
            loss = criterion(preds, yb)

        scaler_amp.scale(loss).backward()
        if args.grad_clip and args.grad_clip > 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        scaler_amp.step(optimizer)
        scaler_amp.update()

        running += loss.item()

    train_loss = running / max(1, len(train_loader))

    # ---- Validate ----
    model.eval()
    with torch.no_grad():
        vloss = 0.0
        for xb, yb in val_loader:
            xb, yb = xb.to(args.device), yb.to(args.device)
            preds = model(xb)
            vloss += criterion(preds, yb).item()
        val_loss = vloss / max(1, len(val_loader))

    if scheduler is not None:
        scheduler.step()

    print(f"[{epoch:03d}/{args.epochs}] Train {train_loss:.6f} | Val {val_loss:.6f} | LR {optimizer.param_groups[0]['lr']:.2e}")

    # ---- Early Stopping + Best Save ----
    if val_loss < best_val - 1e-8:
        best_val = val_loss
        best_epoch = epoch
        no_improve = 0
        torch.save(model.state_dict(), args.save_path)
    else:
        no_improve += 1
        if no_improve >= args.early_stopping_patience:
            print(f"Early stopping at epoch {epoch} (best={best_val:.6f} @ {best_epoch})")
            break

[001/30] Train 0.081584 | Val 0.025754 | LR 9.97e-04
[002/30] Train 0.016484 | Val 0.020275 | LR 9.89e-04
[003/30] Train 0.013411 | Val 0.018425 | LR 9.76e-04
[004/30] Train 0.012303 | Val 0.022164 | LR 9.57e-04
[005/30] Train 0.011587 | Val 0.018709 | LR 9.33e-04
[006/30] Train 0.011058 | Val 0.018258 | LR 9.05e-04
[007/30] Train 0.010647 | Val 0.017922 | LR 8.72e-04
[008/30] Train 0.010320 | Val 0.019124 | LR 8.35e-04
[009/30] Train 0.010254 | Val 0.018054 | LR 7.94e-04
[010/30] Train 0.010068 | Val 0.019695 | LR 7.50e-04
[011/30] Train 0.010074 | Val 0.018987 | LR 7.03e-04
[012/30] Train 0.009937 | Val 0.016832 | LR 6.55e-04
[013/30] Train 0.009636 | Val 0.016893 | LR 6.04e-04
[014/30] Train 0.009620 | Val 0.017520 | LR 5.52e-04
[015/30] Train 0.009534 | Val 0.017957 | LR 5.00e-04
[016/30] Train 0.009532 | Val 0.017898 | LR 4.48e-04
[017/30] Train 0.009417 | Val 0.017607 | LR 3.96e-04
[018/30] Train 0.009333 | Val 0.017102 | LR 3.45e-04
[019/30] Train 0.009213 | Val 0.016982 | LR 2.

## The Evaluation is carried out together with the other models in Notebook # 6