In [None]:
"""
time_series_attention_project_fixed.py

Fixed full pipeline:
- Synthetic multivariate dataset generation (>=5000 rows)
- Preprocessing: train/val/test split, scalers, sliding windows
- Seq2Seq LSTM with Bahdanau Attention predicting next N steps
- Training loop with checkpointing
- Evaluation: RMSE, MAE, MASE
- Monte Carlo dropout prediction intervals
- Attention visualization
- Baseline SARIMAX example (single-var)

Notes:
- This version removes the `verbose` arg from ReduceLROnPlateau (older PyTorch compatibility).
- Decoder corrected: no dynamic layers inside forward; teacher forcing implemented correctly.
- Ensure required packages installed: numpy pandas matplotlib scikit-learn torch statsmodels tqdm
"""

import os
import math
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import statsmodels.api as sm

# reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# ---------------- 1) Synthetic data generation ----------------
def generate_multivariate_series(n_steps=7000, n_vars=4, noise_std=0.2, seasonal_periods=[24,168]):
    t = np.arange(n_steps).astype(float)
    data = np.zeros((n_steps, n_vars), dtype=float)

    # var0: trend + multiple seasonalities
    data[:,0] = 0.01*t + 2.0*np.sin(2*np.pi*t/seasonal_periods[0]) \
                + 0.5*np.sin(2*np.pi*t/seasonal_periods[1]) \
                + 0.2*np.sin(2*np.pi*t/7.0)

    # var1: nonlinear function of var0 + additional seasonality
    data[:,1] = 0.5*np.log1p(np.abs(data[:,0])) * np.sign(data[:,0]) + 0.3*np.cos(2*np.pi*t/12.0)

    # var2: slow trend + spikes
    data[:,2] = 0.005*t + 0.3*np.sin(2*np.pi*t/30.0)
    spikes = (np.random.rand(n_steps) < 0.002).astype(float)
    data[:,2] += spikes * (np.random.randn(n_steps)*5.0)

    # var3: random walk + seasonal forcing
    rw = np.cumsum(0.01*np.random.randn(n_steps))
    data[:,3] = rw + 0.4*np.sin(2*np.pi*t/(seasonal_periods[0]*0.5))

    # cross-coupling and noise
    data[:,0] += 0.2*data[:,3]
    data[:,1] += 0.1*data[:,2]
    data += noise_std * np.random.randn(*data.shape)

    return pd.DataFrame(data, columns=[f"var{i}" for i in range(n_vars)])

# Generate
n_steps = 7000
df = generate_multivariate_series(n_steps=n_steps, n_vars=4, noise_std=0.3)
print("Generated data shape:", df.shape)

# ---------------- 2) Preprocessing / Sliding windows ----------------
INPUT_LEN = 96
PRED_LEN = 10
FEATURES = df.columns.tolist()
OUTPUT_SIZE = len(FEATURES)

# splits
train_frac, val_frac = 0.7, 0.15
train_end = int(len(df) * train_frac)
val_end = int(len(df) * (train_frac + val_frac))
print("Splits -> train_end:", train_end, "val_end:", val_end)

scaler = StandardScaler()
scaler.fit(df.iloc[:train_end].values)  # fit only on train
scaled = scaler.transform(df.values)

class SeqDataset(Dataset):
    def __init__(self, data_array, input_len, pred_len, start_idx, end_idx):
        self.data = data_array
        self.input_len = input_len
        self.pred_len = pred_len
        self.start = start_idx
        self.end = end_idx
        self.starts = []
        for i in range(self.start, self.end - self.input_len - self.pred_len + 1):
            self.starts.append(i)
    def __len__(self):
        return len(self.starts)
    def __getitem__(self, idx):
        i = self.starts[idx]
        x = self.data[i:i+self.input_len]
        y = self.data[i+self.input_len:i+self.input_len+self.pred_len]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

train_ds = SeqDataset(scaled, INPUT_LEN, PRED_LEN, 0, train_end)
val_ds   = SeqDataset(scaled, INPUT_LEN, PRED_LEN, train_end-INPUT_LEN, val_end)
test_ds  = SeqDataset(scaled, INPUT_LEN, PRED_LEN, val_end-INPUT_LEN, len(df))

BATCH = 64
train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=BATCH, shuffle=False)

print("Dataset sizes (samples):", len(train_ds), len(val_ds), len(test_ds))

# ---------------- 3) Model: Encoder / Attention / Decoder ----------------
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=2, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, n_layers, batch_first=True, dropout=dropout)
    def forward(self, x):
        outputs, (h, c) = self.lstm(x)  # outputs: (B, T, H)
        return outputs, (h, c)

class BahdanauAttention(nn.Module):
    def __init__(self, enc_hidden, dec_hidden):
        super().__init__()
        self.W1 = nn.Linear(enc_hidden, dec_hidden)
        self.W2 = nn.Linear(dec_hidden, dec_hidden)
        self.V  = nn.Linear(dec_hidden, 1)
    def forward(self, enc_outputs, dec_hidden):
        # enc_outputs: (B, T_enc, H_enc)
        # dec_hidden: (B, H_dec)
        dec_expanded = dec_hidden.unsqueeze(1).repeat(1, enc_outputs.size(1), 1)  # (B, T_enc, H_dec)
        score = self.V(torch.tanh(self.W1(enc_outputs) + self.W2(dec_expanded)))  # (B, T_enc, 1)
        attn_weights = torch.softmax(score, dim=1)  # (B, T_enc, 1)
        context = torch.sum(attn_weights * enc_outputs, dim=1)  # (B, H_enc)
        return context, attn_weights.squeeze(-1)  # (B, T_enc)

class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, enc_hidden_size, dropout=0.2):
        super().__init__()
        # Using LSTMCell for autoregressive decoding
        self.hidden_size = hidden_size
        self.lstm_cell = nn.LSTMCell(output_size + enc_hidden_size, hidden_size)
        self.attention = BahdanauAttention(enc_hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        # projection from encoder hidden to initial decoder hidden if dimensions differ
        if enc_hidden_size != hidden_size:
            self.enc2dec = nn.Linear(enc_hidden_size, hidden_size)
        else:
            self.enc2dec = None
        # projection from encoder last output to initial "previous output" (so shapes align)
        if enc_hidden_size != output_size:
            self.enc2out = nn.Linear(enc_hidden_size, output_size)
        else:
            self.enc2out = None

    def forward(self, enc_outputs, y_prev=None, pred_len=10, teacher_forcing_rate=0.5):
        """
        enc_outputs: (B, T_enc, H_enc)
        y_prev: (B, pred_len, output_size) scaled targets for teacher forcing (optional)
        """
        B = enc_outputs.size(0)
        T_enc = enc_outputs.size(1)
        device = enc_outputs.device

        # initialize decoder hidden state from mean of enc outputs (projected if needed)
        enc_mean = torch.mean(enc_outputs, dim=1)  # (B, H_enc)
        if self.enc2dec is not None:
            hx = torch.tanh(self.enc2dec(enc_mean))
        else:
            hx = torch.tanh(enc_mean)
        cx = torch.zeros(B, self.hidden_size, device=device)

        # initial prev_out: map last encoder step to output_size, or zero
        last_enc = enc_outputs[:, -1, :]  # (B, H_enc)
        if self.enc2out is not None:
            prev_out = torch.tanh(self.enc2out(last_enc))
        else:
            # shapes match only when enc_hidden == output_size; else use zeros
            if last_enc.size(1) == self.fc.out_features:
                prev_out = last_enc
            else:
                prev_out = torch.zeros(B, self.fc.out_features, device=device)

        outputs = []
        attn_mats = []

        for t in range(pred_len):
            # compute context via attention using current hx as decoder hidden
            context, attn_weights = self.attention(enc_outputs, hx)  # context: (B, H_enc) ; attn_weights: (B, T_enc)
            lstm_input = torch.cat([prev_out, context], dim=1)  # (B, output_size + H_enc)
            hx, cx = self.lstm_cell(lstm_input, (hx, cx))
            out = self.fc(self.dropout(hx))  # (B, output_size)
            outputs.append(out.unsqueeze(1))
            attn_mats.append(attn_weights.unsqueeze(1))  # (B,1,T_enc)

            # decide teacher forcing
            use_teacher = (y_prev is not None) and (random.random() < teacher_forcing_rate)
            if use_teacher:
                prev_out = y_prev[:, t, :].to(device)
            else:
                prev_out = out

        outputs = torch.cat(outputs, dim=1)        # (B, pred_len, output_size)
        attn_mats = torch.cat(attn_mats, dim=1)    # (B, pred_len, T_enc)
        return outputs, attn_mats

class Seq2SeqModel(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, x, y=None, pred_len=PRED_LEN, teacher_forcing_rate=0.5):
        enc_outs, _ = self.encoder(x)
        out, att = self.decoder(enc_outs, y_prev=y, pred_len=pred_len, teacher_forcing_rate=teacher_forcing_rate)
        return out, att

# instantiate model
HIDDEN_SIZE = 128
enc = Encoder(input_size=OUTPUT_SIZE, hidden_size=HIDDEN_SIZE, n_layers=2, dropout=0.2)
dec = Decoder(output_size=OUTPUT_SIZE, hidden_size=HIDDEN_SIZE, enc_hidden_size=HIDDEN_SIZE, dropout=0.2)
model = Seq2SeqModel(enc, dec).to(DEVICE)
print("Model params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

# ---------------- 4) Loss, optimizer, scheduler ----------------
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# Removed `verbose` keyword for compatibility with older PyTorch
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)

# ---------------- 5) Metrics: MASE, evaluate ----------------
def mase(y_true, y_pred, training_series):
    """
    y_true, y_pred: (n_samples, pred_len, n_features) in original scale
    training_series: (n_train, n_features) in original scale
    Returns scalar (average across features)
    """
    # denom: mean absolute one-step naive forecast error on training data per feature
    diffs = np.abs(training_series[1:] - training_series[:-1])  # (n_train-1, n_features)
    denom = np.mean(diffs, axis=0)  # (n_features,)
    denom = np.where(denom == 0, 1e-8, denom)
    mae_per_feature = np.mean(np.abs(y_true - y_pred), axis=(0,1))  # (n_features,)
    return float(np.mean(mae_per_feature / denom))

def evaluate_model(model, loader, scaler, training_series_scaled):
    model.eval()
    preds_list, trues_list, atts_list = [], [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)
            outb, attb = model(xb, y=None, pred_len=PRED_LEN, teacher_forcing_rate=0.0)
            preds_list.append(outb.cpu().numpy())
            trues_list.append(yb.cpu().numpy())
            atts_list.append(attb.cpu().numpy())
    preds = np.concatenate(preds_list, axis=0)  # (N, pred_len, F) scaled
    trues = np.concatenate(trues_list, axis=0)
    atts = np.concatenate(atts_list, axis=0)    # (N, pred_len, enc_T)

    # inverse transform
    N, T, F = preds.shape
    preds_inv = scaler.inverse_transform(preds.reshape(-1, F)).reshape(N, T, F)
    trues_inv = scaler.inverse_transform(trues.reshape(-1, F)).reshape(N, T, F)

    rmse = math.sqrt(mean_squared_error(trues_inv.reshape(-1, F), preds_inv.reshape(-1, F)))
    mae = mean_absolute_error(trues_inv.reshape(-1, F), preds_inv.reshape(-1, F))
    mase_val = mase(trues_inv, preds_inv, scaler.inverse_transform(training_series_scaled))

    return {
        "rmse": float(rmse),
        "mae": float(mae),
        "mase": float(mase_val),
        "preds": preds_inv,
        "trues": trues_inv,
        "attns": atts
    }

# ---------------- 6) Training loop ----------------
EPOCHS = 25
best_val_metric = float("inf")
save_dir = "outputs_ts_attention_fixed"
os.makedirs(save_dir, exist_ok=True)

for ep in range(1, EPOCHS+1):
    model.train()
    losses = []
    for xb, yb in train_loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)
        optimizer.zero_grad()
        outb, _ = model(xb, y=yb, pred_len=PRED_LEN, teacher_forcing_rate=0.5)
        loss = criterion(outb, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        losses.append(loss.item())
    mean_train_loss = float(np.mean(losses)) if losses else 0.0

    # validation metrics
    val_metrics = evaluate_model(model, val_loader, scaler, scaled[:train_end])
    val_mae = val_metrics["mae"]
    scheduler.step(val_mae)  # step scheduler with monitored metric

    print(f"Epoch {ep}/{EPOCHS}  train_loss={mean_train_loss:.6f}  val_mae={val_mae:.6f}  val_rmse={val_metrics['rmse']:.6f}  val_mase={val_metrics['mase']:.4f}")

    # save best by val_mae
    if val_mae < best_val_metric:
        best_val_metric = val_mae
        torch.save(model.state_dict(), os.path.join(save_dir, "best_model.pth"))
        print("Saved best model (val_mae improved).")

# ---------------- 7) Load best and evaluate on test ----------------
model.load_state_dict(torch.load(os.path.join(save_dir, "best_model.pth"), map_location=DEVICE))
test_metrics = evaluate_model(model, test_loader, scaler, scaled[:train_end])
print("TEST metrics:", test_metrics["rmse"], test_metrics["mae"], test_metrics["mase"])

# save metrics & arrays
pd.DataFrame([test_metrics["rmse"], test_metrics["mae"], test_metrics["mase"]], index=["rmse","mae","mase"]).to_csv(os.path.join(save_dir, "test_metrics.csv"))
np.save(os.path.join(save_dir, "preds.npy"), test_metrics["preds"])
np.save(os.path.join(save_dir, "trues.npy"), test_metrics["trues"])

# ---------------- 8) Monte Carlo dropout prediction intervals ----------------
def mc_dropout_predict(model, x_tensor, scaler, mc_runs=50):
    """
    x_tensor: (1, input_len, features) in scaled units
    returns mean, lower, upper in original scale: shapes (pred_len, features)
    """
    # enable dropout during inference
    model.train()
    preds = []
    with torch.no_grad():
        for _ in range(mc_runs):
            out, _ = model(x_tensor.to(DEVICE), y=None, pred_len=PRED_LEN, teacher_forcing_rate=0.0)
            preds.append(out.cpu().numpy())
    preds = np.concatenate(preds, axis=0)  # (mc_runs, 1, pred_len, F) if batch dim preserved or (mc_runs, pred_len, F)
    # handle shapes: out above returns (B, pred_len, F); we passed batch 1 likely so preds shape (mc_runs, 1, pred_len, F)
    if preds.ndim == 4:
        preds = preds[:, 0, :, :]  # (mc_runs, pred_len, F)
    mean = preds.mean(axis=0)
    lower = np.percentile(preds, 2.5, axis=0)
    upper = np.percentile(preds, 97.5, axis=0)
    # inverse scale
    mean_inv = scaler.inverse_transform(mean.reshape(-1, mean.shape[-1])).reshape(mean.shape)
    lower_inv = scaler.inverse_transform(lower.reshape(-1, lower.shape[-1])).reshape(lower.shape)
    upper_inv = scaler.inverse_transform(upper.reshape(-1, upper.shape[-1])).reshape(upper.shape)
    return mean_inv, lower_inv, upper_inv

# Example: MC on first test sample
x0, y0 = test_ds[0]
mean_pred, lower_pred, upper_pred = mc_dropout_predict(model, x0.unsqueeze(0), scaler, mc_runs=40)

# Plot var0 example
plt.figure(figsize=(9,3))
t = np.arange(PRED_LEN)
plt.plot(t, scaler.inverse_transform(y0.numpy())[:,0], label="true")
plt.plot(t, mean_pred[:,0], label="pred_mean")
plt.fill_between(t, lower_pred[:,0], upper_pred[:,0], alpha=0.3, label="95% PI")
plt.legend()
plt.title("MC Dropout prediction intervals (sample 0, var0)")
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "mc_dropout_example_var0.png"), dpi=150)
plt.close()

# ---------------- 9) Attention visualizations ----------------
model.eval()
with torch.no_grad():
    xb, yb = next(iter(test_loader))
    outb, attb = model(xb.to(DEVICE), y=None, pred_len=PRED_LEN, teacher_forcing_rate=0.0)
    att_np = attb.cpu().numpy()  # (B, pred_len, enc_T)

# visualize sample 0 attention matrix
sample_idx = 0
att_sample = att_np[sample_idx]  # (pred_len, enc_T)
plt.figure(figsize=(12,4))
plt.imshow(att_sample, aspect='auto', cmap='viridis')
plt.colorbar()
plt.xlabel("Encoder time step (input history)")
plt.ylabel("Decoder step (prediction horizon)")
plt.title("Attention matrix (sample 0)")
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "attention_matrix_sample0.png"), dpi=150)
plt.close()

# aggregated attention
avg_att = att_sample.mean(axis=0)
plt.figure(figsize=(10,3))
plt.plot(avg_att)
plt.title("Average attention weight across prediction horizon (sample 0)")
plt.xlabel("Encoder time step (older -> recent)")
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "attention_avg_sample0.png"), dpi=150)
plt.close()

# ---------------- 10) Baseline SARIMAX example (single var) ----------------
y = df["var0"].values
train_y = y[:train_end]
try:
    sarima = sm.tsa.SARIMAX(train_y, order=(2,1,2), seasonal_order=(1,0,1,24),
                             enforce_stationarity=False, enforce_invertibility=False)
    sarima_res = sarima.fit(disp=False)
    fc = sarima_res.get_forecast(steps=PRED_LEN)
    fc_mean = fc.predicted_mean
    print("SARIMAX example forecast (first values):", fc_mean[:5])
    with open(os.path.join(save_dir, "sarima_summary.txt"), "w") as f:
        f.write(str(sarima_res.summary()))
except Exception as e:
    print("SARIMAX failed (example). Tweak orders and try again.", e)

print("All outputs saved to", save_dir)


Device: cpu
Generated data shape: (7000, 4)
Splits -> train_end: 4900 val_end: 5950
Dataset sizes (samples): 4795 1041 1041
Model params: 369033
