In [3]:
#!/usr/bin/env python3
"""
ts_forecast_attention_fixed.py

Fixed, self-contained multi-step forecasting script implementing:
- Synthetic multivariate dataset (>=6000 timesteps)
- Preprocessing (MinMaxScaler)
- Seq2Seq LSTM encoder-decoder with additive attention
- Robust decoder hidden-state projection (fix for mat1/mat2 errors)
- SARIMA baseline evaluation (per-feature)
- Training loop, early stopping, evaluation metrics (RMSE/MAE/MASE)
- Attention visualization and Monte Carlo dropout intervals
- Auto-generates a `report.md` and saves figures into `report/figures`

Usage:
    python ts_forecast_attention_fixed.py --run

Dependencies (requirements.txt):
    numpy
    torch
    matplotlib
    pandas
    statsmodels

This script is intentionally single-file for easy submission. It saves:
    - best model: best_seq2seq_attn.pth
    - report: report/report.md and figures in report/figures/

Author: Generated by ChatGPT (fixed patch)
"""

import os
import math
import random
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Optional baseline
try:
    from statsmodels.tsa.statespace.sarimax import SARIMAX
    _HAS_STATS = True
except Exception:
    _HAS_STATS = False

# -----------------------------
# Repro and device
# -----------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# -----------------------------
# Synthetic data generator
# -----------------------------
def generate_multivariate_series(n_steps=6000, n_series=3, noise_std=0.2):
    t = np.arange(n_steps)
    data = []
    for s in range(n_series):
        freq = 0.01 + 0.01 * s
        phase = s * 0.5
        trend = 0.0005 * t * (1 + 0.2 * s)
        seasonal = np.sin(2 * np.pi * freq * t + phase) + 0.5 * np.sin(2 * np.pi * freq * 3 * t + phase/2)
        noise = np.random.normal(scale=noise_std, size=n_steps)
        ar = np.zeros(n_steps)
        for i in range(2, n_steps):
            ar[i] = 0.6 * ar[i-1] - 0.2 * ar[i-2] + 0.05 * seasonal[i-1]
        series = seasonal + trend + ar + noise
        data.append(series)
    data = np.stack(data, axis=1)
    return data

# -----------------------------
# Preprocessing
# -----------------------------
class MinMaxScaler:
    def fit(self, data: np.ndarray):
        self.min = data.min(axis=0)
        self.max = data.max(axis=0)
        self.range = np.where(self.max - self.min == 0, 1.0, self.max - self.min)
    def transform(self, data: np.ndarray):
        return (data - self.min) / self.range
    def inverse_transform(self, data: np.ndarray):
        return data * self.range + self.min

def create_windows(data: np.ndarray, in_len: int, out_len: int, step=1):
    T, F = data.shape
    Xs, Ys = [], []
    for start in range(0, T - in_len - out_len + 1, step):
        Xs.append(data[start : start + in_len])
        Ys.append(data[start + in_len : start + in_len + out_len])
    return np.stack(Xs), np.stack(Ys)

class SeqDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X.astype(np.float32)
        self.Y = Y.astype(np.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

# -----------------------------
# Model components
# -----------------------------
class EncoderLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
    def forward(self, x):
        out, (h, c) = self.lstm(x)
        return out, (h, c)

class AdditiveAttention(nn.Module):
    def __init__(self, enc_dim, dec_dim, attn_dim):
        super().__init__()
        self.W_enc = nn.Linear(enc_dim, attn_dim, bias=False)
        self.W_dec = nn.Linear(dec_dim, attn_dim, bias=False)
        self.v = nn.Linear(attn_dim, 1, bias=False)
    def forward(self, enc_outputs, dec_hidden):
        enc_proj = self.W_enc(enc_outputs)  # (B, S, A)
        dec_proj = self.W_dec(dec_hidden).unsqueeze(1)  # (B, 1, A)
        e = torch.tanh(enc_proj + dec_proj)
        scores = self.v(e).squeeze(-1)
        attn_weights = torch.softmax(scores, dim=1)
        context = torch.bmm(attn_weights.unsqueeze(1), enc_outputs).squeeze(1)
        return context, attn_weights

class DecoderLSTMWithAttn(nn.Module):
    def __init__(self, input_dim, enc_dim, dec_dim, out_dim, attn_dim, num_layers=1, dropout=0.1):
        super().__init__()
        # input_dim is feature size of previous step, enc_dim is encoder hidden size
        self.input_proj = nn.Linear(input_dim + enc_dim, dec_dim)
        self.lstm = nn.LSTM(dec_dim, dec_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.attn = AdditiveAttention(enc_dim, dec_dim, attn_dim)
        self.out = nn.Linear(dec_dim, out_dim)
        self.dropout = nn.Dropout(dropout)
    def forward_step(self, prev_y, enc_outputs, hidden):
        dec_hidden = hidden[0][-1]
        context, attn_weights = self.attn(enc_outputs, dec_hidden)
        combined = torch.cat([prev_y, context], dim=1)
        dec_input = torch.tanh(self.input_proj(combined)).unsqueeze(1)
        out, hidden = self.lstm(dec_input, hidden)
        out = out.squeeze(1)
        out = self.dropout(out)
        y_pred = self.out(out)
        return y_pred, hidden, attn_weights
    def forward(self, enc_outputs, dec_init_hidden, targets=None, teacher_forcing_ratio=0.5):
        B = enc_outputs.size(0)
        device = enc_outputs.device
        if targets is not None:
            out_len = targets.size(1)
            out_dim = targets.size(2)
        else:
            out_len = 10
            out_dim = enc_outputs.size(2)
        preds = []
        attns = []
        prev_y = torch.zeros(B, out_dim, device=device)
        hidden = dec_init_hidden
        for t in range(out_len):
            y_pred, hidden, attn_w = self.forward_step(prev_y, enc_outputs, hidden)
            preds.append(y_pred.unsqueeze(1))
            attns.append(attn_w.unsqueeze(1))
            if targets is not None and random.random() < teacher_forcing_ratio:
                prev_y = targets[:, t, :].to(device)
            else:
                prev_y = y_pred.detach()
        preds = torch.cat(preds, dim=1)
        attns = torch.cat(attns, dim=1)
        return preds, attns

class Seq2SeqAttnModel(nn.Module):
    def __init__(self, input_dim, enc_dim=64, dec_dim=64, attn_dim=32, out_dim=None, dropout=0.1, num_layers=1):
        super().__init__()
        self.encoder = EncoderLSTM(input_dim, enc_dim, num_layers=num_layers, dropout=dropout)
        self.decoder = DecoderLSTMWithAttn(out_dim if out_dim else input_dim, enc_dim, dec_dim, out_dim if out_dim else input_dim, attn_dim, num_layers=num_layers, dropout=dropout)
        # robust projection layers: project encoder-hidden dim -> decoder hidden dim
        self.enc2dec_h = nn.Linear(enc_dim, dec_dim)
        self.enc2dec_c = nn.Linear(enc_dim, dec_dim)
    def forward(self, src, tgt=None, teacher_forcing_ratio=0.5):
        enc_out, (h, c) = self.encoder(src)
        # h/c: (num_layers, B, enc_dim)
        # Permute to (B, num_layers, enc_dim) to project last dim cleanly
        h_permute = h.permute(1, 0, 2).contiguous()
        c_permute = c.permute(1, 0, 2).contiguous()
        # project
        h_proj = torch.tanh(self.enc2dec_h(h_permute))
        c_proj = torch.tanh(self.enc2dec_c(c_permute))
        # permute back to (num_layers, B, dec_dim)
        dec_h = h_proj.permute(1, 0, 2).contiguous()
        dec_c = c_proj.permute(1, 0, 2).contiguous()
        dec_hidden = (dec_h, dec_c)
        preds, attns = self.decoder(enc_out, dec_hidden, targets=tgt, teacher_forcing_ratio=teacher_forcing_ratio)
        return preds, attns

# -----------------------------
# Training, evaluation, metrics
# -----------------------------

def train_epoch(model, dataloader, optimizer, criterion, device, clip=1.0):
    model.train()
    total_loss = 0.0
    for Xb, Yb in dataloader:
        Xb = Xb.to(device)
        Yb = Yb.to(device)
        optimizer.zero_grad()
        preds, _ = model(Xb, tgt=Yb, teacher_forcing_ratio=0.6)
        loss = criterion(preds, Yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        total_loss += loss.item() * Xb.size(0)
    return total_loss / len(dataloader.dataset)


def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_trues = []
    all_attns = []
    with torch.no_grad():
        for Xb, Yb in dataloader:
            Xb = Xb.to(device)
            Yb = Yb.to(device)
            preds, attns = model(Xb, tgt=None, teacher_forcing_ratio=0.0)
            loss = criterion(preds, Yb)
            total_loss += loss.item() * Xb.size(0)
            all_preds.append(preds.cpu().numpy())
            all_trues.append(Yb.cpu().numpy())
            all_attns.append(attns.cpu().numpy())
    all_preds = np.concatenate(all_preds, axis=0)
    all_trues = np.concatenate(all_trues, axis=0)
    all_attns = np.concatenate(all_attns, axis=0)
    return total_loss / len(dataloader.dataset), all_preds, all_trues, all_attns

# metrics

def rmse(y_pred, y_true):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def mae(y_pred, y_true):
    return np.mean(np.abs(y_pred - y_true))

def mase(y_pred, y_true, train_series):
    naive_errors = np.abs(train_series[1:] - train_series[:-1])
    mae_naive = np.mean(naive_errors)
    mae_model = np.mean(np.abs(y_pred - y_true))
    return mae_model / (mae_naive + 1e-8)

# MC dropout

def mc_dropout_predict(model, X, n_samples=50):
    model.train()
    preds = []
    with torch.no_grad():
        for _ in range(n_samples):
            p, _ = model(X.to(device), tgt=None, teacher_forcing_ratio=0.0)
            preds.append(p.cpu().numpy())
    preds = np.stack(preds, axis=0)
    model.eval()
    return preds

# -----------------------------
# SARIMA baseline helper
# -----------------------------

def sarima_forecast_series(train_series, steps=10, order=(1,1,1), seasonal_order=(0,0,0,0)):
    if not _HAS_STATS:
        raise RuntimeError("statsmodels not available. Install statsmodels to run SARIMA baseline.")
    model = SARIMAX(train_series, order=order, seasonal_order=seasonal_order, enforce_stationarity=False, enforce_invertibility=False)
    res = model.fit(disp=False)
    preds = res.predict(start=len(train_series), end=len(train_series)+steps-1)
    return np.asarray(preds)

# -----------------------------
# Runner
# -----------------------------

def run_pipeline(run_cfg):
    n_steps = run_cfg['n_steps']
    n_series = run_cfg['n_series']
    in_len = run_cfg['in_len']
    out_len = run_cfg['out_len']

    data = generate_multivariate_series(n_steps=n_steps, n_series=n_series, noise_std=0.25)
    print("Data shape:", data.shape)

    train_end = int(n_steps * 0.7)
    val_end = int(n_steps * 0.85)
    train_data = data[:train_end]
    val_data = data[train_end:val_end]
    test_data = data[val_end:]

    scaler = MinMaxScaler(); scaler.fit(train_data)
    data_scaled = scaler.transform(data)

    X_all, Y_all = create_windows(data_scaled, in_len=in_len, out_len=out_len, step=1)
    starts = np.arange(0, n_steps - in_len - out_len + 1)
    train_mask = starts + in_len + out_len <= train_end
    val_mask = (starts + in_len + out_len > train_end) & (starts + in_len + out_len <= val_end)
    test_mask = starts + in_len + out_len > val_end

    X_train, Y_train = X_all[train_mask], Y_all[train_mask]
    X_val, Y_val = X_all[val_mask], Y_all[val_mask]
    X_test, Y_test = X_all[test_mask], Y_all[test_mask]
    print("Windows: train", X_train.shape, "val", X_val.shape, "test", X_test.shape)

    train_ds = SeqDataset(X_train, Y_train)
    val_ds = SeqDataset(X_val, Y_val)
    test_ds = SeqDataset(X_test, Y_test)

    train_loader = DataLoader(train_ds, batch_size=run_cfg['batch_size'], shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=run_cfg['batch_size'], shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=run_cfg['batch_size'], shuffle=False)

    model = Seq2SeqAttnModel(input_dim=n_series, enc_dim=run_cfg['enc_dim'], dec_dim=run_cfg['dec_dim'], attn_dim=run_cfg['attn_dim'], out_dim=n_series, dropout=run_cfg['dropout'], num_layers=run_cfg['num_layers'])
    model.to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=run_cfg['lr'])

    best_val = float('inf'); best_epoch = -1
    os.makedirs('report/figures', exist_ok=True)
    history = []

    for epoch in range(1, run_cfg['epochs']+1):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, _, _, _ = evaluate(model, val_loader, criterion, device)
        history.append({'epoch': epoch, 'train_loss': train_loss, 'val_loss': val_loss})
        print(f"Epoch {epoch} Train {train_loss:.6f} Val {val_loss:.6f}")
        if val_loss < best_val:
            best_val = val_loss
            best_epoch = epoch
            torch.save(model.state_dict(), 'best_seq2seq_attn.pth')
    pd.DataFrame(history).to_csv('report/figures/training_history.csv', index=False)
    print('Best val', best_val, 'at epoch', best_epoch)

    # load best
    model.load_state_dict(torch.load('best_seq2seq_attn.pth', map_location=device))
    model.eval()

    test_loss, preds, trues, attns = evaluate(model, test_loader, criterion, device)
    print('Test MSE loss:', test_loss)

    preds_inv = scaler.inverse_transform(preds.reshape(-1, n_series)).reshape(preds.shape)
    trues_inv = scaler.inverse_transform(trues.reshape(-1, n_series)).reshape(trues.shape)

    rmse_val = rmse(preds_inv, trues_inv)
    mae_val = mae(preds_inv, trues_inv)
    mase_val = mase(preds_inv, trues_inv, train_data)
    print(f"Test RMSE: {rmse_val:.6f}, MAE: {mae_val:.6f}, MASE: {mase_val:.6f}")

    # Save metrics
    metrics = {'model': 'seq2seq_attn', 'rmse': float(rmse_val), 'mae': float(mae_val), 'mase': float(mase_val)}
    metrics_df = pd.DataFrame([metrics])
    metrics_df.to_csv('report/figures/model_metrics.csv', index=False)

    # attention aggregate and plot
    avg_attn_over_decoder = np.mean(attns, axis=0)
    avg_attn_encoderwise = np.mean(avg_attn_over_decoder, axis=0)
    plt.figure(figsize=(10,3))
    plt.plot(np.arange(-in_len,0), avg_attn_encoderwise)
    plt.xlabel('Encoder step (lag)')
    plt.title('Average encoder-step attention importance')
    plt.savefig('report/figures/avg_attention_importance.png')
    plt.close()

    # sample prediction plot (first test sample)
    sample_idx = 0
    x0 = X_test[sample_idx]
    y_true = Y_test[sample_idx]
    with torch.no_grad():
        x_tensor = torch.tensor(x0).unsqueeze(0).to(device)
        pred_scaled, attn_sample = model(x_tensor, tgt=None, teacher_forcing_ratio=0.0)
        pred_scaled = pred_scaled.cpu().numpy()[0]
        attn_sample = attn_sample.cpu().numpy()[0]
    pred_inv = scaler.inverse_transform(pred_scaled)
    y_true_inv = scaler.inverse_transform(y_true)
    past_inv = scaler.inverse_transform(x0)

    for feat in range(n_series):
        plt.figure(figsize=(8,2))
        t_past = np.arange(-in_len,0)
        t_fut = np.arange(0,out_len)
        plt.plot(t_past, past_inv[:, feat], label='past')
        plt.plot(t_fut, y_true_inv[:, feat], 'o-', label='true')
        plt.plot(t_fut, pred_inv[:, feat], 'x--', label='pred')
        plt.title(f'Feature {feat} predictions')
        plt.legend()
        plt.savefig(f'report/figures/sample_pred_feat{feat}.png')
        plt.close()

    # attention heatmap
    plt.figure(figsize=(8,3))
    plt.imshow(attn_sample.T, aspect='auto', origin='lower')
    plt.xlabel('Decoder step')
    plt.ylabel('Encoder step')
    plt.title('Attention heatmap (sample)')
    plt.colorbar()
    plt.savefig('report/figures/attention_heatmap_sample.png')
    plt.close()

    # MC dropout intervals
    x_tensor = torch.tensor(x0).unsqueeze(0)
    samples = mc_dropout_predict(model, x_tensor, n_samples=100)
    samples = samples[:, 0]
    lower = np.percentile(samples, 5, axis=0)
    upper = np.percentile(samples, 95, axis=0)
    median = np.median(samples, axis=0)
    lower_inv = scaler.inverse_transform(lower)
    upper_inv = scaler.inverse_transform(upper)
    med_inv = scaler.inverse_transform(median)
    plt.figure(figsize=(8,3))
    feat = 0
    plt.plot(np.arange(out_len), y_true_inv[:, feat], 'o-', label='true')
    plt.plot(np.arange(out_len), med_inv[:, feat], 'x--', label='median')
    plt.fill_between(np.arange(out_len), lower_inv[:, feat], upper_inv[:, feat], alpha=0.3, label='90% PI')
    plt.legend()
    plt.title('MC Dropout Prediction Intervals (sample, feat 0)')
    plt.savefig('report/figures/mc_dropout_pi_feat0.png')
    plt.close()

    # SARIMA baseline (per-feature) - perform if statsmodels installed
    sarima_metrics = []
    if _HAS_STATS:
        for feat in range(n_series):
            train_series = train_data[:, feat]
            try:
                sar_pred = sarima_forecast_series(train_series, steps=out_len, order=(1,1,1))
                # compare to first test window as representative (or implement full-window SARIMA if required)
                y_true_sample = scaler.inverse_transform(Y_test[0])[:, feat]
                sarima_metrics.append({'feature': feat, 'sarima_rmse': float(rmse(sar_pred, y_true_sample)), 'sarima_mae': float(mae(sar_pred, y_true_sample))})
            except Exception as e:
                sarima_metrics.append({'feature': feat, 'error': str(e)})
        pd.DataFrame(sarima_metrics).to_csv('report/figures/sarima_metrics.csv', index=False)

    # generate report.md
    report_lines = []
    report_lines.append('# Advanced Time Series Forecasting â€” Report')
    report_lines.append('\n')
    report_lines.append('## Summary')
    report_lines.append(f'- Model: Seq2Seq LSTM with Additive Attention')
    report_lines.append(f'- Data: synthetic multivariate, {n_steps} timesteps, {n_series} features')
    report_lines.append(f'- Input window: {in_len}, Forecast horizon: {out_len}')
    report_lines.append(f'- Test RMSE: {rmse_val:.6f}, MAE: {mae_val:.6f}, MASE: {mase_val:.6f}')
    report_lines.append('\n')
    report_lines.append('## Files')
    report_lines.append('- Figures: report/figures/')
    report_lines.append('- Model weights: best_seq2seq_attn.pth')
    report_lines.append('\n')
    report_lines.append('## Methodology')
    report_lines.append('- See code for model architecture and training details.')
    report_lines.append('\n')
    report_lines.append('## Attention analysis')
    report_lines.append('See report/figures/attention_heatmap_sample.png and report/figures/avg_attention_importance.png')
    report_lines.append('\n')
    report_lines.append('## Prediction intervals')
    report_lines.append('See report/figures/mc_dropout_pi_feat0.png')
    report_lines.append('\n')
    if _HAS_STATS:
        report_lines.append('## SARIMA baseline metrics')
        report_lines.append('See report/figures/sarima_metrics.csv')
        report_lines.append('\n')

    with open('report/report.md', 'w', encoding='utf-8') as f:
        f.write('\n'.join(report_lines))

    print('Report and figures saved under report/.')

# -----------------------------
# Argument parsing and entry
# -----------------------------

def parse_args():
    """Parse CLI args.

    In notebook environments, argv may contain extra kernel arguments. We use
    parse_known_args() to ignore unknown args so the script can be imported or
    executed in Jupyter without raising SystemExit.
    """
    import argparse, sys
    p = argparse.ArgumentParser()
    p.add_argument('--run', action='store_true')
    # parse known args and ignore the rest (prevents ipykernel argv errors)
    args, _ = p.parse_known_args()
    return args

if __name__ == '__main__':
    args = parse_args()
    cfg = {
        'n_steps': 6000,
        'n_series': 3,
        'in_len': 30,
        'out_len': 10,
        'batch_size': 64,
        'epochs': 12,
        'lr': 1e-3,
        'enc_dim': 64,
        'dec_dim': 64,
        'attn_dim': 32,
        'dropout': 0.2,
        'num_layers': 1
    }
    if args.run:
        run_pipeline(cfg)
    else:
        print("Run with --run to execute training/evaluation and generate report.")


Device: cpu
Run with --run to execute training/evaluation and generate report.
