In [1]:
import json
import os
import numpy as np

def load_existing_results(file_path="forecasting_results.json"):
    """
    Load existing results from a JSON file.
    Returns an empty dictionary if the file doesn't exist.
    """
    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            return json.load(f)
    return {}


def save_results_to_json(data, file_path="forecasting_results.json"):
    """
    Save the results dictionary to a JSON file, handling NumPy data types.
    """

    # Handle NumPy data types (recursive conversion)
    def convert_numpy(obj):
        if isinstance(obj, dict):
            return {k: convert_numpy(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [convert_numpy(i) for i in obj]
        elif isinstance(obj, (np.integer, np.int64, np.int32)):
            return int(obj)
        elif isinstance(obj, (np.floating, np.float64, np.float32)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()  # Convert arrays to lists
        else:
            return obj

    # Convert data and save to JSON
    data = convert_numpy(data)
    with open(file_path, "w") as f:
        json.dump(data, f, indent=4)
    print(f"✅ Results saved to {file_path}")



def store_results(dataset_name, horizons, horizon_value, experiment_type, backbone, mae_result, file_path="forecasting_results.json"):
    """
    Store MAE results for a given experiment type (stl_mae, mtl_mae, global_mae) per horizon.

    Args:
    - dataset_name (str): Name of the dataset (e.g., 'Solar', 'Air Quality').
    - horizons (list): List of horizon values (e.g., [1, 2, 4, 8, 16]).
    - horizon_value (int): The horizon corresponding to the mae_result provided.
    - experiment_type (str): One of ['stl_mae', 'mtl_mae', 'global_mae'].
    - backbone (str): Model backbone name (e.g., 'Deep_LSTM', 'simple_transformer').
    - mae_result (list): MAE values for the current horizon (list of floats).
    - file_path (str): JSON file to store the results.

    Returns:
    - None
    """
    # Load existing results
    results_dict = load_existing_results(file_path)

    # Create dataset entry if it doesn't exist
    dataset_key = f"{dataset_name}_{backbone}"
    if dataset_key not in results_dict:
        results_dict[dataset_key] = {
            "horizons": horizons,
            "mtl": [[] for _ in horizons],
            "global": [[] for _ in horizons],
            "independent": [[] for _ in horizons]
        }

    # Find index for the given horizon
    try:
        horizon_index = horizons.index(horizon_value)
    except ValueError:
        raise ValueError(f"⚠️ Horizon value {horizon_value} not found in {horizons}.")

    # Append the mae_result to the correct horizon
    results_dict[dataset_key][experiment_type][horizon_index].extend(mae_result)

    # Save updated results
    save_results_to_json(results_dict, file_path)

# Pre Requisite

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import dateutil
from sklearn.preprocessing import MinMaxScaler
import os

from sklearn.preprocessing import MinMaxScaler

def df_to_X_y(df, features, target, window_size=32, horizon=1):
    """
    Converts a DataFrame into supervised learning format for multi-step time series forecasting.

    Args:
    - df (pd.DataFrame): DataFrame containing time series data.
    - features (list): List of columns to use as features.
    - target (str): The target column to predict.
    - window_size (int): Past window size.
    - horizon (int): Number of future steps.

    Returns:
    - X (np.array): Features (num_samples, window_size, num_features).
    - y (np.array): Targets (num_samples, horizon).
    """
    # Ensure target is in features
    if target not in features:
        features = [target] + features

    data = df[features].to_numpy()  # Features including target's history
    target_data = df[target].to_numpy()  # Target series

    X, y = [], []
    for i in range(len(data) - window_size - horizon + 1):
        # Past window_size feature values (including target)
        X.append(data[i:i + window_size])

        # Multi-step target: a sequence of future steps
        y.append(target_data[i + window_size : i + window_size + horizon])

    return np.array(X), np.array(y)  # y shape: (num_samples, horizon)



# ---------- Data Loader Function (Target included in features) ----------
def load_and_preprocess_site_data(site_path, features, target, window_size=32, horizon=1, min_date=None, max_date=None, batch_size=16, device='cpu'):
    """
    Loads and preprocesses time series data for a given site with specified features and target,
    ensuring the target column's historical values are included in the features.
    
    Args:
    - site_path (str): Path to the CSV file.
    - features (list): List of feature columns to use.
    - target (str): Target column name.
    - window_size (int): Past window size.
    - horizon (int): Forecast horizon.
    - min_date, max_date (str or datetime): Optional date filtering.
    - batch_size (int): Batch size for DataLoader.
    - device (str): 'cpu' or 'cuda'.

    Returns:
    - train_loader, val_loader, test_loader: PyTorch DataLoaders.
    """
    df = pd.read_csv(site_path)

    # Convert date column to datetime if exists
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
        if min_date:
            min_date = dateutil.parser.parse(min_date) if isinstance(min_date, str) else min_date
            df = df[df['date'] >= min_date]
        if max_date:
            max_date = dateutil.parser.parse(max_date) if isinstance(max_date, str) else max_date
            df = df[df['date'] <= max_date]
        df.drop(columns=['date'], inplace=True)
    # Ensure target is included in the feature set
    if target not in features:
        features = [target] + features

    # Check for missing columns
    all_columns = features
    if not all(col in df.columns for col in all_columns):
        missing = [col for col in all_columns if col not in df.columns]
        raise ValueError(f"Missing columns in dataset: {missing}")

    # Split data: 80% Train/Val, 20% Test
    train_size = int(0.8 * len(df))
    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]

    # 16% validation from train set
    val_size = int(0.2 * len(train_df))
    train_df, val_df = train_df.iloc[:-val_size], train_df.iloc[-val_size:]

    print(f"Train size: {len(train_df)} | Validation size: {len(val_df)} | Test size: {len(test_df)}")

    # # Standardization (using training stats)
    train_mean, train_std = train_df[all_columns].mean(), train_df[all_columns].std()
    train_df[all_columns] = (train_df[all_columns] - train_mean) / (train_std + 1e-8)
    val_df[all_columns] = (val_df[all_columns] - train_mean) / (train_std + 1e-8)
    test_df[all_columns] = (test_df[all_columns] - train_mean) / (train_std + 1e-8)
    
    # ✅ **Min-Max Scaling** (fitted on train only)
    # scaler = MinMaxScaler()
    # train_df[all_columns] = scaler.fit_transform(train_df[all_columns])
    # val_df[all_columns] = scaler.transform(val_df[all_columns])
    # test_df[all_columns] = scaler.transform(test_df[all_columns])

    # Generate sequences
    X_train, y_train = df_to_X_y(train_df, features, target, window_size, horizon)
    X_val, y_val = df_to_X_y(val_df, features, target, window_size, horizon)
    X_test, y_test = df_to_X_y(test_df, features, target, window_size, horizon)

    # Convert to PyTorch tensors
    train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32).to(device), torch.tensor(y_train, dtype=torch.float32).to(device))
    val_data = TensorDataset(torch.tensor(X_val, dtype=torch.float32).to(device), torch.tensor(y_val, dtype=torch.float32).to(device))
    test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32).to(device), torch.tensor(y_test, dtype=torch.float32).to(device))

    # Create DataLoaders
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
    val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size, drop_last=True)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size, drop_last=True)

    return train_loader, val_loader, test_loader #,scaler

# Base Architecture

In [3]:
# ------------------ SIMPLE MULTI-TASK TRANSFORMER MODEL ------------------

class SimpleMultiTaskTransformer(nn.Module):
    """
    Multi-task Transformer model with a shared encoder and task-specific decoders.
    """
    def __init__(self, input_dim, d_model, nhead, num_layers, dim_feedforward, output_dim, num_tasks, dropout=0.1):
        super(SimpleMultiTaskTransformer, self).__init__()
        self.shared_encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.shared_encoder = nn.TransformerEncoder(self.shared_encoder_layer, num_layers=num_layers)

        # Linear projection to match d_model
        self.input_projection = nn.Linear(input_dim, d_model)

        # Task-specific decoders
        self.task_heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(d_model, dim_feedforward),
                nn.ReLU(),
                nn.Linear(dim_feedforward, output_dim)
            ) for _ in range(num_tasks)
        ])

    def forward(self, inputs):
        outputs = []
        for i, x in enumerate(inputs):
            x = self.input_projection(x)
            x = self.shared_encoder(x)
            x = x[:, -1, :]  # Use the last time step output
            outputs.append(self.task_heads[i](x))
        return outputs

## Better Transformer with Postional Encoding, Convolution


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    """
    Implements positional encoding as used in Transformer architectures.
    """
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


class EnhancedMultiTaskTransformer(nn.Module):
    """
    Enhanced Multi-task Transformer model with Conv1D embedding, positional encoding,
    causal masking to prevent information leakage, shared Transformer encoder,
    and task-specific decoders.
    """
    def __init__(self, input_dim, d_model, nhead, num_layers, dim_feedforward, output_dim, num_tasks,
                 seq_len=32, conv1d_emb=True, conv1d_kernel_size=3, dropout=0.1, device="cuda"):
        super(EnhancedMultiTaskTransformer, self).__init__()

        self.conv1d_emb = conv1d_emb
        self.seq_len = seq_len
        self.embed_size = d_model
        self.device = device

        # Input Embedding: Conv1D or Linear
        if conv1d_emb:
            if conv1d_kernel_size % 2 == 0:
                raise Exception("conv1d_kernel_size must be an odd number to preserve dimensions.")
            self.conv1d_padding = conv1d_kernel_size - 1
            self.input_embedding = nn.Conv1d(input_dim, d_model, kernel_size=conv1d_kernel_size)
        else:
            self.input_embedding = nn.Linear(input_dim, d_model)

        # Positional Encoding
        self.position_encoder = PositionalEncoding(d_model=d_model, dropout=dropout, max_len=seq_len)

        # Shared Transformer Encoder
        self.shared_encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.shared_encoder = nn.TransformerEncoder(self.shared_encoder_layer, num_layers=num_layers)

        # Task-specific decoders
        self.task_heads = nn.ModuleList([
            nn.Sequential(
                nn.Linear(seq_len * d_model, dim_feedforward),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(dim_feedforward, output_dim)
            ) for _ in range(num_tasks)
        ])

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def _generate_square_subsequent_mask(self):
        """
        Generates a causal mask to prevent attention to future positions.
        """
        return torch.triu(
            torch.full((self.seq_len, self.seq_len), float('-inf'), dtype=torch.float32, device=self.device),
            diagonal=1,
        )

    def forward(self, inputs):
        outputs = []
        src_mask = self._generate_square_subsequent_mask()
        for i, x in enumerate(inputs):
            if self.conv1d_emb:
                x = F.pad(x, (0, 0, self.conv1d_padding, 0), "constant", -1)
                x = self.input_embedding(x.transpose(1, 2)).transpose(1, 2)
            else:
                x = self.input_embedding(x)

            x = self.position_encoder(x)
            x = self.shared_encoder(x, mask=src_mask)
            x = x.reshape(x.size(0), -1)  # Flatten all sequence outputs

            outputs.append(self.task_heads[i](x))
        return outputs

## Training code

In [48]:
# ------------------ TRAINING & EVALUATION FOR SIMPLE MTL TRANSFORMER ------------------

def train_simple_transformer(site_loaders, input_dim, d_model, nhead, num_layers, dim_feedforward, output_dim, num_tasks, num_epochs=5, device='cpu'):
    """
    Trains the SimpleMultiTaskTransformer model across multiple site datasets.
    """
    # model = SimpleMultiTaskTransformer(input_dim, d_model, nhead, num_layers, dim_feedforward, output_dim, num_tasks).to(device)
    model = EnhancedMultiTaskTransformer(input_dim, d_model, nhead, num_layers, dim_feedforward, output_dim, num_tasks).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

    # Unpack loaders for each tasks
    train_loaders = [loader_tuple[0] for loader_tuple in site_loaders]
    val_loaders = [loader_tuple[1] for loader_tuple in site_loaders]
    
    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        # Iterate over batches from all tasks simultaneously
        for batches in zip(*train_loaders):
            # Each batch in batches is a tuple (X, y) for a given task
            Xs = [batch[0].to(device) for batch in batches]
            ys = [batch[1].to(device) for batch in batches]
            
            optimizer.zero_grad()
            # Pass the list of task batches to the model
            preds_list = model(Xs)  # expects a list of tensors, one per task
            
            # Compute losses for each task and sum them
            losses = [
                criterion(pred, y.view(y.size(0), -1))
                for pred, y in zip(preds_list, ys)
            ]
            total_loss = sum(losses)
            total_loss.backward()
            optimizer.step()
            train_losses.append(total_loss.item())
        
        # Validation phase (similarly, iterate over all task validation loaders)
        model.eval()
        val_losses = []
        with torch.no_grad():
            for batches in zip(*val_loaders):
                Xs = [batch[0].to(device) for batch in batches]
                ys = [batch[1].to(device) for batch in batches]
                preds_list = model(Xs)
                losses = [
                    criterion(pred, y.view(y.size(0), -1)).item()
                    for pred, y in zip(preds_list, ys)
                ]
                # Average loss over tasks for this batch
                val_losses.append(sum(losses) / num_tasks)
        
        scheduler.step()
        print(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {np.mean(train_losses):.4f} | Validation Loss: {np.mean(val_losses):.4f}")
    
    print("Training complete.")
    return model


def evaluate_simple_transformer(model, site_loaders, horizon_val, ds ='NK' ,device='cpu', horizon=16):
    """
    Evaluates the SimpleMultiTaskTransformer model on the test set for all sites and computes MAE.
    """
    model.eval()
    # Prepare test loaders from site_loaders
    test_loaders = [loader_tuple[2] for loader_tuple in site_loaders]
    task_preds, task_targets = [[] for _ in range(len(test_loaders))], [[] for _ in range(len(test_loaders))]
    
    with torch.no_grad():
        for batches in zip(*test_loaders):
            Xs = [batch[0].to(device) for batch in batches]
            ys = [batch[1].to(device) for batch in batches]
            preds_list = model(Xs)
            for i, (pred, y) in enumerate(zip(preds_list, ys)):
                task_preds[i].append(pred.cpu().numpy())
                task_targets[i].append(y.cpu().numpy())
    
    # Compute MAE for each task
    mae_scores = []
    for preds, targets in zip(task_preds, task_targets):
        preds_concat = np.concatenate(preds)
        targets_concat = np.concatenate(targets)
        mae_scores.append(mean_absolute_error(targets_concat, preds_concat))

    print("Simple Transformer MTL evaluation complete.")
    # Append results to output.txt
    with open("output_test.txt", "a") as f:
        f.write(f"\n==================== Simple TRANSFORMER MTL MODEL RESULTS {ds} ====================\n")
        f.write(f"MAE per task: {mae_scores}\n")
    store_results(
        dataset_name=ds,
        horizons=[1,2,4,8,16],
        experiment_type='mtl',
        mae_result=mae_scores,
        backbone='simple_transformer',
        horizon_value=horizon_val
    )
    print("Results saved to output.txt.")
    return mae_scores


def plot_simple_transformer_predictions(model, site_loaders, device='cpu'):
    """
    Plots predictions vs. ground truth for each task in the SimpleMultiTaskTransformer model.
    """
    model.eval()
    for task_id, (_, _, test_loader) in enumerate(site_loaders):
        all_preds, all_truths = [], []

        with torch.no_grad():
            for X, y in test_loader:
                X, y = X.to(device), y.to(device)
                preds = model([X])[task_id]
                all_preds.append(preds.cpu().numpy())
                all_truths.append(y.cpu().numpy())

        preds_concat = np.concatenate(all_preds, axis=0).flatten()
        truths_concat = np.concatenate(all_truths, axis=0).flatten()

        plt.figure(figsize=(10, 5))
        plt.plot(truths_concat, label="Ground Truth", linewidth=1)
        plt.plot(preds_concat, label="Transformer MTL Prediction", linewidth=1, linestyle='--')
        plt.title(f"Transformer MTL - Task {task_id + 1}: Predictions vs Ground Truth")
        plt.xlabel("Sample Index")
        plt.ylabel("Value")
        plt.legend()
        plt.grid(True)
        plt.show()


# ------------------ TRAINING PIPELINE FOR SIMPLE MTL TRANSFORMER ------------------

def run_simple_transformer_pipeline(datasets, horizons, device='cpu'):
    """
    Runs the full training, evaluation, and plotting pipeline for the SimpleMultiTaskTransformer model.
    """
    for dataset in datasets:
        print(f"\n==================== 🌟 DATASET: {dataset['ds']} ====================")
        for horizon in horizons:
            print(f"\n==================== ⏳ HORIZON: {horizon} ====================")

            site_paths = [
                os.path.join(root, file)
                for root, dirs, files in os.walk(dataset['base_path'])
                if root != dataset['base_path']
                for file in files
                if file.endswith(".csv")
            ]

            total_sites = len(site_paths)
            num_tasks = total_sites
            batch_size, window_size, input_dim, d_model, hidden_dim, output_dim = 32, 32, len(dataset['features']), 128, 512, horizon

            site_loaders = [
                load_and_preprocess_site_data(
                    site_path,
                    dataset['features'],
                    dataset['target'],
                    window_size,
                    horizon=output_dim,
                    batch_size=batch_size,
                    min_date=dataset['min_date'],
                    max_date=dataset['max_date']
                ) for site_path in site_paths
            ]

            # Training Transformer MTL model
            transformer_mtl_model = train_simple_transformer(site_loaders, input_dim, d_model, nhead=2, num_layers=1, dim_feedforward=hidden_dim, output_dim=output_dim, num_tasks=num_tasks, num_epochs=10, device=device)
            # Evaluating Transformer MTL model
            transformer_mtl_mae = evaluate_simple_transformer(transformer_mtl_model, site_loaders, horizon_val=horizon ,device=device, ds = dataset['ds'])
            # Plotting predictions
            # plot_simple_transformer_predictions(transformer_mtl_model, site_loaders, device=device)

            print(f"✅ Completed: {dataset['ds']} | Horizon: {horizon} | Transformer MTL MAE per task: {transformer_mtl_mae}")

    print("\n🏆 All Transformer MTL experiments completed successfully!")


In [49]:
horizons = [1, 2, 4, 8, 16]

# 🌐 Dataset Configurations
datasets = [
    # {
    #     'ds': 'Solar',
    #     'features': ['loc-1', 'loc-2', 'loc-3', 'loc-4'],
    #     'target': 'loc-1',
    #     'base_path': "../processed_ds/solar/",
    #     'min_date': "2006-09-01",
    #     'max_date': "2006-09-08 4:50"
    # },
    {
        'ds': 'Air Quality',
        'features': ['PM2.5', 'OT', 'PM10', 'NO2'],
        'target': 'PM2.5',
        'base_path': '../processed_ds/air_quality_cluster',
        'min_date': "2014-09-01",
        'max_date': "2014-11-12 19:00"
    },
    # {
    #     'ds': 'Crypto',
    #     'features': ['Open', 'High', 'Low', 'OT', 'Volume'],
    #     'target': 'OT',
    #     'base_path': "../processed_ds/crypto-data/",
    #     'min_date': "2018-04-01",
    #     'max_date': "2018-06-15"
    # },
    # {
    #     'ds': 'Sales',
    #     'features': ['OT', 'customers', 'open', 'promo', 'holiday'],
    #     'target': 'OT',
    #     'base_path': "../processed_ds/stores_data/",
    #     'min_date': "2013-01-16",
    #     'max_date': "2015-07-31"
    # }
]

In [50]:
import warnings
warnings.filterwarnings('ignore')

run_simple_transformer_pipeline(datasets=datasets, horizons=horizons, device='cuda')



Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Train size: 1119 | Validation size: 279 | Test size: 350
Epoch 1/10 | Train Loss: 33.0085 | Validation Loss: 0.7688
Epoch 2/10 | Train Loss: 2.1055 | Validation Loss: 0.5624
Epoch 3/10 | Train Loss: 1.4313 | Validation Loss: 0.4557
Epoch 4/10 | Train Loss: 1.1152 | Validation Loss: 0.3609
Epoch 5/10 | Train Loss: 0.9437 | Validation Loss: 0.2989
Epoch 6/10 | Train Loss