In [None]:
# ========== Import basic libraries ==========
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

# ========== Import deep learning libraries ==========
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# ========== Import evaluation metrics ==========
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit

# ========== Import hyperparameter tuning library ==========
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# ========== Import visualization library ==========
import matplotlib.pyplot as plt

# ========== Device configuration ==========
tuning_device = torch.device('cpu')
print(f"Hyperparameter tuning device: CPU")

if torch.backends.mps.is_available():
    training_device = torch.device('mps')
    print(f"Training device: MPS (Apple Silicon GPU)")
elif torch.cuda.is_available():
    training_device = torch.device('cuda')
    print(f"Training device: CUDA - {torch.cuda.get_device_name()}")
else:
    training_device = torch.device('cpu')
    print(f"Training device: CPU")

device = training_device

# ========== Global configuration ==========
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)


Hyperparameter tuning device: CPU
Training device: MPS (Apple Silicon GPU)


In [None]:
# ========== Data Loading ==========
def load_datasets(npz_path="/Users/june/Documents/University of Manchester/Data Science/ERP/Project code/1_Data_Preprocessing/all_window_datasets_unscaled.npz"):
    data = np.load(npz_path, allow_pickle=True) 
    datasets = {}
    for key in data.files:
        datasets[key] = data[key]
    return datasets

class StockDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [None]:
# ========== Evaluation Metrics ==========
def r2_zero(y_true, y_pred):
    """
    Compute zero-based R² (baseline is 0)
    y_true: true values array (N,)
    y_pred: predicted values array (N,)
    """
    rss = np.sum((y_true - y_pred)**2)  
    tss = np.sum(y_true**2)            
    return 1 - rss / tss

def calc_directional_metrics(y_true, y_pred, permnos=None):
    """Calculate directional metrics"""
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if permnos is None:
        s_true = np.sign(y_true)
        s_pred = np.sign(y_pred)
        mask = s_true != 0
        s_true = s_true[mask]
        s_pred = s_pred[mask]

        overall_acc = np.mean(s_true == s_pred)
        up_mask = s_true > 0
        down_mask = s_true < 0
        up_acc = np.mean(s_true[up_mask] == s_pred[up_mask]) if np.any(up_mask) else 0
        down_acc = np.mean(s_true[down_mask] == s_pred[down_mask]) if np.any(down_mask) else 0
    else:
        df = pd.DataFrame({"permno": permnos, "yt": y_true, "yp": y_pred})
        overall_accs = []
        up_accs = []
        down_accs = []

        for _, g in df.groupby("permno"):
            s_true = np.sign(g["yt"].values)
            s_pred = np.sign(g["yp"].values)
            mask = s_true != 0
            s_true = s_true[mask]
            s_pred = s_pred[mask]
            if len(s_true) == 0:
                continue
            overall_accs.append(np.mean(s_true == s_pred))

            up_mask = s_true > 0
            down_mask = s_true < 0
            up_accs.append(np.mean(s_true[up_mask] == s_pred[up_mask]) if np.any(up_mask) else np.nan)
            down_accs.append(np.mean(s_true[down_mask] == s_pred[down_mask]) if np.any(down_mask) else np.nan)

        overall_acc = np.nanmean(overall_accs)
        up_acc = np.nanmean(up_accs)
        down_acc = np.nanmean(down_accs)

    return overall_acc, up_acc, down_acc

def regression_metrics(y_true, y_pred, k, meta=None, permnos=None):
    """Regression evaluation metrics (zero-based R², Adjusted R² removed)"""
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    n = len(y_true)

    r2 = r2_zero(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)

    dir_acc, up_acc, down_acc = calc_directional_metrics(y_true, y_pred, permnos)

    metrics = {
        "R2_zero": r2,
        "RMSE": rmse,
        "MAE": mae,
        "MSE": mse,
        "Directional Accuracy": dir_acc,
        "Up_Directional_Acc": up_acc,
        "Down_Directional_Acc": down_acc
    }

    # Market cap group analysis
    if meta is not None and "MKTCAP_PERCENTILE" in meta:
        top_mask = meta["MKTCAP_PERCENTILE"] >= 0.75
        bottom_mask = meta["MKTCAP_PERCENTILE"] <= 0.25

        if np.any(top_mask):
            yt_top = y_true[top_mask]
            yp_top = y_pred[top_mask]
            perm_top = permnos[top_mask] if permnos is not None else None
            r2_top = r2_zero(yt_top, yp_top)
            rmse_top = np.sqrt(mean_squared_error(yt_top, yp_top))
            mae_top = mean_absolute_error(yt_top, yp_top)
            mse_top = mean_squared_error(yt_top, yp_top)
            dir_top, up_top, down_top = calc_directional_metrics(yt_top, yp_top, perm_top)
            metrics.update({
                "Top25_R2_zero": r2_top,
                "Top25_RMSE": rmse_top,
                "Top25_MAE": mae_top,
                "Top25_MSE": mse_top,
                "Top25_Dir_Acc": dir_top,
                "Top25_Up_Acc": up_top,
                "Top25_Down_Acc": down_top
            })

        if np.any(bottom_mask):
            yt_bot = y_true[bottom_mask]
            yp_bot = y_pred[bottom_mask]
            perm_bot = permnos[bottom_mask] if permnos is not None else None
            r2_bot = r2_zero(yt_bot, yp_bot)
            rmse_bot = np.sqrt(mean_squared_error(yt_bot, yp_bot))
            mae_bot = mean_absolute_error(yt_bot, yp_bot)
            mse_bot = mean_squared_error(yt_bot, yp_bot)
            dir_bot, up_bot, down_bot = calc_directional_metrics(yt_bot, yp_bot, perm_bot)
            metrics.update({
                "Bottom25_R2_zero": r2_bot,
                "Bottom25_RMSE": rmse_bot,
                "Bottom25_MAE": mae_bot,
                "Bottom25_MSE": mse_bot,
                "Bottom25_Dir_Acc": dir_bot,
                "Bottom25_Up_Acc": up_bot,
                "Bottom25_Down_Acc": down_bot
            })

    return metrics


In [None]:
# ========== Model and Metrics Saving ==========
def save_model(model, name, window, path="models/"):
    os.makedirs(path, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(path, f"{name}_w{window}.pth"))

def save_metrics(metrics_dict, name, window, path="results.csv"):
    row = pd.DataFrame([metrics_dict])
    row.insert(0, "Model", name)
    row.insert(1, "Window", window)

    if os.path.exists(path):
        df = pd.read_csv(path)
        df = df[~((df["Model"] == name) & (df["Window"] == window))]
        df = pd.concat([df, row], ignore_index=True)
        df.to_csv(path, index=False)
    else:
        row.to_csv(path, index=False)

def save_predictions(model_name, window_size, y_true, y_pred, permnos, path="predictions/"):
    os.makedirs(path, exist_ok=True)
    
    df = pd.DataFrame({
        "PERMNO": permnos,
        "y_true": y_true,
        "y_pred": y_pred
    })

    filename = f"{model_name}_w{window_size}.csv"
    df.to_csv(os.path.join(path, filename), index=False)


In [None]:
# ========== Autoformer Model Architecture ==========
class AutoCorrelation(nn.Module):
    """Auto-Correlation mechanism"""
    def __init__(self, factor=1, scale=None, attention_dropout=0.1):
        super(AutoCorrelation, self).__init__()
        self.factor = factor
        self.scale = scale
        self.dropout = nn.Dropout(attention_dropout)

    def time_delay_agg_training(self, values, corr):
        head = values.shape[1]
        channel = values.shape[2]
        length = values.shape[3]
        # Find top-k delays, ensure not exceeding available length
        top_k = max(1, min(int(self.factor * length), length))
        mean_value = torch.mean(torch.mean(corr, dim=1), dim=1)
        
        # Ensure top_k does not exceed the last dimension of mean_value
        available_length = mean_value.shape[-1]
        top_k = min(top_k, available_length)
        
        if top_k <= 0 or available_length <= 0:
            return values
            
        index = torch.topk(torch.mean(mean_value, dim=0), top_k, dim=-1)[1]
        weights = torch.stack([mean_value[:, index[i]] for i in range(top_k)], dim=-1)
        tmp_corr = torch.softmax(weights, dim=-1)
        tmp_values = values
        delays_agg = torch.zeros_like(values).float()
        for i in range(top_k):
            pattern = torch.roll(tmp_values, -int(index[i]), -1)
            delays_agg = delays_agg + pattern * \
                         (tmp_corr[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length))
        return delays_agg

    def forward(self, queries, keys, values):
        B, L, H, E = queries.shape
        _, S, _, D = values.shape
        
        # Sequence length protection for FFT
        if L < 2:
            return values
            
        if L > S:
            zeros = torch.zeros_like(queries[:, :(L - S), :]).float()
            values = torch.cat([values, zeros], dim=1)
            keys = torch.cat([keys, zeros], dim=1)
        else:
            values = values[:, :L, :, :]
            keys = keys[:, :L, :, :]

        # Compute auto-correlation
        q_fft = torch.fft.rfft(queries.permute(0, 2, 3, 1).contiguous(), dim=-1)
        k_fft = torch.fft.rfft(keys.permute(0, 2, 3, 1).contiguous(), dim=-1)
        res = q_fft * torch.conj(k_fft)
        corr = torch.fft.irfft(res, dim=-1)

        # Time delay aggregation
        V = self.time_delay_agg_training(values.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2)
        return V.contiguous()

class AutoformerLayer(nn.Module):
    """Autoformer layer"""
    def __init__(self, d_model, n_heads, d_ff=None, dropout=0.1, activation="relu"):
        super(AutoformerLayer, self).__init__()
        d_ff = d_ff or 4 * d_model
        self.attention = AutoCorrelation(attention_dropout=dropout)
        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = F.relu if activation == "relu" else F.gelu

        # Ensure correct projection dimension
        head_dim = max(1, d_model // n_heads)
        self.projection = nn.Linear(d_model, n_heads * head_dim)
        self.n_heads = n_heads
        self.d_model = d_model

    def forward(self, x):
        B, L, _ = x.shape
        H = self.n_heads
        
        # Project to queries, keys, values
        projected = self.projection(x)
        head_dim = projected.shape[-1] // H
        queries = keys = values = projected.view(B, L, H, head_dim)
        
        new_x = self.attention(queries, keys, values)
        if new_x.dim() == 4:
            new_x = new_x.view(B, L, -1)
        x = x + self.dropout(new_x)
        y = x = self.norm1(x)
        
        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
        y = self.dropout(self.conv2(y).transpose(-1, 1))
        
        return self.norm2(x + y)

class Autoformer(nn.Module):
    """Autoformer model with sequence input and automatic sequence length selection"""
    def __init__(self, input_size, d_model=64, n_heads=4, e_layers=2, d_ff=256, dropout=0.1, seq_len=None):
        super(Autoformer, self).__init__()
        
        # Automatically select appropriate sequence length
        if seq_len is None:
            for sl in range(8, 1, -1):
                if input_size % sl == 0:
                    seq_len = sl
                    break
            if seq_len is None:
                seq_len = 1
        
        self.seq_len = seq_len
        self.feature_dim = input_size // seq_len
        
        print(f"[Autoformer] Using sequence length seq_len={seq_len}, feature dimension feature_dim={self.feature_dim}")
        
        self.input_projection = nn.Linear(self.feature_dim, d_model)
        
        self.layers = nn.ModuleList([
            AutoformerLayer(d_model, n_heads, d_ff, dropout) for _ in range(e_layers)
        ])
        
        self.output_projection = nn.Linear(d_model, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        batch_size = x.shape[0]
        
        if self.seq_len > 1:
            x = x.view(batch_size, self.seq_len, -1)
        else:
            x = x.unsqueeze(1)
        
        x = self.input_projection(x)
        
        for layer in self.layers:
            x = layer(x)
        
        x = x[:, -1, :]
        
        x = self.output_projection(x)
        return x.squeeze(-1)


In [None]:
# ========== Lightweight hyperparameter tuning (using TimeSeriesSplit) ==========
def tune_autoformer_with_optuna(X_train, y_train, n_trials=10):
    """Lightweight hyperparameter tuning using Optuna and TimeSeriesSplit - some parameters fixed"""
    print(f"[Hyperparameter Tuning] Using CPU device for optimization to save memory")
    
    n_splits = min(3, max(2, len(X_train) // 100))
    tscv = TimeSeriesSplit(n_splits=3)
    
    def objective(trial):
        params = {
            'd_model': 64,  
            'n_heads': 4,   
            'e_layers': trial.suggest_int('e_layers', 1, 2),
            'd_ff': 256,    
            'dropout': trial.suggest_float('dropout', 0.1, 0.3),
            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 5e-3, log=True),
            'batch_size': 32  
        }
        
        cv_scores = []
        for train_idx, val_idx in tscv.split(X_train):
            X_tr, X_val = X_train[train_idx], X_train[val_idx]
            y_tr, y_val = y_train[train_idx], y_train[val_idx]
            
            if len(X_tr) < 50 or len(X_val) < 10:
                continue
            
            train_dataset = StockDataset(X_tr, y_tr)
            val_dataset = StockDataset(X_val, y_val)
            
            pin_memory = False
            num_workers = 0  
            
            effective_batch_train = min(params['batch_size'], max(1, len(X_tr)))
            effective_batch_val = min(params['batch_size'], max(1, len(X_val)))
            
            train_loader = DataLoader(train_dataset, batch_size=effective_batch_train, 
                                    shuffle=False, pin_memory=pin_memory, num_workers=num_workers)
            val_loader = DataLoader(val_dataset, batch_size=effective_batch_val, 
                                  shuffle=False, pin_memory=pin_memory, num_workers=num_workers)
            
            model = Autoformer(input_size=X_train.shape[1],
                               **{k: v for k, v in params.items() 
                                  if k not in ['learning_rate', 'batch_size']}).to(tuning_device)
            
            optimizer = torch.optim.AdamW(model.parameters(), lr=params['learning_rate'])
            criterion = nn.MSELoss()
            
            model.train()
            for epoch in range(10):  
                for batch_X, batch_y in train_loader:
                    batch_X, batch_y = batch_X.to(tuning_device), batch_y.to(tuning_device)
                    optimizer.zero_grad()
                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y)
                    loss.backward()
                    optimizer.step()
            
            model.eval()
            val_preds = []
            val_targets = []
            with torch.no_grad():
                for batch_X, batch_y in val_loader:
                    batch_X = batch_X.to(tuning_device)
                    outputs = model(batch_X)
                    val_preds.extend(outputs.cpu().numpy())
                    val_targets.extend(batch_y.numpy())
            
            mse = mean_squared_error(val_targets, val_preds)
            cv_scores.append(mse)
        
        if len(cv_scores) == 0:
            return float('inf')
        
        return np.mean(cv_scores)
    
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=n_trials)
    
    print(f"[Optuna] Autoformer best_MSE={study.best_value:.6f}, best_params={study.best_params}")
    return study.best_params


In [None]:
# ========== Training and Prediction Functions ==========
def train_autoformer(model, train_loader, val_loader, learning_rate=1e-3, epochs=30):
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    criterion = nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    
    best_val_loss = float('inf')
    patience_counter = 0
    patience = 5
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(training_device), batch_y.to(training_device)
            
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(training_device), batch_y.to(training_device)
                outputs = model(batch_X)
                val_loss += criterion(outputs, batch_y).item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        scheduler.step(val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break
            
        if epoch % 10 == 0:
            print(f"Epoch {epoch+1}: train_loss={train_loss:.6f}, val_loss={val_loss:.6f}")
    
    return model

def predict_autoformer(model, test_loader):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for batch_X, _ in test_loader:
            batch_X = batch_X.to(training_device)
            outputs = model(batch_X)
            predictions.extend(outputs.cpu().numpy())
    
    return np.array(predictions)


In [None]:
# Main training and evaluation function
def train_and_evaluate_autoformer(window_size, X_train, y_train, X_test, y_test, 
                                  permnos_train, permnos_test, meta_test, shared_params=None):
    """Train and evaluate the Autoformer model"""
    print(f"Training Autoformer on Window = {window_size}")
    
    # Hyperparameter tuning: only performed when window=5, other windows use shared parameters
    if shared_params is None:
        print(f"[Hyperparameter Tuning] Running Optuna optimization for window={window_size}")
        best_params = tune_autoformer_with_optuna(X_train, y_train, n_trials=10)
    else:
        print(f"[Shared Parameters] Using optimized params from window=5 for window={window_size}")
        best_params = shared_params
    
    # Split data into train/val
    split_idx = int(0.8 * len(X_train))
    X_tr, X_val = X_train[:split_idx], X_train[split_idx:]
    y_tr, y_val = y_train[:split_idx], y_train[split_idx:]
    
    pin_memory = training_device.type == 'cuda'
    num_workers = 0
    
    train_dataset = StockDataset(X_tr, y_tr)
    val_dataset = StockDataset(X_val, y_val)
    test_dataset = StockDataset(X_test, y_test)
    
    batch_size = best_params.get('batch_size', 64)
    
    # Adjust batch_size for small datasets to avoid empty batches
    effective_batch_train = min(batch_size, max(1, len(X_tr)))
    effective_batch_val = min(batch_size, max(1, len(X_val)))
    effective_batch_test = min(batch_size, max(1, len(X_test)))
    
    train_loader = DataLoader(train_dataset, batch_size=effective_batch_train, shuffle=False, pin_memory=pin_memory, num_workers=num_workers)
    val_loader = DataLoader(val_dataset, batch_size=effective_batch_val, shuffle=False, pin_memory=pin_memory, num_workers=num_workers)
    test_loader = DataLoader(test_dataset, batch_size=effective_batch_test, shuffle=False, pin_memory=pin_memory, num_workers=num_workers)
    
    model_params = {k: v for k, v in best_params.items() if k not in ['learning_rate', 'batch_size']}
    
    # Automatically select sequence length based on input dimension
    model = Autoformer(input_size=X_train.shape[1], **model_params).to(training_device)
    
    model = train_autoformer(model, train_loader, val_loader, 
                           learning_rate=best_params.get('learning_rate', 1e-3))
    
    y_pred = predict_autoformer(model, test_loader)
    
    k = X_test.shape[1]
    metrics = regression_metrics(y_test, y_pred, k, meta=meta_test, permnos=permnos_test)
    
    save_model(model, "Autoformer", window_size)
    save_metrics(metrics, "Autoformer", window_size)
    save_predictions("Autoformer", window_size, y_test, y_pred, permnos_test)
    
    print(f"Completed Autoformer w={window_size}: MSE={metrics['MSE']:.6f}, Dir_Acc={metrics['Directional Accuracy']:.4f}")
    return metrics, best_params


In [None]:
# ========== Main execution function ==========
def main():
    """Main execution function"""
    datasets = load_datasets()
    window_sizes = [5, 21, 252, 512]
    
    # Store hyperparameters optimized from window=5
    shared_hyperparams = None
    
    for window in window_sizes:
        X_train = datasets[f"X_train_{window}"]
        y_train = datasets[f"y_train_{window}"]
        X_test = datasets[f"X_test_{window}"]
        y_test = datasets[f"y_test_{window}"]

        # Load metadata
        meta_train_dict = datasets[f"meta_train_{window}"].item()
        meta_test_dict = datasets[f"meta_test_{window}"].item()

        meta_train = pd.DataFrame.from_dict(meta_train_dict)
        meta_test = pd.DataFrame.from_dict(meta_test_dict)

        permnos_train = meta_train["PERMNO"].values
        permnos_test = meta_test["PERMNO"].values

        # Only tune hyperparameters when window=5, share for other windows
        if window == 5:
            metrics, optimized_params = train_and_evaluate_autoformer(
                window, X_train, y_train, X_test, y_test,
                permnos_train, permnos_test, meta_test, shared_params=None
            )
            shared_hyperparams = optimized_params
            print(f"[Shared Hyperparams] Optimized on window=5: {shared_hyperparams}")
        else:
            metrics, _ = train_and_evaluate_autoformer(
                window, X_train, y_train, X_test, y_test,
                permnos_train, permnos_test, meta_test, shared_params=shared_hyperparams
            )

if __name__ == "__main__":
    main()


Training Autoformer on Window = 5
[Hyperparameter Tuning] Running Optuna optimization for window=5
[Hyperparameter Tuning] Using CPU device for optimization to save memory
[Autoformer] Using sequence length seq_len=5, feature dimension feature_dim=1
[Autoformer] Using sequence length seq_len=5, feature dimension feature_dim=1
[Autoformer] Using sequence length seq_len=5, feature dimension feature_dim=1
[Autoformer] Using sequence length seq_len=5, feature dimension feature_dim=1
[Autoformer] Using sequence length seq_len=5, feature dimension feature_dim=1
[Autoformer] Using sequence length seq_len=5, feature dimension feature_dim=1
[Autoformer] Using sequence length seq_len=5, feature dimension feature_dim=1
[Autoformer] Using sequence length seq_len=5, feature dimension feature_dim=1
[Autoformer] Using sequence length seq_len=5, feature dimension feature_dim=1
[Autoformer] Using sequence length seq_len=5, feature dimension feature_dim=1
[Autoformer] Using sequence length seq_len=5, fe