In [None]:
# ========== Basic Libraries ==========
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import gc
import joblib
import random

# ========== Evaluation Metrics ==========
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score, confusion_matrix
)
from sklearn.model_selection import TimeSeriesSplit

# ========== Hyperparameter Tuning ==========
try:
    import optuna
    OPTUNA_AVAILABLE = True
    print("Optuna is available for hyperparameter tuning")
except ImportError:
    OPTUNA_AVAILABLE = False
    print("Optuna not available, will use default parameters")

# ========== Global Configuration ==========
warnings.filterwarnings('ignore')

def set_random_seeds(seed=42):
    """Set all relevant random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_random_seeds(42)

device = (
    "mps"
    if torch.backends.mps.is_available()
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using device: {device}")

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    if device == "mps":
        torch.mps.empty_cache()

Optuna is available for hyperparameter tuning
Using device: mps


In [None]:
# ========== 2. Evaluation Metrics ==========

def r2_zero(y_true, y_pred):
    """
    Compute zero-based R² (baseline is 0)
    y_true: true values (N,)
    y_pred: predicted values (N,)
    """
    rss = np.sum((y_true - y_pred)**2)  
    tss = np.sum(y_true**2)            
    return 1 - rss / tss

def calc_directional_metrics(y_true, y_pred, permnos=None):
    """
    - Sign prediction at sample level
    - If grouped by stock, calculate Overall, Up, Down for each stock and then average
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if permnos is None:
        s_true = np.sign(y_true)
        s_pred = np.sign(y_pred)
        mask = s_true != 0
        s_true = s_true[mask]
        s_pred = s_pred[mask]

        overall_acc = np.mean(s_true == s_pred)

        up_mask = s_true > 0
        down_mask = s_true < 0
        up_acc = np.mean(s_true[up_mask] == s_pred[up_mask]) if np.any(up_mask) else 0
        down_acc = np.mean(s_true[down_mask] == s_pred[down_mask]) if np.any(down_mask) else 0

    else:
        df = pd.DataFrame({"permno": permnos, "yt": y_true, "yp": y_pred})
        overall_accs = []
        up_accs = []
        down_accs = []

        for _, g in df.groupby("permno"):
            s_true = np.sign(g["yt"].values)
            s_pred = np.sign(g["yp"].values)
            mask = s_true != 0
            s_true = s_true[mask]
            s_pred = s_pred[mask]
            if len(s_true) == 0:
                continue
            overall_accs.append(np.mean(s_true == s_pred))

            up_mask = s_true > 0
            down_mask = s_true < 0
            up_accs.append(np.mean(s_true[up_mask] == s_pred[up_mask]) if np.any(up_mask) else np.nan)
            down_accs.append(np.mean(s_true[down_mask] == s_pred[down_mask]) if np.any(down_mask) else np.nan)

        overall_acc = np.nanmean(overall_accs)
        up_acc = np.nanmean(up_accs)
        down_acc = np.nanmean(down_accs)

    return overall_acc, up_acc, down_acc

def regression_metrics(y_true, y_pred, k, meta=None, permnos=None):
    """
    Includes:
    - Regression metrics
    - Pointwise directional accuracy
    - Market cap group metrics
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    n = len(y_true)

    r2 = r2_zero(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)

    dir_acc, up_acc, down_acc = calc_directional_metrics(y_true, y_pred, permnos)

    metrics = {
        "R2_zero": r2,
        "RMSE": rmse,
        "MAE": mae,
        "MSE": mse,
        "Directional Accuracy": dir_acc,
        "Up_Directional_Acc": up_acc,
        "Down_Directional_Acc": down_acc
    }

    if meta is not None and "MKTCAP_PERCENTILE" in meta:
        top_mask = meta["MKTCAP_PERCENTILE"] >= 0.75
        bottom_mask = meta["MKTCAP_PERCENTILE"] <= 0.25

        if np.any(top_mask):
            yt_top = y_true[top_mask]
            yp_top = y_pred[top_mask]
            perm_top = permnos[top_mask] if permnos is not None else None
            r2_top = r2_zero(yt_top, yp_top)
            rmse_top = np.sqrt(mean_squared_error(yt_top, yp_top))
            mae_top = mean_absolute_error(yt_top, yp_top)
            mse_top = mean_squared_error(yt_top, yp_top)
            dir_top, up_top, down_top = calc_directional_metrics(yt_top, yp_top, perm_top)
            metrics.update({
                "Top25_R2_zero": r2_top,
                "Top25_RMSE": rmse_top,
                "Top25_MAE": mae_top,
                "Top25_MSE": mse_top,
                "Top25_Dir_Acc": dir_top,
                "Top25_Up_Acc": up_top,
                "Top25_Down_Acc": down_top
            })

        if np.any(bottom_mask):
            yt_bot = y_true[bottom_mask]
            yp_bot = y_pred[bottom_mask]
            perm_bot = permnos[bottom_mask] if permnos is not None else None
            r2_bot = r2_zero(yt_bot, yp_bot)
            rmse_bot = np.sqrt(mean_squared_error(yt_bot, yp_bot))
            mae_bot = mean_absolute_error(yt_bot, yp_bot)
            mse_bot = mean_squared_error(yt_bot, yp_bot)
            dir_bot, up_bot, down_bot = calc_directional_metrics(yt_bot, yp_bot, perm_bot)
            metrics.update({
                "Bottom25_R2_zero": r2_bot,
                "Bottom25_RMSE": rmse_bot,
                "Bottom25_MAE": mae_bot,
                "Bottom25_MSE": mse_bot,
                "Bottom25_Dir_Acc": dir_bot,
                "Bottom25_Up_Acc": up_bot,
                "Bottom25_Down_Acc": down_bot
            })

    return metrics


In [None]:
# ========== 3. Save functions (consistent with Linear_Models) ==========

def save_model(model, name, window, path="models/"):
    os.makedirs(path, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(path, f"{name}_w{window}.pth"))

def save_metrics(metrics_dict, name, window, path="results.csv"):
    row = pd.DataFrame([metrics_dict])
    row.insert(0, "Model", name)
    row.insert(1, "Window", window)

    if os.path.exists(path):
        df = pd.read_csv(path)
        df = df[~((df["Model"] == name) & (df["Window"] == window))]
        df = pd.concat([df, row], ignore_index=True)
        df.to_csv(path, index=False)
        print(f"[Update] Metrics updated for {name} w={window}")
    else:
        row.to_csv(path, index=False)
        print(f"[Create] New metrics file created with {name} w={window}")

def save_predictions(model_name, window_size, y_true, y_pred, permnos, path="predictions/"):
    os.makedirs(path, exist_ok=True)
    
    df = pd.DataFrame({
        "PERMNO": permnos,
        "y_true": y_true,
        "y_pred": y_pred
    })

    filename = f"{model_name}_w{window_size}.csv"
    df.to_csv(os.path.join(path, filename), index=False)
    print(f"[Save] {filename}")


In [None]:
# ========== 4. Dataset and Model Definition ==========

class StockDataset(Dataset):
    def __init__(self, X, y, device):
        self.X = torch.FloatTensor(X).to(device)
        self.y = torch.FloatTensor(y).to(device)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.2):
        super(LSTMModel, self).__init__()

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: (batch_size, seq_len, input_size)
        lstm_out, _ = self.lstm(x)
        # Only use the output of the last time step
        last_output = lstm_out[:, -1, :]
        last_output = self.dropout(last_output)
        out = self.fc(last_output)
        return out.squeeze()

def prepare_data(X, y, batch_size=32, device=None):
    """Prepare data loader"""
    if device is None:
        device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    dataset = StockDataset(X, y, device)
    return DataLoader(dataset, batch_size=batch_size, shuffle=False)


In [None]:
# ========== 5. Training Functions ==========

def train_epoch(model, train_loader, criterion, optimizer, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        
        optimizer.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

def validate(model, val_loader, criterion, device):
    """Validate the model"""
    model.eval()
    total_loss = 0
    predictions = []
    targets = []
    
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            output = model(X)
            loss = criterion(output, y)
            total_loss += loss.item()
            
            predictions.extend(output.cpu().numpy())
            targets.extend(y.cpu().numpy())
    
    return (total_loss / len(val_loader), 
            np.array(predictions), 
            np.array(targets))

def create_train_val_split_timeseries(X_train, y_train, permnos_train, n_splits=3, test_size_ratio=0.2):
    """
    Create time series cross-validation splits using TimeSeriesSplit to avoid data leakage.
    Return the last split as the train-validation split.
    """
    tscv = TimeSeriesSplit(n_splits=n_splits, test_size=int(len(X_train) * test_size_ratio))
    splits = list(tscv.split(X_train))
    train_idx, val_idx = splits[-1]
    
    X_tr = X_train[train_idx]
    X_val = X_train[val_idx]
    y_tr = y_train[train_idx]
    y_val = y_train[val_idx]
    
    if permnos_train is not None:
        perm_tr = permnos_train[train_idx]
        perm_val = permnos_train[val_idx]
        return X_tr, X_val, y_tr, y_val, perm_tr, perm_val
    else:
        return X_tr, X_val, y_tr, y_val, None, None

def create_train_val_split(X_train, y_train, permnos_train, val_ratio=0.2):
    """
    Create a validation set from the end of the training set in chronological order.
    Take the last val_ratio proportion of the data as the validation set.
    """
    split_idx = int(len(X_train) * (1 - val_ratio))
    
    X_tr = X_train[:split_idx]
    X_val = X_train[split_idx:]
    y_tr = y_train[:split_idx]
    y_val = y_train[split_idx:]
    
    if permnos_train is not None:
        perm_tr = permnos_train[:split_idx]
        perm_val = permnos_train[split_idx:]
        return X_tr, X_val, y_tr, y_val, perm_tr, perm_val
    else:
        return X_tr, X_val, y_tr, y_val, None, None


In [None]:
# ========== 6. Hyperparameter Optimization Function ==========

def optimize_lstm_hyperparameters(X_train, y_train, permnos_train, window_size, n_trials=15):
    """
    Use Optuna to optimize LSTM hyperparameters (with pruning, some parameters fixed)
    """
    if not OPTUNA_AVAILABLE:
        print("[Warning] Optuna not available, using default parameters")
        return {
            'batch_size': 32,
            'learning_rate': 0.0005,
            'dropout_rate': 0.1,
            'hidden_size': 128,
            'num_layers': 2,
            'epochs': 50
        }
    
    X_train_reshaped = X_train.reshape(X_train.shape[0], window_size, -1)
    input_size = X_train_reshaped.shape[2]
    
    X_tr, X_val, y_tr, y_val, _, _ = create_train_val_split_timeseries(
        X_train_reshaped, y_train, permnos_train, n_splits=4, test_size_ratio=0.2
    )
    
    cpu_device = torch.device("cpu")
    
    def objective(trial):
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 5e-3, log=True)
        dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.2)
        
        hidden_size = 128
        num_layers = 2
        
        try:
            train_loader = prepare_data(X_tr, y_tr, batch_size=batch_size, device=cpu_device)
            val_loader = prepare_data(X_val, y_val, batch_size=batch_size, device=cpu_device)
            
            model = LSTMModel(
                input_size=input_size,
                hidden_size=hidden_size,
                num_layers=num_layers,
                dropout=dropout_rate
            ).to(cpu_device)
            
            criterion = nn.MSELoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
            
            best_val_loss = float('inf')
            patience = 10
            patience_counter = 0
            max_epochs = 20
            
            for epoch in range(max_epochs):
                model.train()
                train_loss = 0
                for X_batch, y_batch in train_loader:
                    X_batch, y_batch = X_batch.to(cpu_device), y_batch.to(cpu_device)
                    optimizer.zero_grad()
                    output = model(X_batch)
                    loss = criterion(output, y_batch)
                    loss.backward()
                    optimizer.step()
                    train_loss += loss.item()
                
                model.eval()
                val_loss = 0
                with torch.no_grad():
                    for X_batch, y_batch in val_loader:
                        X_batch, y_batch = X_batch.to(cpu_device), y_batch.to(cpu_device)
                        output = model(X_batch)
                        loss = criterion(output, y_batch)
                        val_loss += loss.item()
                
                val_loss /= len(val_loader)
                
                trial.report(val_loss, epoch)
                
                if trial.should_prune():
                    del model, train_loader, val_loader
                    gc.collect()
                    raise optuna.exceptions.TrialPruned()
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= patience:
                        break
            
            del model, train_loader, val_loader
            gc.collect()
            
            return best_val_loss
            
        except optuna.exceptions.TrialPruned:
            raise
        except Exception as e:
            print(f"Trial failed: {e}")
            return float('inf')
    
    pruner = optuna.pruners.MedianPruner(
        n_startup_trials=3,
        n_warmup_steps=5,
        interval_steps=2
    )
    
    print(f"[Optuna] Starting hyperparameter optimization with {n_trials} trials on CPU...")
    print("[Optuna] Using MedianPruner for early trial termination")
    print("[Optuna] Fixed parameters: hidden_size=128, num_layers=2")
    print("[Optuna] Searching: batch_size, learning_rate, dropout_rate")
    
    study = optuna.create_study(
        direction='minimize', 
        sampler=optuna.samplers.TPESampler(seed=42),
        pruner=pruner
    )
    
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    best_params = study.best_params
    best_params['hidden_size'] = 128
    best_params['num_layers'] = 2
    best_params['epochs'] = 50
    
    print(f"[Optuna] Best parameters: {best_params}")
    print(f"[Optuna] Best validation loss: {study.best_value:.6f}")
    print(f"[Optuna] Number of pruned trials: {len([t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED])}")
    print(f"[Optuna] Number of completed trials: {len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])}")
    
    return best_params

# ========== 7. Training and Evaluation Main Function ==========

def train_and_evaluate_lstm(window_size, X_train, y_train, X_test, y_test,
                           permnos_train, permnos_test, meta=None, shared_params=None):
    """Train and evaluate LSTM model"""
    print(f"\nTraining LSTM on Window = {window_size}")
    
    X_train_reshaped = X_train.reshape(X_train.shape[0], window_size, -1)
    X_test_reshaped = X_test.reshape(X_test.shape[0], window_size, -1)
    
    input_size = X_train_reshaped.shape[2]
    print(f"[Info] Input shape: {X_train_reshaped.shape}, Input size: {input_size}")
    
    X_tr, X_val, y_tr, y_val, perm_tr, perm_val = create_train_val_split_timeseries(
        X_train_reshaped, y_train, permnos_train, n_splits=4, test_size_ratio=0.2
    )
    
    if shared_params is not None:
        print(f"[Info] Using shared parameters for LSTM")
        batch_size = shared_params['batch_size']
        learning_rate = shared_params['learning_rate']
        dropout_rate = shared_params['dropout_rate']
        hidden_size = shared_params['hidden_size']
        num_layers = shared_params['num_layers']
        epochs = shared_params['epochs']
    else:
        print(f"[Info] Using default parameters for LSTM")
        batch_size = 32
        learning_rate = 0.0005
        dropout_rate = 0.1
        hidden_size = 128
        num_layers = 2
        epochs = 50
    
    train_loader = prepare_data(X_tr, y_tr, batch_size=batch_size, device=device)
    val_loader = prepare_data(X_val, y_val, batch_size=batch_size, device=device)
    test_loader = prepare_data(X_test_reshaped, y_test, batch_size=batch_size, device=device)
    
    model = LSTMModel(
        input_size=input_size,
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout=dropout_rate
    ).to(device)
    
    criterion = nn.MSELoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    best_model = None
    best_val_loss = float('inf')
    patience = 15
    patience_counter = 0
    
    print(f"[Info] Starting training for LSTM with {epochs} epochs (early stopping patience={patience})...")
    print(f"[Info] Train size: {len(X_tr)}, Val size: {len(X_val)}, Test size: {len(X_test)}")
    
    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, predictions, targets = validate(model, val_loader, criterion, device)
        
        pred_std = np.std(predictions)
        positive_ratio = (np.sign(predictions) > 0).mean()
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            
        if (epoch + 1) % 10 == 0:
            print(f"[LSTM] Epoch {epoch+1}/{epochs}, "
                  f"Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}, "
                  f"PredStd: {pred_std:.6f}, PosRatio: {positive_ratio:.3f}")
        
        if patience_counter >= patience:
            print(f"[LSTM] Early stopping at epoch {epoch+1}")
            break
    
    if best_model is not None:
        model.load_state_dict(best_model)
    
    _, y_pred, y_true = validate(model, test_loader, criterion, device)
    
    print("\n=== Directional Sanity Check ===")
    print("Pos ratio (y_test):", (y_test > 0).mean())
    print("Neg ratio (y_test):", (y_test < 0).mean())
    sign_pred = np.sign(y_pred)
    print("Pred +1 ratio:", (sign_pred > 0).mean())
    print("Pred -1 ratio:", (sign_pred < 0).mean())
    
    conf = confusion_matrix(np.sign(y_test), sign_pred, labels=[1, -1])
    print("      Pred+  Pred-")
    print("+1 |", conf[0])
    print("-1 |", conf[1])
    
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    metrics = regression_metrics(y_true, y_pred, k=window_size, meta=meta, permnos=permnos_test)
    print(f"[Info] Model has {num_params} trainable parameters, using window_size={window_size} for Adjusted R²")
    
    save_model(model, "LSTM", window_size)
    save_metrics(metrics, "LSTM", window_size)
    save_predictions("LSTM", window_size, y_true, y_pred, permnos_test)
    
    print(f"[Info] Training completed for LSTM")
    return metrics


In [None]:
# ========== 8. Main Dispatcher Function ==========

def load_datasets(npz_path="/Users/june/Documents/University of Manchester/Data Science/ERP/Project code/1_Data_Preprocessing/all_window_datasets_unscaled.npz"):
    """Load dataset"""
    data = np.load(npz_path, allow_pickle=True)
    datasets = {}
    for key in data.files:
        datasets[key] = data[key]
    return datasets

def loop_all_windows():
    """Train LSTM on different window sizes"""
    datasets = load_datasets()
    
    window_sizes = [5, 21, 252, 512]
    shared_params = None

    for window in window_sizes:
        print(f"\n=== Processing Window Size: {window} ===")
        
        X_train = datasets[f"X_train_{window}"]
        y_train = datasets[f"y_train_{window}"]
        X_test = datasets[f"X_test_{window}"]
        y_test = datasets[f"y_test_{window}"]
        
        meta_train_dict = datasets[f"meta_train_{window}"].item()
        meta_test_dict = datasets[f"meta_test_{window}"].item()

        meta_train = pd.DataFrame.from_dict(meta_train_dict)
        meta_test = pd.DataFrame.from_dict(meta_test_dict)

        permnos_train = meta_train["PERMNO"].values
        permnos_test = meta_test["PERMNO"].values

        if window == 5:
            print("\nTraining LSTM to get shared parameters...")
            shared_params = optimize_lstm_hyperparameters(
                X_train, y_train, permnos_train, window, n_trials=15
            )
            print(f"Shared parameters from LSTM: {shared_params}")

        if window == 5:
            print("\nTraining LSTM with optimized parameters...")
        else:
            print(f"\nTraining LSTM with shared parameters...")
            
        train_and_evaluate_lstm(
            window, X_train, y_train, X_test, y_test,
            permnos_train, permnos_test, meta_test, shared_params
        )
        
        clear_memory()
        print(f"Window {window} completed and memory cleared.")


In [None]:
# ========== 9. Entry Point ==========
if __name__ == "__main__":
    loop_all_windows()



=== Processing Window Size: 5 ===


[I 2025-08-02 19:27:19,106] A new study created in memory with name: no-name-85eff0be-0980-4a90-849b-511603efbcd7



Training LSTM to get shared parameters...
[Optuna] Starting hyperparameter optimization with 15 trials on CPU...
[Optuna] Using MedianPruner for early trial termination
[Optuna] Fixed parameters: hidden_size=128, num_layers=2
[Optuna] Searching: batch_size, learning_rate, dropout_rate


  0%|          | 0/15 [00:00<?, ?it/s]

[I 2025-08-02 19:30:18,312] Trial 0 finished with value: 0.00016400192423913853 and parameters: {'batch_size': 64, 'learning_rate': 0.00041282053438262235, 'dropout_rate': 0.031203728088487304}. Best is trial 0 with value: 0.00016400192423913853.
[I 2025-08-02 19:32:34,330] Trial 1 finished with value: 0.00016334617800106121 and parameters: {'batch_size': 128, 'learning_rate': 0.00041917115166952007, 'dropout_rate': 0.1416145155592091}. Best is trial 1 with value: 0.00016334617800106121.
[I 2025-08-02 19:35:23,771] Trial 2 finished with value: 0.0001636845359152849 and parameters: {'batch_size': 64, 'learning_rate': 3.7419406111184946e-05, 'dropout_rate': 0.03636499344142013}. Best is trial 1 with value: 0.00016334617800106121.
[I 2025-08-02 19:36:03,759] Trial 3 pruned. 
[I 2025-08-02 19:37:16,473] Trial 4 pruned. 
[I 2025-08-02 19:38:30,570] Trial 5 pruned. 
[I 2025-08-02 19:39:50,720] Trial 6 pruned. 
[I 2025-08-02 19:41:09,023] Trial 7 pruned. 
[I 2025-08-02 19:42:04,099] Trial 8 p