In [10]:
# ========== Basic Libraries ==========
import numpy as np
import pandas as pd
import os
import joblib
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ========== Model and Preprocessing ==========
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

# ========== Evaluation Metrics ==========
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
)

# ========== Visualization and Hyperparameter Tuning ==========
import matplotlib.pyplot as plt
import seaborn as sns
!install optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# ========== Global Configuration ==========
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.backends.cudnn.deterministic = True
plt.rcdefaults()

if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

usage: install [-bCcpSsUv] [-f flags] [-g group] [-m mode] [-o owner]
               [-M log] [-D dest] [-h hash] [-T tags]
               [-B suffix] [-l linkflags] [-N dbdir]
               file1 file2
       install [-bCcpSsUv] [-f flags] [-g group] [-m mode] [-o owner]
               [-M log] [-D dest] [-h hash] [-T tags]
               [-B suffix] [-l linkflags] [-N dbdir]
               file1 ... fileN directory
       install -dU [-vU] [-g group] [-m mode] [-N dbdir] [-o owner]
               [-M log] [-D dest] [-h hash] [-T tags]
               directory ...
Using device: mps


In [11]:
# ========== 1. Data Loading and Preprocessing ==========
class StockDataset(Dataset):
    def __init__(self, X, y, device):
        self.X = torch.FloatTensor(X).to(device)
        self.y = torch.FloatTensor(y).to(device)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def load_datasets(npz_path="/Users/june/Documents/University of Manchester/Data Science/ERP/Project code/1_Data_Preprocessing/all_window_datasets_unscaled.npz"):
    data = np.load(npz_path, allow_pickle=True)
    datasets = {}
    for key in data.files:
        datasets[key] = data[key]
    return datasets

def prepare_data(X, y, batch_size=128, device=None):
    if device is None:
        device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    dataset = StockDataset(X, y, device)
    return DataLoader(dataset, batch_size=batch_size, shuffle=False)

# ========== 2. Evaluation Metrics ==========
def r2_zero(y_true, y_pred):
    """
    Calculate zero-based R² (baseline is zero)
    y_true: true values (N,)
    y_pred: predicted values (N,)
    """
    rss = np.sum((y_true - y_pred)**2)  
    tss = np.sum(y_true**2)            
    return 1 - rss / tss

def calc_directional_metrics(y_true, y_pred, permnos=None):
    """
    Calculate sign prediction and up/down accuracy.
    If permnos is provided, calculate metrics per group and average.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if permnos is None:
        s_true = np.sign(y_true)
        s_pred = np.sign(y_pred)
        mask = s_true != 0
        s_true = s_true[mask]
        s_pred = s_pred[mask]

        overall_acc = np.mean(s_true == s_pred)

        up_mask = s_true > 0
        down_mask = s_true < 0
        up_acc = np.mean(s_true[up_mask] == s_pred[up_mask]) if np.any(up_mask) else 0
        down_acc = np.mean(s_true[down_mask] == s_pred[down_mask]) if np.any(down_mask) else 0

    else:
        df = pd.DataFrame({"permno": permnos, "yt": y_true, "yp": y_pred})
        overall_accs = []
        up_accs = []
        down_accs = []

        for _, g in df.groupby("permno"):
            s_true = np.sign(g["yt"].values)
            s_pred = np.sign(g["yp"].values)
            mask = s_true != 0
            s_true = s_true[mask]
            s_pred = s_pred[mask]
            if len(s_true) == 0:
                continue
            overall_accs.append(np.mean(s_true == s_pred))

            up_mask = s_true > 0
            down_mask = s_true < 0
            up_accs.append(np.mean(s_true[up_mask] == s_pred[up_mask]) if np.any(up_mask) else np.nan)
            down_accs.append(np.mean(s_true[down_mask] == s_pred[down_mask]) if np.any(down_mask) else np.nan)

        overall_acc = np.nanmean(overall_accs)
        up_acc = np.nanmean(up_accs)
        down_acc = np.nanmean(down_accs)

    return overall_acc, up_acc, down_acc

def regression_metrics(y_true, y_pred, k, meta=None, permnos=None):
    """
    Calculate regression metrics and directional accuracy.
    If meta is provided and contains MKTCAP_PERCENTILE, also calculate metrics for top and bottom market cap groups.
    """
    if isinstance(y_true, torch.Tensor):
        y_true = y_true.detach().cpu().numpy()
    if isinstance(y_pred, torch.Tensor):
        y_pred = y_pred.detach().cpu().numpy()
    
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    n = len(y_true)

    r2 = r2_zero(y_true, y_pred)  
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)

    dir_acc, up_acc, down_acc = calc_directional_metrics(y_true, y_pred, permnos)

    metrics = {
        "R2_zero": r2,
        "RMSE": rmse,
        "MAE": mae,
        "MSE": mse,
        "Directional Accuracy": dir_acc,
        "Up_Directional_Acc": up_acc,
        "Down_Directional_Acc": down_acc
    }

    if meta is not None and "MKTCAP_PERCENTILE" in meta:
        top_mask = meta["MKTCAP_PERCENTILE"] >= 0.75
        bottom_mask = meta["MKTCAP_PERCENTILE"] <= 0.25

        if np.any(top_mask):
            yt_top = y_true[top_mask]
            yp_top = y_pred[top_mask]
            perm_top = permnos[top_mask] if permnos is not None else None
            r2_top = r2_zero(yt_top, yp_top)
            rmse_top = np.sqrt(mean_squared_error(yt_top, yp_top))
            mae_top = mean_absolute_error(yt_top, yp_top)
            mse_top = mean_squared_error(yt_top, yp_top)
            dir_top, up_top, down_top = calc_directional_metrics(yt_top, yp_top, perm_top)
            metrics.update({
                "Top25_R2_zero": r2_top,
                "Top25_MSE": mse_top,
                "Top25_RMSE": rmse_top,
                "Top25_MAE": mae_top,
                "Top25_Dir_Acc": dir_top,
                "Top25_Up_Acc": up_top,
                "Top25_Down_Acc": down_top
            })

        if np.any(bottom_mask):
            yt_bot = y_true[bottom_mask]
            yp_bot = y_pred[bottom_mask]
            perm_bot = permnos[bottom_mask] if permnos is not None else None
            r2_bot = r2_zero(yt_bot, yp_bot)
            rmse_bot = np.sqrt(mean_squared_error(yt_bot, yp_bot))
            mae_bot = mean_absolute_error(yt_bot, yp_bot)
            mse_bot = mean_squared_error(yt_bot, yp_bot)
            dir_bot, up_bot, down_bot = calc_directional_metrics(yt_bot, yp_bot, perm_bot)
            metrics.update({
                "Bottom25_R2_zero": r2_bot,
                "Bottom25_MSE": mse_bot,
                "Bottom25_RMSE": rmse_bot,
                "Bottom25_MAE": mae_bot,
                "Bottom25_Dir_Acc": dir_bot,
                "Bottom25_Up_Acc": up_bot,
                "Bottom25_Down_Acc": down_bot
            })

    return metrics


In [12]:
# 3. Model Definition
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dims, dropout_rate=0.1):
        super(MLP, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        # Build hidden layers
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_dim),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        
        # Output layer
        layers.append(nn.Linear(prev_dim, 1))
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x).squeeze()

# Predefined network structures
MLP_CONFIGS = {
    "NN1": [64],                    
    "NN2": [64, 32],               
    "NN3": [128, 64, 32],          
    "NN4": [128, 64, 32, 16],      
    "NN5": [256, 128, 64, 32, 16]  
}

# Default hyperparameters
DEFAULT_PARAMS = {
    "batch_size": 128,
    "learning_rate": 0.001,
    "dropout_rate": 0.1,
    "epochs": 50
}


In [13]:
# 4. Training functions
def train_epoch(model, train_loader, criterion, optimizer, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        
        optimizer.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

def validate(model, val_loader, criterion, device):
    """Validate the model"""
    model.eval()
    total_loss = 0
    predictions = []
    targets = []
    
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            output = model(X)
            loss = criterion(output, y)
            total_loss += loss.item()
            
            predictions.extend(output.cpu().numpy())
            targets.extend(y.cpu().numpy())
    
    return (total_loss / len(val_loader), 
            np.array(predictions), 
            np.array(targets))

def create_train_val_split(X_train, y_train, permnos_train, val_ratio=0.2):
    """
    Create a validation set from the training set in chronological order.
    The last val_ratio proportion of the data is used as the validation set.
    """
    split_idx = int(len(X_train) * (1 - val_ratio))
    
    X_tr = X_train[:split_idx]
    X_val = X_train[split_idx:]
    y_tr = y_train[:split_idx]
    y_val = y_train[split_idx:]
    
    if permnos_train is not None:
        perm_tr = permnos_train[:split_idx]
        perm_val = permnos_train[split_idx:]
        return X_tr, X_val, y_tr, y_val, perm_tr, perm_val
    else:
        return X_tr, X_val, y_tr, y_val, None, None


In [14]:
# ========== 5. Hyperparameter Tuning ==========

def tune_model_with_optuna(model_name, X, y, permnos=None, n_trials=10):
    """Hyperparameter tuning using Optuna, pure MSE loss, reduced learning rate, less regularization"""
    input_dim = X.shape[1]
    tscv = TimeSeriesSplit(n_splits=5)
    
    def objective(trial):
        params = {
            "batch_size": trial.suggest_categorical("batch_size", [64, 128]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
            "dropout_rate": trial.suggest_float("dropout_rate", 0.05, 0.15),
            "epochs": 20
        }
        
        cv_scores = []
        for train_idx, val_idx in tscv.split(X):
            X_tr, X_val = X[train_idx], X[val_idx]
            y_tr, y_val = y[train_idx], y[val_idx]
            
            train_loader = prepare_data(X_tr, y_tr, batch_size=params["batch_size"])
            val_loader = prepare_data(X_val, y_val, batch_size=params["batch_size"])
            
            model = MLP(
                input_dim=input_dim,
                hidden_dims=MLP_CONFIGS[model_name],
                dropout_rate=params["dropout_rate"]
            ).to(device)
            
            criterion = nn.MSELoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=params["learning_rate"])
            
            best_val_loss = float('inf')
            
            for epoch in range(params["epochs"]):
                train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
                val_loss, _, _ = validate(model, val_loader, criterion, device)
                
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
            
            cv_scores.append(best_val_loss)
        
        return np.mean(cv_scores)
    
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(seed=42),
        pruner=optuna.pruners.MedianPruner(n_startup_trials=5)
    )
    
    study.optimize(objective, n_trials=n_trials, n_jobs=1)
    
    if len(study.trials) == 0 or study.best_trial is None:
        print(f"[Skip Model] {model_name} failed to complete any trial. Skipping.")
        return None
    
    best_params = study.best_params
    best_params["epochs"] = 50  
    best_score = study.best_value
    print(f"[Optuna] {model_name} best_MSE={best_score:.6f}, best_params={best_params}")
    
    return best_params


In [15]:
# ========== 6. Save Functions ==========
def save_model(model, name, window, path="models/"):
    os.makedirs(path, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(path, f"{name}_w{window}.pth"))

def save_metrics(metrics_dict, name, window, path="results.csv"):
    row = pd.DataFrame([metrics_dict])
    row.insert(0, "Model", name)
    row.insert(1, "Window", window)

    if os.path.exists(path):
        df = pd.read_csv(path)
        df = df[~((df["Model"] == name) & (df["Window"] == window))]
        df = pd.concat([df, row], ignore_index=True)
        df.to_csv(path, index=False)
        print(f"[Update] Metrics updated for {name} w={window}")
    else:
        row.to_csv(path, index=False)
        print(f"[Create] New metrics file created with {name} w={window}")

def save_predictions(model_name, window_size, y_true, y_pred, permnos, path="predictions/"):
    os.makedirs(path, exist_ok=True)
    
    df = pd.DataFrame({
        "PERMNO": permnos,
        "y_true": y_true,
        "y_pred": y_pred
    })

    filename = f"{model_name}_w{window_size}.csv"
    df.to_csv(os.path.join(path, filename), index=False)
    print(f"[Save] {filename}")


In [16]:
# ========== 7. Main training and evaluation function ==========
def train_and_evaluate(model_name, window_size,
                       X_train, y_train, X_test, y_test,
                       permnos_train, permnos_test, meta=None, shared_params=None):
    print(f"\nTraining {model_name} on Window = {window_size}")
    input_dim = X_train.shape[1]
    
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    print(f"[Info] Using device: {device}")
    
    if model_name == "NN1":
        best_params = tune_model_with_optuna(model_name, X_train, y_train, permnos=permnos_train)
        if best_params is None:
            print(f"[Skip] {model_name} tuning failed, using default parameters")
            best_params = DEFAULT_PARAMS.copy()
    else:
        if shared_params is None:
            print(f"[Warning] No shared parameters provided for {model_name}, using default")
            best_params = DEFAULT_PARAMS.copy()
        else:
            print(f"[Info] Using shared parameters for {model_name}")
            best_params = shared_params.copy()
    
    X_tr, X_val, y_tr, y_val, perm_tr, perm_val = create_train_val_split(
        X_train, y_train, permnos_train, val_ratio=0.2
    )
    
    train_loader = prepare_data(X_tr, y_tr, batch_size=best_params["batch_size"], device=device)
    val_loader = prepare_data(X_val, y_val, batch_size=best_params["batch_size"], device=device)
    test_loader = prepare_data(X_test, y_test, batch_size=best_params["batch_size"], device=device)
    
    model = MLP(
        input_dim=input_dim,
        hidden_dims=MLP_CONFIGS[model_name],
        dropout_rate=best_params["dropout_rate"]
    ).to(device)
    
    criterion = nn.MSELoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=best_params["learning_rate"])
    
    best_model = None
    best_val_loss = float('inf')
    patience = 20
    patience_counter = 0
    
    print(f"[Info] Starting training for {model_name} with {best_params['epochs']} epochs (early stopping patience={patience})...")
    print(f"[Info] Train size: {len(X_tr)}, Val size: {len(X_val)}, Test size: {len(X_test)}")
    
    for epoch in range(best_params["epochs"]):
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, predictions, targets = validate(model, val_loader, criterion, device)
        
        pred_std = np.std(predictions)
        positive_ratio = (np.sign(predictions) > 0).mean()
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            
        if (epoch + 1) % 10 == 0:
            print(f"[{model_name}] Epoch {epoch+1}/{best_params['epochs']}, "
                  f"Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}, "
                  f"PredStd: {pred_std:.6f}, PosRatio: {positive_ratio:.3f}")
        
        if patience_counter >= patience:
            print(f"[{model_name}] Early stopping at epoch {epoch+1}")
            break
    
    if best_model is not None:
        model.load_state_dict(best_model)
    
    _, y_pred, y_true = validate(model, test_loader, criterion, device)
    
    y_pred = y_pred.cpu() if isinstance(y_pred, torch.Tensor) else y_pred
    y_true = y_true.cpu() if isinstance(y_true, torch.Tensor) else y_true
    
    print("\n=== Directional Sanity Check ===")
    print("Pos ratio (y_test):", (y_test > 0).mean())
    print("Neg ratio (y_test):", (y_test < 0).mean())
    sign_pred = np.sign(y_pred)
    print("Pred +1 ratio:", (sign_pred > 0).mean())
    print("Pred -1 ratio:", (sign_pred < 0).mean())
    
    from sklearn.metrics import confusion_matrix
    conf = confusion_matrix(np.sign(y_test), sign_pred, labels=[1, -1])
    print("      Pred+  Pred-")
    print("+1 |", conf[0])
    print("-1 |", conf[1])
    
    metrics = regression_metrics(y_true, y_pred, k=X_test.shape[1], meta=meta, permnos=permnos_test)
    
    save_model(model, model_name, window_size)
    save_metrics(metrics, model_name, window_size)
    save_predictions(model_name, window_size, y_true, y_pred, permnos_test)
    
    print(f"[Info] Training completed for {model_name}")
    return metrics, best_params


In [17]:
# ========== 8. Main dispatcher function: loop through all models and windows ==========
def loop_all_models(run_test_first=False):
    """Loop to train all models on different window sizes"""
    datasets = load_datasets()
    
    model_list = ["NN1", "NN2", "NN3", "NN4", "NN5"]
    window_sizes = [5, 21, 252, 512]

    for window in window_sizes:
        print(f"\n=== Processing Window Size: {window} ===")
        
        X_train = datasets[f"X_train_{window}"]
        y_train = datasets[f"y_train_{window}"]
        X_test = datasets[f"X_test_{window}"]
        y_test = datasets[f"y_test_{window}"]
        
        meta_train_dict = datasets[f"meta_train_{window}"].item()
        meta_test_dict = datasets[f"meta_test_{window}"].item()

        meta_train = pd.DataFrame.from_dict(meta_train_dict)
        meta_test = pd.DataFrame.from_dict(meta_test_dict)

        permnos_train = meta_train["PERMNO"].values
        permnos_test = meta_test["PERMNO"].values

        print(f"\nTraining NN1 to get shared parameters...")
        _, shared_params = train_and_evaluate(
            "NN1", window,
            X_train, y_train, X_test, y_test,
            permnos_train, permnos_test,
            meta_test
        )
        print(f"Shared parameters from NN1: {shared_params}")

        for model_name in model_list[1:]:
            print(f"\nTraining {model_name} with shared parameters...")
            train_and_evaluate(
                model_name, window,
                X_train, y_train, X_test, y_test,
                permnos_train, permnos_test,
                meta_test, shared_params
            )


In [18]:
# ========== 9. Entry point ==========
if __name__ == "__main__":
    loop_all_models()



=== Processing Window Size: 5 ===

Training NN1 to get shared parameters...

▶ Training NN1 on Window = 5
[Info] Using device: mps
[Optuna] NN1 best_MSE=0.000304, best_params={'batch_size': 64, 'learning_rate': 3.8396292998041685e-05, 'dropout_rate': 0.08663618432936918, 'epochs': 50}
[Info] Starting training for NN1 with 50 epochs (early stopping patience=20)...
[Info] Train size: 157536, Val size: 39384, Test size: 110850
[NN1] Epoch 10/50, Train Loss: 0.000444, Val Loss: 0.000164, PredStd: 0.000921, PosRatio: 0.661
[NN1] Epoch 20/50, Train Loss: 0.000438, Val Loss: 0.000164, PredStd: 0.000924, PosRatio: 0.624
[NN1] Epoch 30/50, Train Loss: 0.000436, Val Loss: 0.000164, PredStd: 0.001002, PosRatio: 0.610
[NN1] Epoch 40/50, Train Loss: 0.000435, Val Loss: 0.000164, PredStd: 0.000977, PosRatio: 0.642
[NN1] Epoch 50/50, Train Loss: 0.000434, Val Loss: 0.000164, PredStd: 0.000969, PosRatio: 0.630

=== Directional Sanity Check ===
Pos ratio (y_test): 0.5225259359494813
Neg ratio (y_test)