In [1]:
# ========== Basic Libraries ==========
import numpy as np
import pandas as pd
import os
import joblib

# ========== Models and Preprocessing ==========
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import confusion_matrix

# ========== Evaluation Metrics ==========
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
)

# ========== Visualization and Hyperparameter Tuning ==========
import matplotlib.pyplot as plt
import seaborn as sns
! pip install optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# ========== Global Configuration ==========
np.random.seed(42)
plt.rcdefaults()




In [None]:
# ========== 1. Data Loading ==========
def load_datasets(npz_path="/Users/june/Documents/University of Manchester/Data Science/ERP/Project code/1_Data_Preprocessing/all_window_datasets.npz"):
    """Load dataset from npz file"""
    data = np.load(npz_path, allow_pickle=True)
    datasets = {}
    for key in data.files:
        datasets[key] = data[key]
    return datasets

# ========== 2. Evaluation Metrics ==========
def r2_zero(y_true, y_pred):
    """
    Compute zero-based R² (baseline is 0)
    y_true: true values (N,)
    y_pred: predicted values (N,)
    """
    rss = np.sum((y_true - y_pred)**2)  
    tss = np.sum(y_true**2)            
    return 1 - rss / tss

def calc_directional_metrics(y_true, y_pred, permnos=None):
    """
    Calculate sign prediction accuracy and up/down accuracy.
    If permnos is provided, compute metrics per group and average.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if permnos is None:
        s_true = np.sign(y_true)
        s_pred = np.sign(y_pred)
        mask = s_true != 0
        s_true = s_true[mask]
        s_pred = s_pred[mask]

        overall_acc = np.mean(s_true == s_pred)

        up_mask = s_true > 0
        down_mask = s_true < 0
        up_acc = np.mean(s_true[up_mask] == s_pred[up_mask]) if np.any(up_mask) else 0
        down_acc = np.mean(s_true[down_mask] == s_pred[down_mask]) if np.any(down_mask) else 0

    else:
        df = pd.DataFrame({"permno": permnos, "yt": y_true, "yp": y_pred})
        overall_accs = []
        up_accs = []
        down_accs = []

        for _, g in df.groupby("permno"):
            s_true = np.sign(g["yt"].values)
            s_pred = np.sign(g["yp"].values)
            mask = s_true != 0
            s_true = s_true[mask]
            s_pred = s_pred[mask]
            if len(s_true) == 0:
                continue
            overall_accs.append(np.mean(s_true == s_pred))

            up_mask = s_true > 0
            down_mask = s_true < 0
            up_accs.append(np.mean(s_true[up_mask] == s_pred[up_mask]) if np.any(up_mask) else np.nan)
            down_accs.append(np.mean(s_true[down_mask] == s_pred[down_mask]) if np.any(down_mask) else np.nan)

        overall_acc = np.nanmean(overall_accs)
        up_acc = np.nanmean(up_accs)
        down_acc = np.nanmean(down_accs)

    return overall_acc, up_acc, down_acc

def regression_metrics(y_true, y_pred, k, meta=None, permnos=None):
    """
    Compute regression metrics and directional accuracy.
    If meta is provided and contains MKTCAP_PERCENTILE, also compute metrics for top and bottom market cap groups.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    n = len(y_true)

    r2 = r2_zero(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)

    dir_acc, up_acc, down_acc = calc_directional_metrics(y_true, y_pred, permnos)

    metrics = {
        "R2_zero": r2,
        "RMSE": rmse,
        "MAE": mae,
        "MSE": mse,
        "Directional Accuracy": dir_acc,
        "Up_Directional_Acc": up_acc,
        "Down_Directional_Acc": down_acc
    }

    if meta is not None and "MKTCAP_PERCENTILE" in meta:
        top_mask = meta["MKTCAP_PERCENTILE"] >= 0.75
        bottom_mask = meta["MKTCAP_PERCENTILE"] <= 0.25

        if np.any(top_mask):
            yt_top = y_true[top_mask]
            yp_top = y_pred[top_mask]
            perm_top = permnos[top_mask] if permnos is not None else None
            r2_top = r2_zero(yt_top, yp_top)
            rmse_top = np.sqrt(mean_squared_error(yt_top, yp_top))
            mae_top = mean_absolute_error(yt_top, yp_top)
            mse_top = mean_squared_error(yt_top, yp_top)
            dir_top, up_top, down_top = calc_directional_metrics(yt_top, yp_top, perm_top)
            metrics.update({
                "Top25_R2_zero": r2_top,
                "Top25_RMSE": rmse_top,
                "Top25_MAE": mae_top,
                "Top25_MSE": mse_top,
                "Top25_Dir_Acc": dir_top,
                "Top25_Up_Acc": up_top,
                "Top25_Down_Acc": down_top
            })

        if np.any(bottom_mask):
            yt_bot = y_true[bottom_mask]
            yp_bot = y_pred[bottom_mask]
            perm_bot = permnos[bottom_mask] if permnos is not None else None
            r2_bot = r2_zero(yt_bot, yp_bot)
            rmse_bot = np.sqrt(mean_squared_error(yt_bot, yp_bot))
            mae_bot = mean_absolute_error(yt_bot, yp_bot)
            mse_bot = mean_squared_error(yt_bot, yp_bot)
            dir_bot, up_bot, down_bot = calc_directional_metrics(yt_bot, yp_bot, perm_bot)
            metrics.update({
                "Bottom25_R2_zero": r2_bot,
                "Bottom25_RMSE": rmse_bot,
                "Bottom25_MAE": mae_bot,
                "Bottom25_MSE": mse_bot,
                "Bottom25_Dir_Acc": dir_bot,
                "Bottom25_Up_Acc": up_bot,
                "Bottom25_Down_Acc": down_bot
            })

    return metrics


In [3]:
# ========== 3. Save Model and Metrics ==========
def save_model(model, name, window, path="models/"):
    """Save model"""
    os.makedirs(path, exist_ok=True)
    joblib.dump(model, os.path.join(path, f"{name}_w{window}.joblib"))

def save_metrics(metrics_dict, name, window, path="results.csv"):
    """Save evaluation metrics"""
    row = pd.DataFrame([metrics_dict])
    row.insert(0, "Model", name)
    row.insert(1, "Window", window)

    if os.path.exists(path):
        df = pd.read_csv(path)
        df = df[~((df["Model"] == name) & (df["Window"] == window))]
        df = pd.concat([df, row], ignore_index=True)
        df.to_csv(path, index=False)
        print(f"[Update] Metrics updated for {name} w={window}")
    else:
        row.to_csv(path, index=False)
        print(f"[Create] New metrics file created with {name} w={window}")

def save_predictions(model_name, window_size, y_true, y_pred, permnos, path="predictions/"):
    """Save prediction results"""
    os.makedirs(path, exist_ok=True)
    
    df = pd.DataFrame({
        "PERMNO": permnos,
        "y_true": y_true,
        "y_pred": y_pred
    })

    filename = f"{model_name}_w{window_size}.csv"
    df.to_csv(os.path.join(path, filename), index=False)
    print(f"[Save] {filename}")


In [4]:
# ========== 4. Model Hyperparameter Tuning ==========
TUNED_MODELS = {"RF", "XGB"}


def tune_model_with_optuna(model_name, X, y, permnos=None, n_trials=30):
    """Use Optuna for hyperparameter tuning - MSE as objective (suitable for portfolio construction)"""
    if model_name not in TUNED_MODELS:
        print(f"[Skip] {model_name} is not a tunable model. Skipped tuning.")
        return None

    tscv = TimeSeriesSplit(n_splits=5)

    def objective(trial):
        try:
            if model_name == "RF":
                params = {
                    'n_estimators': trial.suggest_int('n_estimators', 100, 300),
                    'max_depth': trial.suggest_int('max_depth', 4, 12),
                    'min_samples_split': trial.suggest_int('min_samples_split', 2, 8),
                    'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
                    'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
                }
                model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)

            elif model_name == "XGB":
                params = {
                    'n_estimators': trial.suggest_int('n_estimators', 100, 300),
                    'max_depth': trial.suggest_int('max_depth', 4, 10),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
                    'subsample': trial.suggest_float('subsample', 0.7, 1.0),
                    'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
                    'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 0.1, log=True),
                    'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 0.1, log=True),
                }
                model = XGBRegressor(**params, random_state=42, n_jobs=-1)

            scores = []
            for train_idx, val_idx in tscv.split(X):
                X_tr, X_val = X[train_idx], X[val_idx]
                y_tr, y_val = y[train_idx], y[val_idx]

                model.fit(X_tr, y_tr)
                preds = model.predict(X_val)
                
                mse = mean_squared_error(y_val, preds)
                scores.append(mse)

            return np.mean(scores)
            
        except Exception as e:
            print(f"[Optuna Trial Failed] {e}")
            return float('inf')

    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(seed=42),
        pruner=optuna.pruners.MedianPruner()
    )

    study.optimize(objective, n_trials=n_trials, n_jobs=-1)

    if len(study.trials) == 0 or study.best_trial is None:
        print(f"[Skip Model] {model_name} failed to complete any trial. Skipping.")
        return None

    best_params = study.best_params
    best_score = study.best_value
    print(f"[Optuna] {model_name} best_MSE={best_score:.6f}, best_params={best_params}")

    if model_name == "RF":
        return RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
    elif model_name == "XGB":
        return XGBRegressor(**best_params, random_state=42, n_jobs=-1)

    return None


In [5]:
# ========== 5. Main Training and Evaluation Function ==========
def train_and_evaluate(model_name, window_size, X_train, y_train, X_test, y_test, 
                      permnos_train, permnos_test, meta_train, meta_test):
    """Train and evaluate the model"""
    print(f"\nTraining {model_name} on Window = {window_size}")
    
    model = tune_model_with_optuna(model_name, X_train, y_train, permnos_train)
    
    if model is None:
        print(f"[Skip] {model_name} tuning failed, using default parameters")
        if model_name == "RF":
            model = RandomForestRegressor(random_state=42, n_jobs=-1)
        elif model_name == "XGB":
            model = XGBRegressor(random_state=42, n_jobs=-1)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print("\n=== Directional Sanity Check ===")
    print("Pos ratio (y_test):", (y_test > 0).mean())
    print("Neg ratio (y_test):", (y_test < 0).mean())
    sign_pred = np.sign(y_pred)
    print("Pred +1 ratio:", (sign_pred > 0).mean())
    print("Pred -1 ratio:", (sign_pred < 0).mean())
    
    sign_true = np.sign(y_test)
    sign_pred = np.sign(y_pred)
    
    mask = (sign_true != 0) & (sign_pred != 0)
    if mask.sum() > 0:
        cm = confusion_matrix(sign_true[mask], sign_pred[mask], labels=[1, -1])
        print("      Pred+  Pred-")
        print("+1 |", cm[0])
        print("-1 |", cm[1])
    else:
        print("\n[Warning] After filtering zeros, no valid samples remain.")
    
    metrics = regression_metrics(y_test, y_pred, k=X_test.shape[1], 
                               meta=meta_test, permnos=permnos_test)
    
    save_model(model, model_name, window_size)
    save_metrics(metrics, model_name, window_size)
    save_predictions(model_name, window_size, y_test, y_pred, permnos_test)
    
    return metrics


In [6]:
# ========== 6. Main scheduling function: loop through all models and windows ==========
def loop_all_models():
    """Train all models on different window sizes"""
    datasets = load_datasets()
    
    model_list = ["RF", "XGB"]
    window_sizes = [5, 21, 252, 512]

    for window in window_sizes:
        print(f"\n{'='*50}")
        print(f"Processing Window Size: {window}")
        print(f"{'='*50}")
        
        X_train = datasets[f"X_train_{window}"]
        y_train = datasets[f"y_train_{window}"]
        X_test = datasets[f"X_test_{window}"]
        y_test = datasets[f"y_test_{window}"]
        
        meta_train_dict = datasets[f"meta_train_{window}"].item()
        meta_train = pd.DataFrame.from_dict(meta_train_dict, orient="columns")
        permnos_train = meta_train["PERMNO"].values
        
        meta_test_dict = datasets[f"meta_test_{window}"].item()
        meta_test = pd.DataFrame.from_dict(meta_test_dict, orient="columns")
        permnos_test = meta_test["PERMNO"].values

        for model_name in model_list:
            print(f"Training {model_name} on Window = {window}")
            train_and_evaluate(model_name, window, X_train, y_train, X_test, y_test, 
                             permnos_train, permnos_test, meta_train, meta_test)


In [7]:
# ========== 7. Main entry point ==========
if __name__ == "__main__":
    loop_all_models()


Processing Window Size: 5
Training RF on Window = 5

▶ Training RF on Window = 5
[Optuna] RF best_MSE=0.000298, best_params={'n_estimators': 176, 'max_depth': 4, 'min_samples_split': 7, 'min_samples_leaf': 4, 'max_features': 'sqrt'}

=== Directional Sanity Check ===
Pos ratio (y_test): 0.5225259359494813
Neg ratio (y_test): 0.47667117726657643
Pred +1 ratio: 0.9690121786197564
Pred -1 ratio: 0.03098782138024357
      Pred+  Pred-
+1 | [56276  1646]
-1 | [51055  1784]
[Update] Metrics updated for RF w=5
[Save] RF_w5.csv
Training XGB on Window = 5

▶ Training XGB on Window = 5
[Optuna] XGB best_MSE=0.000299, best_params={'n_estimators': 150, 'max_depth': 4, 'learning_rate': 0.0521063977683361, 'subsample': 0.7510326400110393, 'min_child_weight': 6, 'reg_alpha': 0.00024594068746898375, 'reg_lambda': 0.0005482087312288792}

=== Directional Sanity Check ===
Pos ratio (y_test): 0.5225259359494813
Neg ratio (y_test): 0.47667117726657643
Pred +1 ratio: 0.9539738385205232
Pred -1 ratio: 0.0460