<a href="https://colab.research.google.com/github/anjaliii210/Cross-Market-Meta-Learner/blob/main/meta_fin_regime_pooled_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd
import json
from google.colab import drive
drive.mount("/content/drive")

BASE_PATH = "/content/drive/MyDrive/regime_project"

df_model = pd.read_parquet(
    f"{BASE_PATH}/df_model_with_regimes.parquet"
)

with open(f"{BASE_PATH}/feature_cols_clean.json") as f:
    feature_cols_clean = json.load(f)

with open(f"{BASE_PATH}/regime_info.json") as f:
    regime_info = json.load(f)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [26]:
#adding next day bucket for vol normalised return
z_window = 30  # rolling window for z-score

def add_next_day_z_bucket(df):
    df = df.sort_values("date").copy()

    # 1. next-day vol-normalized return
    df["next_rnret"] = df.groupby("asset_id")["ret_volnorm"].shift(-1)

    # 2. rolling mean & std (per asset)
    roll_mean = (
        df.groupby("asset_id")["next_rnret"]
        .rolling(z_window)
        .mean()
        .reset_index(level=0, drop=True)
    )
    roll_std = (
        df.groupby("asset_id")["next_rnret"]
        .rolling(z_window)
        .std()
        .reset_index(level=0, drop=True)
    )

    # 3. z-score
    df["next_rnret_z"] = (df["next_rnret"] - roll_mean) / roll_std

    # 4. bucketization
    df["next_day_z_bucket"] = np.select(
        [
            df["next_rnret_z"] <= -2,
            (df["next_rnret_z"] > -2) & (df["next_rnret_z"] <= -1),
            (df["next_rnret_z"] > -1) & (df["next_rnret_z"] <= 1),
            (df["next_rnret_z"] > 1) & (df["next_rnret_z"] <= 2),
            df["next_rnret_z"] > 2,
        ],
        [-2, -1, 0, 1, 2],
        default=np.nan
    )

    return df

In [27]:
#adding next-day z-bucket target
df_model = add_next_day_z_bucket(df_model)

#dropping rows created by rolling
df_model = df_model.dropna(
    subset=[
        "next_rnret",
        "next_rnret_z",
        "next_day_z_bucket"
    ]
).reset_index(drop=True)

bucket3_map = {
    0: 0,  # DOWN
    1: 0,  # DOWN
    2: 1,  # NEUTRAL
    3: 2,  # UP
    4: 2   # UP
}

bucket_map = {-2: 0, -1: 1, 0: 2, 1: 3, 2: 4}
df_model["next_day_z_bucket_cls"] = (
    df_model["next_day_z_bucket"].map(bucket_map).astype(int)
)

df_model["next_day_bucket_3cls"] = (
    df_model["next_day_z_bucket_cls"]
    .map(bucket3_map)
    .astype(int)
)

In [28]:
df_model = df_model.drop(columns=["next_day_z_bucket","next_day_z_bucket_cls"])

df_model.head()

Unnamed: 0,date,ticker,price,asset_id,return,vol_rolling,ma_5,ma_20,ret_volnorm,cum,...,pca_score_2,pca_score_3,bench_ret,corr_vs_index_20,regime_kmeans_3,hmm_prob_0,hmm_prob_1,next_rnret,next_rnret_z,next_day_bucket_3cls
0,2015-03-11,EURUSD=X,1.07087,Close_EURUSD=X,-0.011512,0.005134,0.016971,0.04225,-2.242145,0.885117,...,,,-0.001918,-0.137592,0,1.0,0.0,-2.72626,-1.904371,0
1,2015-03-11,^GSPC,2040.23999,Close_^GSPC,-0.001918,0.005646,0.010308,0.026754,-0.339624,0.991274,...,,,-0.001918,1.0,2,1.0,0.0,2.231278,1.680729,2
2,2015-03-11,SI=F,15.345,Close_SI=F,-0.017165,0.007634,0.02028,0.051672,-2.248556,0.975276,...,,,-0.001918,0.428507,0,1.0,0.0,1.20402,1.67705,2
3,2015-03-11,GC=F,1150.699951,Close_GC=F,-0.008103,0.006752,0.009021,0.035739,-1.200045,0.970236,...,,,-0.001918,0.537908,1,1.0,0.0,0.183376,0.478985,1
4,2015-03-11,MSFT,36.038612,Close_MSFT,-0.00119,0.007618,0.008004,0.032266,-0.156231,0.904165,...,,,-0.001918,0.88062,0,1.0,0.0,-3.002706,-2.171436,0


In [29]:
class Model(nn.Module):
    def __init__(self, input_dim, num_classes=5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, num_classes)  # logits for classes
        )

    def forward(self, x):
        return self.net(x)

In [30]:
def filter_non_index_assets(df):
    return df[~df["asset_id"].str.contains(r"\^")].copy()

df_assets = filter_non_index_assets(df_model)


In [31]:
def build_pooled_regime_dataset(
    df,
    feature_cols,
    target_col,
    regime_col,
    regime_value
):
    df_r = df[df[regime_col] == regime_value].copy()
    df_r = df_r.dropna(subset=feature_cols + [target_col])

    X = df_r[feature_cols].values.astype("float32")
    y = df_r[target_col].values.astype("int64")

    X_t = torch.tensor(X)
    y_t = torch.tensor(y)

    return X_t, y_t

In [32]:
from sklearn.utils.class_weight import compute_class_weight
def train_regime_model(
    X,
    y,
    input_dim,
    num_classes=3,
    lr=1e-3,
    epochs=50
):
    model = Model(input_dim, num_classes=num_classes)

    #class weights (per regime)
    classes = np.arange(num_classes)
    weights = compute_class_weight(
        class_weight="balanced",
        classes=classes,
        y=y.numpy()
    )
    weights = torch.tensor(weights, dtype=torch.float32)

    criterion = nn.CrossEntropyLoss(weight=weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        optimizer.zero_grad()
        logits = model(X)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        if epoch % 10 == 0:
            print(f"Epoch {epoch:03d} | Loss: {loss.item():.6f}")

    return model


In [33]:
def train_all_regime_models(
    df,
    feature_cols,
    target_col,
    regime_col,
    regime_values,
    num_classes=3,
    epochs=50
):
    models = {}

    for r in regime_values:
        print(f"\nTraining shared model for regime {r}")

        X_r, y_r = build_pooled_regime_dataset(
            df,
            feature_cols,
            target_col,
            regime_col,
            r
        )

        print(f"Samples: {len(X_r)}")

        model_r = train_regime_model(
            X_r,
            y_r,
            input_dim=len(feature_cols),
            num_classes=num_classes,
            epochs=epochs
        )

        models[r] = model_r

    return models

In [34]:
# selected features
feature_cols_clean = feature_cols_clean

# classification target
target_col = "next_day_bucket_3cls"

regime_col = "regime_kmeans_3"
regime_values = sorted(df_model[regime_col].unique())


#train shared regime-conditioned classifiers
regime_models = train_all_regime_models(
    df=df_assets,    #after removing indx
    feature_cols=feature_cols_clean,
    target_col=target_col,
    regime_col=regime_col,
    regime_values=regime_values,
    num_classes=3,
    epochs=50
)


Training shared model for regime 0
Samples: 11011
Epoch 000 | Loss: 1.490182
Epoch 010 | Loss: 1.157855
Epoch 020 | Loss: 1.144562
Epoch 030 | Loss: 1.124188
Epoch 040 | Loss: 1.106534

Training shared model for regime 1
Samples: 1335
Epoch 000 | Loss: 1.164420
Epoch 010 | Loss: 1.096424
Epoch 020 | Loss: 1.087677
Epoch 030 | Loss: 1.076000
Epoch 040 | Loss: 1.069380

Training shared model for regime 2
Samples: 5409
Epoch 000 | Loss: 1.723059
Epoch 010 | Loss: 1.269474
Epoch 020 | Loss: 1.232929
Epoch 030 | Loss: 1.194425
Epoch 040 | Loss: 1.162329


In [38]:
df_assets.to_parquet(
    f"{BASE_PATH}/df_assets_with_regimes.parquet",
    index=False
)

### Evaluation
testing on 40% of data

In [35]:
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

#evaluation dataset
def build_pooled_regime_testset(
    df,
    feature_cols,
    target_col,
    regime_col,
    regime_value,
    support_frac=0.6
):
    df_r = df[df[regime_col] == regime_value].copy()
    df_r = df_r.dropna(subset=feature_cols + [target_col])
    df_r = df_r.sort_values("date")

    split = int(len(df_r) * support_frac)
    test = df_r.iloc[split:]

    X = torch.tensor(
        test[feature_cols].values.astype("float32")
    )
    y = torch.tensor(
        test[target_col].values.astype("int64")
    )

    return X, y


#accuracy and confusion matrix
def evaluate_classifier(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        logits = model(X_test)
        preds = torch.argmax(logits, dim=1)

    acc = accuracy_score(y_test.numpy(), preds.numpy())
    cm = confusion_matrix(y_test.numpy(), preds.numpy())

    return acc, cm


#evaluation
for r, model in regime_models.items():
    X_te, y_te = build_pooled_regime_testset(
        df_assets,
        feature_cols_clean,
        target_col,
        regime_col,
        r
    )

    acc, cm = evaluate_classifier(model, X_te, y_te)

    print(f"\nRegime {r}")
    print(f"Accuracy: {acc:.4f}")
    print("Confusion matrix:")
    print(cm)


Regime 0
Accuracy: 0.3596
Confusion matrix:
[[ 264  196   87]
 [1542 1185  553]
 [ 261  182  135]]

Regime 1
Accuracy: 0.3390
Confusion matrix:
[[ 21  14   7]
 [169 150 140]
 [ 11  12  10]]

Regime 2
Accuracy: 0.3018
Confusion matrix:
[[ 81  67  84]
 [653 506 527]
 [107  73  66]]


In [36]:
import torch
import os

SAVE_DIR = "/content/drive/MyDrive/regime_project/saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)

for r, model in regime_models.items():
    path = f"{SAVE_DIR}/regime_{r}.pt"
    torch.save({
        "model_state": model.state_dict(),
        "regime": r
    }, path)

print("Models saved to Drive.")

Models saved to Drive.
