<a href="https://colab.research.google.com/github/anjaliii210/Cross-Market-Meta-Learner/blob/main/meta_fin_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df_model = pd.read_parquet('/content/drive/MyDrive/df_model.parquet')

Mounted at /content/drive


In [None]:
print(df_model.columns)

Index(['date', 'ticker', 'price', 'asset_id', 'return', 'vol_rolling', 'ma_5',
       'ma_20', 'ret_volnorm', 'cum', 'cum_max', 'drawdown', 'vol_short',
       'vol_long', 'vol_jump', 'VaR', 'CVaR', 'ret_lag_1', 'ret_lag_2',
       'ret_lag_5', 'ret_var_20', 'acf1', 'target_next_day_return', 'Close',
       'High', 'Low', 'Open', 'Volume', 'roll_sharpe_20', 'downside_dev_20',
       'roll_skew_20', 'roll_kurt_20', 'boll_z', 'ATR_14', 'RSI_14',
       'WilliamsR_14', 'realized_vol_20', 'trend_frac_20'],
      dtype='object', name='type')


In [None]:
import numpy as np
df = df_model.copy()

# Ensure date is datetime
df['date'] = pd.to_datetime(df['date'])

#adding cross-asset features
#rank of risk normalised return
df_model['rnret_rank'] = df_model.groupby('date')['ret_volnorm'].rank(pct=True)

#cross-asset z score
df['rnret_z_universe'] = df.groupby('date')['ret_volnorm'].transform(
    lambda x: (x - x.mean()) / x.std(ddof=0)
)

#relative performance vs benchmark/index
benchmark = df[df['ticker'] == '^GSPC'][['date', 'ret_volnorm']].rename(
    columns={'ret_volnorm': 'benchmark_rnret'}
)

df = df.merge(benchmark, on='date', how='left')
df['excess_rnret_index'] = df['ret_volnorm'] - df['benchmark_rnret']

#dispersion
vol_dispersion = df.groupby('date')['realized_vol_20'].std(ddof=0).rename('vol_dispersion')
df = df.merge(vol_dispersion, on='date')


#cross-asset pairwise spread
anchors = ['^GSPC', 'AAPL', 'GC=F', 'CL=F', 'EURUSD=X', '^TNX', 'VIX']
for a in anchors:
    tmp = df[df['ticker'] == a][['date','ret_volnorm']].rename(
        columns={'ret_volnorm': f'rnret_anchor_{a}'}
    )
    df = df.merge(tmp, on='date', how='left')
    df[f'spread_vs_{a}'] = df['ret_volnorm'] - df[f'rnret_anchor_{a}']

#tail/risk features
#ewma vol normalisation
df['vol_ewma_20'] = df.groupby('asset_id')['return'].transform(
    lambda x: x.ewm(span=20, adjust=False).std()
)
df['rnret_ewma'] = df['return'] / df['vol_ewma_20']


#downside risk-norm return
df['rnret_downside'] = df['return'] / df['downside_dev_20']

#prob of extreme move
def tail_prob_1pct(series):
    return series.rolling(120).apply(
        lambda w: (w < np.percentile(w, 1)).mean(), raw=False
    )

df['tail_prob_1pct'] = df.groupby('asset_id')['return'].transform(tail_prob_1pct)

#volatility jump indicator
df['vol_jump_flag'] = (df['vol_short'] / df['vol_long']) > 2

#volume z score vs asset history
df['vol_z_asset'] = df.groupby('asset_id')['Volume'].transform(
    lambda x: (x - x.mean()) / x.std(ddof=0)
)

#vol return signed imbalance
df['vol_ret_imb'] = df['Volume'] * np.sign(df['return'])
df['vol_ret_imb_20'] = df.groupby('asset_id')['vol_ret_imb'].transform(
    lambda x: x.rolling(20).mean()
)

#cross-asset rank momentum-for 5 days and 20 days
df_model['rank_mom_5'] = df_model.groupby('asset_id')['rnret_rank'].diff(5)
df_model['rank_mom_20'] = df_model.groupby('asset_id')['rnret_rank'].diff(20)

#trend intensity
df_model['trend_intensity']=df_model["trend_frac_20"] * df_model["roll_sharpe_20"]

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

WINDOW = 60
N_COMP = 3

pivot = df.pivot(index='date', columns='asset_id', values='return').sort_index()
pivot = pivot.ffill().bfill()

rows = []
dates = pivot.index.tolist()
assets = pivot.columns.tolist()

scaler = StandardScaler()

for i in range(WINDOW, len(pivot)):
    win = pivot.iloc[i-WINDOW:i]                 # (60 x 9)

    # STANDARDIZE (critical step)
    win_std = scaler.fit_transform(win.values)   # zero mean, unit variance per asset

    pca = PCA(n_components=N_COMP)
    comps = pca.fit_transform(win_std.T)         # per-asset scores

    day = dates[i]
    for j, a in enumerate(assets):
        rows.append((day, a, *comps[j]))

pca_df = pd.DataFrame(rows,
                      columns=['date','asset_id']+[f'pca_score_{k+1}' for k in range(N_COMP)])

df = df.merge(pca_df, on=['date','asset_id'], how='left')


In [None]:
#correlation features
# Create benchmark return series
bench = df[df['ticker']=='^GSPC'][['date','return']].rename(
    columns={'return':'bench_ret'}
)
df = df.merge(bench, on='date', how='left')


df['corr_vs_index_20'] = df.groupby('asset_id').apply(
    lambda x: x['return'].rolling(20).corr(x['bench_ret'])
).reset_index(level=0, drop=True)


  df['corr_vs_index_20'] = df.groupby('asset_id').apply(


In [None]:
!pip install hmmlearn

Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (165 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/166.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hmmlearn
Successfully installed hmmlearn-0.3.3


In [None]:
#regime markers
#clusters
from sklearn.cluster import KMeans

cluster_data = df[['realized_vol_20','roll_skew_20','roll_kurt_20']].fillna(0)
kmeans = KMeans(n_clusters=3, random_state=42)
df['regime_kmeans_3'] = kmeans.fit_predict(cluster_data)

#hmm regime probabilities
from hmmlearn.hmm import GaussianHMM

hmm = GaussianHMM(n_components=2, covariance_type='full', n_iter=200)
hmm.fit(df[['ret_volnorm']].dropna())

probs = hmm.predict_proba(df[['ret_volnorm']].fillna(0))
df['hmm_prob_0'] = probs[:,0]
df['hmm_prob_1'] = probs[:,1]



In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['return'], inplace=True)
df_model = df.copy()

In [None]:
df_model[200:220]

Unnamed: 0,date,ticker,price,asset_id,return,vol_rolling,ma_5,ma_20,ret_volnorm,cum,...,vol_ret_imb,vol_ret_imb_20,pca_score_1,pca_score_2,pca_score_3,bench_ret,corr_vs_index_20,regime_kmeans_3,hmm_prob_0,hmm_prob_1
200,2015-08-29,AAPL,25.434504,Close_AAPL,0.0,0.026755,-0.023868,-0.008681,0.0,1.049384,...,0.0,-1861840.0,3.737301,2.808055,0.119118,0.0,0.847102,0,0.998918,0.001082
201,2015-08-30,AAPL,25.434504,Close_AAPL,0.0,0.026755,-0.007009,-0.011519,0.0,1.049384,...,0.0,-12852160.0,3.7571,2.86349,0.328468,0.0,0.84352,0,0.998918,0.001082
202,2015-08-31,AAPL,25.315516,Close_AAPL,-0.004678,0.025305,0.003104,-0.007197,-0.18487,1.044475,...,-224917200.0,-4681460.0,3.650504,2.913739,0.452338,-0.008392,0.90475,2,0.0,1.0
203,2015-09-01,AAPL,24.183994,Close_AAPL,-0.044697,0.022479,0.040382,0.035764,-1.98839,0.99779,...,-307383600.0,-40294140.0,3.727237,2.894037,0.458094,-0.029576,0.919622,2,0.0,1.0
204,2015-09-02,AAPL,25.221216,Close_AAPL,0.042889,0.024247,-0.004095,-0.008082,1.768847,1.040584,...,247555200.0,-18209220.0,3.854172,2.666722,0.479462,0.018293,0.921004,0,0.0,1.0
205,2015-09-03,AAPL,24.778938,Close_AAPL,-0.017536,0.026341,0.00839,0.00709,-0.665723,1.022336,...,-212935600.0,-37441900.0,4.036282,2.664811,0.823896,0.001165,0.902749,0,0.0,1.0
206,2015-09-04,AAPL,24.531977,Close_AAPL,-0.009967,0.026524,0.011183,0.014167,-0.37576,1.012147,...,-199985200.0,-47441160.0,3.915524,2.72264,1.055947,-0.01533,0.901091,0,0.0,1.0
207,2015-09-05,AAPL,24.531977,Close_AAPL,0.0,0.026575,0.004796,0.011106,0.0,1.012147,...,0.0,-47441160.0,3.856529,2.904788,1.001579,0.0,0.901091,0,0.998921,0.001079
208,2015-09-06,AAPL,24.531977,Close_AAPL,0.0,0.026575,0.007632,0.007495,0.0,1.012147,...,0.0,-55618100.0,3.875371,3.091169,0.686277,0.0,0.899569,0,0.999719,0.000281
209,2015-09-07,AAPL,24.531977,Close_AAPL,0.0,0.026409,0.002013,0.004187,0.0,1.012147,...,0.0,-48705960.0,3.923026,3.091932,0.538359,0.0,0.90027,0,0.998921,0.001079


In [None]:
print(df_model.columns)

Index(['date', 'ticker', 'price', 'asset_id', 'return', 'vol_rolling', 'ma_5',
       'ma_20', 'ret_volnorm', 'cum', 'cum_max', 'drawdown', 'vol_short',
       'vol_long', 'vol_jump', 'VaR', 'CVaR', 'ret_lag_1', 'ret_lag_2',
       'ret_lag_5', 'ret_var_20', 'acf1', 'target_next_day_return', 'Close',
       'High', 'Low', 'Open', 'Volume', 'roll_sharpe_20', 'downside_dev_20',
       'roll_skew_20', 'roll_kurt_20', 'boll_z', 'ATR_14', 'RSI_14',
       'WilliamsR_14', 'realized_vol_20', 'trend_frac_20', 'rnret_z_universe',
       'benchmark_rnret', 'excess_rnret_index', 'vol_dispersion',
       'rnret_anchor_^GSPC', 'spread_vs_^GSPC', 'rnret_anchor_AAPL',
       'spread_vs_AAPL', 'rnret_anchor_GC=F', 'spread_vs_GC=F',
       'rnret_anchor_CL=F', 'spread_vs_CL=F', 'rnret_anchor_EURUSD=X',
       'spread_vs_EURUSD=X', 'rnret_anchor_^TNX', 'spread_vs_^TNX',
       'rnret_anchor_VIX', 'spread_vs_VIX', 'vol_ewma_20', 'rnret_ewma',
       'rnret_downside', 'tail_prob_1pct', 'vol_jump_flag', 'v

In [None]:
df_model.shape

(32517, 72)

##Task Creation for Meta Learner

In [None]:
#creating asset x regime tasks
regime_col = "regime_kmeans_3"

cols_x = [c for c in df_model.columns
          if c not in ['date','asset_id','ticker','target_next_day_return',
                       'regime_kmeans_3']]

cols_x = [c for c in cols_x if df_model[c].dtype != 'object'] # remove strings

X = df_model[cols_x]
y = df_model['ret_volnorm']

df_tasks = df_model[['date','asset_id', regime_col] + cols_x + ['target_next_day_return']]



In [None]:
groups = df_tasks.groupby(['asset_id', regime_col])
#each(asset_id,regime) pair is one maml task

In [None]:
#checking if enough samples present for support and query sets
task_dfs = []
for (asset, reg), g in groups:
    if len(g) < 120:   # strict lower bound
        continue
    g = g.sort_values('date')
    task_dfs.append(((asset, reg), g))

In [None]:
#creating support and query sets for each task
tasks = []

for (asset, reg), g in task_dfs:
    n = len(g)
    split = int(0.6 * n)
    support = g.iloc[:split]
    query   = g.iloc[split:]

    tasks.append({
        "asset": asset,
        "regime": reg,
        "support_X": support[cols_x].values,
        "support_y": support['target_next_day_return'].values,
        "query_X": query[cols_x].values,
        "query_y": query['target_next_day_return'].values,
    })

In [None]:
#rolling windows for stable meta learning
window = 64
support_len = 32
query_len = 32

rolling_tasks = []

for (asset, reg), g in task_dfs:
    g = g.sort_values('date').reset_index(drop=True)

    for start in range(0, len(g) - window, window//2):
        block = g.iloc[start:start+window]

        support = block.iloc[:support_len]
        query   = block.iloc[support_len:support_len+query_len]

        rolling_tasks.append({
            "asset": asset,
            "regime": reg,
            "support_X": support[cols_x].values,
            "support_y": support['target_next_day_return'].values,
            "query_X": query[cols_x].values,
            "query_y": query['target_next_day_return'].values,
        })

In [None]:
#pairs for maml
valid_pairs = []
for (asset, reg), g in groups:
    if len(g) >= 120:
        valid_pairs.append((asset, reg))

pd.DataFrame(valid_pairs, columns=['asset_id','regime'])

Unnamed: 0,asset_id,regime
0,Close_AAPL,0
1,Close_AAPL,1
2,Close_AAPL,2
3,Close_CL=F,0
4,Close_CL=F,1
5,Close_CL=F,2
6,Close_EURUSD=X,0
7,Close_EURUSD=X,1
8,Close_EURUSD=X,2
9,Close_GBPUSD=X,0


In [None]:
df_tasks.groupby(['asset_id', regime_col]).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
asset_id,regime_kmeans_3,Unnamed: 2_level_1
Close_AAPL,0,2287
Close_AAPL,1,283
Close_AAPL,2,1043
Close_CL=F,0,2685
Close_CL=F,1,142
Close_CL=F,2,786
Close_EURUSD=X,0,2627
Close_EURUSD=X,1,126
Close_EURUSD=X,2,860
Close_GBPUSD=X,0,2539


## Checking grad similarity b/w tasks****

In [None]:
import torch
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.net(x)


def get_grad_vector(model):
    grads = []
    for p in model.parameters():
        if p.grad is None:
            grads.append(torch.zeros_like(p).view(-1))
        else:
            grads.append(p.grad.view(-1))
    return torch.cat(grads)

In [None]:
def compute_task_grad(model, X_s, y_s):
    model.zero_grad()
    pred = model(X_s).squeeze()
    loss = torch.nn.functional.mse_loss(pred, y_s)
    loss.backward()
    return get_grad_vector(model).detach()

In [None]:
def grad_cosine_sim(g1, g2):
    return torch.nn.functional.cosine_similarity(
        g1.unsqueeze(0), g2.unsqueeze(0)
    ).item()

In [None]:
def compute_grad_similarities(model, tasks):
    """
    tasks = list of (asset_id, X_s, y_s)
    Returns dict: {(asset_i, asset_j): cosine_sim}
    """
    grad_map = {}

    # First compute gradient for each task
    task_grads = {}
    for asset, X_s, y_s in tasks:
        task_grads[asset] = compute_task_grad(model, X_s, y_s)

    # Compute pairwise similarity
    assets = list(task_grads.keys())
    sim = {}

    for i in range(len(assets)):
        for j in range(i + 1, len(assets)):
            a1, a2 = assets[i], assets[j]
            g1, g2 = task_grads[a1], task_grads[a2]
            sim[(a1, a2)] = grad_cosine_sim(g1, g2)

    return sim

In [None]:
feature_cols_clean = [
    # Price/return structure
    "return",
    "ret_volnorm",
    "ret_lag_1",
    "ret_lag_2",
    "ret_lag_5",

    # Volatility structure
    "vol_rolling",
    "vol_short",
    "vol_long",
    "realized_vol_20",
    "vol_ewma_20",
    "vol_z_asset",

    # Range-based & momentum indicators
    "ma_5",
    "ma_20",
    "ATR_14",
    "RSI_14",
    "boll_z",

    # Higher-order distribution shape (regime expressive)
    "roll_skew_20",
    "roll_kurt_20",
    "downside_dev_20",

    # Trend/momentum + persistence
    "trend_frac_20",
    "acf1",

    # Risk variables (safe, because they use past window)
    "VaR",
    "CVaR",

    # PCA latent structure (dimension-reduced, cross-asset consistent)
    "pca_score_1",
    "pca_score_2",
    "pca_score_3",
]

In [None]:
def build_task_dicts(
    df,
    feature_cols,
    target_col,
    regime_col,
    min_len=120,
    support_frac=0.6
):


    tasks_dicts = []

    for (asset, regime), g in df.groupby(["asset_id", regime_col]):
        g = g.sort_values("date")

        # Drop missing values
        g = g.dropna(subset=feature_cols + [target_col])

        if len(g) < min_len:
            continue

        split = int(len(g) * support_frac)
        support = g.iloc[:split]

        X = support[feature_cols].values.astype("float32")
        y = support[target_col].values.astype("float32")

        if X.shape[0] == 0:
            continue

        tasks_dicts.append({
            "asset": asset,
            "regime": regime,
            "support_X": X,
            "support_y": y
        })

    return tasks_dicts


In [None]:
tasks_dicts = build_task_dicts(
    df=df_model,                      # full dataframe
    feature_cols=feature_cols_clean,
    target_col="target_next_day_return",
    regime_col="regime_kmeans_3"
)

In [None]:
print(len(tasks_dicts))
print(tasks_dicts[0].keys())
print(tasks_dicts[0]["support_X"].shape)
print(tasks_dicts[0]["regime"])

18
dict_keys(['asset', 'regime', 'support_X', 'support_y'])
(1340, 26)
0


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Model
class Model(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.net(x)


# Gradient utilities
def get_grad_vector(model):
    grads = []
    for p in model.parameters():
        if p.grad is None:
            grads.append(torch.zeros_like(p).view(-1))
        else:
            grads.append(p.grad.view(-1))
    return torch.cat(grads)


def compute_task_grad(model, X_s, y_s):
    model.zero_grad()
    pred = model(X_s).squeeze()
    loss = F.mse_loss(pred, y_s)
    loss.backward()
    return get_grad_vector(model).detach()


def cosine_sim(g1, g2):
    return F.cosine_similarity(g1.unsqueeze(0), g2.unsqueeze(0)).item()


def build_task_dicts(
    df,
    feature_cols_clean,
    target_col,
    regime_col,
    min_len=120,
    support_frac=0.6
):
    tasks_dicts = []

    for (asset, regime), g in df.groupby(["asset_id", regime_col]):
        g = g.sort_values("date")
        g = g.dropna(subset=feature_cols_clean + [target_col])

        if len(g) < min_len:
            continue

        split = int(len(g) * support_frac)
        support = g.iloc[:split]

        # ✅ support_X built ONLY from feature_cols_clean
        X = support[feature_cols_clean].values.astype("float32")
        y = support[target_col].values.astype("float32")

        tasks_dicts.append({
            "asset": asset,
            "regime": regime,
            "support_X": X,
            "support_y": y
        })

    return tasks_dicts


def build_tasks_for_similarity(tasks_dicts):
    processed_tasks = []

    for task in tasks_dicts:
        asset = task["asset"]
        regime = task["regime"]

        X_t = torch.tensor(task["support_X"], dtype=torch.float32)
        y_t = torch.tensor(task["support_y"], dtype=torch.float32)

        processed_tasks.append((asset, regime, X_t, y_t))

    return processed_tasks




# Gradient similarity
def compute_grad_similarities(model, tasks):
    task_grads = {}

    for asset, regime, X_s, y_s in tasks:
        key = (asset, regime)
        task_grads[key] = compute_task_grad(model, X_s, y_s)

    keys = list(task_grads.keys())
    similarities = {}

    for i in range(len(keys)):
        for j in range(i + 1, len(keys)):
            k1, k2 = keys[i], keys[j]

            # compare ONLY within same regime
            if k1[1] != k2[1]:
                continue

            similarities[(k1, k2)] = cosine_sim(
                task_grads[k1], task_grads[k2]
            )

    return similarities


# USAGE
# tasks_dicts  -> list of task dictionaries (your existing structure)
# feature_cols_clean -> selected feature list
# cols_x -> column order used to build support_X

tasks_dicts = build_task_dicts(
    df=df_model,
    feature_cols_clean=feature_cols_clean,
    target_col="target_next_day_return",
    regime_col="regime_kmeans_3"
)

processed_tasks = build_tasks_for_similarity(tasks_dicts)

model = Model(input_dim=len(feature_cols_clean))

similarities = compute_grad_similarities(model, processed_tasks)

for pair, sim in similarities.items():
    print(pair, sim)



(('Close_AAPL', np.int32(0)), ('Close_CL=F', np.int32(0))) 0.9999518394470215
(('Close_AAPL', np.int32(0)), ('Close_GC=F', np.int32(0))) 0.9908687472343445
(('Close_AAPL', np.int32(0)), ('Close_MSFT', np.int32(0))) 0.9998627305030823
(('Close_AAPL', np.int32(0)), ('Close_SI=F', np.int32(0))) 0.9997276663780212
(('Close_AAPL', np.int32(0)), ('Close_^GSPC', np.int32(0))) 0.6925675272941589
(('Close_AAPL', np.int32(1)), ('Close_CL=F', np.int32(1))) 0.9979935884475708
(('Close_AAPL', np.int32(1)), ('Close_GC=F', np.int32(1))) 0.994465172290802
(('Close_AAPL', np.int32(1)), ('Close_MSFT', np.int32(1))) 0.9996376633644104
(('Close_AAPL', np.int32(1)), ('Close_SI=F', np.int32(1))) 0.9985083937644958
(('Close_AAPL', np.int32(1)), ('Close_^GSPC', np.int32(1))) -0.16765490174293518
(('Close_AAPL', np.int32(2)), ('Close_CL=F', np.int32(2))) 0.99937903881073
(('Close_AAPL', np.int32(2)), ('Close_GC=F', np.int32(2))) 0.9915462732315063
(('Close_AAPL', np.int32(2)), ('Close_MSFT', np.int32(2))) 0.99

Hence,regimes induce a shared learning geometry across assets,but not for gspc

In [None]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32517 entries, 0 to 32516
Data columns (total 72 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   date                    32517 non-null  datetime64[ns]
 1   ticker                  32517 non-null  object        
 2   price                   32517 non-null  float64       
 3   asset_id                32517 non-null  object        
 4   return                  32517 non-null  float64       
 5   vol_rolling             32517 non-null  float64       
 6   ma_5                    32517 non-null  float64       
 7   ma_20                   32517 non-null  float64       
 8   ret_volnorm             32517 non-null  float64       
 9   cum                     32517 non-null  float64       
 10  cum_max                 32517 non-null  float64       
 11  drawdown                32517 non-null  float64       
 12  vol_short               32517 non-null  float6

In [None]:
#saving df
import os

BASE_PATH = "/content/drive/MyDrive/regime_project"
os.makedirs(BASE_PATH, exist_ok=True)

df_model.to_parquet(
    f"{BASE_PATH}/df_model_with_regimes.parquet",
    index=False
)


In [None]:
#saving regime metadata
import json

feature_cols_clean = list(feature_cols_clean)  # ensure serializable

with open(f"{BASE_PATH}/feature_cols_clean.json", "w") as f:
    json.dump(feature_cols_clean, f, indent=2)

In [None]:
#saving col order for x
regime_col = "regime_kmeans_3"

regime_info = {
    "regime_col": regime_col,
    "regime_counts": df_model[regime_col].value_counts().to_dict()
}

with open(f"{BASE_PATH}/regime_info.json", "w") as f:
    json.dump(regime_info, f, indent=2)
