<a href="https://colab.research.google.com/github/anjaliii210/Cross-Market-Meta-Learner/blob/main/meta_fin_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import pandas as pd

In [29]:
from google.colab import drive
drive.mount('/content/drive')
df_model = pd.read_parquet('/content/drive/MyDrive/df_model.parquet')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
print(df_model.columns)

Index(['date', 'ticker', 'price', 'asset_id', 'return', 'vol_rolling', 'ma_5',
       'ma_20', 'ret_volnorm', 'cum', 'cum_max', 'drawdown', 'vol_short',
       'vol_long', 'vol_jump', 'VaR', 'CVaR', 'ret_lag_1', 'ret_lag_2',
       'ret_lag_5', 'ret_var_20', 'acf1', 'target_next_day_return', 'Close',
       'High', 'Low', 'Open', 'Volume', 'roll_sharpe_20', 'downside_dev_20',
       'roll_skew_20', 'roll_kurt_20', 'boll_z', 'ATR_14', 'RSI_14',
       'WilliamsR_14', 'realized_vol_20', 'trend_frac_20'],
      dtype='object', name='type')


In [31]:
import numpy as np
df = df_model.copy()

# Ensure date is datetime
df['date'] = pd.to_datetime(df['date'])

#adding cross-asset features
#rank of risk normalised return
df_model['rnret_rank'] = df_model.groupby('date')['ret_volnorm'].rank(pct=True)

#cross-asset z score
df['rnret_z_universe'] = df.groupby('date')['ret_volnorm'].transform(
    lambda x: (x - x.mean()) / x.std(ddof=0)
)

#relative performance vs benchmark/index
benchmark = df[df['ticker'] == '^GSPC'][['date', 'ret_volnorm']].rename(
    columns={'ret_volnorm': 'benchmark_rnret'}
)

df = df.merge(benchmark, on='date', how='left')
df['excess_rnret_index'] = df['ret_volnorm'] - df['benchmark_rnret']

#dispersion
vol_dispersion = df.groupby('date')['realized_vol_20'].std(ddof=0).rename('vol_dispersion')
df = df.merge(vol_dispersion, on='date')


#cross-asset pairwise spread
anchors = ['^GSPC', 'AAPL', 'GC=F', 'CL=F', 'EURUSD=X', '^TNX', 'VIX']
for a in anchors:
    tmp = df[df['ticker'] == a][['date','ret_volnorm']].rename(
        columns={'ret_volnorm': f'rnret_anchor_{a}'}
    )
    df = df.merge(tmp, on='date', how='left')
    df[f'spread_vs_{a}'] = df['ret_volnorm'] - df[f'rnret_anchor_{a}']

#tail/risk features
#ewma vol normalisation
df['vol_ewma_20'] = df.groupby('asset_id')['return'].transform(
    lambda x: x.ewm(span=20, adjust=False).std()
)
df['rnret_ewma'] = df['return'] / df['vol_ewma_20']


#downside risk-norm return
df['rnret_downside'] = df['return'] / df['downside_dev_20']

#prob of extreme move
def tail_prob_1pct(series):
    return series.rolling(120).apply(
        lambda w: (w < np.percentile(w, 1)).mean(), raw=False
    )

df['tail_prob_1pct'] = df.groupby('asset_id')['return'].transform(tail_prob_1pct)

#volatility jump indicator
df['vol_jump_flag'] = (df['vol_short'] / df['vol_long']) > 2

#volume z score vs asset history
df['vol_z_asset'] = df.groupby('asset_id')['Volume'].transform(
    lambda x: (x - x.mean()) / x.std(ddof=0)
)

#vol return signed imbalance
df['vol_ret_imb'] = df['Volume'] * np.sign(df['return'])
df['vol_ret_imb_20'] = df.groupby('asset_id')['vol_ret_imb'].transform(
    lambda x: x.rolling(20).mean()
)

#cross-asset rank momentum-for 5 days and 20 days
df_model['rank_mom_5'] = df_model.groupby('asset_id')['rnret_rank'].diff(5)
df_model['rank_mom_20'] = df_model.groupby('asset_id')['rnret_rank'].diff(20)

#trend intensity
df_model['trend_intensity']=df_model["trend_frac_20"] * df_model["roll_sharpe_20"]

In [32]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

WINDOW = 60
N_COMP = 3

pivot = df.pivot(index='date', columns='asset_id', values='return').sort_index()
pivot = pivot.ffill().bfill()

rows = []
dates = pivot.index.tolist()
assets = pivot.columns.tolist()

scaler = StandardScaler()

for i in range(WINDOW, len(pivot)):
    win = pivot.iloc[i-WINDOW:i]                 # (60 x 9)

    # STANDARDIZE (critical step)
    win_std = scaler.fit_transform(win.values)   # zero mean, unit variance per asset

    pca = PCA(n_components=N_COMP)
    comps = pca.fit_transform(win_std.T)         # per-asset scores

    day = dates[i]
    for j, a in enumerate(assets):
        rows.append((day, a, *comps[j]))

pca_df = pd.DataFrame(rows,
                      columns=['date','asset_id']+[f'pca_score_{k+1}' for k in range(N_COMP)])

df = df.merge(pca_df, on=['date','asset_id'], how='left')


In [33]:
#correlation features
# Create benchmark return series
bench = df[df['ticker']=='^GSPC'][['date','return']].rename(
    columns={'return':'bench_ret'}
)
df = df.merge(bench, on='date', how='left')


df['corr_vs_index_20'] = df.groupby('asset_id').apply(
    lambda x: x['return'].rolling(20).corr(x['bench_ret'])
).reset_index(level=0, drop=True)


  df['corr_vs_index_20'] = df.groupby('asset_id').apply(


In [34]:
!pip install hmmlearn



In [35]:
#regime markers
#clusters
from sklearn.cluster import KMeans

cluster_data = df[['realized_vol_20','roll_skew_20','roll_kurt_20']].fillna(0)
kmeans = KMeans(n_clusters=3, random_state=42)
df['regime_kmeans_3'] = kmeans.fit_predict(cluster_data)

#hmm regime probabilities
from hmmlearn.hmm import GaussianHMM

hmm = GaussianHMM(n_components=2, covariance_type='full', n_iter=200)
hmm.fit(df[['ret_volnorm']].dropna())

probs = hmm.predict_proba(df[['ret_volnorm']].fillna(0))
df['hmm_prob_0'] = probs[:,0]
df['hmm_prob_1'] = probs[:,1]



In [36]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['return'], inplace=True)
df_model = df.copy()

In [47]:
print(df_model[200:220])

          date ticker      price    asset_id    return  vol_rolling      ma_5  \
200 2015-08-29   AAPL  25.434504  Close_AAPL  0.000000     0.026755 -0.023868   
201 2015-08-30   AAPL  25.434504  Close_AAPL  0.000000     0.026755 -0.007009   
202 2015-08-31   AAPL  25.315516  Close_AAPL -0.004678     0.025305  0.003104   
203 2015-09-01   AAPL  24.183994  Close_AAPL -0.044697     0.022479  0.040382   
204 2015-09-02   AAPL  25.221216  Close_AAPL  0.042889     0.024247 -0.004095   
205 2015-09-03   AAPL  24.778938  Close_AAPL -0.017536     0.026341  0.008390   
206 2015-09-04   AAPL  24.531977  Close_AAPL -0.009967     0.026524  0.011183   
207 2015-09-05   AAPL  24.531977  Close_AAPL  0.000000     0.026575  0.004796   
208 2015-09-06   AAPL  24.531977  Close_AAPL  0.000000     0.026575  0.007632   
209 2015-09-07   AAPL  24.531977  Close_AAPL  0.000000     0.026409  0.002013   
210 2015-09-08   AAPL  25.214487  Close_AAPL  0.027821     0.026412 -0.021655   
211 2015-09-09   AAPL  24.72

In [48]:
print(df_model.columns)

Index(['date', 'ticker', 'price', 'asset_id', 'return', 'vol_rolling', 'ma_5',
       'ma_20', 'ret_volnorm', 'cum', 'cum_max', 'drawdown', 'vol_short',
       'vol_long', 'vol_jump', 'VaR', 'CVaR', 'ret_lag_1', 'ret_lag_2',
       'ret_lag_5', 'ret_var_20', 'acf1', 'target_next_day_return', 'Close',
       'High', 'Low', 'Open', 'Volume', 'roll_sharpe_20', 'downside_dev_20',
       'roll_skew_20', 'roll_kurt_20', 'boll_z', 'ATR_14', 'RSI_14',
       'WilliamsR_14', 'realized_vol_20', 'trend_frac_20', 'rnret_z_universe',
       'benchmark_rnret', 'excess_rnret_index', 'vol_dispersion',
       'rnret_anchor_^GSPC', 'spread_vs_^GSPC', 'rnret_anchor_AAPL',
       'spread_vs_AAPL', 'rnret_anchor_GC=F', 'spread_vs_GC=F',
       'rnret_anchor_CL=F', 'spread_vs_CL=F', 'rnret_anchor_EURUSD=X',
       'spread_vs_EURUSD=X', 'rnret_anchor_^TNX', 'spread_vs_^TNX',
       'rnret_anchor_VIX', 'spread_vs_VIX', 'vol_ewma_20', 'rnret_ewma',
       'rnret_downside', 'tail_prob_1pct', 'vol_jump_flag', 'v