In [82]:
!rm -rf /kaggle/working/multimodal-eq-sizing
!git clone -b feature/add_loaders_target_and_spread https://github.com/brianrp09232000/multimodal-eq-sizing.git /kaggle/working/multimodal-eq-sizing
!pip install -r /kaggle/working/multimodal-eq-sizing/requirements.txt

Cloning into '/kaggle/working/multimodal-eq-sizing'...
remote: Enumerating objects: 211, done.[K
remote: Counting objects: 100% (211/211), done.[K
remote: Compressing objects: 100% (164/164), done.[K
remote: Total 211 (delta 106), reused 16 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (211/211), 51.55 KiB | 2.58 MiB/s, done.
Resolving deltas: 100% (106/106), done.


In [83]:
import sys
import pathlib
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [84]:
np.seterr(invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [85]:
repo_root = pathlib.Path("/kaggle/working/multimodal-eq-sizing")
sys.path.append(str(repo_root))

In [86]:
from src.data.loaders import get_tickers_history, get_return_data, get_excess_return, get_vix_data, get_spread_z

In [87]:
def get_date_range(df: pd.DataFrame) -> tuple:
    grouped_by_date = df.groupby(["ticker"]).agg(['min', 'max', 'count'])["Date"]
    start = grouped_by_date["min"].min()
    end = grouped_by_date["max"].max()
    return start, end

In [88]:
df = get_return_data("/kaggle/input/news-trading/return_data.csv")
start, end = get_date_range(df)

# Add excess return

In [89]:
def add_excess_return(df, start, end):
    excess_return_df = get_excess_return(df, start, end)
    df = df.merge(excess_return_df, on=["ticker", "Date"], how="left")
    return df

In [90]:
df = add_excess_return(df, start, end)

# Add market regime VIX z-score

In [92]:
def add_vix_z(df, start, end):
    vix_z_df = get_vix_data(start, end)
    format_str = "%Y-%m-%d"
    vix_z_df["Date"] = vix_z_df["Date"].dt.strftime(format_str)
    df["Date"] = df["Date"].dt.strftime(format_str) 
    df = df.merge(vix_z_df, on=["Date"], how="left")
    df['Date'] = pd.to_datetime(df['Date'], utc=True)
    return df

In [94]:
df = add_vix_z(df, start, end)

Yay!ðŸ¥³


# Add spread z-score

In [96]:
def add_spread_z(existing_df: pd.DataFrame, buffer_days=380) -> pd.DataFrame:
    """
    Use existing OHLCV df, pull buffered history, compute young-safe spread_z on the combined
    Then merge back only the target window rows to prevent nulls.
    """
    df = existing_df.copy()
    start, end = df["Date"].min(), df["Date"].max()

    tickers = sorted(df['ticker'].unique())
    fetch_start = start - timedelta(days=buffer_days)
    fetch_end   = end

    # You already have get_tickers_history(tickers, start, end)
    hist = get_tickers_history(tickers, fetch_start, fetch_end)
    hist["Date"] = pd.to_datetime(hist["Date"], utc=True)

    # Combine buffer + existing; keep existing rows on overlap
    combined = pd.concat([hist, df], ignore_index=True)
    combined = combined.sort_values(['ticker', "Date"])
    combined = combined.drop_duplicates(subset=['ticker', "Date"], keep="last")

    # Compute young-safe spread_z on the full combined range
    combined = get_spread_z(combined)

    # Merge only computed columns back to target window
    cols_to_merge = ['ticker', 'Date', "spread_z"]
    out = df.merge(combined[cols_to_merge], on=['ticker', 'Date'], how="left")

    # Final minimal, causal clean-up to guarantee NON-NULL spread_z in target window:
    # 1) per-ticker forward-fill (past only), 2) same-day cross-section median, 3) final 0
    out["spread_z"] = (
        out.groupby('ticker')["spread_z"].ffill()
           .fillna(out.groupby('Date')["spread_z"].transform("median"))
           .fillna(0.0)
    ).clip(-3, 3)

    return out

In [97]:
df = add_spread_z(df)

# Final Complete Dataset

In [98]:
df.to_csv('final_dataset.csv', index=False)

In [99]:
get_return_data("/kaggle/working/final_dataset.csv")

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,o2c_return,excess_return,VIX_Close,VIX_z,spread_z
0,2010-01-04 00:00:00+00:00,6.487649,6.520174,6.455732,6.505279,493729600,0.0,0.0,AAPL,0.002718,-0.005826,20.040001,-1.249591,-0.231385
1,2010-01-05 00:00:00+00:00,6.523214,6.553307,6.482178,6.516527,601904800,0.0,0.0,AAPL,-0.001025,-0.004292,19.350000,-1.314181,-0.015467
2,2010-01-06 00:00:00+00:00,6.516527,6.542364,6.406185,6.412873,552160000,0.0,0.0,AAPL,-0.015906,-0.017580,19.160000,-1.323599,0.871327
3,2010-01-07 00:00:00+00:00,6.436583,6.444183,6.354511,6.401018,477131200,0.0,0.0,AAPL,-0.005525,-0.011605,19.059999,-1.324017,1.082039
4,2010-01-08 00:00:00+00:00,6.392506,6.444181,6.354814,6.443573,447610800,0.0,0.0,AAPL,0.007989,0.002018,18.129999,-1.413993,0.266586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21904,2018-12-21 00:00:00+00:00,219.550003,222.259995,207.369995,208.800003,8828100,0.0,0.0,ADBE,-0.048964,-0.024485,30.110001,2.909232,-1.825644
21905,2018-12-24 00:00:00+00:00,206.899994,212.580002,204.949997,205.160004,2940500,0.0,0.0,ADBE,-0.008410,0.011252,36.070000,3.000000,-2.249653
21906,2018-12-26 00:00:00+00:00,206.529999,222.960007,206.250000,222.949997,5897900,0.0,0.0,ADBE,0.079504,0.036236,30.410000,2.812017,-2.164573
21907,2018-12-27 00:00:00+00:00,219.990005,225.169998,214.160004,225.139999,3931100,0.0,0.0,ADBE,0.023410,0.000736,29.959999,2.675121,-2.151093


In [100]:
!rm -rf /kaggle/working/multimodal-eq-sizing