In [1]:
!rm -rf multimodal-eq-sizing
!git clone https://github.com/brianrp09232000/multimodal-eq-sizing.git
!pip install -r multimodal-eq-sizing/requirements.txt

Cloning into 'multimodal-eq-sizing'...
remote: Enumerating objects: 866, done.[K
remote: Counting objects: 100% (195/195), done.[K
remote: Compressing objects: 100% (101/101), done.[K
remote: Total 866 (delta 143), reused 95 (delta 94), pack-reused 671 (from 3)[K
Receiving objects: 100% (866/866), 995.38 KiB | 5.21 MiB/s, done.
Resolving deltas: 100% (528/528), done.
Collecting yfinance==0.2.66 (from -r multimodal-eq-sizing/requirements.txt (line 1))
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting datetime (from -r multimodal-eq-sizing/requirements.txt (line 4))
  Downloading datetime-6.0-py3-none-any.whl.metadata (34 kB)
Collecting dataclasses (from -r multimodal-eq-sizing/requirements.txt (line 5))
  Downloading dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting typing (from -r multimodal-eq-sizing/requirements.txt (line 6))
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import sys
import pathlib
import pandas as pd
import numpy as np
import kagglehub
from datetime import datetime, timedelta

In [3]:
np.seterr(invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [4]:
# Uses the current directory where the notebook is running
repo_root = pathlib.Path("multimodal-eq-sizing")
sys.path.append(str(repo_root.resolve())) # .resolve() gets the full absolute path locally

In [5]:
from src.models.calibrators import IsotonicCalibrator
from src.models.aggregator import AggregatorParams, apply_aggregator

In [6]:
def make_dummy_predictions_df(n_rows: int = 10_000) -> pd.DataFrame:
    """
    Create a dummy dataset with columns:
        - date
        - ticker
        - target
        - pred
        - news_flag

    Rows are generated by repeating (date, ticker) combinations until
    n_rows is reached.
    """
    n_tickers = 200
    start_date: str = "2013-01-01"

    rng = np.random.default_rng()

    tickers = [f"T{i:03d}" for i in range(n_tickers)]

    # Figure out how many dates we need to get at least n_rows combos
    n_days = int(np.ceil(n_rows / n_tickers))

    dates = pd.date_range(start_date, periods=n_days, freq="D")

    # Create cartesian product of dates × tickers, then trim to n_rows
    date_vals = np.repeat(dates, n_tickers)[:n_rows]
    ticker_vals = np.tile(tickers, n_days)[:n_rows]

    # Create targets, predictions and flags
    target = rng.normal(loc=0.0, scale=0.02, size=n_rows)
    pred = target + rng.normal(loc=0.0, scale=0.02, size=n_rows)
    vol_realized_20d = rng.normal(loc=0.0, scale=0.02, size=n_rows)
    flag = np.random.randint(2, size=n_rows)

    df = pd.DataFrame({
        "date": date_vals,
        "ticker": ticker_vals,
        "target": target,
        "pred": pred,
        "news_flag": flag,
        "vol_realized_20d": vol_realized_20d
    })

    return df

In [7]:
def get_predictions(handle, filename, mock_data=False): 
    if mock_data:
        df = make_dummy_predictions_df()
        local_dataset_dir = f"/kaggle/working/{filename}"
        os.makedirs(local_dataset_dir, exist_ok=True)
        df.to_csv(f"{local_dataset_dir}/{filename}.csv", index=False)
        current_date = datetime.today().strftime("%Y-%m-%d")
        kagglehub.dataset_upload(handle, local_dataset_dir, version_notes= f"Dataset {current_date}")

    dir_path = kagglehub.dataset_download(handle, force_download=True)
    path = os.path.join(dir_path, filename + ".csv")
    return pd.read_csv(path)

In [8]:
leg1_df = get_predictions("brian2000/dummy-leg1", "leg1", True)
leg2_df = get_predictions("brian2000/dummy-leg2", "leg2", True)

Uploading Dataset https://www.kaggle.com/datasets/brian2000/dummy-leg1 ...
Starting upload for file /kaggle/working/leg1/leg1.csv


Uploading: 100%|██████████| 819k/819k [00:02<00:00, 400kB/s] 

Upload successful: /kaggle/working/leg1/leg1.csv (800KB)





Your dataset has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/brian2000/dummy-leg1
Uploading Dataset https://www.kaggle.com/datasets/brian2000/dummy-leg2 ...
Starting upload for file /kaggle/working/leg2/leg2.csv


Uploading: 100%|██████████| 819k/819k [00:01<00:00, 549kB/s]

Upload successful: /kaggle/working/leg2/leg2.csv (800KB)





Your dataset has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/brian2000/dummy-leg2


In [9]:
df = pd.merge(leg1_df,
              leg2_df[["date", "ticker", 'pred']],
              on=["date", "ticker"],
              suffixes=('_leg1', '_leg2'))

In [10]:
df_train = df[df["date"] < '2013-02-01'].copy()
df_test  = df[df["date"] >= '2013-02-01'].copy()

In [11]:
leg1_calibrator = IsotonicCalibrator().fit(df_train['pred_leg1'].to_numpy(),
                                           df_train['target'].to_numpy())
leg2_calibrator = IsotonicCalibrator().fit(df_train['pred_leg2'].to_numpy(),
                                           df_train['target'].to_numpy())

In [12]:
df_train["r_px_cal"] = leg1_calibrator.predict(df_train["pred_leg1"])
df_train["r_news_cal"] = leg2_calibrator.predict(df_train["pred_leg2"])
df_train["pred_source"] = "oof"

In [13]:
df_test["r_px_cal"] = leg1_calibrator.predict(df_test["pred_leg1"])
df_test["r_news_cal"] = leg2_calibrator.predict(df_test["pred_leg2"])
df_test["pred_source"] = "test"

In [14]:
calibrated_df = pd.concat([df_train, df_test])
calibrated_df = apply_aggregator(calibrated_df)
agg_calibrator = IsotonicCalibrator().fit(calibrated_df['r_tilde'].to_numpy(),
                                          calibrated_df['target'].to_numpy())
calibrated_df["r_hat"] = agg_calibrator.predict(calibrated_df["r_tilde"])
calibrated_df["z"] = calibrated_df["r_hat"].clip(-0.005, 0.005) / df["vol_realized_20d"]
calibrated_df

Unnamed: 0,date,ticker,target,pred_leg1,news_flag,vol_realized_20d,pred_leg2,r_px_cal,r_news_cal,pred_source,r_tilde,disagreement,r_hat,z
0,2013-01-01,T000,-0.012136,-0.041669,1,0.010233,0.019837,-0.020685,0.000092,oof,-0.000009,0.020778,-0.010307,-0.488604
1,2013-01-01,T001,-0.011064,-0.032377,0,0.035610,0.060542,-0.015156,0.001123,oof,-0.015156,0.000000,-0.013846,-0.140410
2,2013-01-01,T002,0.022297,0.018790,1,0.023563,0.024538,0.008935,0.000092,oof,0.000021,0.008842,0.007634,0.212197
3,2013-01-01,T003,0.036338,0.029654,0,0.023873,0.003472,0.014516,0.000092,oof,0.014516,0.000000,0.015192,0.209439
4,2013-01-01,T004,-0.027452,-0.048634,0,0.007031,0.020133,-0.026518,0.000092,oof,-0.026518,0.000000,-0.026012,-0.711107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2013-02-19,T195,-0.006457,-0.019360,0,-0.012315,-0.001846,-0.009630,0.000092,test,-0.009630,0.000000,-0.010307,0.405998
9996,2013-02-19,T196,-0.054167,-0.067117,1,0.001298,0.023463,-0.035123,0.000092,test,-0.000005,0.035216,-0.010307,-3.852021
9997,2013-02-19,T197,0.008144,-0.012173,1,0.013122,-0.023966,-0.006395,-0.000458,test,-0.000035,0.005938,-0.010307,-0.381028
9998,2013-02-19,T198,-0.025550,0.005493,0,-0.007238,0.014994,0.003857,0.000092,test,0.003857,0.000000,0.007634,-0.690841


In [15]:
handle = "brian2000/calibrated-predictions"
local_dataset_dir = f"/kaggle/working/datasets"
os.makedirs(local_dataset_dir, exist_ok=True)
calibrated_df.to_csv(f"{local_dataset_dir}/calibrated_predictions.csv", index=False)
current_date = datetime.today().strftime("%Y-%m-%d")
kagglehub.dataset_upload(handle, local_dataset_dir, version_notes= f"Dataset {current_date}")

Uploading Dataset https://www.kaggle.com/datasets/brian2000/calibrated-predictions ...
Starting upload for file /kaggle/working/datasets/calibrated_predictions.csv


Uploading: 100%|██████████| 2.19M/2.19M [00:01<00:00, 1.19MB/s]

Upload successful: /kaggle/working/datasets/calibrated_predictions.csv (2MB)





Your dataset has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/brian2000/calibrated-predictions
