In [1]:
!rm -rf multimodal-eq-sizing
!git clone https://github.com/brianrp09232000/multimodal-eq-sizing.git
!pip install -r multimodal-eq-sizing/requirements.txt

Cloning into 'multimodal-eq-sizing'...
remote: Enumerating objects: 862, done.[K
remote: Counting objects: 100% (181/181), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 862 (delta 133), reused 95 (delta 94), pack-reused 681 (from 3)[K
Receiving objects: 100% (862/862), 986.42 KiB | 8.81 MiB/s, done.
Resolving deltas: 100% (528/528), done.
Collecting yfinance==0.2.66 (from -r multimodal-eq-sizing/requirements.txt (line 1))
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting datetime (from -r multimodal-eq-sizing/requirements.txt (line 4))
  Downloading datetime-6.0-py3-none-any.whl.metadata (34 kB)
Collecting dataclasses (from -r multimodal-eq-sizing/requirements.txt (line 5))
  Downloading dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting typing (from -r multimodal-eq-sizing/requirements.txt (line 6))
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [2]:
import os
import sys
import pathlib
import pandas as pd
import numpy as np
import kagglehub
from datetime import datetime, timedelta

In [3]:
np.seterr(invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [4]:
# Uses the current directory where the notebook is running
repo_root = pathlib.Path("multimodal-eq-sizing")
sys.path.append(str(repo_root.resolve())) # .resolve() gets the full absolute path locally

In [5]:
from src.models.calibrators import IsotonicCalibrator
from src.models.aggregator import AggregatorParams, apply_aggregator

In [6]:
def make_dummy_predictions_df(n_rows: int = 10_000) -> pd.DataFrame:
    """
    Create a dummy dataset with columns:
        - date
        - ticker
        - target
        - pred
        - news_flag

    Rows are generated by repeating (date, ticker) combinations until
    n_rows is reached.
    """
    n_tickers = 200
    start_date: str = "2013-01-01"

    rng = np.random.default_rng()

    tickers = [f"T{i:03d}" for i in range(n_tickers)]

    # Figure out how many dates we need to get at least n_rows combos
    n_days = int(np.ceil(n_rows / n_tickers))

    dates = pd.date_range(start_date, periods=n_days, freq="D")

    # Create cartesian product of dates × tickers, then trim to n_rows
    date_vals = np.repeat(dates, n_tickers)[:n_rows]
    ticker_vals = np.tile(tickers, n_days)[:n_rows]

    # Create targets, predictions and flags
    target = rng.normal(loc=0.0, scale=0.02, size=n_rows)
    pred = target + rng.normal(loc=0.0, scale=0.02, size=n_rows)
    flag = np.random.randint(2, size=n_rows)

    df = pd.DataFrame({
        "date": date_vals,
        "ticker": ticker_vals,
        "target": target,
        "pred": pred,
        "news_flag": flag
    })

    return df

In [7]:
def get_predictions(handle, filename, mock_data=False): 
    if mock_data:
        df = make_dummy_predictions_df()
        local_dataset_dir = f"/kaggle/working/{filename}"
        os.makedirs(local_dataset_dir, exist_ok=True)
        df.to_csv(f"{local_dataset_dir}/{filename}.csv", index=False)
        current_date = datetime.today().strftime("%Y-%m-%d")
        kagglehub.dataset_upload(handle, local_dataset_dir, version_notes= f"Dataset {current_date}")

    dir_path = kagglehub.dataset_download(handle, force_download=True)
    path = os.path.join(dir_path, filename + ".csv")
    return pd.read_csv(path)

In [8]:
leg1_df = get_predictions("brian2000/dummy-leg1", "leg1", True)
leg2_df = get_predictions("brian2000/dummy-leg2", "leg2", True)

Uploading Dataset https://www.kaggle.com/datasets/brian2000/dummy-leg1 ...
Starting upload for file /kaggle/working/leg1/leg1.csv


Uploading: 100%|██████████| 606k/606k [00:00<00:00, 1.35MB/s]

Upload successful: /kaggle/working/leg1/leg1.csv (592KB)





Your dataset has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/brian2000/dummy-leg1
Uploading Dataset https://www.kaggle.com/datasets/brian2000/dummy-leg2 ...
Starting upload for file /kaggle/working/leg2/leg2.csv


Uploading: 100%|██████████| 605k/605k [00:00<00:00, 1.48MB/s]

Upload successful: /kaggle/working/leg2/leg2.csv (591KB)





Your dataset has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/brian2000/dummy-leg2


In [9]:
df = pd.merge(leg1_df,
              leg2_df[["date", "ticker", 'pred']],
              on=["date", "ticker"],
              suffixes=('_leg1', '_leg2'))
df

Unnamed: 0,date,ticker,target,pred_leg1,news_flag,pred_leg2
0,2013-01-01,T000,0.026605,0.044675,1,0.029977
1,2013-01-01,T001,0.002245,0.015173,0,-0.028340
2,2013-01-01,T002,0.003168,0.003060,1,-0.027206
3,2013-01-01,T003,-0.049775,-0.068806,0,0.013270
4,2013-01-01,T004,0.012393,0.029545,1,-0.027736
...,...,...,...,...,...,...
9995,2013-02-19,T195,-0.006433,0.003279,0,-0.044346
9996,2013-02-19,T196,-0.031526,-0.021167,0,0.004953
9997,2013-02-19,T197,0.042583,0.013798,0,-0.072039
9998,2013-02-19,T198,-0.016464,-0.001605,1,0.053259


In [10]:
df_train = df[df["date"] < '2013-02-01'].copy()
df_test  = df[df["date"] >= '2013-02-01'].copy()

In [11]:
leg1_calibrator = IsotonicCalibrator().fit(df_train['pred_leg1'].to_numpy(),
                                           df_train['target'].to_numpy())
leg2_calibrator = IsotonicCalibrator().fit(df_train['pred_leg2'].to_numpy(),
                                           df_train['target'].to_numpy())

In [12]:
df_train["r_px_cal"] = leg1_calibrator.predict(df_train["pred_leg1"])
df_train["r_news_cal"] = leg2_calibrator.predict(df_train["pred_leg2"])
df_train["pred_source"] = "oof"

In [13]:
df_test["r_px_cal"] = leg1_calibrator.predict(df_test["pred_leg1"])
df_test["r_news_cal"] = leg2_calibrator.predict(df_test["pred_leg2"])
df_test["pred_source"] = "test"

In [14]:
calibrated_df = pd.concat([df_train, df_test])
calibrated_df

Unnamed: 0,date,ticker,target,pred_leg1,news_flag,pred_leg2,r_px_cal,r_news_cal,pred_source
0,2013-01-01,T000,0.026605,0.044675,1,0.029977,0.023104,0.000062,oof
1,2013-01-01,T001,0.002245,0.015173,0,-0.028340,0.008916,0.000062,oof
2,2013-01-01,T002,0.003168,0.003060,1,-0.027206,0.000172,0.000062,oof
3,2013-01-01,T003,-0.049775,-0.068806,0,0.013270,-0.034056,0.000062,oof
4,2013-01-01,T004,0.012393,0.029545,1,-0.027736,0.015235,0.000062,oof
...,...,...,...,...,...,...,...,...,...
9995,2013-02-19,T195,-0.006433,0.003279,0,-0.044346,0.000172,0.000062,test
9996,2013-02-19,T196,-0.031526,-0.021167,0,0.004953,-0.011831,0.000062,test
9997,2013-02-19,T197,0.042583,0.013798,0,-0.072039,0.006643,-0.001232,test
9998,2013-02-19,T198,-0.016464,-0.001605,1,0.053259,-0.001153,0.000062,test


In [15]:
calibrated_df = apply_aggregator(calibrated_df)
calibrated_df

Unnamed: 0,date,ticker,target,pred_leg1,news_flag,pred_leg2,r_px_cal,r_news_cal,pred_source,r_tilde,disagreement
0,2013-01-01,T000,0.026605,0.044675,1,0.029977,0.023104,0.000062,oof,0.000008,0.023043
1,2013-01-01,T001,0.002245,0.015173,0,-0.028340,0.008916,0.000062,oof,0.008916,0.000000
2,2013-01-01,T002,0.003168,0.003060,1,-0.027206,0.000172,0.000062,oof,0.000113,0.000110
3,2013-01-01,T003,-0.049775,-0.068806,0,0.013270,-0.034056,0.000062,oof,-0.034056,0.000000
4,2013-01-01,T004,0.012393,0.029545,1,-0.027736,0.015235,0.000062,oof,0.000012,0.015173
...,...,...,...,...,...,...,...,...,...,...,...
9995,2013-02-19,T195,-0.006433,0.003279,0,-0.044346,0.000172,0.000062,test,0.000172,0.000000
9996,2013-02-19,T196,-0.031526,-0.021167,0,0.004953,-0.011831,0.000062,test,-0.011831,0.000000
9997,2013-02-19,T197,0.042583,0.013798,0,-0.072039,0.006643,-0.001232,test,0.006643,0.000000
9998,2013-02-19,T198,-0.016464,-0.001605,1,0.053259,-0.001153,0.000062,test,-0.000107,0.001215


In [16]:
agg_calibrator = IsotonicCalibrator().fit(calibrated_df['r_tilde'].to_numpy(),
                                          calibrated_df['target'].to_numpy())

In [17]:
calibrated_df["r_hat"] = agg_calibrator.predict(calibrated_df["r_tilde"])
calibrated_df

Unnamed: 0,date,ticker,target,pred_leg1,news_flag,pred_leg2,r_px_cal,r_news_cal,pred_source,r_tilde,disagreement,r_hat
0,2013-01-01,T000,0.026605,0.044675,1,0.029977,0.023104,0.000062,oof,0.000008,0.023043,0.009408
1,2013-01-01,T001,0.002245,0.015173,0,-0.028340,0.008916,0.000062,oof,0.008916,0.000000,0.009408
2,2013-01-01,T002,0.003168,0.003060,1,-0.027206,0.000172,0.000062,oof,0.000113,0.000110,0.009408
3,2013-01-01,T003,-0.049775,-0.068806,0,0.013270,-0.034056,0.000062,oof,-0.034056,0.000000,-0.033652
4,2013-01-01,T004,0.012393,0.029545,1,-0.027736,0.015235,0.000062,oof,0.000012,0.015173,0.009408
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2013-02-19,T195,-0.006433,0.003279,0,-0.044346,0.000172,0.000062,test,0.000172,0.000000,0.009408
9996,2013-02-19,T196,-0.031526,-0.021167,0,0.004953,-0.011831,0.000062,test,-0.011831,0.000000,-0.012545
9997,2013-02-19,T197,0.042583,0.013798,0,-0.072039,0.006643,-0.001232,test,0.006643,0.000000,0.009408
9998,2013-02-19,T198,-0.016464,-0.001605,1,0.053259,-0.001153,0.000062,test,-0.000107,0.001215,-0.009352
