In [1]:
!rm -rf multimodal-eq-sizing
!git clone https://github.com/brianrp09232000/multimodal-eq-sizing.git
!pip install -r multimodal-eq-sizing/requirements.txt

Cloning into 'multimodal-eq-sizing'...
remote: Enumerating objects: 908, done.[K
remote: Counting objects: 100% (243/243), done.[K
remote: Compressing objects: 100% (142/142), done.[K
remote: Total 908 (delta 173), reused 108 (delta 101), pack-reused 665 (from 3)[K
Receiving objects: 100% (908/908), 2.12 MiB | 14.20 MiB/s, done.
Resolving deltas: 100% (555/555), done.
Collecting yfinance==0.2.66 (from -r multimodal-eq-sizing/requirements.txt (line 1))
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting datetime (from -r multimodal-eq-sizing/requirements.txt (line 4))
  Downloading datetime-6.0-py3-none-any.whl.metadata (34 kB)
Collecting dataclasses (from -r multimodal-eq-sizing/requirements.txt (line 5))
  Downloading dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting typing (from -r multimodal-eq-sizing/requirements.txt (line 6))
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import sys
import pathlib
import pandas as pd
import numpy as np
import kagglehub
from datetime import datetime, timedelta

In [3]:
np.seterr(invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [4]:
# Uses the current directory where the notebook is running
repo_root = pathlib.Path("multimodal-eq-sizing")
sys.path.append(str(repo_root.resolve())) # .resolve() gets the full absolute path locally

In [5]:
from src.models.calibrators import IsotonicCalibrator
from src.models.aggregator import AggregatorParams, apply_aggregator

In [6]:
def make_dummy_predictions_df(n_rows: int = 10_000) -> pd.DataFrame:
    """
    Create a dummy dataset with columns:
        - date
        - ticker
        - target
        - pred
        - news_flag

    Rows are generated by repeating (date, ticker) combinations until
    n_rows is reached.
    """
    n_tickers = 200
    start_date: str = "2013-01-01"

    rng = np.random.default_rng()

    tickers = [f"T{i:03d}" for i in range(n_tickers)]

    # Figure out how many dates we need to get at least n_rows combos
    n_days = int(np.ceil(n_rows / n_tickers))

    dates = pd.date_range(start_date, periods=n_days, freq="D")

    # Create cartesian product of dates × tickers, then trim to n_rows
    date_vals = np.repeat(dates, n_tickers)[:n_rows]
    ticker_vals = np.tile(tickers, n_days)[:n_rows]

    # Create targets, predictions and flags
    target = rng.normal(loc=0.0, scale=0.02, size=n_rows)
    pred = target + rng.normal(loc=0.0, scale=0.02, size=n_rows)
    vol_realized_20d = rng.normal(loc=0.0, scale=0.02, size=n_rows)
    flag = np.random.randint(2, size=n_rows)

    df = pd.DataFrame({
        "date": date_vals,
        "ticker": ticker_vals,
        "target": target,
        "pred": pred,
        "news_flag": flag,
        "vol_realized_20d": vol_realized_20d
    })

    return df

In [7]:
def get_predictions(handle, filename, mock_data=False): 
    if mock_data:
        df = make_dummy_predictions_df()
        local_dataset_dir = f"/kaggle/working/{filename}"
        os.makedirs(local_dataset_dir, exist_ok=True)
        df.to_csv(f"{local_dataset_dir}/{filename}.csv", index=False)
        current_date = datetime.today().strftime("%Y-%m-%d")
        kagglehub.dataset_upload(handle, local_dataset_dir, version_notes= f"Dataset {current_date}")

    dir_path = kagglehub.dataset_download(handle, force_download=True)
    path = os.path.join(dir_path, filename + ".csv")
    return pd.read_csv(path)

In [8]:
leg1_df = get_predictions("rocklaura/multimodal-eq-sizing-norm-final",
                          "price_model_test_predicted")
leg2_df = get_predictions("brian2000/multimodal-eq-sizing-leg2",
                          "leg2_oof_preds")

In [9]:
leg1_df.rename(columns={"y_pred":"pred",
                        "news flag":"news_flag",
                        "next_day_excess_return": "target"}, inplace=True)
leg2_df.rename(columns={"prediction":"pred"}, inplace=True)

In [10]:
df = pd.merge(leg1_df,
              leg2_df[["Date", "ticker", 'pred']],
              on=["Date", "ticker"],
              how='left',
              suffixes=('_leg1', '_leg2'))

In [11]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
df_train = df[df["split"].isin(["train", "val"])].copy()
df_test  = df[df["split"] == "test"].copy()

In [13]:
leg1_calibrator = IsotonicCalibrator().fit(df_train['pred_leg1'].to_numpy(),
                                           df_train['target'].to_numpy())
leg2_calibrator = IsotonicCalibrator().fit(df_train['pred_leg2'].to_numpy(),
                                           df_train['target'].to_numpy())

In [14]:
df_train["r_px_cal"] = leg1_calibrator.predict(df_train["pred_leg1"])
df_train["r_news_cal"] = leg2_calibrator.predict(df_train["pred_leg2"])
df_train["pred_source"] = "oof"

In [15]:
df_test["r_px_cal"] = leg1_calibrator.predict(df_test["pred_leg1"])
df_test["r_news_cal"] = leg2_calibrator.predict(df_test["pred_leg2"])
df_test["pred_source"] = "test"

In [16]:
calibrated_df = pd.concat([df_train, df_test])
calibrated_df = apply_aggregator(calibrated_df)
agg_calibrator = IsotonicCalibrator().fit(calibrated_df['r_tilde'].to_numpy(),
                                          calibrated_df['target'].to_numpy())
calibrated_df["r_hat"] = agg_calibrator.predict(calibrated_df["r_tilde"])
calibrated_df["z"] = calibrated_df["r_hat"].clip(-0.005, 0.005) / df["vol_realized_20d"]
calibrated_df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,sector,...,entities_today,pred_leg1,pred_leg2,r_px_cal,r_news_cal,pred_source,r_tilde,disagreement,r_hat,z
0,2012-01-03 00:00:00+00:00,67.374450,68.372319,67.374450,67.854904,12249842.0,0.0,0.0,GE,Industrials,...,,-0.000086,-0.000000,0.000036,0.000025,oof,0.000036,0.000000,0.000084,0.004796
1,2012-01-04 00:00:00+00:00,67.670155,68.741940,67.559285,68.594109,8921285.0,0.0,0.0,GE,Industrials,...,,-0.000086,-0.000000,0.000036,0.000025,oof,0.000036,0.000000,0.000084,0.004815
2,2012-01-05 00:00:00+00:00,68.224468,68.741882,67.596184,68.557091,9560730.0,0.0,0.0,GE,Industrials,...,,-0.000086,-0.000000,0.000036,0.000025,oof,0.000036,0.000000,0.000084,0.004925
3,2012-01-06 00:00:00+00:00,69.222359,69.444105,68.594074,68.926697,10656248.0,0.0,0.0,GE,Industrials,...,,-0.000086,-0.000000,0.000036,0.000025,oof,0.000036,0.000000,0.000084,0.004935
4,2012-01-09 00:00:00+00:00,69.259304,69.776712,68.594059,69.702797,9804489.0,0.0,0.0,GE,Industrials,...,,-0.000086,-0.000000,0.000036,0.000025,oof,0.000036,0.000000,0.000084,0.005487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436910,2018-12-21 00:00:00+00:00,17.770000,17.950001,16.850000,16.959999,9401000.0,0.0,0.0,UAA,Consumer Cyclical,...,"['lem', 'lulu', 'nike', 'on']",-0.000063,-0.000173,0.000036,-0.000185,test,-0.000066,0.000221,-0.000202,-0.005395
436911,2018-12-21 00:00:00+00:00,17.770000,17.950001,16.850000,16.959999,9401000.0,0.0,0.0,UAA,Consumer Cyclical,...,"['lem', 'lulu', 'nike', 'on']",-0.000063,-0.000023,0.000036,0.000025,test,0.000031,0.000010,-0.000076,-0.002043
436912,2018-12-24 00:00:00+00:00,16.740000,17.360001,16.570000,16.750000,2688600.0,0.0,0.0,UAA,Consumer Cyclical,...,,-0.000063,0.000000,0.000036,0.000025,test,0.000036,0.000000,0.000084,0.002245
436913,2018-12-26 00:00:00+00:00,16.790001,17.840000,16.520000,17.820000,3966400.0,0.0,0.0,UAA,Consumer Cyclical,...,[],-0.000063,-0.000123,0.000036,0.000025,test,0.000031,0.000010,-0.000076,-0.001920


In [17]:
handle = "brian2000/calibrated-predictions"
local_dataset_dir = f"/kaggle/working/datasets"
os.makedirs(local_dataset_dir, exist_ok=True)
calibrated_df.to_csv(f"{local_dataset_dir}/calibrated_predictions.csv", index=False)
current_date = datetime.today().strftime("%Y-%m-%d")
kagglehub.dataset_upload(handle, local_dataset_dir, version_notes= f"Dataset {current_date}")

Uploading Dataset https://www.kaggle.com/datasets/brian2000/calibrated-predictions ...
Starting upload for file /kaggle/working/datasets/calibrated_predictions.csv


Uploading: 100%|██████████| 3.59G/3.59G [00:36<00:00, 99.5MB/s]

Upload successful: /kaggle/working/datasets/calibrated_predictions.csv (3GB)





Your dataset has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/brian2000/calibrated-predictions
