In [1]:
!rm -rf /kaggle/working/multimodal-eq-sizing
!git clone -b feature/add_rl_part https://github.com/brianrp09232000/multimodal-eq-sizing.git /kaggle/working/multimodal-eq-sizing
!pip install -r /kaggle/working/multimodal-eq-sizing/requirements.txt

Cloning into '/kaggle/working/multimodal-eq-sizing'...
remote: Enumerating objects: 540, done.[K
remote: Counting objects: 100% (160/160), done.[K
remote: Compressing objects: 100% (122/122), done.[K
remote: Total 540 (delta 91), reused 50 (delta 37), pack-reused 380 (from 3)[K
Receiving objects: 100% (540/540), 233.81 KiB | 9.74 MiB/s, done.
Resolving deltas: 100% (301/301), done.
Collecting yfinance==0.2.66 (from -r /kaggle/working/multimodal-eq-sizing/requirements.txt (line 1))
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting datetime (from -r /kaggle/working/multimodal-eq-sizing/requirements.txt (line 4))
  Downloading datetime-6.0-py3-none-any.whl.metadata (34 kB)
Collecting dataclasses (from -r /kaggle/working/multimodal-eq-sizing/requirements.txt (line 5))
  Downloading dataclasses-0.6-py3-none-any.whl.metadata (3.0 kB)
Collecting typing (from -r /kaggle/working/multimodal-eq-sizing/requirements.txt (line 6))
  Downloading typing-

In [2]:
import sys
import pathlib
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import tensorflow as tf

2025-12-01 23:14:37.182180: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764630877.431138      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764630877.499170      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
np.seterr(invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [4]:
repo_root = pathlib.Path("/kaggle/working/multimodal-eq-sizing")
sys.path.append(str(repo_root))

In [5]:
from src.backtest.behavior_policy import run_behavior_policy_with_guards
from src.rl.state_builder import ensure_z_column, build_rl_dataset
from src.rl.cql_agent import CQLAgentTF, CQLConfig

In [6]:
# 1) Load final stacked dataset from 03_calibrate_and_stack
final_path = "/kaggle/input/01-prepare-data/final_dataset.csv"
df = pd.read_csv(final_path, parse_dates=["Date"])

In [7]:
# 2) Ensure we have a z column (dummy for now)
z_col ="dummy_z"
df = ensure_z_column(df, z_col=z_col, seed=42)

Using randomly generated dummy z column


In [8]:
# 3) Run dumb behavior policy + guards
df_policy = run_behavior_policy_with_guards(
    df,
    z_col="dummy_z",    # later change to "z" when you have real alpha
    nav=1_000_000.0,
    allow_short=False,
    trading_enabled=True,
)

In [9]:
# 4) Build RL training dataset (s, a, r, s')
extra_state_cols = ["has_news", "disagreement"]
rl_df = build_rl_dataset(
    df_policy,
    z_col="dummy_z",
    reward_return_col="excess_return",
    action_col="action_weight_raw",
    weight_col="weight_after_guards",
    extra_state_cols=None,  # only if those exist
    lambda_risk=0.1,
)
rl_df.head()

Unnamed: 0,Date,ticker,reward,action,done,state_dummy_z,state_VIX_z,state_spread_z,state_weight_after_guards,next_state_dummy_z,next_state_VIX_z,next_state_spread_z,next_state_weight_after_guards
0,2010-01-04 00:00:00+00:00,AAPL,-0.0,0.0,0,0.304717,-1.249591,-0.231379,0.0,-1.039984,-1.314181,-0.015455,0.0
8,2010-01-05 00:00:00+00:00,AAPL,-0.0,0.0,0,-1.039984,-1.314181,-0.015455,0.0,0.750451,-1.323599,0.871336,0.0
16,2010-01-06 00:00:00+00:00,AAPL,-0.0,0.0,0,0.750451,-1.323599,0.871336,0.0,0.940565,-1.324017,1.082035,0.006999
24,2010-01-07 00:00:00+00:00,AAPL,2e-06,0.01,0,0.940565,-1.324017,1.082035,0.006999,-1.951035,-1.413993,0.266584,0.0
32,2010-01-08 00:00:00+00:00,AAPL,-7e-06,0.0,0,-1.951035,-1.413993,0.266584,0.0,-1.30218,-1.465035,0.106547,0.0


In [10]:
#5) Extract state / next_state matrices and targets 
state_cols = [c for c in rl_df.columns if c.startswith("state_")]
next_state_cols = [c for c in rl_df.columns if c.startswith("next_state_")]

states = rl_df[state_cols].to_numpy().astype(np.float32)          # [N, state_dim]
next_states = rl_df[next_state_cols].to_numpy().astype(np.float32)

rewards = rl_df["reward"].to_numpy().astype(np.float32).reshape(-1, 1)  # [N,1]
dones = rl_df["done"].to_numpy().astype(np.float32).reshape(-1, 1)

# action is in weight units {0.0, 0.01, 0.02}. Map to indices {0,1,2}.
actions_w = rl_df["action"].to_numpy().astype(np.float32)
actions_idx = np.round(actions_w * 100).astype(np.int32).reshape(-1, 1)
# (0.00 -> 0, 0.01 -> 1, 0.02 -> 2)

state_dim = states.shape[1]
n_actions = 3

In [11]:
batch_size = 1024

dataset = tf.data.Dataset.from_tensor_slices(
    (states, actions_idx, rewards, next_states, dones)
)
dataset = dataset.shuffle(buffer_size=len(rl_df), reshuffle_each_iteration=True)
dataset = dataset.batch(batch_size)

I0000 00:00:1764630918.336622      20 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [12]:
#6) Training loop
config = CQLConfig(
    gamma=0.99,
    alpha=1.0,
    lr=1e-3,
    tau=0.005,
    n_actions=n_actions,
    hidden_dim=128,
)

agent = CQLAgentTF(state_dim=state_dim, config=config)

epochs = 100
steps_per_epoch = 200  # or len(rl_df) // batch_size

for epoch in range(epochs):
    logs_epoch = {"loss": 0.0, "td_loss": 0.0, "cql_loss": 0.0}
    n_batches = 0

    for step, (s_b, a_b, r_b, ns_b, d_b) in enumerate(dataset):
        metrics = agent.train_step(s_b, a_b, r_b, ns_b, d_b)
        for k in logs_epoch:
            logs_epoch[k] += float(metrics[k])
        n_batches += 1

        if step >= steps_per_epoch:
            break

    agent.update_target()

    for k in logs_epoch:
        logs_epoch[k] /= max(n_batches, 1)

    print(
        f"Epoch {epoch:03d}: "
        f"loss={logs_epoch['loss']:.6f}, "
        f"td={logs_epoch['td_loss']:.6f}, "
        f"cql={logs_epoch['cql_loss']:.6f}"
    )

Epoch 000: loss=0.866396, td=0.059742, cql=0.806653
Epoch 001: loss=0.634317, td=0.070626, cql=0.563691
Epoch 002: loss=0.525236, td=0.055762, cql=0.469475
Epoch 003: loss=0.476231, td=0.050309, cql=0.425922
Epoch 004: loss=0.452569, td=0.048976, cql=0.403593
Epoch 005: loss=0.441939, td=0.051840, cql=0.390099
Epoch 006: loss=0.434090, td=0.049249, cql=0.384841
Epoch 007: loss=0.431054, td=0.050059, cql=0.380995
Epoch 008: loss=0.428315, td=0.051207, cql=0.377107
Epoch 009: loss=0.426185, td=0.048070, cql=0.378115
Epoch 010: loss=0.424688, td=0.049205, cql=0.375483
Epoch 011: loss=0.421396, td=0.050512, cql=0.370884
Epoch 012: loss=0.420774, td=0.048040, cql=0.372734
Epoch 013: loss=0.420063, td=0.049688, cql=0.370375
Epoch 014: loss=0.419384, td=0.047851, cql=0.371533
Epoch 015: loss=0.414355, td=0.045977, cql=0.368378
Epoch 016: loss=0.413479, td=0.047177, cql=0.366303
Epoch 017: loss=0.412297, td=0.048426, cql=0.363871
Epoch 018: loss=0.411143, td=0.047178, cql=0.363965
Epoch 019: l

In [13]:
# Save the Q-network that defines the policy
agent.q.save("/kaggle/working/cql_q_policy.keras")
# Save the rl dataset
df_policy.to_csv('df_policy_dataset.csv', index=False)

In [14]:
!rm -rf /kaggle/working/multimodal-eq-sizing