# RL Agent — Single Window Debug Run

Runs **1 walk-forward window** (train: 2016-2017, test: 2018-Q1) to validate
the full pipeline before committing to the 32-window backtest.

**Algorithm:** MaskablePPO (sb3-contrib) with action masking, normalised observations.

**Runtime:** T4 GPU

In [None]:
# Cell 1: Setup
import warnings
warnings.filterwarnings('ignore', module='jupyter_client')
warnings.filterwarnings('ignore', message='.*Gym has been unmaintained.*')
warnings.filterwarnings('ignore', message='.*Falling back to prediction using DMatrix.*')

from google.colab import drive
drive.mount('/content/drive')

import os, sys
DRIVE_ROOT = '/content/drive/MyDrive/kronos'
REPO_DIR = '/content/tradingagent'

if not os.path.exists(REPO_DIR):
    !git clone https://github.com/Yuxiaoliu12/tradingagent.git {REPO_DIR}

!pip install -q -r {REPO_DIR}/requirements.txt

sys.path.insert(0, REPO_DIR)
print(f'Repo: {REPO_DIR}')
print(f'Drive: {DRIVE_ROOT}')

In [None]:
# Cell 2: Smoke test (synthetic data, no L1+L2 needed)
import numpy as np
import pandas as pd
from screener.portfolio_env import build_action_table, PortfolioEnv
from screener.rl_trader import RLTrader
from screener.config import ScreenerConfig

# Action table
actions = build_action_table(3, 3)
assert len(actions) == 63, f'Expected 63 actions, got {len(actions)}'
print(f'[OK] Action table: {len(actions)} actions')

# Synthetic env
np.random.seed(42)
n_days, n_stocks = 100, 50
dates = pd.bdate_range('2020-01-01', periods=n_days)
ohlcv_dict = {}
for i in range(n_stocks):
    sym = f'sh.{600000+i}'
    close = 10.0 * np.exp(np.cumsum(np.random.randn(n_days) * 0.02))
    ohlcv_dict[sym] = pd.DataFrame({
        'open': close * (1 + np.random.randn(n_days) * 0.005),
        'high': close * (1 + abs(np.random.randn(n_days) * 0.01)),
        'low':  close * (1 - abs(np.random.randn(n_days) * 0.01)),
        'close': close,
        'volume': np.random.randint(1000, 100000, n_days).astype(float),
    }, index=dates)

symbols = list(ohlcv_dict.keys())
bench_df = pd.DataFrame({'close': np.linspace(100, 110, n_days)}, index=dates)
daily_signals = []
for d in dates:
    upside = pd.Series(np.abs(np.random.randn(n_stocks)) * 0.05, index=symbols)
    downside = pd.Series(-np.abs(np.random.randn(n_stocks)) * 0.05, index=symbols)
    combined = upside + downside
    feats = pd.DataFrame(
        np.random.randn(n_stocks, 15), index=symbols,
        columns=['macd','macd_signal','macd_hist','rsi_14','rsi_5',
                 'ma5_slope','ma20_slope','ma60_slope','bb_position',
                 'volume_trend','mom_5','mom_10','mom_20','atr_14','obv_slope'],
    )
    daily_signals.append({
        'date': d,
        'l2_scores': combined.sort_values(ascending=False),
        'l2_upside': upside,
        'l2_downside': downside,
        'l2_ranking': list(combined.sort_values(ascending=False).index),
        'l2_features': feats,
    })

smoke_cfg = ScreenerConfig()

# ── 1. Random steps ──
env = PortfolioEnv(smoke_cfg, daily_signals, ohlcv_dict, bench_df, training_mode=True)
obs, _ = env.reset()
assert obs.shape == (40,) and not np.any(np.isnan(obs))
for step in range(n_days - 1):
    obs, reward, terminated, truncated, info = env.step(env.action_space.sample())
    assert not np.any(np.isnan(obs)), f'Step {step}: NaN'
    if terminated: break
print(f'[OK] Random agent: {step+1} steps, NAV={env._nav:,.0f} ({env._nav/smoke_cfg.initial_capital-1:+.2%})')

# ── 2. Action mask sanity ──
env2 = PortfolioEnv(smoke_cfg, daily_signals, ohlcv_dict, bench_df, training_mode=False)
obs2, _ = env2.reset()
mask = env2.action_masks()
assert mask.shape == (63,), f'Mask shape: {mask.shape}'
assert mask.dtype == bool
assert mask.any(), 'All actions masked!'
n_masked = (~mask).sum()
print(f'[OK] Action masks: {mask.sum()}/63 legal, {n_masked} masked (day 0)')

# Step a few times and verify masks update
for _ in range(5):
    legal_actions = np.where(mask)[0]
    act = np.random.choice(legal_actions)
    obs2, _, term, _, _ = env2.step(act)
    if term: break
    mask = env2.action_masks()
    assert mask.any()
print(f'[OK] Action masks update correctly across steps')

# ── 3. MaskablePPO train 1000 steps ──
smoke_cfg_ppo = ScreenerConfig(rl_total_timesteps=1000, rl_batch_size=32, rl_n_steps=128)
train_env = PortfolioEnv(smoke_cfg_ppo, daily_signals, ohlcv_dict, bench_df, training_mode=True)
trader = RLTrader(smoke_cfg_ppo)
model = trader.train(train_env)

# ── 4. Inference with masks ──
from sb3_contrib.common.wrappers import ActionMasker
from sb3_contrib.common.maskable.utils import get_action_masks

test_env = PortfolioEnv(smoke_cfg_ppo, daily_signals, ohlcv_dict, bench_df, training_mode=False)
masked_test_env = ActionMasker(test_env, lambda e: e.action_masks())
obs, _ = masked_test_env.reset()
blocked_total = 0
for _ in range(20):
    masks = get_action_masks(masked_test_env)
    action, _ = model.predict(obs, deterministic=True, action_masks=masks)
    obs, reward, terminated, truncated, info = masked_test_env.step(int(action))
    blocked_total += len(info.get('blocked_trades', []))
    if terminated: break
print(f'[OK] PPO inference: NAV={test_env._nav:,.0f}, blocked_trades={blocked_total}')

# ── 5. Save/load ──
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
    p = os.path.join(tmpdir, 'smoke')
    trader.save(model, p)
    loaded = trader.load(p)
    obs_t, _ = masked_test_env.reset()
    masks_t = get_action_masks(masked_test_env)
    a1, _ = model.predict(obs_t, deterministic=True, action_masks=masks_t)
    a2, _ = loaded.predict(obs_t, deterministic=True, action_masks=masks_t)
    assert a1 == a2
print('[OK] Save/load round-trip')
print('\n=== All smoke tests passed ===')

In [None]:
# Cell 3: Config — 1 window only (test 2018-Q1)
from screener.config import ScreenerConfig

cfg = ScreenerConfig(
    ohlcv_pickle_path=os.path.join(DRIVE_ROOT, 'data/ohlcv_all_a.pkl'),
    benchmark_pickle_path=os.path.join(DRIVE_ROOT, 'data/benchmark_000905.pkl'),
    drive_root=os.path.join(DRIVE_ROOT, 'output/screener'),
    backtest_end='2018-03-31',   # <-- 1 window only
)

cfg.layer1_xgb_params['device'] = 'cuda'
cfg.layer2_xgb_params['device'] = 'cuda'

print(f'Train: {cfg.train_start} -> {cfg.train_end}')
print(f'Backtest: {cfg.backtest_start} -> {cfg.backtest_end}')
print(f'Run ID: {cfg.run_id}')
print(f'Cache dir: {cfg.cache_dir}')

In [None]:
# Cell 4: Run RL backtest (1 window)
import time
from screener.backtester import WalkForwardBacktester

bt = WalkForwardBacktester(cfg)

t0 = time.time()
rl_results = bt.run_rl(verbose=True)
elapsed = time.time() - t0

print(f'\nWall time: {elapsed/60:.1f} min')

In [None]:
# Cell 5: Results
import matplotlib.pyplot as plt

print('=== Metrics ===')
for k, v in rl_results['metrics'].items():
    print(f'  {k:>20}: {v:.4f}' if isinstance(v, float) else f'  {k:>20}: {v}')

print('\n=== Window Results ===')
for wr in rl_results['window_results']:
    print(f"  Window {wr['window']+1}: {wr['test_start']}->{wr['test_end']}  "
          f"return={wr['test_return']*100:+.2f}%  NAV={wr['final_nav']:,.0f}  "
          f"blocked={wr['blocked_trades']}")

# NAV curve
rl_nav = rl_results['nav_series']
if len(rl_nav) > 0:
    fig, ax = plt.subplots(figsize=(12, 4))
    rl_nav.plot(ax=ax, title='RL Agent NAV (Window 1)', linewidth=1.5)
    ax.set_ylabel('NAV')
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# Save
import pickle
rl_path = os.path.join(cfg.run_dir, 'rl_backtest_results.pkl')
os.makedirs(os.path.dirname(rl_path), exist_ok=True)
with open(rl_path, 'wb') as f:
    pickle.dump(rl_results, f)
print(f'\nResults saved -> {rl_path}')