In [2]:
# =========================
# XGB_Prototype_Modular.ipynb (Cell 1)
# Modularized version using refactored components
# =========================
from __future__ import annotations

import gc, os, sys, math
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping, EvaluationMonitor

import shap
import optuna
from packaging import version

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
)

# ---- project config ----
config_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(config_path)
import config as config

# ---- Import modularized components ----
from src.data.tabular_dataset import build_leak_proof_dataset
from src.data.storage import DataStorage
from src.backtest.splits import purged_time_splits, get_last_split
from src.backtest.engine import run_backtest


In [3]:
# ---- paths ----
DATASET_PATH = str(config.PROCESSED_DATA_PATH)
OUTPUT_PLOT = str(config.XGB_CONFUSION_MATRIX_PLOT)
OUTPUT_PREDICTIONS = str(config.PREDICTIONS_CSV)
OUTPUT_SHAP = str(config.OUTPUTS_DIR / "shap_feature_importance.csv")


In [4]:
# =========================
# MODULAR DATA LOADING (Cell 3, v2 regression)
# Uses DataStorage to load v2 execution-aware dataset
# =========================

print(f"[modular] Loading data from {DATASET_PATH}")

storage = DataStorage(processed_path=DATASET_PATH)
df_raw = storage.load_full_table()
print(f"[modular] Raw loaded: {len(df_raw):,} rows, {df_raw.shape[1]} columns")

# ---- basic cleaning / filters ----
df = df_raw.copy()

# Require tradable row (as defined in data_preparer v2)
if "tradable" in df.columns:
    df = df[df["tradable"] == 1].copy()

# Require non-NaN execution-aware labels and non-zero label size
target_cols = ["y_long_best", "y_long_drawdown", "y_short_best", "y_short_drawup"]
for c in target_cols:
    if c not in df.columns:
        raise RuntimeError(f"Expected column '{c}' not found in processed dataset.")

df = df[df["y_long_best"].notna()].copy()
if "label_qty" in df.columns:
    df = df[df["label_qty"] > 0].copy()

df = df.reset_index(drop=True)

# Primary regression target (can change later)
PRIMARY_TARGET = "y_long_best"
AUX_TARGETS = ["y_long_drawdown", "y_short_best", "y_short_drawup"]

# Identify columns that should NOT be used as features (targets, ids, legacy)
exclude_exact = set(
    ["item", "timestamp"] +
    target_cols +
    [
        "target_min_abs", "target_max_abs", "target_min_rel", "target_max_rel",
        "target_q_up_abs", "target_q_dn_abs", "target_q_up_rel", "target_q_dn_rel",
    ]
)

# Exclude any other columns starting with legacy target prefix
exclude_prefixes = ("target_",)

feature_cols = []
for c in df.columns:
    if c in exclude_exact:
        continue
    if any(c.startswith(pref) for pref in exclude_prefixes):
        continue
    # keep everything else as potential feature (including mid_price, px_entry_*, exec_spread_rel_labelQ, jump_flag, etc.)
    feature_cols.append(c)

# Target vector
y = df[PRIMARY_TARGET].astype("float32").to_numpy()

# After df is finalized (df = df_raw[...] / df = df_raw.copy(); etc.)
del df_raw
gc.collect()
print("[mem] Dropped df_raw")

print(f"[modular] Dataset ready (v2 regression): {len(df):,} rows")
print(f"[modular] Features: {len(feature_cols)}; primary target: {PRIMARY_TARGET}")


[modular] Loading data from C:\Users\reyno\Documents\GitHub\Project-BLD\data\processed\improved_normalized_labeled.parquet
[modular] Raw loaded: 41,719,080 rows, 131 columns
[mem] Dropped df_raw
[modular] Dataset ready (v2 regression): 41,608,256 rows
[modular] Features: 117; primary target: y_long_best


In [5]:
# =========================
# Split using modularized splits module (Cell 4)
# =========================
print("[modular] Creating train/val split using purged_time_splits...")

# Use the modularized splitting function
train_idx, val_idx = get_last_split(df, n_splits=5, embargo=0)

# Extract features for training
X = df[feature_cols]
X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_tr, y_val = y[train_idx], y[val_idx]

w_tr = None  # no class weights for regression

del X

print("[mem] Dropped X")

print(f"[modular] Train: {len(X_tr):,} | Val: {len(X_val):,}")


[modular] Creating train/val split using purged_time_splits...
[modular] Train: 34,673,547 | Val: 6,934,709


In [6]:
# =========================
# Leak check (Cell 5)
# =========================
print("[modular] Running leak check...")
sus_target = [c for c in df.columns if c.startswith("target_")]
sus_y = [c for c in df.columns if c.startswith("y_")]
print(f"[sanity] legacy target_* columns present: {sus_target[:8]}{'...' if len(sus_target) > 8 else ''}")
print(f"[sanity] y_* regression label columns present: {sus_y}")

# For reference: PRIMARY_TARGET is a continuous value now
print(f"[sanity] PRIMARY_TARGET: {PRIMARY_TARGET}, y mean={np.nanmean(y):.4f}, std={np.nanstd(y):.4f}")


[modular] Running leak check...
[sanity] legacy target_* columns present: ['target_min_abs', 'target_max_abs', 'target_min_rel', 'target_max_rel', 'target_q_up_abs', 'target_q_dn_abs', 'target_q_up_rel', 'target_q_dn_rel']
[sanity] y_* regression label columns present: ['y_long_best', 'y_long_drawdown', 'y_short_best', 'y_short_drawup']
[sanity] PRIMARY_TARGET: y_long_best, y mean=307.4051, std=14970.4609


In [11]:
# =========================
# Training utilities (Cell 6)
# =========================
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

USE_GPU = True
USE_OPTUNA = False
N_TRIALS = 10
LOAD_EXISTING_MODEL = False
SAVE_MODEL_AFTER_TRAIN = True
OVERWRITE_SAVED_PARAMS = True

MODEL_DIR = Path(str(config.OUTPUTS_DIR)) / "xgb"
PARAMS_PATH = MODEL_DIR / "xgb_best_params.json"
MODEL_PATH = MODEL_DIR / "xgb_model.ubj"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

import json

def _save_json(obj, p: Path):
    with open(p, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2)

def _load_json(p: Path):
    with open(p, "r", encoding="utf-8") as f:
        return json.load(f)

def _np_view(df):
    return df.to_numpy(dtype=np.float32, copy=False)

def _merge_common(best: dict, use_gpu: bool) -> dict:
    common = {
        "tree_method": "hist",
        "objective": "multi:softprob",
        "num_class": 3,
        "eval_metric": "mlogloss",
        "random_state": RANDOM_SEED,
        "n_jobs": max(1, os.cpu_count()-1),
        "early_stopping_rounds": 100,
        "device": ("cuda" if use_gpu else "cpu"),
    }
    merged = {**common, **best}
    return merged

def _merge_common_reg(best: dict, use_gpu: bool) -> dict:
    common = {
        "tree_method": "hist",
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "random_state": RANDOM_SEED,
        "n_jobs": max(1, os.cpu_count()-1),
        "early_stopping_rounds": 100,
        "device": ("cuda" if use_gpu else "cpu"),
    }
    merged = {**common, **best}
    return merged



In [12]:
# =========================
# Opportunity coverage helper (Cell 7)
# =========================
from numpy.lib.stride_tricks import sliding_window_view

def _forward_extrema(df_item, horizon):
    """Compute forward max/min returns for a single item's time series."""
    x = df_item['mid_price'].to_numpy()
    if len(x) < horizon:
        out = df_item[['timestamp','item','mid_price']].copy()
        out['fwd_up_ret'] = np.nan
        out['fwd_dn_ret'] = np.nan
        return out
    win = sliding_window_view(x, horizon)
    fwd_max = np.concatenate([win.max(1), np.full(horizon-1, np.nan)])
    fwd_min = np.concatenate([win.min(1), np.full(horizon-1, np.nan)])
    out = df_item[['timestamp','item','mid_price']].copy()
    out['fwd_up_ret'] = (fwd_max - x) / x
    out['fwd_dn_ret'] = (x - fwd_min) / x
    return out

def opportunity_coverage(prices, preds, horizon=60, up_tau=0.03, dn_tau=0.03, score_col='score'):
    """
    Compute opportunity coverage metrics.

    Measures how well the model captures good trading opportunities
    within the specified horizon.
    """
    preds = preds.copy()
    if score_col not in preds:
        preds['score'] = preds['Pp1'] - preds['Pm1']
    prices = prices.sort_values(['item','timestamp']).copy()
    opp = (prices.groupby('item', group_keys=False)
                 .apply(lambda g: _forward_extrema(g, horizon)))

    joined = preds.merge(opp, on=['item','timestamp'], how='inner')
    joined['good_up']  = joined['fwd_up_ret'] >= up_tau
    joined['good_dn']  = joined['fwd_dn_ret'] >= dn_tau
    joined['flag_buy'] = joined['score'] > 0
    joined['flag_sell']= joined['score'] < 0

    metrics = {
        'recall@good_up': (joined['flag_buy'] & joined['good_up']).sum() / max(1, joined['good_up'].sum()),
        'precision@buy':  (joined['flag_buy'] & (joined['fwd_up_ret']>0)).sum() / max(1, joined['flag_buy'].sum()),
        'recall@good_dn': (joined['flag_sell'] & joined['good_dn']).sum() / max(1, joined['good_dn'].sum()),
        'precision@sell': (joined['flag_sell'] & (joined['fwd_dn_ret']>0)).sum() / max(1, joined['flag_sell'].sum()),
        'coverage%':      100 * joined.groupby('timestamp')['flag_buy'].max().mean()
    }
    return pd.Series(metrics), joined


In [13]:

# =========================
# Training function (Cell 8, v2 regression)
# =========================
from sklearn.metrics import mean_squared_error, r2_score


def train_regressor(
        X_tr, y_tr, X_val, y_val,
        w_tr=None,
        use_gpu=USE_GPU,
        params_path=PARAMS_PATH,
        model_path=MODEL_PATH,
        load_existing_model=LOAD_EXISTING_MODEL,
        save_model_after_train=SAVE_MODEL_AFTER_TRAIN,
        overwrite_saved_params=OVERWRITE_SAVED_PARAMS,
):
    """
    Train XGBoost regressor on PRIMARY_TARGET (e.g. y_long_best).
    """

    # Fast path: load existing model
    if load_existing_model and Path(model_path).exists():
        print(f"[train] Loading existing regressor → {model_path}")
        reg = xgb.XGBRegressor()
        reg.load_model(str(model_path))
        if Path(params_path).exists():
            params = _load_json(params_path)
        else:
            params = {"framework": "xgb_reg", "loaded_from_model": True}
        return reg, params

    # Load saved params if present
    loaded_params = None
    if Path(params_path).exists():
        loaded_raw = _load_json(params_path)
        if loaded_raw.get("framework") == "xgb_reg":
            # Strip framework tag; keep only true XGB params
            loaded_params = {k: v for k, v in loaded_raw.items() if k != "framework"}
            print(f"[train] Loaded saved reg params from {params_path}")
        else:
            print(f"[train] Found non-regression params at {params_path} (framework={loaded_raw.get('framework')}) – ignoring for regressor.")


    # NumPy views
    X_tr_np = _np_view(X_tr)
    X_val_np = _np_view(X_val)

    # Use loaded params or baseline
    if loaded_params is not None:
        best = loaded_params
    else:
        print("[train] No reg params file found — using baseline reg params.")
        best = {
            "n_estimators": 650,
            "max_depth": 12,
            "learning_rate": 0.06,
            "subsample": 0.85,
            "colsample_bytree": 0.85,
            "min_child_weight": 2.0,
            "reg_lambda": 1.0,
            "reg_alpha": 0.0,
            "gamma": 0.0,
            "max_bin": 256,
            "grow_policy": "depthwise",
        }
    best = _merge_common_reg(best, use_gpu)

    print(best)

    # Train regressor
    reg = xgb.XGBRegressor(**best)
    reg.fit(
        X_tr_np,
        y_tr,
        sample_weight=w_tr,
        eval_set=[(X_val_np, y_val)],
        verbose=False,
    )

    # Basic validation metrics
    y_val_pred = reg.predict(X_val_np)
    mse = mean_squared_error(y_val, y_val_pred)
    rmse = float(np.sqrt(mse))
    r2 = r2_score(y_val, y_val_pred)
    print(f"[train] Val RMSE: {rmse:.5f}, R^2: {r2:.4f}")


    if save_model_after_train:
        try:
            reg.save_model(str(model_path))
            print(f"[train] Saved regressor → {model_path}")
        except Exception as e:
            print(f"[train][warn] Could not save model: {e}")
    try:
        best_out = {"framework": "xgb_reg", **best}
        if overwrite_saved_params:
            _save_json(best_out, params_path)
            print(f"[train] Saved reg params → {params_path}")
    except Exception as e:
        print(f"[train][warn] Could not save params: {e}")

    return reg, {"framework": "xgb_reg", **best}


# ---- Run training ----
print("[modular] Training regressor on", PRIMARY_TARGET, "...")
reg, params = train_regressor(
    X_tr, y_tr, X_val, y_val,
    w_tr=w_tr,
    use_gpu=USE_GPU,
)
print("[modular] Regressor params:", params)
gc.collect()


[modular] Training regressor on y_long_best ...
[train] No reg params file found — using baseline reg params.
{'tree_method': 'hist', 'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42, 'n_jobs': 31, 'early_stopping_rounds': 100, 'device': 'cuda', 'n_estimators': 650, 'max_depth': 12, 'learning_rate': 0.06, 'subsample': 0.85, 'colsample_bytree': 0.85, 'min_child_weight': 2.0, 'reg_lambda': 1.0, 'reg_alpha': 0.0, 'gamma': 0.0, 'max_bin': 256, 'grow_policy': 'depthwise'}


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[train] Val RMSE: 4103.26602, R^2: 0.7306
[train] Saved regressor → C:\Users\reyno\Documents\GitHub\Project-BLD\outputs\xgb\xgb_model.ubj
[train] Saved reg params → C:\Users\reyno\Documents\GitHub\Project-BLD\outputs\xgb\xgb_best_params.json
[modular] Regressor params: {'framework': 'xgb_reg', 'tree_method': 'hist', 'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42, 'n_jobs': 31, 'early_stopping_rounds': 100, 'device': 'cuda', 'n_estimators': 650, 'max_depth': 12, 'learning_rate': 0.06, 'subsample': 0.85, 'colsample_bytree': 0.85, 'min_child_weight': 2.0, 'reg_lambda': 1.0, 'reg_alpha': 0.0, 'gamma': 0.0, 'max_bin': 256, 'grow_policy': 'depthwise'}


814

In [27]:
# =========================
# Regression outputs: preds_test, metrics, backtest CSV
# =========================
from pathlib import Path
from src.analysis.directional_eval import run_regression_outputs

PRIMARY_TARGET = PRIMARY_TARGET  # already defined earlier in your notebook

res = run_regression_outputs(
    df=df,
    val_idx=val_idx,
    reg=reg,
    X_val=X_val_np if "X_val_np" in globals() else _np_view(X_val),
    y_val=y_val,
    primary_target=PRIMARY_TARGET,
    output_predictions=OUTPUT_PREDICTIONS,
    vol_est_col="vol_est",
    k_softmax=20.0,
    up_tau_reg=0.02,
    dn_tau_reg=0.02,
)

y_val_pred = res["y_val_pred"]
preds_test = res["preds_test"]
reg_metrics = res["metrics"]
pred_df = res["pred_df"]

print("[modular] Regression metrics on val:")
print(f"  RMSE: {reg_metrics['rmse']:.6f}")
print(f"  R^2:  {reg_metrics['r2']:.4f}")
print(f"[modular] Saved regression-based predictions → {res['output_path']}")


  pred_proba_buy = 1.0 / (1.0 + np.exp(-k * y_pred_cont))


[modular] Regression metrics on val:
  RMSE: 4103.266016
  R^2:  0.7306
[modular] Saved regression-based predictions → C:\Users\reyno\Documents\GitHub\Project-BLD\outputs\xgb_predictions.csv


In [28]:
# =========================
# Backtest tuning + best backtest (Cell 15 alt)
# =========================
from pathlib import Path
from src.backtest.engine import tune_backtest_knobs

print("[modular] Tuning backtest knobs with src.backtest.engine...")

TRADE_LOG_PATH = Path(config.XGB_TRADE_LOG_CSV)
PLOTS_DIR = Path(config.XGB_TRADING_DIR)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)
TRADE_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

# Run tuning on the same predictions CSV you already generate
study, best_summary, best_trades, best_eq = tune_backtest_knobs(
    pred_path=Path(OUTPUT_PREDICTIONS),   # uses the CSV you created earlier
    study_name="bt_knobs_v23",
    storage_dir="backtests/db",
    total_trials=200,                     # adjust as needed
    trade_log_path=TRADE_LOG_PATH,       # also saves best trades to CSV
    initial_capital=1_000_000_000.0,
    max_trades_per_minute=2,
    spread_bps=10.0,
    fee_bps=100.0,
    slippage_bps=0.0,
    median_window=30,
    impact_cap_bps=200.0,
    max_positions_per_item=1,
    cooldown_minutes=0,
    bar_seconds=60,
)

# Make results visible to the diagnostics cell
trades = best_trades
equity = best_eq
summary = best_summary

print("[modular] Best backtest summary:")
for k, v in best_summary.items():
    print(f"  {k}: {v}")

print(f"[modular] Best trades saved to: {TRADE_LOG_PATH}")
print(f"[modular] Optuna best params: {study.best_params}")


[modular] Tuning backtest knobs with src.backtest.engine...


[I 2025-11-21 23:22:41,474] A new study created in RDB with name: bt_knobs_v23


[optuna] completed trials: 200
Best params: {'min_trade_amount': 8545694.214414444, 'min_confidence': 0.9369703773088872, 'exit_profit_threshold': 0.34492252499058945, 'stop_loss_threshold': 0.11887324842035293, 'persist_bars': 26, 'alpha': 2.6951118763957527, 'min_confidence_streak': 0.5944538757487808}
Best value: 1.326528693516282
Summary: {'Final Capital': '1,643,716,285', 'Total Profit': '643,716,285', 'Num Trades': 1321, 'Win Rate': '31.42%', 'Average Return / Trade': '2.17%', 'Average Win': '32.06%', 'Average Loss': '-11.52%', 'Gross Profit': '3,776,546,253', 'Gross Loss': '-3,132,829,968', 'Profit Factor': '1.21', 'Average Duration (min)': '2172.97'}
[modular] Best backtest summary:
  Final Capital: 1,643,716,285
  Total Profit: 643,716,285
  Num Trades: 1321
  Win Rate: 31.42%
  Average Return / Trade: 2.17%
  Average Win: 32.06%
  Average Loss: -11.52%
  Gross Profit: 3,776,546,253
  Gross Loss: -3,132,829,968
  Profit Factor: 1.21
  Average Duration (min): 2172.97
[modular] 

In [31]:
# =========================
# Diagnostic plots (Cell 16) — using run_discrete_backtest_diagnostics
# =========================

from pathlib import Path
from src.analysis.discrete_diagnostics import run_discrete_backtest_diagnostics

print(f"[modular] Saving plots to: {PLOTS_DIR}")

# Build kwargs defensively so the cell doesn't explode if some vars are missing
diag_kwargs = {
    "trades": locals().get("trades", None),
    "equity": locals().get("equity", None),
    "clf": locals().get("clf", None),
    "X_val": locals().get("X_val", None),
    "y_val": locals().get("y_val", None),
    "proba_val": locals().get("proba_val", None),
    "classes_dec": locals().get("classes_dec", None),
    "opp_joined": locals().get("opp_joined", None),
    "preds_test": locals().get("preds_test", None),
    "plots_dir": PLOTS_DIR,  # uses the same directory as before
}

saved_paths = run_discrete_backtest_diagnostics(**diag_kwargs)

print("[modular] Done. Generated plots:")
for name, path in saved_paths.items():
    print(f"  {name}: {path}")


ModuleNotFoundError: No module named 'src.analysis.discrete_diagnostics'