In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import (
    average_precision_score, roc_auc_score,
    classification_report, confusion_matrix
)
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv("../../data/outputs/fvg_bulk_master_dataset_SOL-USDC_1m.csv")

df.head()

Unnamed: 0,EntryIdx,ExitIdx,Direction,EntryPrice,ExitPrice,Size,SL,TP,PnL,Commission,...,run_sortino,run_calmar,run_mdd_pct,run_trades,run_win_rate_pct,run_profit_factor,run_sqn,run_expectancy_pct,buy_hold_return_pct,run_index
0,3014,9121,1,211.18,209.85328,8.0,209.85328,287.1232,-10.61376,6.736532,...,-1.907327,-2.581558,-9.9907,77,1.298701,0.778577,-0.286464,-0.132974,-33.514111,0
1,13353,13610,-1,188.52,189.6813,10.0,189.6813,120.672,-11.613,7.564026,...,-1.907327,-2.581558,-9.9907,77,1.298701,0.778577,-0.286464,-0.132974,-33.514111,0
2,13620,13648,1,191.58,190.43052,9.0,190.43052,260.5488,-10.34532,6.876189,...,-1.907327,-2.581558,-9.9907,77,1.298701,0.778577,-0.286464,-0.132974,-33.514111,0
3,19113,19261,1,187.53,186.4744,10.0,186.4744,255.136,-10.556,7.480088,...,-1.907327,-2.581558,-9.9907,77,1.298701,0.778577,-0.286464,-0.132974,-33.514111,0
4,21857,21857,1,203.87,202.61696,9.0,202.61696,277.2224,-11.27736,7.316765,...,-1.907327,-2.581558,-9.9907,77,1.298701,0.778577,-0.286464,-0.132974,-33.514111,0


In [2]:
TARGET_COL = "y_win_net"    # 1 if profitable net of costs, else 0
EARLY_STOP_ROUNDS = 50      # tweak if needed
SEED = 42

In [3]:
# Basic dtype fixes (safe conversions if present)
for col in ["EntryTimestamp", "ExitTimestamp", "SignalTimestamp", "fvg_timestamp", "start_ts", "end_ts"]:
    if col in df.columns:
        # many of your timestamps are epoch ms
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_datetime(df[col], unit="ms", errors="coerce")
        else:
            df[col] = pd.to_datetime(df[col], errors="coerce")

# Make sure cv_month exists and is sortable (keep as string like "2025-01" if you have it)
if "cv_month" not in df.columns:
    # fallback: derive year-month from EntryTimestamp
    if "EntryTimestamp" in df.columns:
        df["cv_month"] = df["EntryTimestamp"].dt.to_period("M").astype(str)
    else:
        raise ValueError("cv_month or EntryTimestamp required for time-based CV.")

# Target
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found.")
y = df[TARGET_COL].astype(int)

In [4]:
# ---------- 2) DROP LEAKAGE ----------
leak_cols = [
    # post-entry outcomes / future info
    "ExitIdx","ExitTimestamp","ExitPrice","PnL","PnL_net","y_win_gross","realized_R",
    "holding_bars","Commission",
    # run-level performance over the whole period (definite leakage)
    "run_return_ann_pct","run_vol_ann_pct","run_cagr_pct","run_sharpe","run_sortino",
    "run_calmar","run_mdd_pct","run_trades","run_win_rate_pct","run_profit_factor",
    "run_sqn","run_expectancy_pct","buy_hold_return_pct",
    # IDs / mostly constants / CV helpers
    "TradeId","Tag","run_id","strategy_name","symbol","timeframe",
    "start_ts","end_ts","run_index",
]

drop_these = [c for c in leak_cols if c in df.columns]
X = df.drop(columns=drop_these + [TARGET_COL])

In [5]:
# ---------- 3) FEATURE SELECTION (keep only safe/useful cols if present) ----------
# Keep features known at signal/entry time (includes your FVG params)
whitelist = [
    # market/entry context
    "Direction","EntryIdx","EntryPrice","Size","SL","TP","planned_R",
    "SignalIdx","signal_to_entry_delay_bars",
    # fvg geometry/quality
    "fvg_bar_index","fvg_is_bull","fvg_max_price","fvg_min_price","fvg_midpoint",
    "fvg_gap_size_percent","fvg_displacement_strength",
    "fvg_age_bars","zone_width_abs","zone_width_pct","matched_zone",
    "entry_pos_in_zone","signed_entry_pos",
    # HTF bias
    "htf_bull_count","htf_bear_count","htf_neutral_count",
    "htf_bull_ratio","htf_bear_ratio","htf_all_bullish","htf_all_bearish",
    "htf_any_bullish","htf_any_bearish",
    # time features (coarse)
    "hour_of_day","day_of_week","is_weekend",
    # run parameters (the tunable dials)
    "max_fvg_age","fvg_threshold","profit_target","loss_target",
    "commission_rate","slippage_rate",
    "position_sizing_rule","position_fraction","max_concurrent_trades",
    # optional HTF EMAs if you want them (will one-hot later or just treat numeric)
    "ema_htf_6h","ema_htf_1h","ema_htf_30m","ema_htf_15m","ema_htf_5m",
]

present = [c for c in whitelist if c in X.columns]
X = X[present].copy()

In [6]:
# ---------- 4) LIGHT ENCODING (fixed for new sklearn) ----------
# Booleans we want as categoricals if present
bool_as_cat = []
for c in ["fvg_is_bull", "matched_zone", "is_weekend"]:
    if c in X.columns:
        X[c] = X[c].astype("bool")
        bool_as_cat.append(c)

# Auto-detect other categoricals (object/string)
auto_cat = [c for c in X.columns if X[c].dtype == "object"]

# Make sure this is string (categorical)
if "position_sizing_rule" in X.columns and "position_sizing_rule" not in auto_cat:
    X["position_sizing_rule"] = X["position_sizing_rule"].astype(str)
    auto_cat.append("position_sizing_rule")

cat_cols = list(dict.fromkeys(bool_as_cat + auto_cat))  # unique, keep order
num_cols = [c for c in X.columns if c not in cat_cols]

if cat_cols:
    # sklearn >=1.2 uses 'sparse_output'; try it, fall back for older versions
    try:
        ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    except TypeError:
        ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")  # older sklearn

    ohe_arr = ohe.fit_transform(X[cat_cols])
    ohe_cols = ohe.get_feature_names_out(cat_cols)
    X_enc = pd.DataFrame(ohe_arr, columns=ohe_cols, index=X.index)
    X = pd.concat([X[num_cols], X_enc], axis=1)
else:
    ohe = None

# Clean up bad values and fill NAs
X = X.replace([np.inf, -np.inf], np.nan).fillna(method="ffill").fillna(method="bfill")


  X = X.replace([np.inf, -np.inf], np.nan).fillna(method="ffill").fillna(method="bfill")


In [7]:
# ---------- 5) FORWARD-CHAINING BY MONTH ----------
months = sorted(X.index.to_series().map(df["cv_month"]).unique())
# Build folds: train on all months before m_k, validate on m_k
folds = []
for k in range(1, len(months)):
    val_month = months[k]
    train_months = months[:k]
    trn_idx = df.index[df["cv_month"].isin(train_months)]
    val_idx = df.index[df["cv_month"] == val_month]
    if len(trn_idx) > 0 and len(val_idx) > 50:  # require some minimum
        folds.append((trn_idx, val_idx))

if not folds:
    raise ValueError("Not enough distinct months to create forward-chaining folds.")

In [None]:
# ---------- 6) RUN FOLDS (fixed: use callback-based early stopping) ----------
reports = []
feat_importance_gain = defaultdict(float)

for fold_num, (trn_idx, val_idx) in enumerate(folds, start=1):
    X_tr, y_tr = X.loc[trn_idx], y.loc[trn_idx]
    X_va, y_va = X.loc[val_idx], y.loc[val_idx]

    # class imbalance
    n_pos = int(y_tr.sum())
    n_neg = int((y_tr == 0).sum())
    spw = (n_neg / max(n_pos, 1)) if n_pos > 0 else 1.0

    dtr = xgb.DMatrix(X_tr, label=y_tr, feature_names=X.columns.tolist())
    dva = xgb.DMatrix(X_va, label=y_va, feature_names=X.columns.tolist())

    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "hist",            # "gpu_hist" if you have GPU
        "max_depth": 6,
        "min_child_weight": 4,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "gamma": 1.0,
        "lambda": 2.0,                    # reg_lambda
        "eta": 0.08,                      # learning_rate
        "scale_pos_weight": spw,
        "verbosity": 0,
    }

    watchlist = [(dtr, "train"), (dva, "valid")]
    booster = xgb.train(
        params=params,
        dtrain=dtr,
        num_boost_round=2000,
        evals=watchlist,
        early_stopping_rounds=EARLY_STOP_ROUNDS,
        verbose_eval=False,
    )

    # predictions using best iteration
    try:
        p = booster.predict(dva, iteration_range=(0, booster.best_iteration + 1))
    except TypeError:
        # older xgboost
        p = booster.predict(dva, ntree_limit=booster.best_ntree_limit)

    yhat = (p >= 0.5).astype(int)

    # metrics
    ap = average_precision_score(y_va, p)
    roc = roc_auc_score(y_va, p)
    cm = confusion_matrix(y_va, yhat)
    rep = classification_report(y_va, yhat, digits=3)

    reports.append({
        "fold": fold_num,
        "train_months": months[:fold_num],
        "val_month": months[fold_num],
        "n_train": len(trn_idx),
        "n_val": len(val_idx),
        "auc_pr": ap,
        "roc_auc": roc,
        "cm": cm.tolist(),
        "report": rep
    })

    # feature importance by gain (keys are actual column names)
    fmap = booster.get_score(importance_type="gain")
    for name, gain in fmap.items():
        feat_importance_gain[name] += gain

TypeError: XGBClassifier.fit() got an unexpected keyword argument 'callbacks'

In [None]:
# ---------- 7) PRINT RESULTS ----------
print("\n==== Forward-Chaining Results ====")
for r in reports:
    print(f"\nFold {r['fold']} | train: {r['train_months']} -> val: {r['val_month']}")
    print(f"n_train={r['n_train']:,}  n_val={r['n_val']:,}")
    print(f"AUC-PR={r['auc_pr']:.4f} | ROC-AUC={r['roc_auc']:.4f}")
    print("Confusion matrix [[TN, FP],[FN, TP]]:")
    print(np.array(r["cm"]))
    print("Classification report:")
    print(r["report"])

# Top features by total gain across folds
fi_sorted = sorted(feat_importance_gain.items(), key=lambda x: x[1], reverse=True)
print("\n==== Top 25 Features (total gain across folds) ====")
for name, val in fi_sorted[:25]:
    print(f"{name:30s}  {val:.2f}")

# Optional: quick sanity check on class balance
pos_rate = y.mean()
print(f"\nOverall positive rate (y=1): {pos_rate:.3f}")
# ===== end =====