<a href="https://colab.research.google.com/github/brendonhuynhbp-hub/gt-markets/blob/main/GoogleTrends_Financial_Modeling_Backtest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup: Mount Drive + Paths

In [5]:

# =========================================================
# Google Trends + Financial Modeling — Backtest Pipeline
# (Per-asset engineered feature filtering + leak guard + column logging)
# =========================================================

# -------------------------------
# Global switches
# -------------------------------
DEBUG    = False           # True → faster sanity run; False → full training
FREQ     = "W"            # "D" (daily) or "W" (weekly)
RUN_MODE = "all"          # "all" or "single"
PAIR_KEY = "gold"         # when RUN_MODE == "single": "gold"|"btc"|"oil"|"usdcny"

# -------------------------------
# Drive + core paths
# -------------------------------
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import os, warnings, random, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

CANDIDATE_PROJECT_DIRS = [
    Path("/content/drive/MyDrive/gt-markets"),
    Path("/content/drive/Shareddrives/gt-markets"),
]
PROJECT_DIR = next((p for p in CANDIDATE_PROJECT_DIRS if p.exists()), None)
assert PROJECT_DIR is not None, "Project directory not found in Drive."

DATA_DIR = PROJECT_DIR / "data" / "processed"
KW_DIR   = PROJECT_DIR / "data" / "Keyword Selection"   # exact folder name
OUT_DIR  = PROJECT_DIR / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

RAW_FILE = DATA_DIR / "merged_financial_trends_data_2025-09-07.csv"
ENG_FILE = DATA_DIR / "merged_financial_trends_engineered_2025-09-07.csv"
KW_CSV   = KW_DIR  / "combined_significant_lagged_correlations.csv"

assert RAW_FILE.exists(), f"Missing dataset: {RAW_FILE}"
assert ENG_FILE.exists(), f"Missing engineered dataset: {ENG_FILE}"
assert KW_CSV.exists(),   f"Missing keyword file: {KW_CSV}"

print("PROJECT_DIR:", PROJECT_DIR)
print("DATA_DIR:   ", DATA_DIR)
print("KW_DIR:     ", KW_DIR)
print("OUT_DIR:    ", OUT_DIR)

# -------------------------------
# Reproducibility + warnings
# -------------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED)
warnings.filterwarnings("ignore")

# -------------------------------
# ML/DL stack
# -------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import tensorflow as tf
from tensorflow import keras
tf.keras.utils.set_random_seed(SEED)
try:
    gpus = tf.config.list_physical_devices('GPU')
    for g in gpus:
        tf.config.experimental.set_memory_growth(g, True)
    print(f"TF GPU devices: {len(gpus)} (mem growth on)" if gpus else "TF on CPU")
except Exception as e:
    print("TF GPU setup note:", e)
print("TF version:", tf.__version__)

# -------------------------------
# Asset registry
# -------------------------------
ASSETS = [
    {"PAIR_ID": "GC=F",      "price_col": "GC=F Close",      "label": "Gold"},
    {"PAIR_ID": "BTC-USD",   "price_col": "BTC-USD Close",   "label": "BTC"},
    {"PAIR_ID": "CL=F",      "price_col": "CL=F Close",      "label": "Oil"},
    {"PAIR_ID": "USDCNY=X",  "price_col": "USDCNY=X Close",  "label": "USDCNY"},
]
asset_by_label = {a["label"].lower(): a for a in ASSETS}

# -------------------------------
# Load + clean data (fill GT NaNs with 0)
# -------------------------------
def _load_and_clean(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, parse_dates=["Date"]).set_index("Date").sort_index()
    trend_cols = [c for c in df.columns if "trend" in c.lower()]
    if trend_cols:
        bad = [c for c in trend_cols if df[c].isna().any()]
        if bad:
            print(f"{path.name}: filling NaNs in trend cols with 0:", bad[:10], "..." if len(bad)>10 else "")
            df[bad] = df[bad].fillna(0.0)
    return df

df_raw0 = _load_and_clean(RAW_FILE)
df_eng0 = _load_and_clean(ENG_FILE)
if DEBUG:
    # Keep a small tail for quick runs
    df_raw0 = df_raw0.tail(1000)
    df_eng0 = df_eng0.tail(1000)
    print("DEBUG mode: using last 1000 rows (RAW/ENG).")

# -------------------------------
# Frequency control
# -------------------------------
def to_frequency(df_in: pd.DataFrame, freq: str = "D") -> pd.DataFrame:
    """
    Weekly re-sample uses last-of-period for prices (avoid look-ahead),
    and mean-of-period for all non-price columns (trends/indicators).
    """
    if freq.upper() == "D":
        return df_in.copy()
    assert freq.upper() == "W", "Supported: 'D' or 'W'."
    out = pd.DataFrame(index=df_in.resample("W").last().index)
    # prices as last-of-week
    for a in ASSETS:
        if a["price_col"] in df_in.columns:
            out[a["price_col"]] = df_in[a["price_col"]].resample("W").last()
    # other columns as weekly mean
    price_cols = {a["price_col"] for a in ASSETS}
    for c in df_in.columns:
        if c not in price_cols:
            out[c] = df_in[c].resample("W").mean()
    return out

df_raw = to_frequency(df_raw0, FREQ)
df_eng = to_frequency(df_eng0, FREQ)
print("RAW frame:", df_raw.index.min().date(), "→", df_raw.index.max().date(), "| rows:", len(df_raw))
print("ENG frame:", df_eng.index.min().date(), "→", df_eng.index.max().date(), "| rows:", len(df_eng))

# -------------------------------
# Keyword utilities
# -------------------------------
def load_keywords_for_pair(csv_path: Path, pair_id: str) -> list[str]:
    d = pd.read_csv(csv_path)
    assert {"Pair","Keyword"}.issubset(d.columns), "Keyword CSV must have: Pair, Keyword."
    aliases = {pair_id}
    if pair_id == "USDCNY=X": aliases |= {"CNY=X"}
    if pair_id == "CNY=X":    aliases |= {"USDCNY=X"}
    kws = (d.loc[d["Pair"].isin(aliases), "Keyword"]
             .dropna().astype(str).str.strip().str.lower().unique().tolist())
    return kws

def map_keywords_to_trend_cols(all_cols: pd.Index, keywords: list[str]) -> list[str]:
    norm = lambda s: str(s).lower().strip().replace(" ", "_")
    desired = {f"{norm(k)}_trend" for k in keywords}
    return [c for c in all_cols if str(c).lower() in desired]

# -------------------------------
# Targets + safe lag features
# -------------------------------
def make_target(df: pd.DataFrame, price_col: str) -> pd.DataFrame:
    out = df.copy()
    out["ret1"] = out[price_col].pct_change()
    out["y_up"] = (out[price_col].shift(-1) > out[price_col]).astype(int)
    return out.dropna(subset=[price_col]).dropna()

def build_trend_lag_features(df_in: pd.DataFrame, sel_cols: list[str], lag_steps=(7,14,21)) -> pd.DataFrame:
    """
    Safe %change features over given lags, shifted by +1 step (causality),
    with ±inf→NaN→0 fill and clipping for numeric stability.
    """
    out = {}
    for c in sel_cols:
        s = df_in[c].astype(float)
        for L in lag_steps:
            chg = s.pct_change(L).shift(1)
            chg = chg.replace([np.inf, -np.inf], np.nan).fillna(0.0)
            chg = chg.clip(lower=-10.0, upper=10.0)
            out[f"{c}__chg{L}"] = chg
    return pd.DataFrame(out, index=df_in.index)

# -------------------------------
# Output structure (versioned)
# -------------------------------
from datetime import datetime
RUN_MODE_TAG = "debug" if DEBUG else "prod"
RUN_STAMP    = datetime.now().strftime("%Y%m%d-%H%M%S")
RUN_ID       = f"{FREQ.lower()}_{RUN_MODE_TAG}_{RUN_STAMP}"

RUN_ROOT = OUT_DIR / "runs" / RUN_ID
STAGES = {
    "data":        RUN_ROOT / "00_data",
    "logs":        RUN_ROOT / "10_logs",
    "preds_val":   RUN_ROOT / "20_preds" / "val",
    "preds_test":  RUN_ROOT / "20_preds" / "test",
    "backtests":   RUN_ROOT / "30_backtests",
    "figs":        RUN_ROOT / "40_figs",
    "leaderboard": RUN_ROOT / "50_leaderboards",
}
for p in STAGES.values(): p.mkdir(parents=True, exist_ok=True)

(df_raw.head(1)
 .assign(_start=df_raw.index.min(), _end=df_raw.index.max())
 .to_csv(STAGES["data"] / f"dataset_snapshot_{RAW_FILE.name}.head1.csv"))

def _slug(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "_", s.lower()).strip("_")

def _pred_name(asset_label: str, dataset_tag: str, model: str, *, window: int | None = None, split="test") -> str:
    bits = [_slug(asset_label), dataset_tag, model.lower()]
    if window: bits.append(f"w{window}")
    return ".".join(["_".join(bits), split, "csv"])

def _log_name(asset_label: str, dataset_tag: str, run_name: str) -> str:
    return f"{_slug(asset_label)}_{dataset_tag}_{_slug(run_name)}.{RUN_STAMP}.txt"

def _fig_name(asset_label: str, tail: str) -> str:
    return f"{_slug(asset_label)}_{tail}.png"

def save_txt_log(asset_label: str, dataset_tag: str, run_name: str, lines: list[str]):
    path = STAGES["logs"] / _log_name(asset_label, dataset_tag, run_name)
    with open(path, "w") as f: f.write("\n".join(lines))
    return path

def save_preds_df(df_pred: pd.DataFrame, asset_label: str, dataset_tag: str, model: str, *, window: int | None, split: str):
    stage = STAGES["preds_val"] if split == "val" else STAGES["preds_test"]
    outp = stage / _pred_name(asset_label, dataset_tag, model, window=window, split=split)
    df_pred.to_csv(outp); return outp

def save_leaderboard(df_leader: pd.DataFrame, tag: str = "metrics"):
    path = STAGES["leaderboard"] / f"leaderboard_{tag}.csv"
    df_leader.to_csv(path, index=False); return path

def save_figure(fig, asset_label: str, tail: str):
    path = STAGES["figs"] / _fig_name(asset_label, tail)
    fig.savefig(path, dpi=150, bbox_inches="tight"); plt.close(fig); return path

print("RUN_ID:", RUN_ID)
for name, path in STAGES.items(): print(f"{name:>12}: {path}")

# -------------------------------
# Models & sequence utilities
# -------------------------------
def make_purged_splits(n: int, train=0.70, valid=0.15, embargo: int = 5):
    i_tr = int(n * train); i_va = int(n * (train + valid))
    tr = slice(0, i_tr)
    va = slice(min(i_tr + embargo, i_va), i_va)
    te = slice(min(i_va + embargo, n), n)
    return tr, va, te

def make_mlp(input_dim: int):
    tf.keras.utils.set_random_seed(SEED)
    model = keras.Sequential([
        keras.layers.Input(shape=(input_dim,)),
        keras.layers.Dense(128, activation="relu"),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.30),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.30),
        keras.layers.Dense(1, activation="sigmoid")
    ])
    model.compile(
        optimizer=keras.optimizers.Adam(1e-3),
        loss="binary_crossentropy",
        metrics=[keras.metrics.AUC(name="auc"), keras.metrics.BinaryAccuracy(name="acc")]
    )
    return model

def make_lstm(input_shape, units=64, dropout=0.2):
    tf.keras.utils.set_random_seed(SEED)
    model = keras.Sequential([
        keras.layers.Input(shape=input_shape),
        keras.layers.LSTM(units, return_sequences=True),
        keras.layers.Dropout(dropout),
        keras.layers.LSTM(units//2),
        keras.layers.Dropout(dropout),
        keras.layers.Dense(1, activation="sigmoid"),
    ])
    model.compile(
        optimizer=keras.optimizers.Adam(1e-3),
        loss="binary_crossentropy",
        metrics=[keras.metrics.AUC(name="auc"), keras.metrics.BinaryAccuracy(name="acc")]
    )
    return model

def build_sequences_from_extended(df_ext: pd.DataFrame, feature_cols: list[str], y_col="y_up", window=30):
    X = df_ext[feature_cols].values
    y = df_ext[y_col].astype(int).values
    idx = df_ext.index
    xs, ys, dates = [], [], []
    for t in range(window, len(df_ext)):
        xs.append(X[t-window:t, :]); ys.append(y[t]); dates.append(idx[t])
    return np.asarray(xs), np.asarray(ys), pd.DatetimeIndex(dates)

def split_scale_sequences(X_seq, y_seq, train=0.70, valid=0.15):
    n = len(X_seq); i_tr = int(n*train); i_va = int(n*(train+valid))
    X_tr, X_va, X_te = X_seq[:i_tr], X_seq[i_tr:i_va], X_seq[i_va:]
    y_tr, y_va, y_te = y_seq[:i_tr], y_seq[i_tr:i_va], y_seq[i_va:]
    if len(X_tr)==0: return (X_tr,y_tr), (X_va,y_va), (X_te,y_te)
    T,W,F = X_tr.shape
    scaler = StandardScaler().fit(X_tr.reshape(T*W,F))
    def _tf(x):
        if len(x)==0: return x
        TT,WW,FF = x.shape
        return scaler.transform(x.reshape(TT*WW,FF)).reshape(TT,WW,FF)
    return (_tf(X_tr),y_tr),(_tf(X_va),y_va),(_tf(X_te),y_te)

# -------------------------------
# Shared feature sanitiser
# -------------------------------
def sanitize_features(df_ext: pd.DataFrame, price_col: str):
    """
    Make features ML/DL-safe:
      - Identify feature columns (exclude price, ret1, y_up)
      - Replace ±inf→NaN, then fill NaN with 0.0
      - Clip to [-10, 10]
      - Drop rows missing price or target
    Returns: cleaned frame, feature_cols list.
    """
    exclude = {price_col, "ret1", "y_up"}
    feature_cols = [c for c in df_ext.columns if c not in exclude]
    if feature_cols:
        df_ext[feature_cols] = (
            df_ext[feature_cols]
            .replace([np.inf, -np.inf], np.nan)
            .fillna(0.0)
            .clip(lower=-10.0, upper=10.0)
        )
    df_ext = df_ext.dropna(subset=[price_col, "y_up"]).copy()
    return df_ext, feature_cols

# -------------------------------
# Data source selector
# -------------------------------
def get_base_source(dataset_version: str) -> pd.DataFrame:
    root = dataset_version.split("_",1)[0].lower()
    if root == "raw": return df_raw
    if root == "eng": return df_eng
    raise ValueError(f"Unknown dataset_version: {dataset_version}")

# =========================================================
# NEW: per-asset engineered feature selection + leak guard
# =========================================================
def _asset_aliases(asset: dict) -> set[str]:
    """
    Tokens that can appear in column names for THIS asset:
      - PAIR_ID (e.g., "GC=F", "BTC-USD", "USDCNY=X")
      - root from price_col (e.g., "GC=F")
      - human label (e.g., "gold")
    Handles USDCNY<->CNY aliasing.
    """
    pid = str(asset["PAIR_ID"]).lower()
    root = str(asset["price_col"]).split()[0].lower()
    label = str(asset["label"]).lower()
    aliases = {pid, root, label}
    if pid == "usdcny=x":
        aliases |= {"cny=x"}
    if pid == "cny=x":
        aliases |= {"usdcny=x"}
    return aliases

def _other_assets_tokens(current: dict) -> set[str]:
    toks = set()
    for a in ASSETS:
        if a is current:
            continue
        toks |= _asset_aliases(a)
    return toks

def select_engineered_for_asset(all_cols: list[str], asset: dict) -> list[str]:
    """
    Keep engineered (non-trend) columns for THIS asset only:
      - Exclude all price columns
      - Exclude all trend columns (handled separately)
      - Exclude columns that contain tokens of OTHER assets
      - Allow columns that contain CURRENT asset tokens or are 'generic'
    """
    price_cols_all = {a["price_col"] for a in ASSETS}
    other_tokens = _other_assets_tokens(asset)

    selected = []
    for c in all_cols:
        lc = str(c).lower()
        if c in price_cols_all:
            continue
        if "trend" in lc:
            continue
        if any(tok in lc for tok in other_tokens):
            continue
        selected.append(c)
    return selected

# -------------------------------
# DL runner (MLP + LSTM) with safe selector + leak guard + column log
# -------------------------------
def run_asset_dl(asset: dict, dataset_version: str = "raw", use_keywords: bool = True, use_dl_mode: str = "both"):
    label       = asset["label"]
    price_col   = asset["price_col"]
    dataset_tag = f"{dataset_version}_{'ext' if use_keywords else 'base'}"
    label_tag   = f"{label} [{dataset_tag}]"

    base_source = get_base_source(dataset_version)

    # Base target
    base = make_target(base_source[[price_col]], price_col).dropna(subset=[price_col, "y_up"])

    # Engineered (non-trend) features -> filtered per-asset
    non_trend_engineered = select_engineered_for_asset(list(base_source.columns), asset)

    keywords_used = 0
    if use_keywords:
        kws = load_keywords_for_pair(KW_CSV, asset["PAIR_ID"])
        trend_cols = [c for c in map_keywords_to_trend_cols(base_source.columns, kws) if c in base_source.columns]
        lag_df = build_trend_lag_features(base_source, trend_cols, lag_steps=(7,14,21))
        ext = (
            base
            .join(base_source[non_trend_engineered], how="left")
            .join(base_source[trend_cols],          how="left")
            .join(lag_df,                           how="left")
        )
        if trend_cols: ext[trend_cols] = ext[trend_cols].fillna(0.0)
        lag_cols = [c for c in ext.columns if "__chg" in c]
        if lag_cols:  ext[lag_cols]  = ext[lag_cols].fillna(0.0)
        keywords_used = len(trend_cols)
    else:
        ext = base.join(base_source[non_trend_engineered], how="left")

    # Final sanitisation + feature list
    ext, feature_cols = sanitize_features(ext, price_col)

    # Leak guard: drop any columns that still contain other-asset tokens
    other_tokens = _other_assets_tokens(asset)
    leaky = [c for c in feature_cols if any(tok in c.lower() for tok in other_tokens)]
    if leaky:
        print("[WARN] Cross-asset leakage detected, dropping:", leaky[:10], "..." if len(leaky) > 10 else "")
        keep = [c for c in feature_cols if c not in leaky]
        ext = ext[[*keep, "y_up", price_col]].copy()
        feature_cols = keep

    rows_used = len(ext)
    results = {}
    modes = [use_dl_mode] if use_dl_mode in {"mlp","lstm"} else ["mlp","lstm"]

    for mode in modes:
        if mode == "mlp":
            X = ext[feature_cols].values
            y = ext["y_up"].astype(int).values
            tr, va, te = make_purged_splits(len(ext), train=0.70, valid=0.15, embargo=5)
            X_tr, X_va, X_te = X[tr], X[va], X[te]
            y_tr, y_va, y_te = y[tr], y[va], y[te]

            scaler = StandardScaler().fit(X_tr)
            X_tr, X_va, X_te = scaler.transform(X_tr), scaler.transform(X_va), scaler.transform(X_te)

            MAX_EPOCHS = 3 if DEBUG else 30
            BATCH      = 32 if DEBUG else 64
            es = keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=5, restore_best_weights=True)

            mlp = make_mlp(X_tr.shape[1])
            mlp.fit(X_tr, y_tr, validation_data=(X_va, y_va), epochs=MAX_EPOCHS, batch_size=BATCH, callbacks=[es], verbose=0)

            p_va = mlp.predict(X_va, verbose=0).ravel(); h_va = (p_va >= 0.5).astype(int)
            p_te = mlp.predict(X_te, verbose=0).ravel(); h_te = (p_te >= 0.5).astype(int)

            m = {"acc": accuracy_score(y_te, h_te) if len(y_te) else float("nan"),
                 "f1":  f1_score(y_te, h_te) if len(y_te) and len(np.unique(y_te))>1 else float("nan"),
                 "auc": roc_auc_score(y_te, p_te) if len(y_te) and len(np.unique(y_te))>1 else float("nan")}
            results["MLP"] = (None, m)

            df_val  = pd.DataFrame({"date": ext.index[va], "y_true": y_va, "y_pred": h_va, "prob_up": p_va}).set_index("date")
            df_test = pd.DataFrame({"date": ext.index[te], "y_true": y_te, "y_pred": h_te, "prob_up": p_te}).set_index("date")
            save_preds_df(df_val,  label, dataset_tag, "mlp", window=None, split="val")
            save_preds_df(df_test, label, dataset_tag, "mlp", window=None, split="test")

        elif mode == "lstm":
            WINDOW     = 30
            MAX_EPOCHS = 3 if DEBUG else 50
            BATCH      = 32 if DEBUG else 64

            Xseq, yseq, idx = build_sequences_from_extended(ext, feature_cols, "y_up", window=WINDOW)
            nseq = len(Xseq)
            if nseq < 50:
                print(f"[WARN] Not enough samples for {label_tag} (LSTM).")
            else:
                tr, va, te = make_purged_splits(nseq, train=0.70, valid=0.15, embargo=5)
                (X_tr_raw,y_tr_raw), (X_va_raw,y_va_raw), (X_te_raw,y_te_raw) = split_scale_sequences(Xseq, yseq, train=0.70, valid=0.15)
                i_tr_end = int(nseq*0.70); i_va_end = int(nseq*0.85)
                X_tr, y_tr = X_tr_raw[tr], y_tr_raw[tr]
                X_va, y_va = X_va_raw[va.start - i_tr_end: va.stop - i_tr_end], y_va_raw[va.start - i_tr_end: va.stop - i_tr_end]
                X_te, y_te = X_te_raw[te.start - i_va_end:],                y_te_raw[te.start - i_va_end:]

                if len(X_va)==0 or len(X_te)==0:
                    print(f"[WARN] Not enough samples after purge for {label_tag} (LSTM).")
                else:
                    es = keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=6, restore_best_weights=True)
                    lstm = make_lstm(input_shape=X_tr.shape[1:])
                    lstm.fit(X_tr, y_tr, validation_data=(X_va, y_va), epochs=MAX_EPOCHS, batch_size=BATCH, callbacks=[es], verbose=0)

                    p_va = lstm.predict(X_va, verbose=0).ravel(); h_va = (p_va >= 0.5).astype(int)
                    p_te = lstm.predict(X_te, verbose=0).ravel(); h_te = (p_te >= 0.5).astype(int)

                    m = {"acc": accuracy_score(y_te, h_te) if len(y_te) else float("nan"),
                         "f1":  f1_score(y_te, h_te) if len(y_te) and len(np.unique(y_te))>1 else float("nan"),
                         "auc": roc_auc_score(y_te, p_te) if len(y_te) and len(np.unique(y_te))>1 else float("nan")}
                    results["LSTM"] = (None, m)

                    val_idx  = idx[va]; test_idx = idx[te]
                    df_val  = pd.DataFrame({"date": val_idx,  "y_true": y_va, "y_pred": h_va, "prob_up": p_va}).set_index("date")
                    df_test = pd.DataFrame({"date": test_idx, "y_true": y_te, "y_pred": h_te, "prob_up": p_te}).set_index("date")
                    save_preds_df(df_val,  label, dataset_tag, "lstm", window=WINDOW, split="val")
                    save_preds_df(df_test, label, dataset_tag, "lstm", window=WINDOW, split="test")

    # -------------------------------
    # LOGGING with columns list
    # -------------------------------
    if results:
        lines = [
    f"{label} [{dataset_tag}] — DL {','.join(results.keys())} (use_keywords={use_keywords})",
    f"[Run] FREQ={FREQ} | DEBUG={DEBUG} | RUN_ID={RUN_ID}",
    f"[Data] Rows used: {rows_used} | Features: {len(feature_cols)}",
    f"[Extended] Keywords used: {keywords_used}",
    "[Columns]"
] + [f"    - {c}" for c in feature_cols]
        for name, (_x, met) in results.items():
            lines.append(f"{name}: ACC={met.get('acc', float('nan')):.3f} F1={met.get('f1', float('nan')):.3f} AUC={met.get('auc', float('nan')):.3f}")
        save_txt_log(label, dataset_tag, f"DL_{','.join(results.keys())}", lines)

    print(f"✅ DL finished {label_tag} | rows: {rows_used} | features: {len(feature_cols)}")

# -------------------------------
# ML runner (LR / RF / XGB) with safe selector + leak guard + column log
# -------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
try:
    from xgboost import XGBClassifier
    _HAS_XGB = True
except Exception:
    print("[WARN] XGBoost not available; skipping XGB.")
    _HAS_XGB = False

def run_asset_ml(asset: dict, dataset_version: str = "raw", use_keywords: bool = True):
    label       = asset["label"]
    price_col   = asset["price_col"]
    dataset_tag = f"{dataset_version}_{'ext' if use_keywords else 'base'}"
    label_tag   = f"{label} [{dataset_tag}]"

    base_source = get_base_source(dataset_version)
    base = make_target(base_source[[price_col]], price_col).dropna(subset=[price_col, "y_up"])

    # Engineered (non-trend) features -> filtered per-asset
    non_trend_engineered = select_engineered_for_asset(list(base_source.columns), asset)

    keywords_used = 0
    if use_keywords:
        kws = load_keywords_for_pair(KW_CSV, asset["PAIR_ID"])
        trend_cols = [c for c in map_keywords_to_trend_cols(base_source.columns, kws) if c in base_source.columns]
        lag_df = build_trend_lag_features(base_source, trend_cols, lag_steps=(7,14,21))
        ext = (
            base
            .join(base_source[non_trend_engineered], how="left")
            .join(base_source[trend_cols],          how="left")
            .join(lag_df,                           how="left")
        )
        if trend_cols: ext[trend_cols] = ext[trend_cols].fillna(0.0)
        lag_cols = [c for c in ext.columns if "__chg" in c]
        if lag_cols:  ext[lag_cols]  = ext[lag_cols].fillna(0.0)
        keywords_used = len(trend_cols)
    else:
        ext = base.join(base_source[non_trend_engineered], how="left")

    # Final sanitisation + feature list
    ext, feature_cols = sanitize_features(ext, price_col)

    # Leak guard: drop any columns that still contain other-asset tokens
    other_tokens = _other_assets_tokens(asset)
    leaky = [c for c in feature_cols if any(tok in c.lower() for tok in other_tokens)]
    if leaky:
        print("[WARN] Cross-asset leakage detected, dropping:", leaky[:10], "..." if len(leaky) > 10 else "")
        keep = [c for c in feature_cols if c not in leaky]
        ext = ext[[*keep, "y_up", price_col]].copy()
        feature_cols = keep

    rows_used = len(ext)

    # Split + scale
    X = ext[feature_cols].values
    y = ext["y_up"].astype(int).values
    tr, va, te = make_purged_splits(len(ext), train=0.70, valid=0.15, embargo=5)
    X_tr, X_va, X_te = X[tr], X[va], X[te]
    y_tr, y_va, y_te = y[tr], y[va], y[te]
    scaler = StandardScaler().fit(X_tr)
    X_tr, X_va, X_te = scaler.transform(X_tr), scaler.transform(X_va), scaler.transform(X_te)

    results = {}

    # Logistic Regression
    lr = LogisticRegression(max_iter=1000, class_weight="balanced")
    lr.fit(X_tr, y_tr)
    p_va = lr.predict_proba(X_va)[:,1]; h_va = (p_va >= 0.5).astype(int)
    p_te = lr.predict_proba(X_te)[:,1]; h_te = (p_te >= 0.5).astype(int)
    m = {"acc": accuracy_score(y_te,h_te) if len(y_te) else float("nan"),
         "f1":  f1_score(y_te,h_te) if len(y_te) and len(np.unique(y_te))>1 else float("nan"),
         "auc": roc_auc_score(y_te,p_te) if len(y_te) and len(np.unique(y_te))>1 else float("nan")}
    results["LR"] = (None, m)
    save_preds_df(pd.DataFrame({"date": ext.index[va], "y_true": y_va, "y_pred": h_va, "prob_up": p_va}).set_index("date"),
                  label, dataset_tag, "lr", window=None, split="val")
    save_preds_df(pd.DataFrame({"date": ext.index[te], "y_true": y_te, "y_pred": h_te, "prob_up": p_te}).set_index("date"),
                  label, dataset_tag, "lr", window=None, split="test")

    # Random Forest
    rf = RandomForestClassifier(
        n_estimators=100 if not DEBUG else 50,
        max_depth=None, n_jobs=-1, random_state=SEED,
        class_weight="balanced_subsample"
    )
    rf.fit(X_tr, y_tr)
    p_va = rf.predict_proba(X_va)[:,1]; h_va = (p_va >= 0.5).astype(int)
    p_te = rf.predict_proba(X_te)[:,1]; h_te = (p_te >= 0.5).astype(int)
    m = {"acc": accuracy_score(y_te,h_te) if len(y_te) else float("nan"),
         "f1":  f1_score(y_te,h_te) if len(y_te) and len(np.unique(y_te))>1 else float("nan"),
         "auc": roc_auc_score(y_te,p_te) if len(y_te) and len(np.unique(y_te))>1 else float("nan")}  # noqa
    results["RF"] = (None, m)
    save_preds_df(pd.DataFrame({"date": ext.index[va], "y_true": y_va, "y_pred": h_va, "prob_up": p_va}).set_index("date"),
                  label, dataset_tag, "rf", window=None, split="val")
    save_preds_df(pd.DataFrame({"date": ext.index[te], "y_true": y_te, "y_pred": h_te, "prob_up": p_te}).set_index("date"),
                  label, dataset_tag, "rf", window=None, split="test")

    # XGBoost (optional)
    try:
        if _HAS_XGB:
            xgb = XGBClassifier(
                n_estimators=400 if not DEBUG else 150,
                max_depth=4, learning_rate=0.05,
                subsample=0.8, colsample_bytree=0.8,
                reg_lambda=1.0, random_state=SEED,
                tree_method="hist", eval_metric="auc"
            )
            xgb.fit(X_tr, y_tr)
            p_va = xgb.predict_proba(X_va)[:,1]; h_va = (p_va >= 0.5).astype(int)
            p_te = xgb.predict_proba(X_te)[:,1]; h_te = (p_te >= 0.5).astype(int)
            m = {"acc": accuracy_score(y_te,h_te) if len(y_te) else float("nan"),
                 "f1":  f1_score(y_te,h_te) if len(y_te) and len(np.unique(y_te))>1 else float("nan"),
                 "auc": roc_auc_score(y_te,p_te) if len(y_te) and len(np.unique(y_te))>1 else float("nan")}
            results["XGB"] = (None, m)
            save_preds_df(pd.DataFrame({"date": ext.index[va], "y_true": y_va, "y_pred": h_va, "prob_up": p_va}).set_index("date"),
                          label, dataset_tag, "xgb", window=None, split="val")
            save_preds_df(pd.DataFrame({"date": ext.index[te], "y_true": y_te, "y_pred": h_te, "prob_up": p_te}).set_index("date"),
                          label, dataset_tag, "xgb", window=None, split="test")
    except Exception as e:
        print("[WARN] XGB failed:", e)

    # -------------------------------
    # LOGGING with columns list
    # -------------------------------
    if results:
        lines = [
    f"{label} [{dataset_tag}] — ML {','.join(results.keys())} (use_keywords={use_keywords})",
    f"[Run] FREQ={FREQ} | DEBUG={DEBUG} | RUN_ID={RUN_ID}",
    f"[Data] Rows used: {rows_used} | Features: {len(feature_cols)}",
    f"[Extended] Keywords used: {keywords_used}",
    "[Columns]"
] + [f"    - {c}" for c in feature_cols]
        for name, (_x, met) in results.items():
            lines.append(f"{name}: ACC={met.get('acc', float('nan')):.3f} F1={met.get('f1', float('nan')):.3f} AUC={met.get('auc', float('nan')):.3f}")
        save_txt_log(label, dataset_tag, f"ML_{','.join(results.keys())}", lines)

    print(f"✅ ML finished {label_tag} | rows: {rows_used} | features: {len(feature_cols)}")

# -------------------------------
# Leaderboard + Backtest
# -------------------------------
def build_leaderboard_from_logs(run_root: Path = OUT_DIR / "runs" / RUN_ID):
    pat_hdr   = re.compile(r"^(?P<label>.+?) \[(?P<dataset>.+?)\] — (?P<run>.+)$")
    pat_model = re.compile(r"^(?P<model>LR|RF|XGB|MLP|LSTM): ACC=(?P<acc>[\d\.]+) F1=(?P<f1>[\d\.NaN]+) AUC=(?P<auc>[\d\.NaN]+)")
    rows = []
    for fp in sorted((run_root / "10_logs").glob("*.txt")):
        label = dataset = run = None
        for ln in open(fp, "r"):
            ln = ln.strip()
            m1 = pat_hdr.match(ln)
            if m1:
                label, dataset, run = m1.group("label"), m1.group("dataset"), m1.group("run"); continue
            m2 = pat_model.match(ln)
            if m2 and label and run:
                rows.append({
                    "file": fp.name,
                    "asset": label,
                    "dataset": dataset,
                    "run_type": run,
                    "model": m2.group("model"),
                    "ACC": float(m2.group("acc")),
                    "F1":  float("nan") if m2.group("f1")=="NaN" else float(m2.group("f1")),
                    "AUC": float("nan") if m2.group("auc")=="NaN" else float(m2.group("auc")),
                })
    df_leader = (pd.DataFrame(rows)
                 .sort_values(["asset","dataset","AUC","ACC"], ascending=[True,True,False,False]))
    path = save_leaderboard(df_leader, tag="metrics")
    print("Leaderboard saved:", path)
    return df_leader

def _infer_rule_from_index(idx: pd.DatetimeIndex) -> str:
    rule = pd.infer_freq(idx)
    if rule: return rule
    if len(idx) > 3:
        gaps = (idx[1:] - idx[:-1]).days
        if np.median(gaps) > 3:
            return "W"
    return "D"

def backtest_pred_file(pred_csv_path: Path, price_csv_path: Path, asset_label: str,
                       up_thr=0.55, down_thr=0.45, fee_bps=0.0005) -> pd.DataFrame:
    preds = pd.read_csv(pred_csv_path, parse_dates=["date"]).set_index("date").sort_index()
    if preds.empty: return preds

    PRICE_COL_MAP = {a["label"]: a["price_col"] for a in ASSETS}
    price_col = PRICE_COL_MAP[asset_label]
    px = pd.read_csv(price_csv_path, parse_dates=["Date"]).set_index("Date").sort_index()
    assert price_col in px.columns, f"{price_col} not in price file"
    px = px[[price_col]].rename(columns={price_col: "Close"})

    rule = _infer_rule_from_index(preds.index)
    px_aligned = px.asfreq("D") if rule == "D" else px.resample(rule).last()
    px_aligned["ret1"] = px_aligned["Close"].pct_change()

    dfb = preds.join(px_aligned, how="inner").dropna(subset=["prob_up","ret1"])
    if dfb.empty or len(dfb) < 3:
        return pd.DataFrame(index=pd.DatetimeIndex([], name="date"))

    dfb["pos"] = np.where(dfb["prob_up"] >= up_thr, 1, np.where(dfb["prob_up"] <= down_thr, -1, 0))
    dfb["pos_shift"] = dfb["pos"].shift(1).fillna(0)
    dfb["turnover"] = (dfb["pos"] != dfb["pos_shift"]).astype(int)
    dfb["pnl"] = dfb["pos_shift"] * dfb["ret1"] - dfb["turnover"] * fee_bps
    dfb["cum_pnl"] = (1 + dfb["pnl"]).cumprod()
    return dfb

def plot_and_save_equity(df_bt: pd.DataFrame, asset_label: str, model_tag: str):
    fig = plt.figure()
    df_bt["cum_pnl"].plot()
    plt.title(f"{asset_label} – {model_tag}")
    plt.xlabel("Date"); plt.ylabel("Equity")
    path = save_figure(fig, asset_label, f"{_slug(model_tag)}_equity")
    print("Figure saved:", path)
    return path

def backtest_best_from_leaderboard(df_leader: pd.DataFrame, *, only_model=("LR","RF","XGB","MLP","LSTM")):
    best = (df_leader[df_leader["model"].isin(only_model)]
            .sort_values(["asset","dataset","AUC","ACC"], ascending=[True,True,False,False])
            .groupby(["asset","dataset"]).head(1))

    summaries = []
    for _, row in best.iterrows():
        asset, dataset, model = row["asset"], row["dataset"], row["model"]
        guess = _pred_name(asset, dataset, model, window=30 if model=="LSTM" else None, split="test")
        pred_path = STAGES["preds_test"] / guess
        if not pred_path.exists():
            print(f"Skip (no prediction file): {pred_path.name}")
            continue

        bt = backtest_pred_file(pred_path, RAW_FILE, asset, up_thr=0.55, down_thr=0.45, fee_bps=0.0005)
        if bt.empty or "cum_pnl" not in bt or bt["cum_pnl"].empty:
            print(f"Skip (no overlapping data after alignment): {asset} | {pred_path.name}")
            continue

        model_tag = f"{model} ({dataset})"
        (STAGES["backtests"] / f"{_slug(asset)}_{_slug(model_tag)}_backtest.csv").parent.mkdir(parents=True, exist_ok=True)
        bt.to_csv(STAGES["backtests"] / f"{_slug(asset)}_{_slug(model_tag)}_backtest.csv")
        plot_and_save_equity(bt, asset, model_tag)

        summaries.append({
            "asset": asset,
            "dataset": dataset,
            "model": model,
            "final_equity": float(bt["cum_pnl"].iloc[-1]),
            "obs": int(bt["cum_pnl"].shape[0])
        })

    if summaries:
        df_sum = pd.DataFrame(summaries).sort_values(["asset","final_equity"], ascending=[True,False])
        path = STAGES["leaderboard"] / "backtest_summary.csv"
        df_sum.to_csv(path, index=False)
        print("Backtest summary saved:", path)
        print(df_sum)
    else:
        print("No backtest summaries produced (nothing overlapped or files missing).")

# =========================================================
# RUNS — choose mode via RUN_MODE ("all" or "single")
# =========================================================
print("=== RUN CONFIG ===")
print("FREQ:", FREQ, "| DEBUG:", DEBUG, "| RUN_ID:", RUN_ID)
print("OUTPUT ROOT:", RUN_ROOT)
print("RUN_MODE:", RUN_MODE)

if RUN_MODE == "all":
    for dataset_version in ["raw", "eng"]:
        for use_kw in [False, True]:  # base first, then extended
            tag = f"{dataset_version}_{'base' if not use_kw else 'ext'}"
            print(f"\n--- {tag.upper()} — ML + DL ---")
            for _, asset in asset_by_label.items():
                # Classical ML baselines
                run_asset_ml(asset, dataset_version=dataset_version, use_keywords=use_kw)
                # Deep learning (tabular MLP + sequence LSTM)
                run_asset_dl(asset, dataset_version=dataset_version, use_keywords=use_kw, use_dl_mode="both")

    # Build leaderboard from all logs saved in this run
    df_leader = build_leaderboard_from_logs(RUN_ROOT)
    # Backtest the best model per asset × dataset flavour (includes ML + DL)
    backtest_best_from_leaderboard(df_leader, only_model=("LR","RF","XGB","MLP","LSTM"))

elif RUN_MODE == "single":
    asset = asset_by_label[PAIR_KEY]
    print("ASSET:", asset["label"])
    for dataset_version in ["raw", "eng"]:
        for use_kw in [False, True]:  # base first, then extended
            tag = f"{dataset_version}_{'base' if not use_kw else 'ext'}"
            print(f"\n--- {asset['label']} [{tag.upper()}] — ML + DL ---")
            # Classical ML
            run_asset_ml(asset, dataset_version=dataset_version, use_keywords=use_kw)
            # Deep Learning
            run_asset_dl(asset, dataset_version=dataset_version, use_keywords=use_kw, use_dl_mode="both")

    df_leader = build_leaderboard_from_logs(RUN_ROOT)
    backtest_best_from_leaderboard(df_leader, only_model=("LR","RF","XGB","MLP","LSTM"))

else:
    raise ValueError("RUN_MODE must be 'all' or 'single'.")

print("\n=== ARTIFACTS ===")
for name, path in STAGES.items():
    num = len(list(path.glob('*')))
    print(f"{name:>12}: {path}  ({num} files)")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
PROJECT_DIR: /content/drive/MyDrive/gt-markets
DATA_DIR:    /content/drive/MyDrive/gt-markets/data/processed
KW_DIR:      /content/drive/MyDrive/gt-markets/data/Keyword Selection
OUT_DIR:     /content/drive/MyDrive/gt-markets/outputs
TF on CPU
TF version: 2.19.0
merged_financial_trends_data_2025-09-07.csv: filling NaNs in trend cols with 0: ['entrepreneurial_trend', 'cryptocurrency_trend'] 
RAW frame: 2015-09-13 → 2025-09-07 | rows: 522
ENG frame: 2015-09-13 → 2025-09-07 | rows: 522
RUN_ID: w_prod_20250914-013201
        data: /content/drive/MyDrive/gt-markets/outputs/runs/w_prod_20250914-013201/00_data
        logs: /content/drive/MyDrive/gt-markets/outputs/runs/w_prod_20250914-013201/10_logs
   preds_val: /content/drive/MyDrive/gt-markets/outputs/runs/w_prod_20250914-013201/20_preds/val
  preds_test: /content/drive/MyDrive/gt-markets/outputs/runs/w_prod_202