<a href="https://colab.research.google.com/github/brendonhuynhbp-hub/gt-markets/blob/update-2.1/notebooks/GoogleTrends_Financial_Modeling_Backtest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup: Mount Drive + Paths

In [11]:
# =========================================================
# GoogleTrends Financial Modeling — End-to-End Backtest
# - Mount Drive, paths, data load/resample
# - Feature engineering + keyword lags
# - Leak guard + NaN safety
# - ML (LR/RF/XGB) + DL (MLP/LSTM/GRU) with toggles
# - Rolling walk-forward (ML & DL) + Platt calibration
# - Backtest, logging, figs, leaderboards
# =========================================================

# -------- Global switches --------
DEBUG          = False       # True → quick run; False → full
FREQ           = "D"        # "D" or "W"
RUN_MODE       = "all"      # "all" or "single"
PAIR_KEY       = "gold"     # used when RUN_MODE == "single"

# Rolling
USE_ROLLING_ML = True
USE_ROLLING_DL = True
MAX_FOLDS      = 5
ROLL_STEP      = "test"     # "test" or integer step
DL_WINDOW      = 30         # sequence length for DL

# Calibration
CALIBRATE_PROBS = True

# Model toggles (ML)
ENABLE_LR   = True
ENABLE_RF   = True
ENABLE_XGB  = True   # set False if xgboost unavailable

# Model toggles (DL)
ENABLE_MLP  = True   # MLP over flattened window
ENABLE_LSTM = True
ENABLE_GRU  = True

# -------- Drive + paths --------
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import os, warnings, random, re, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

CANDIDATE_PROJECT_DIRS = [
    Path("/content/drive/MyDrive/gt-markets"),
    Path("/content/drive/Shareddrives/gt-markets"),
]
PROJECT_DIR = next((p for p in CANDIDATE_PROJECT_DIRS if p.exists()), None)
assert PROJECT_DIR is not None, "Project directory not found."

DATA_DIR = PROJECT_DIR / "data" / "processed"
KW_DIR   = PROJECT_DIR / "data" / "Keyword Selection"
OUT_DIR  = PROJECT_DIR / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

RAW_FILE = DATA_DIR / "merged_financial_trends_data_2025-09-07.csv"
ENG_FILE = DATA_DIR / "merged_financial_trends_engineered_2025-09-07.csv"
KW_CSV   = KW_DIR  / "combined_significant_lagged_correlations.csv"
assert RAW_FILE.exists() and ENG_FILE.exists() and KW_CSV.exists(), "Missing data files."

print("PROJECT_DIR:", PROJECT_DIR)
print("DATA_DIR:   ", DATA_DIR)
print("KW_DIR:     ", KW_DIR)
print("OUT_DIR:    ", OUT_DIR)

# -------- Repro + warnings --------
SEED = 42
random.seed(SEED); np.random.seed(SEED)
warnings.filterwarnings("ignore")

# -------- ML/DL stack --------
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import tensorflow as tf
from tensorflow import keras
tf.keras.utils.set_random_seed(SEED)
try:
    gpus = tf.config.list_physical_devices('GPU')
    for g in gpus:
        tf.config.experimental.set_memory_growth(g, True)
    print(f"TF GPU devices: {len(gpus)} (mem growth on)" if gpus else "TF on CPU")
except Exception as e:
    print("TF GPU setup note:", e)
print("TF version:", tf.__version__)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
try:
    from xgboost import XGBClassifier
    HAVE_XGB = True
except Exception:
    HAVE_XGB = False

# -------- Assets --------
ASSETS = [
    {"PAIR_ID": "GC=F",     "price_col": "GC=F Close",     "label": "Gold"},
    {"PAIR_ID": "BTC-USD",  "price_col": "BTC-USD Close",  "label": "BTC"},
    {"PAIR_ID": "CL=F",     "price_col": "CL=F Close",     "label": "Oil"},
    {"PAIR_ID": "USDCNY=X", "price_col": "USDCNY=X Close", "label": "USDCNY"},
]
asset_by_label = {a["label"].lower(): a for a in ASSETS}

# -------- Load + trend NaN fill --------
def _load_and_clean(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, parse_dates=["Date"]).set_index("Date").sort_index()
    trend_cols = [c for c in df.columns if "trend" in c.lower()]
    if trend_cols:
        bad = [c for c in trend_cols if df[c].isna().any()]
        if bad:
            print(f"{path.name}: filling NaNs in trend cols with 0:", bad[:10], "..." if len(bad)>10 else "")
            df[bad] = df[bad].fillna(0.0)
    return df

df_raw0 = _load_and_clean(RAW_FILE)
df_eng0 = _load_and_clean(ENG_FILE)
if DEBUG:
    df_raw0 = df_raw0.tail(1000)
    df_eng0 = df_eng0.tail(1000)
    print("DEBUG mode: using last 1000 rows (RAW/ENG).")

# -------- Frequency control (D/W) --------
def to_frequency(df_in: pd.DataFrame, freq: str = "D") -> pd.DataFrame:
    if freq.upper() == "D": return df_in.copy()
    assert freq.upper() == "W"
    out = pd.DataFrame(index=df_in.resample("W").last().index)
    # prices as last-of-week; non-price as mean-of-week
    price_cols = {a["price_col"] for a in ASSETS}
    for a in ASSETS:
        if a["price_col"] in df_in.columns:
            out[a["price_col"]] = df_in[a["price_col"]].resample("W").last()
    for c in df_in.columns:
        if c not in price_cols:
            out[c] = df_in[c].resample("W").mean()
    return out

df_raw = to_frequency(df_raw0, FREQ)
df_eng = to_frequency(df_eng0, FREQ)
print("RAW frame:", df_raw.index.min().date(), "→", df_raw.index.max().date(), "| rows:", len(df_raw))
print("ENG frame:", df_eng.index.min().date(), "→", df_eng.index.max().date(), "| rows:", len(df_eng))

# -------- Keywords utilities --------
def load_keywords_for_pair(csv_path: Path, pair_id: str) -> list[str]:
    d = pd.read_csv(csv_path)
    assert {"Pair","Keyword"}.issubset(d.columns)
    aliases = {pair_id}
    if pair_id == "USDCNY=X": aliases |= {"CNY=X"}
    if pair_id == "CNY=X":    aliases |= {"USDCNY=X"}
    return (d.loc[d["Pair"].isin(aliases), "Keyword"]
              .dropna().astype(str).str.strip().str.lower().unique().tolist())

def map_keywords_to_trend_cols(all_cols: pd.Index, keywords: list[str]) -> list[str]:
    norm = lambda s: str(s).lower().strip().replace(" ", "_")
    desired = {f"{norm(k)}_trend" for k in keywords}
    return [c for c in all_cols if str(c).lower() in desired]

# -------- Targets + %chg lags --------
def make_target(df: pd.DataFrame, price_col: str) -> pd.DataFrame:
    out = df.copy()
    out["ret1"] = out[price_col].pct_change()
    out["y_up"] = (out[price_col].shift(-1) > out[price_col]).astype(int)
    return out.dropna(subset=[price_col]).dropna()

def build_trend_lag_features(df_in: pd.DataFrame, sel_cols: list[str], lag_steps=(7,14,21)) -> pd.DataFrame:
    out = {}
    for c in sel_cols:
        s = df_in[c].astype(float)
        for L in lag_steps:
            chg = s.pct_change(L).shift(1)
            chg = chg.replace([np.inf, -np.inf], np.nan).fillna(0.0).clip(-10.0, 10.0)
            out[f"{c}__chg{L}"] = chg
    return pd.DataFrame(out, index=df_in.index)

# -------- Versioned outputs --------
from datetime import datetime
RUN_MODE_TAG = "debug" if DEBUG else "prod"
RUN_STAMP    = datetime.now().strftime("%Y%m%d-%H%M%S")
RUN_ID       = f"{FREQ.lower()}_{RUN_MODE_TAG}_{RUN_STAMP}"

RUN_ROOT = OUT_DIR / "runs" / RUN_ID
STAGES = {
    "data":        RUN_ROOT / "00_data",
    "logs":        RUN_ROOT / "10_logs",
    "preds_val":   RUN_ROOT / "20_preds" / "val",
    "preds_test":  RUN_ROOT / "20_preds" / "test",
    "backtests":   RUN_ROOT / "30_backtests",
    "figs":        RUN_ROOT / "40_figs",
    "leaderboard": RUN_ROOT / "50_leaderboards",
}
for p in STAGES.values(): p.mkdir(parents=True, exist_ok=True)

(df_raw.head(1)
 .assign(_start=df_raw.index.min(), _end=df_raw.index.max())
 .to_csv(STAGES["data"] / f"dataset_snapshot_{RAW_FILE.name}.head1.csv"))

def _slug(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "_", s.lower()).strip("_")

def _pred_name(asset_label: str, dataset_tag: str, model: str, *, window: int | None = None, split="test") -> str:
    bits = [_slug(asset_label), dataset_tag, model.lower()]
    if window: bits.append(f"w{window}")
    return ".".join(["_".join(bits), split, "csv"])

def _log_name(asset_label: str, dataset_tag: str, run_name: str) -> str:
    return f"{_slug(asset_label)}_{dataset_tag}_{_slug(run_name)}.{RUN_STAMP}.txt"

def _fig_name(asset_label: str, tail: str) -> str:
    return f"{_slug(asset_label)}_{tail}.png"

def save_txt_log(asset_label: str, dataset_tag: str, run_name: str, lines: list[str]):
    path = STAGES["logs"] / _log_name(asset_label, dataset_tag, run_name)
    with open(path, "w") as f: f.write("\n".join(lines))
    return path

def save_preds_df(df_pred: pd.DataFrame, asset_label: str, dataset_tag: str, model: str, *, window: int | None, split: str):
    stage = STAGES["preds_val"] if str(split).startswith("val") else STAGES["preds_test"]
    outp = stage / _pred_name(asset_label, dataset_tag, model, window=window, split=split)
    df_pred.to_csv(outp); return outp

def save_leaderboard(df_leader: pd.DataFrame, tag: str = "metrics"):
    path = STAGES["leaderboard"] / f"leaderboard_{tag}.csv"
    df_leader.to_csv(path, index=False); return path

def save_figure(fig, asset_label: str, tail: str):
    path = STAGES["figs"] / _fig_name(asset_label, tail)
    fig.savefig(path, dpi=150, bbox_inches="tight"); plt.close(fig); return path

print("=== RUN CONFIG ===")
print(f"FREQ: {FREQ} | DEBUG: {DEBUG} | RUN_ID: {RUN_ID}")
print("OUTPUT ROOT:", RUN_ROOT)
print("RUN_MODE:", RUN_MODE)
print()

# -------- Folds (purged+embargoed) --------
from collections import namedtuple
Fold = namedtuple("Fold", ["fold_id", "tr", "va", "te"])

def time_series_walk_forward_folds(
    n_samples: int,
    train_frac: float = 0.70,
    valid_frac: float = 0.15,
    test_frac:  float = 0.15,
    embargo: int = 5,
    step: str | int = "test",
    min_train: int = 100,
    max_folds: int | None = None
):
    assert abs(train_frac + valid_frac + test_frac - 1.0) < 1e-6
    folds = []
    tr_w = math.floor(n_samples * train_frac)
    va_w = math.floor(n_samples * valid_frac)
    te_w = n_samples - tr_w - va_w
    step_sz = te_w if step == "test" else max(1, int(step))
    end = tr_w + va_w + te_w
    fid = 0
    while end <= n_samples:
        tr_end = tr_w
        va_end = tr_w + va_w
        te_end = tr_w + va_w + te_w
        tr = slice(0, max(tr_end - embargo, 0))
        va = slice(min(tr_end + embargo, n_samples), max(va_end - embargo, 0))
        te = slice(min(va_end + embargo, n_samples), min(te_end, n_samples))
        if (tr.stop - tr.start) >= max(1, min_train) and \
           (va.stop - va.start) >= 5 and \
           (te.stop - te.start) >= 5 and \
           (tr.stop <= va.start) and (va.stop <= te.start):
            folds.append(Fold(fid, tr, va, te))
        tr_w += step_sz; va_w += step_sz; te_w += step_sz
        end = tr_w + va_w + te_w; fid += 1
        if max_folds is not None and len(folds) >= max_folds: break
    return folds

# -------- Calibration (Platt) --------
def platt_calibrate_on_val(p_val: np.ndarray, y_val: np.ndarray):
    if not CALIBRATE_PROBS or len(y_val) < 20 or len(np.unique(y_val)) < 2:
        return None
    try:
        cal = LogisticRegression(max_iter=250, solver="lbfgs")
        cal.fit(p_val.reshape(-1, 1), y_val.astype(int))
        return cal
    except Exception:
        return None

def apply_calibration(cal, p: np.ndarray) -> np.ndarray:
    if cal is None: return p
    try:
        return cal.predict_proba(p.reshape(-1, 1))[:, 1]
    except Exception:
        return p

# -------- Dataset assembly (per asset) --------
def _prepare_dataset_for_asset(asset: dict, dataset_root: str, use_keywords: bool):
    label      = asset["label"]
    price_col  = asset["price_col"]
    dataset_tag = f"{dataset_root}_{'ext' if use_keywords else 'base'}"
    base_source = df_raw if dataset_root == "raw" else df_eng

    base = make_target(base_source[[price_col]], price_col).dropna(subset=[price_col, "y_up"])

    price_cols_all = {a["price_col"] for a in ASSETS}
    non_trend_engineered = [c for c in base_source.columns
                            if c not in price_cols_all and "trend" not in c.lower()]

    keywords_used = 0
    if use_keywords:
        kws = load_keywords_for_pair(KW_CSV, asset["PAIR_ID"])
        trend_cols = [c for c in map_keywords_to_trend_cols(base_source.columns, kws) if c in base_source.columns]
        lag_df = build_trend_lag_features(base_source, trend_cols, lag_steps=(7,14,21))
        ext = (base
               .join(base_source[non_trend_engineered], how="left")
               .join(base_source[trend_cols],          how="left")
               .join(lag_df,                           how="left"))
        if trend_cols: ext[trend_cols] = ext[trend_cols].fillna(0.0)
        lag_cols = [c for c in ext.columns if "__chg" in c]
        if lag_cols:  ext[lag_cols] = ext[lag_cols].fillna(0.0)
        keywords_used = len(trend_cols)
    else:
        ext = base.join(base_source[non_trend_engineered], how="left")

    keep_cols = [c for c in ext.columns if c not in ("y_up", price_col) and np.issubdtype(ext[c].dtype, np.number)]
    feature_cols = sorted(keep_cols)
    ext = ext[[*feature_cols, "y_up", price_col]].copy()

    # Cross-asset leak guard
    def _asset_aliases(a: dict) -> set[str]:
        pid = str(a["PAIR_ID"]).lower()
        root = str(a["price_col"]).split()[0].lower()
        lab = str(a["label"]).lower()
        S = {pid, root, lab}
        if pid == "usdcny=x": S |= {"cny=x"}
        if pid == "cny=x":    S |= {"usdcny=x"}
        return S
    other_tokens = set().union(*[_asset_aliases(a) for a in ASSETS if a is not asset])
    leaky = [c for c in feature_cols if any(tok in c.lower() for tok in other_tokens)]
    if leaky:
        print("[WARN] Cross-asset leakage detected, dropping:", leaky[:10], "..." if len(leaky) > 10 else "")
        feature_cols = [c for c in feature_cols if c not in leaky]
        ext = ext[[*feature_cols, "y_up", price_col]].copy()

    # NaN safety on features
    if feature_cols:
        ext[feature_cols] = ext[feature_cols].ffill().bfill().fillna(0.0)

    ext = ext.dropna(subset=["y_up"])
    return label, dataset_tag, ext, feature_cols, keywords_used, price_col

# -------- DL factories (fixed ranks) --------
def make_mlp(input_shape_3d):
    # input_shape_3d = (window, features); flatten to (window*features)
    keras.utils.set_random_seed(SEED)
    return keras.Sequential([
        keras.layers.Input(shape=input_shape_3d),
        keras.layers.Flatten(),                                  # <-- ensure 2D to Dense
        keras.layers.Dense(128, activation="relu"),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.30),
        keras.layers.Dense(64, activation="relu"),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.30),
        keras.layers.Dense(1, activation="sigmoid"),
    ], name="MLP_flat")

def make_lstm(input_shape, units=64, dropout=0.20):
    keras.utils.set_random_seed(SEED)
    return keras.Sequential([
        keras.layers.Input(shape=input_shape),
        keras.layers.LSTM(units, return_sequences=True),
        keras.layers.Dropout(dropout),
        keras.layers.LSTM(units//2, return_sequences=False),     # <-- vector output
        keras.layers.Dropout(dropout),
        keras.layers.Dense(1, activation="sigmoid"),
    ], name="LSTM_seq")

def make_gru(input_shape, units=64, dropout=0.20):
    keras.utils.set_random_seed(SEED)
    return keras.Sequential([
        keras.layers.Input(shape=input_shape),
        keras.layers.GRU(units, return_sequences=True),
        keras.layers.Dropout(dropout),
        keras.layers.GRU(units//2, return_sequences=False),      # <-- vector output
        keras.layers.Dropout(dropout),
        keras.layers.Dense(1, activation="sigmoid"),
    ], name="GRU_seq")

def compile_binary(model, lr=1e-3):
    model.compile(optimizer=keras.optimizers.Adam(lr),
                  loss="binary_crossentropy",
                  metrics=[keras.metrics.AUC(name="auc"), keras.metrics.BinaryAccuracy(name="acc")])
    return model

# -------- Sequence packing --------
def build_sequences_from_extended(df_ext: pd.DataFrame, feature_cols: list[str], y_col="y_up", window=30):
    X = df_ext[feature_cols].values
    y = df_ext[y_col].astype(int).values
    idx = df_ext.index
    xs, ys, dates = [], [], []
    for t in range(window, len(df_ext)):
        xs.append(X[t-window:t, :]); ys.append(y[t]); dates.append(idx[t])
    return np.asarray(xs), np.asarray(ys), pd.DatetimeIndex(dates)

def split_scale_sequences(X_seq, y_seq, train=0.70, valid=0.15):
    n = len(X_seq); i_tr = int(n*train); i_va = int(n*(train+valid))
    X_tr, X_va, X_te = X_seq[:i_tr], X_seq[i_tr:i_va], X_seq[i_va:]
    y_tr, y_va, y_te = y_seq[:i_tr], y_seq[i_tr:i_va], y_seq[i_va:]
    if len(X_tr)==0: return (X_tr,y_tr), (X_va,y_va), (X_te,y_te)
    T,W,F = X_tr.shape
    scaler = StandardScaler().fit(X_tr.reshape(T*W,F))
    def _tf(x):
        if len(x)==0: return x
        TT,WW,FF = x.shape
        return scaler.transform(x.reshape(TT*WW,FF)).reshape(TT,WW,FF)
    return (_tf(X_tr),y_tr),(_tf(X_va),y_va),(_tf(X_te),y_te)

# -------- DL runner (MLP/LSTM/GRU) --------
def run_asset_dl(asset: dict,
                 dataset_root: str = "raw",
                 use_keywords: bool = True,
                 use_rolling: bool = USE_ROLLING_DL,
                 max_folds: int = MAX_FOLDS,
                 roll_step: str | int = ROLL_STEP,
                 window: int = DL_WINDOW):

    label, dataset_tag, ext, feature_cols, keywords_used, price_col = _prepare_dataset_for_asset(
        asset, dataset_root, use_keywords
    )
    rows_used = len(ext)

    modes = []
    if ENABLE_MLP:  modes.append("mlp")
    if ENABLE_LSTM: modes.append("lstm")
    if ENABLE_GRU:  modes.append("gru")
    if not modes:
        print(f"[SKIP] DL disabled for {label} [{dataset_tag}]")
        return

    # sequences
    Xseq, yseq, idx_seq = build_sequences_from_extended(ext, feature_cols, "y_up", window)
    nseq = len(Xseq)
    if nseq < 80:
        print(f"[WARN] {label} [{dataset_tag}]: not enough sequences (>=80 required).")
        return
    (X_tr_raw, y_tr_raw), (X_va_raw, y_va_raw), (X_te_raw, y_te_raw) = split_scale_sequences(Xseq, yseq, 0.70, 0.15)
    i_tr_end = int(nseq*0.70); i_va_end = int(nseq*0.85)

    def slice_scaled(tr: slice, va: slice, te: slice):
        X_tr = X_tr_raw[max(tr.start,0): min(tr.stop, i_tr_end)]
        y_tr = y_tr_raw[max(tr.start,0): min(tr.stop, i_tr_end)]
        va_s = max(va.start - i_tr_end, 0); va_e = max(min(va.stop - i_tr_end, len(X_va_raw)), 0)
        X_va = X_va_raw[va_s:va_e]; y_va = y_va_raw[va_s:va_e]
        te_s = max(te.start - i_va_end, 0); te_e = max(min(te.stop - i_va_end, len(X_te_raw)), 0)
        X_te = X_te_raw[te_s:te_e]; y_te = y_te_raw[te_s:te_e]
        return (X_tr, y_tr), (X_va, y_va), (X_te, y_te)

    folds = time_series_walk_forward_folds(nseq, 0.70, 0.15, 0.15, embargo=5,
                                           step=roll_step, min_train=50, max_folds=max_folds) if use_rolling \
            else [Fold(0, slice(0,int(nseq*0.70)), slice(int(nseq*0.70)+5,int(nseq*0.85)-5), slice(int(nseq*0.85)+5,nseq))]

    rows_summary = []
    for mode in modes:
        per_fold = []
        for fd in folds:
            (X_tr, y_tr), (X_va, y_va), (X_te, y_te) = slice_scaled(fd.tr, fd.va, fd.te)
            if len(X_tr)==0 or len(X_va)==0 or len(X_te)==0: continue

            if mode == "mlp":
                model = compile_binary(make_mlp(X_tr.shape[1:]), lr=1e-3)             # Flatten inside
            elif mode == "lstm":
                model = compile_binary(make_lstm(X_tr.shape[1:], units=64, dropout=0.20), lr=1e-3)
            elif mode == "gru":
                model = compile_binary(make_gru(X_tr.shape[1:], units=64, dropout=0.20), lr=1e-3)
            else:
                continue

            es = keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=6, restore_best_weights=True)
            EPOCHS = 3 if DEBUG else 50
            BATCH  = 32 if DEBUG else 64
            model.fit(X_tr, y_tr, validation_data=(X_va, y_va),
                      epochs=EPOCHS, batch_size=BATCH, callbacks=[es], verbose=0)

            p_va = model.predict(X_va, verbose=0).ravel()
            p_te = model.predict(X_te, verbose=0).ravel()
            cal  = platt_calibrate_on_val(p_va, y_va)
            p_teC = apply_calibration(cal, p_te)
            h_te  = (p_teC >= 0.5).astype(int)

            auc = roc_auc_score(y_te, p_teC) if len(np.unique(y_te))>1 else float("nan")
            acc = accuracy_score(y_te, h_te)
            f1  = f1_score(y_te, h_te) if len(np.unique(y_te))>1 else float("nan")
            per_fold.append({"asset": label, "dataset": dataset_tag, "model": mode.upper(),
                             "fold": fd.fold_id, "auc": auc, "acc": acc, "f1": f1,
                             "va_n": len(y_va), "te_n": len(y_te)})

            val_idx  = idx_seq[fd.va]
            test_idx = idx_seq[fd.te]
            df_val  = pd.DataFrame({"date": val_idx,  "y_true": y_va, "prob_up": p_va}).set_index("date")
            df_test = pd.DataFrame({"date": test_idx, "y_true": y_te, "prob_up": p_teC}).set_index("date")
            save_preds_df(df_val,  label, dataset_tag, mode, window=DL_WINDOW, split=f"val_f{fd.fold_id}")
            save_preds_df(df_test, label, dataset_tag, mode, window=DL_WINDOW, split=f"test_f{fd.fold_id}")

        if per_fold:
            dfm = pd.DataFrame(per_fold)
            agg = dfm.agg({"auc":"mean","acc":"mean","f1":"mean"}).to_dict()
            agg.update({"asset": label, "dataset": dataset_tag, "model": mode.upper(), "folds": len(dfm)})
            rows_summary.append(agg)

    if rows_summary:
        roll_csv = STAGES["leaderboard"] / "rolling_dl_metrics.csv"
        df_roll = pd.DataFrame(rows_summary)
        if roll_csv.exists():
            prev = pd.read_csv(roll_csv)
            df_roll = pd.concat([prev, df_roll], ignore_index=True)
        df_roll.to_csv(roll_csv, index=False)

    lines = [
        f"{label} [{dataset_tag}] — DL modes={','.join(modes)} | rolling={use_rolling} | window={DL_WINDOW}",
        f"[Run] FREQ={FREQ} | DEBUG={DEBUG} | RUN_ID={RUN_ID}",
        f"[Data] Rows: {rows_used} | Sequences: {len(Xseq)} | Folds: {len(folds)}",
        f"[Keywords] used={keywords_used}",
        "[Columns]",
        *[f"    - {c}" for c in feature_cols],
        "[Hyperparams]",
        f"    MLP 128-64, drop=0.30 (flattened), lr=1e-3, epochs={'3' if DEBUG else '50'}, batch={'32' if DEBUG else '64'}",
        f"    LSTM 64→32,  drop=0.20, lr=1e-3, epochs={'3' if DEBUG else '50'}, batch={'32' if DEBUG else '64'}",
        f"    GRU  64→32,  drop=0.20, lr=1e-3, epochs={'3' if DEBUG else '50'}, batch={'32' if DEBUG else '64'}",
        f"    Calibration: {'Platt on validation' if CALIBRATE_PROBS else 'OFF'}",
    ]
    save_txt_log(label, dataset_tag, f"DL_{','.join(modes)}", lines)
    print(f"✅ DL finished {label} [{dataset_tag}] | rows: {rows_used} | features: {len(feature_cols)}")

# -------- ML runner (LR/RF/XGB) --------
def run_asset_ml(asset: dict,
                 dataset_root: str = "raw",
                 use_keywords: bool = True,
                 use_rolling: bool = USE_ROLLING_ML,
                 max_folds: int = MAX_FOLDS,
                 roll_step: str | int = ROLL_STEP):

    label, dataset_tag, ext, feature_cols, keywords_used, price_col = _prepare_dataset_for_asset(
        asset, dataset_root, use_keywords
    )
    rows_used = len(ext)

    X = ext[feature_cols].values
    y = ext["y_up"].astype(int).values
    n = len(X)
    if n < 100:
        print(f"[WARN] {label} [{dataset_tag}]: not enough rows (>=100 required).")
        return

    folds = time_series_walk_forward_folds(n, 0.70, 0.15, 0.15, embargo=5,
                                           step=roll_step, min_train=80, max_folds=max_folds) if use_rolling else \
            [Fold(0, slice(0,int(n*0.70)-5), slice(int(n*0.70)+5, int(n*0.85)-5), slice(int(n*0.85)+5, n))]

    fitters = []

    if ENABLE_LR:
        def fit_lr(Xtr, ytr):
            pipe = Pipeline(steps=[
                ("imp", SimpleImputer(strategy="median")),
                ("scl", StandardScaler(with_mean=True, with_std=True)),
                ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
            ]).fit(Xtr, ytr)
            return ("LR", pipe)
        fitters.append(fit_lr)

    if ENABLE_RF:
        def fit_rf(Xtr, ytr):
            pipe = Pipeline(steps=[
                ("imp", SimpleImputer(strategy="median")),
                ("clf", RandomForestClassifier(
                    n_estimators=400 if not DEBUG else 100,
                    max_depth=None, n_jobs=-1, random_state=SEED,
                    class_weight="balanced_subsample"
                ))
            ]).fit(Xtr, ytr)
            return ("RF", pipe)
        fitters.append(fit_rf)

    if ENABLE_XGB and HAVE_XGB:
        def fit_xgb(Xtr, ytr):
            pipe = Pipeline(steps=[
                ("imp", SimpleImputer(strategy="median")),
                ("clf", XGBClassifier(
                    n_estimators=600 if not DEBUG else 200,
                    max_depth=4, learning_rate=0.08,
                    subsample=0.8, colsample_bytree=0.8,
                    min_child_weight=2, reg_lambda=1.0,
                    tree_method="hist", eval_metric="auc",
                    random_state=SEED, n_jobs=-1
                ))
            ]).fit(Xtr, ytr)
            return ("XGB", pipe)
        fitters.append(fit_xgb)

    rows_summary = []
    for fit_fn in fitters:
        per_fold = []
        for fd in folds:
            tr, va, te = fd.tr, fd.va, fd.te
            Xtr, ytr = X[tr], y[tr]
            Xva, yva = X[va], y[va]
            Xte, yte = X[te], y[te]
            if len(Xtr)==0 or len(Xva)==0 or len(Xte)==0: continue

            name, model = fit_fn(Xtr, ytr)

            p_va = model.predict_proba(Xva)[:,1] if hasattr(model, "predict_proba") else model.decision_function(Xva)
            p_te = model.predict_proba(Xte)[:,1] if hasattr(model, "predict_proba") else model.decision_function(Xte)

            cal   = platt_calibrate_on_val(p_va, yva)
            p_teC = apply_calibration(cal, p_te)
            h_te  = (p_teC >= 0.5).astype(int)

            auc = roc_auc_score(yte, p_teC) if len(np.unique(yte))>1 else float("nan")
            acc = accuracy_score(yte, h_te)
            f1  = f1_score(yte, h_te) if len(np.unique(yte))>1 else float("nan")
            per_fold.append({"asset": label, "dataset": dataset_tag, "model": name,
                             "fold": fd.fold_id, "auc": auc, "acc": acc, "f1": f1,
                             "va_n": len(yva), "te_n": len(yte)})

            va_idx = ext.index[va]; te_idx = ext.index[te]
            df_val  = pd.DataFrame({"date": va_idx, "y_true": yva, "prob_up": p_va}).set_index("date")
            df_test = pd.DataFrame({"date": te_idx, "y_true": yte, "prob_up": p_teC}).set_index("date")
            save_preds_df(df_val,  label, dataset_tag, name.lower(), window=None, split=f"val_f{fd.fold_id}")
            save_preds_df(df_test, label, dataset_tag, name.lower(), window=None, split=f"test_f{fd.fold_id}")

        if per_fold:
            dfm = pd.DataFrame(per_fold)
            agg = dfm.agg({"auc":"mean","acc":"mean","f1":"mean"}).to_dict()
            agg.update({"asset": label, "dataset": dataset_tag, "model": dfm["model"].iloc[0], "folds": len(dfm)})
            rows_summary.append(agg)

    if rows_summary:
        roll_csv = STAGES["leaderboard"] / "rolling_ml_metrics.csv"
        df_roll = pd.DataFrame(rows_summary)
        if roll_csv.exists():
            prev = pd.read_csv(roll_csv)
            df_roll = pd.concat([prev, df_roll], ignore_index=True)
        df_roll.to_csv(roll_csv, index=False)

    lines = [
        f"{label} [{dataset_tag}] — ML (rolling={use_rolling})",
        f"[Run] FREQ={FREQ} | DEBUG={DEBUG} | RUN_ID={RUN_ID}",
        f"[Data] Rows: {rows_used} | Folds: {len(folds)} | MaxFolds: {max_folds}",
        f"[Keywords] used={keywords_used}",
        "[Columns]",
        *[f"    - {c}" for c in feature_cols],
        "[Hyperparams]",
        "    LR: StandardScaler + median imputer + lbfgs, class_weight=balanced",
        f"    RF: median imputer + n_estimators={400 if not DEBUG else 100}, class_weight=balanced_subsample",
        ("    XGB: median imputer + n_estimators={}; depth=4; lr=0.08; subsample=0.8; colsample=0.8; min_child_weight=2"
         .format(600 if not DEBUG else 200)) if (ENABLE_XGB and HAVE_XGB) else "    XGB: not available",
        f"    Calibration: {'Platt on validation' if CALIBRATE_PROBS else 'OFF'}",
    ]
    save_txt_log(label, dataset_tag, "ML", lines)
    print(f"✅ ML finished {label} [{dataset_tag}] | rows: {rows_used} | features: {len(feature_cols)}")

# -------- Backtest + figs --------
def equity_curve_from_probs(df_probs: pd.DataFrame, price_series: pd.Series, entry_col="prob_up", thresh=0.5):
    df = df_probs.copy().join(price_series, how="left")
    ret = price_series.pct_change().shift(-1)
    signal = (df[entry_col] >= thresh).astype(int)
    eq = (1 + (signal * ret).fillna(0.0)).cumprod()
    return eq

def summarize_backtests():
    rows = []
    for p in sorted((STAGES["preds_test"]).glob("*.csv")):
        parts = p.name.split(".")
        if len(parts) < 3 or not parts[1].startswith("test"): continue

        stem = parts[0]                   # <asset>_<dataset>_<model>[_wXX]
        tokens = stem.split("_")
        asset_slug = tokens[0]
        model = tokens[-1].split("_w")[0]
        dataset = "_".join(tokens[1:-1]) if len(tokens) > 2 else "unknown"

        d = pd.read_csv(p)
        if "date" not in d.columns: d.rename(columns={d.columns[0]: "date"}, inplace=True)
        d["date"] = pd.to_datetime(d["date"]); d.set_index("date", inplace=True)
        if not {"prob_up","y_true"}.issubset(d.columns): continue

        label = next((a for a in ASSETS if asset_slug == _slug(a["label"])), None)
        if label is None: continue
        base_source = df_raw if dataset.startswith("raw") else df_eng
        price = base_source[label["price_col"]].reindex(d.index).ffill()

        eq = equity_curve_from_probs(d, price, "prob_up", 0.5)
        rows.append({"asset": label["label"], "dataset": dataset, "model": model.upper(),
                     "final_equity": float(eq.iloc[-1]) if len(eq) else np.nan,
                     "obs": int(len(eq))})

        fig, ax = plt.subplots(figsize=(6,3))
        eq.plot(ax=ax); ax.set_title(f"{label['label']} — {dataset} — {model.upper()} equity")
        ax.grid(True, alpha=0.3)
        save_figure(fig, label["label"].lower(), f"{model.lower()}_{dataset}_equity")

    if rows:
        dfb = pd.DataFrame(rows).sort_values(["asset","dataset","final_equity"],
                                             ascending=[True, True, False])
        path = STAGES["leaderboard"] / "backtest_summary.csv"
        dfb.to_csv(path, index=False)
        print("Backtest summary saved:", path)
        with pd.option_context('display.max_rows', None, 'display.width', 120):
            print(dfb.head(50))
    else:
        print("No backtests found under test predictions.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
PROJECT_DIR: /content/drive/MyDrive/gt-markets
DATA_DIR:    /content/drive/MyDrive/gt-markets/data/processed
KW_DIR:      /content/drive/MyDrive/gt-markets/data/Keyword Selection
OUT_DIR:     /content/drive/MyDrive/gt-markets/outputs
TF on CPU
TF version: 2.19.0
merged_financial_trends_data_2025-09-07.csv: filling NaNs in trend cols with 0: ['entrepreneurial_trend', 'cryptocurrency_trend'] 
RAW frame: 2015-09-08 → 2025-09-05 | rows: 2609
ENG frame: 2015-09-08 → 2025-09-05 | rows: 2609
=== RUN CONFIG ===
FREQ: D | DEBUG: False | RUN_ID: d_prod_20250914-055550
OUTPUT ROOT: /content/drive/MyDrive/gt-markets/outputs/runs/d_prod_20250914-055550
RUN_MODE: all



In [None]:
# =========================================================
# Driver: iterate dataset roots × keyword usage × assets
# =========================================================
grid = [
    ("raw", False),  # raw_base
    ("raw", True),   # raw_ext
    ("eng", False),  # eng_base
    ("eng", True),   # eng_ext
]

assets_to_run = [asset_by_label[PAIR_KEY]] if RUN_MODE == "single" else ASSETS

for dataset_root, use_kw in grid:
    tag = f"{dataset_root}_{'ext' if use_kw else 'base'}"
    print(f"--- {tag} — ML + DL ---")
    for asset in assets_to_run:
        run_asset_ml(asset,
                     dataset_root=dataset_root,
                     use_keywords=use_kw,
                     use_rolling=USE_ROLLING_ML,
                     max_folds=MAX_FOLDS,
                     roll_step=ROLL_STEP)
        run_asset_dl(asset,
                     dataset_root=dataset_root,
                     use_keywords=use_kw,
                     use_rolling=USE_ROLLING_DL,
                     max_folds=MAX_FOLDS,
                     roll_step=ROLL_STEP,
                     window=DL_WINDOW)

# Leaderboards + figs
summarize_backtests()

print("\n=== ARTIFACTS ===")
for k, v in STAGES.items():
    try:
        n = len(list(v.glob("*"))) if v.exists() else 0
        print(f"{k:>12}: {v}  ({n} files)")
    except Exception:
        print(f"{k:>12}: {v}")


--- raw_base — ML + DL ---
[WARN] Cross-asset leakage detected, dropping: ['BTC-USD 14-day RSI', 'BTC-USD 20-day simple moving average', 'BTC-USD 30-day Rolling Volatility', 'BTC-USD High', 'BTC-USD Log Returns', 'BTC-USD Low', 'BTC-USD Open', 'CL=F 14-day RSI', 'CL=F 20-day simple moving average', 'CL=F 30-day Rolling Volatility'] ...
✅ ML finished Gold [raw_base] | rows: 2608 | features: 16
[WARN] Cross-asset leakage detected, dropping: ['BTC-USD 14-day RSI', 'BTC-USD 20-day simple moving average', 'BTC-USD 30-day Rolling Volatility', 'BTC-USD High', 'BTC-USD Log Returns', 'BTC-USD Low', 'BTC-USD Open', 'CL=F 14-day RSI', 'CL=F 20-day simple moving average', 'CL=F 30-day Rolling Volatility'] ...
