<a href="https://colab.research.google.com/github/brendonhuynhbp-hub/gt-markets/blob/main/notebooks/GoogleTrends_Financial_Modeling_Backtest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup: Mount Drive + Paths

In [33]:

# =====================================================
# 1) SETUP  —  one-time per Colab session
# =====================================================

# --- 1.1 Mount Google Drive & set project paths ---
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
PROJECT_DIR = Path("/content/drive/MyDrive/gt-markets")
DATA_DIR    = PROJECT_DIR / "data" / "processed"
OUT_DIR     = PROJECT_DIR / "outputs"
KW_DIR      = PROJECT_DIR / "data" / "Keyword Selection"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- 1.2 Libraries (ML/DL + utils) ---
import pandas as pd
import numpy as np
import re
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
!pip -q install xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import tensorflow as tf
from tensorflow import keras

print("✅ Setup: libs loaded.")

# --- 1.3 Config (filenames & safe defaults) ---
MERGED_FILE = DATA_DIR / "merged_financial_trends_data_2025-09-07.csv"  # raw merged (prices + trends)
KW_CSV      = KW_DIR / "combined_significant_lagged_correlations.csv"   # James's keyword selection
assert MERGED_FILE.exists(), f"Missing merged dataset: {MERGED_FILE}"
assert KW_CSV.exists(), f"Missing keyword file: {KW_CSV}"

# --- 1.4 Asset registry (no magic indices anywhere) ---
# PAIR_ID must match exactly what's in the Keyword CSV "Pair" column.
assets = [
    {"PAIR_ID": "GC=F",    "price_col": "GC=F Close",    "label": "Gold"},
    {"PAIR_ID": "BTC-USD", "price_col": "BTC-USD Close", "label": "BTC"},
    {"PAIR_ID": "CL=F",    "price_col": "CL=F Close",    "label": "Oil"},
    {"PAIR_ID": "CNY=X",   "price_col": "CNY=X Close",   "label": "USDCNY"},
]

# Fast lookup by label
asset_by_label = {a["label"].lower(): a for a in assets}

# Optional aliases so you can call run_asset_by_label("gc=f") etc.
asset_alias = {
    "gold": "gold", "gc=f": "gold", "xau": "gold", "xauusd": "gold",
    "btc": "btc", "bitcoin": "btc", "btc-usd": "btc",
    "oil": "oil", "cl=f": "oil", "wti": "oil",
    "usdcny": "usdcny", "cny=x": "usdcny", "cny": "usdcny",
}

def resolve_label(name: str) -> str:
    key = (name or "").strip().lower()
    return asset_alias.get(key, key)  # normalize known aliases → canonical

# --- 1.5 Reusable helpers (target, features, evaluation, saving) ---
def make_target(df: pd.DataFrame, price_col: str) -> pd.DataFrame:
    """
    Add returns & classification target for 'next-day up' based on the given price column.
    We drop rows with NaNs afterward so downstream code sees a clean panel.
    """
    df = df.copy()
    df["ret1"] = df[price_col].pct_change()
    df["y_up"] = (df["ret1"].shift(-1) > 0).astype(int)
    return df.dropna().copy()

def pick_start_index(n_rows: int, floor: int = 60, frac: float = 0.30, cap: int = 500) -> int:
    """
    Choose a walk-forward starting point:
    - enough warm-up for the scaler and model to stabilize (floor),
    - a fraction of data to train first (frac),
    - capped so we don't waste tons of history (cap).
    """
    return max(floor, min(cap, int(n_rows * frac)))

def walk_forward_eval(df_in: pd.DataFrame, feature_cols, model, start_index: int = 500):
    """
    Expanding-window walk-forward:
      train on [0:i) → predict on [i]
      IMPORTANT: scale using train only → transform test (no leakage).
    Returns (pred_df, metrics_dict).
    """
    df_in = df_in.copy()
    X_all = df_in[feature_cols].values
    y_all = df_in["y_up"].values
    idxs  = df_in.index

    scaler = StandardScaler(with_mean=True, with_std=True)
    preds, probs, trues, dates = [], [], [], []

    for i in range(start_index, len(df_in)):
        X_train, y_train = X_all[:i], y_all[:i]
        X_test,  y_test  = X_all[i:i+1], y_all[i]

        X_train_s = scaler.fit_transform(X_train)
        X_test_s  = scaler.transform(X_test)

        model.fit(X_train_s, y_train)
        p = model.predict_proba(X_test_s)[0, 1]
        yhat = int(p >= 0.5)

        preds.append(yhat); probs.append(p); trues.append(int(y_test)); dates.append(idxs[i])

    out = pd.DataFrame({"date": dates, "y_true": trues, "y_pred": preds, "prob_up": probs}).set_index("date")
    acc = accuracy_score(out["y_true"], out["y_pred"])
    f1  = f1_score(out["y_true"], out["y_pred"])
    try:
        auc = roc_auc_score(out["y_true"], out["prob_up"])
    except Exception:
        auc = np.nan
    return out, {"acc": acc, "f1": f1, "auc": auc}

def save_run_group_txt(pair_label: str, run_label: str, results_dict: dict,
                       keywords: list, features_count: int, rows_used: int, note: str = ""):
    """
    Write one TXT per run containing:
      - asset label + run label
      - features/rows info
      - keyword list (or 'None')
      - metrics for ALL models in this run
    """
    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
    safe_pair = pair_label.replace(" ", "_").lower()
    safe_run  = run_label.replace(" ", "_").replace("+", "plus").lower()
    out_path  = OUT_DIR / f"{safe_pair}_{safe_run}_{ts}.txt"

    with open(out_path, "w") as f:
        f.write(f"{pair_label} — {run_label}\n")
        f.write(f"Timestamp: {ts}\n")
        f.write(f"Features used: {features_count}\n")
        f.write(f"Rows used: {rows_used}\n")
        if note:
            f.write(f"Note: {note}\n")

        f.write("\nKeywords used:\n")
        if keywords:
            for k in keywords:
                f.write(f"- {k}\n")
        else:
            f.write("- None\n")

        f.write("\nResults (all models):\n")
        for model_name, (_out, met) in results_dict.items():
            f.write(f"{model_name}: ACC={met['acc']:.3f}, F1={met['f1']:.3f}, AUC={met['auc']:.3f}\n")

    print("📝 Saved:", out_path)

def sanitize_features(X: pd.DataFrame) -> pd.DataFrame:
    """
    Be defensive: replace inf, fill NaN, clip outliers (winsorize 0.1%–99.9%).
    """
    X = X.replace([np.inf, -np.inf], np.nan).fillna(0.0)
    lo = X.quantile(0.001)
    hi = X.quantile(0.999)
    return X.clip(lower=lo, upper=hi, axis=1)

def build_trend_lag_features(frame: pd.DataFrame, cols: list) -> pd.DataFrame:
    """
    From *_trend columns, compute leak-safe change features:
      - 1-day % change, lagged 1 day
      - 7-day % change, lagged 1 day
    """
    feats = {}
    for c in cols:
        chg1 = frame[c].pct_change()
        chg7 = frame[c].pct_change(7)
        feats[c+"_chg1_lag1"] = chg1.shift(1)  # only info available at t-1
        feats[c+"_chg7_lag1"] = chg7.shift(1)
    return pd.DataFrame(feats, index=frame.index)

def map_keywords_to_trend_cols(df_cols: list, keywords: list) -> list:
    """
    Match keyword strings (from James's CSV) to *_trend columns present in our dataset.
    We normalize both sides to be resilient to underscores/spacing.
    """
    trend_cols_all = [c for c in df_cols if str(c).endswith("_trend")]
    def _norm(s): return re.sub(r"[^a-z0-9]+", "", str(s).lower())
    kw_to_col, unmatched = {}, []
    for kw in keywords:
        n = _norm(kw)
        hits = [c for c in trend_cols_all if n in _norm(c.replace("_trend",""))]
        if hits:
            kw_to_col[kw] = hits[0]  # first match is fine
        else:
            unmatched.append(kw)
    # (We could print unmatched here if you want to inspect)
    return list(dict.fromkeys(kw_to_col.values()))  # de-dup, keep order

def load_keywords_for_pair(csv_path: Path, pair_id: str) -> list:
    """
    Pull all keywords from James's file for the given Pair (e.g., 'GC=F').
    """
    dfk = pd.read_csv(csv_path)
    assert {"Pair", "Keyword"}.issubset(dfk.columns), f"Unexpected keyword CSV columns: {dfk.columns}"
    subset = dfk[dfk["Pair"] == pair_id]
    kw = (subset["Keyword"].dropna().astype(str).str.strip().str.lower().unique().tolist())
    return kw

# --- 1.6 Models (define once; re-used across assets) ---
models = {
    "LR":  LogisticRegression(max_iter=500),
    "RF":  RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "XGB": XGBClassifier(
        n_estimators=500, max_depth=4, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        tree_method="hist", random_state=42
    )
}

print("✅ Setup complete: config, helpers, models ready.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Setup: libs loaded.
✅ Setup complete: config, helpers, models ready.


In [37]:

# =====================================================
# 2) RUNNER — Baseline → Extended → (DL) for one asset
# =====================================================

def run_asset(asset: dict, keyword_csv: Path, use_dl: bool = True):
    """
    Run the full pipeline for a single asset:
      1) Baseline (no trend features)
      2) Extended (add GOLD/BTC/OIL/USDCNY keyword trends as lagged features)
      3) Deep Learning (MLP on extended features) — optional
    Saves one TXT per run with metrics for ALL models.
    """
    label     = asset["label"]
    pair_id   = asset["PAIR_ID"]
    price_col = asset["price_col"]

    print(f"\n==============================\n▶️  Running {label} (Pair={pair_id})\n==============================")

    # --- Load & prep dataset ---
    df = pd.read_csv(MERGED_FILE, parse_dates=["Date"]).set_index("Date").sort_index()
    # Build target for THIS asset
    df_mod = make_target(df, price_col)
    print(f"[Data] Range: {df_mod.index.min().date()} → {df_mod.index.max().date()} | Rows: {len(df_mod)}")

    # --- BASELINE (technical-only) ---
    # Exclude raw price, direct return/target, and ALL *_trend columns to keep baseline clean
    all_trend_cols = [c for c in df_mod.columns if c.endswith("_trend")]
    exclude = {price_col, "ret1", "y_up"} | set(all_trend_cols)
    numeric = [c for c in df_mod.columns if df_mod[c].dtype != "O"]
    baseline_cols = [c for c in numeric if c not in exclude]
    print(f"[Baseline] Features: {len(baseline_cols)}")

    start_idx = pick_start_index(len(df_mod))
    results_baseline = {}
    for name, mdl in models.items():
        out_b, m_b = walk_forward_eval(df_mod, baseline_cols, mdl, start_index=start_idx)
        results_baseline[name] = (out_b, m_b)
        print(f"  BASE {name}: ACC={m_b['acc']:.3f} F1={m_b['f1']:.3f} AUC={m_b['auc']:.3f}")

    save_run_group_txt(label, "baseline", results_baseline, keywords=[],
                       features_count=len(baseline_cols), rows_used=len(df_mod))

    # --- EXTENDED (baseline + keyword trend features) ---
    # 1) Load pair-specific keywords
    keep_keywords = load_keywords_for_pair(keyword_csv, pair_id)
    print(f"[Extended] Keywords found for {pair_id}: {len(keep_keywords)}")

    # 2) Map keywords → columns actually present in our dataset
    selected_trend_cols = map_keywords_to_trend_cols(df_mod.columns, keep_keywords)
    if not selected_trend_cols:
        print("[WARN] No trend columns matched for these keywords. Skipping extended run.")
        return

    # 3) Turn trend level into lagged changes (leak-safe), and drop sparse ones
    lag_feats = build_trend_lag_features(df_mod, selected_trend_cols)
    nan_ratio = lag_feats.isna().mean()
    kept = nan_ratio[nan_ratio < 0.50].index.tolist()
    lag_feats = lag_feats[kept]
    print(f"[Extended] New lagged trend features kept: {len(kept)}")

    # 4) Assemble extended dataset and sanitize feature matrix
    WARMUP = 8  # due to pct_change(7) then shift(1)
    df_ext = df_mod.join(lag_feats).iloc[WARMUP:].copy()
    df_ext[kept] = df_ext[kept].fillna(0.0)

    extended_cols = sorted(set(baseline_cols).union(kept))
    df_ext[extended_cols] = sanitize_features(df_ext[extended_cols])
    print(f"[Extended] Total features (baseline + trends): {len(extended_cols)} | Rows: {len(df_ext)}")

    start_idx_ext = pick_start_index(len(df_ext))
    results_extended = {}
    for name, mdl in models.items():
        out_e, m_e = walk_forward_eval(df_ext, extended_cols, mdl, start_index=start_idx_ext)
        results_extended[name] = (out_e, m_e)
        print(f"  EXT  {name}: ACC={m_e['acc']:.3f} F1={m_e['f1']:.3f} AUC={m_e['auc']:.3f}")

    save_run_group_txt(label, "baseline + keywords", results_extended, keywords=keep_keywords,
                       features_count=len(extended_cols), rows_used=len(df_ext))

    # --- DEEP LEARNING (optional): simple MLP on extended features ---
    if use_dl:
        print("[DL] Training MLP on extended features…")
        X = df_ext[extended_cols].values
        y = df_ext["y_up"].astype(int).values

        n = len(df_ext)
        i_tr, i_va = int(n * 0.70), int(n * 0.85)

        X_tr, X_va, X_te = X[:i_tr], X[i_tr:i_va], X[i_va:]
        y_tr, y_va, y_te = y[:i_tr], y[i_tr:i_va], y[i_va:]

        scaler = StandardScaler().fit(X_tr)
        X_tr_s, X_va_s, X_te_s = scaler.transform(X_tr), scaler.transform(X_va), scaler.transform(X_te)

        tf.keras.utils.set_random_seed(42)
        mlp = keras.Sequential([
            keras.layers.Input(shape=(X_tr_s.shape[1],)),
            keras.layers.Dense(128, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(64, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(1, activation="sigmoid"),
        ])
        mlp.compile(optimizer=keras.optimizers.Adam(1e-3),
                    loss="binary_crossentropy",
                    metrics=[keras.metrics.AUC(name="auc"),
                             keras.metrics.BinaryAccuracy(name="acc")])

        es = keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=5, restore_best_weights=True)
        mlp.fit(X_tr_s, y_tr, validation_data=(X_va_s, y_va), epochs=30, batch_size=64, callbacks=[es], verbose=0)

        p_te = mlp.predict(X_te_s, verbose=0).ravel()
        yhat = (p_te >= 0.5).astype(int)
        dl_metrics = {
            "acc": accuracy_score(y_te, yhat),
            "f1":  f1_score(y_te, yhat),
            "auc": roc_auc_score(y_te, p_te) if len(set(y_te)) > 1 else np.nan
        }
        print(f"  DL MLP: ACC={dl_metrics['acc']:.3f} F1={dl_metrics['f1']:.3f} AUC={dl_metrics['auc']:.3f}")

        # Save as a TXT run with a single "MLP" entry
        save_run_group_txt(label, "DL MLP (extended)", {"MLP": (None, dl_metrics)},
                           keywords=keep_keywords, features_count=len(extended_cols), rows_used=len(df_ext))

    print(f"✅ Finished {label}.")

In [38]:
# =====================================================
# 3) CONVENIENCE — run by human-readable label
# =====================================================

def run_asset_by_label(name: str, use_dl: bool = True):
    """
    Call like:
      run_asset_by_label("Gold")
      run_asset_by_label("BTC")
      run_asset_by_label("Oil")
      run_asset_by_label("USDCNY")
    Aliases like 'gc=f', 'wti', 'cny=x' also work.
    """
    canon = resolve_label(name)
    asset = asset_by_label.get(canon)
    if asset is None:
        print(f"[!] Unknown asset '{name}'. Available: {list(asset_by_label.keys())}")
        return
    run_asset(asset, KW_CSV, use_dl=use_dl)

print("✅ You can now call run_asset_by_label('Gold') etc.")

✅ You can now call run_asset_by_label('Gold') etc.


In [None]:

# =====================================================
# 4) RUNS — you control what to run today
# =====================================================

# 4.1 Run Gold end-to-end (Baseline → Extended → DL)
run_asset_by_label("Gold", use_dl=True)

# 4.2 Run BTC
# run_asset_by_label("BTC", use_dl=True)

# 4.3 Run Oil
# run_asset_by_label("Oil", use_dl=True)

# 4.4 Run USD/CNY
# run_asset_by_label("USDCNY", use_dl=True)

# TIP: Uncomment only what you want to run this session.
# All TXT outputs will appear under:
#   /content/drive/MyDrive/gt-markets/outputs/


▶️  Running Gold (Pair=GC=F)
[Data] Range: 2016-03-08 → 2020-05-18 | Rows: 985
[Baseline] Features: 40
  BASE LR: ACC=0.503 F1=0.480 AUC=0.492
