<a href="https://colab.research.google.com/github/brendonhuynhbp-hub/gt-markets/blob/main/notebooks/GoogleTrends_Financial_Modeling_Backtest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup: Mount Drive + Paths

In [50]:

# =====================================================
# 1) SETUP  —  run once per Colab session
# =====================================================

# --- 1.1 Mount Google Drive & project paths ---
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
PROJECT_DIR = Path("/content/drive/MyDrive/gt-markets")
DATA_DIR    = PROJECT_DIR / "data" / "processed"
KW_DIR      = PROJECT_DIR / "data" / "Keyword Selection"
OUT_DIR     = PROJECT_DIR / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- 1.2 Libraries ---
import pandas as pd
import numpy as np
import re
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
!pip -q install xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import tensorflow as tf
from tensorflow import keras

print("✅ Libraries loaded.")


# --- 1.3 Data files (RAW vs ENGINEERED) ---
# (Update filenames here if your dates change)
RAW_FILE = DATA_DIR / "merged_financial_trends_data_2025-09-07.csv"
ENG_FILE = DATA_DIR / "merged_financial_trends_engineered_2025-09-07.csv"
KW_CSV   = KW_DIR   / "combined_significant_lagged_correlations.csv"

assert RAW_FILE.exists(), f"Missing RAW dataset: {RAW_FILE}"
assert ENG_FILE.exists(), f"Missing ENGINEERED dataset: {ENG_FILE}"
assert KW_CSV.exists(),  f"Missing keyword file: {KW_CSV}"

DATASETS = {"raw": RAW_FILE, "engineered": ENG_FILE}

def get_dataset_path(dataset_version: str) -> Path:
    """Resolve dataset path by version: 'raw' or 'engineered'."""
    key = (dataset_version or "raw").strip().lower()
    if key not in DATASETS:
        raise ValueError(f"Unknown dataset_version '{dataset_version}'. Use one of {list(DATASETS.keys())}.")
    return DATASETS[key]

def tag_label(label: str, dataset_version: str) -> str:
    """Produce human label with dataset tag, e.g. 'Gold [raw]'."""
    return f"{label} [{dataset_version}]"


# --- 1.4 Asset registry (no magic numbers) ---
# Pair IDs must match the "Pair" column in James's keyword CSV.
assets = [
    {"PAIR_ID": "GC=F",    "price_col": "GC=F Close",    "label": "Gold"},
    {"PAIR_ID": "BTC-USD", "price_col": "BTC-USD Close", "label": "BTC"},
    {"PAIR_ID": "CL=F",    "price_col": "CL=F Close",    "label": "Oil"},
    {"PAIR_ID": "CNY=X",   "price_col": "CNY=X Close",   "label": "USDCNY"},
]
asset_by_label = {a["label"].lower(): a for a in assets}
asset_alias = {
    "gold":"gold","gc=f":"gold","xau":"gold","xauusd":"gold",
    "btc":"btc","bitcoin":"btc","btc-usd":"btc",
    "oil":"oil","cl=f":"oil","wti":"oil",
    "usdcny":"usdcny","cny=x":"usdcny","cny":"usdcny",
}

def resolve_label(name: str) -> str:
    key = (name or "").strip().lower()
    return asset_alias.get(key, key)  # normalize alias → canonical

# --- 1.5 Utilities: target, features, eval, saving ---
def make_target(df: pd.DataFrame, price_col: str) -> pd.DataFrame:
    """
    Add 1-day return and classification target y_up (whether next day is up).
    Drops rows with NaN so downstream code is clean.
    """
    df = df.copy()
    df["ret1"] = df[price_col].pct_change()
    df["y_up"] = (df["ret1"].shift(-1) > 0).astype(int)
    return df.dropna().copy()

def pick_start_index(n_rows: int, floor: int = 60, frac: float = 0.30, cap: int = 500) -> int:
    """
    Choose a walk-forward starting index:
    - at least 'floor' rows,
    - about 'frac' of data as initial train window,
    - but never exceed 'cap'.
    """
    return max(floor, min(cap, int(n_rows * frac)))

def walk_forward_eval(df_in: pd.DataFrame, feature_cols, model, start_index: int = 500):
    """
    Expanding-window walk-forward:
      train on [0:i) → predict on [i]
    IMPORTANT: Scale using train only → transform test (no leakage).
    Returns (pred_df, metrics_dict).
    """
    df_in = df_in.copy()
    X_all = df_in[feature_cols].values
    y_all = df_in["y_up"].values
    idxs  = df_in.index

    scaler = StandardScaler(with_mean=True, with_std=True)
    preds, probs, trues, dates = [], [], [], []

    for i in range(start_index, len(df_in)):
        X_train, y_train = X_all[:i], y_all[:i]
        X_test,  y_test  = X_all[i:i+1], y_all[i]

        X_train_s = scaler.fit_transform(X_train)
        X_test_s  = scaler.transform(X_test)

        model.fit(X_train_s, y_train)
        p    = model.predict_proba(X_test_s)[0, 1]
        yhat = int(p >= 0.5)

        preds.append(yhat); probs.append(p); trues.append(int(y_test)); dates.append(idxs[i])

    out = pd.DataFrame({"date": dates, "y_true": trues, "y_pred": preds, "prob_up": probs}).set_index("date")
    acc = accuracy_score(out["y_true"], out["y_pred"])
    f1  = f1_score(out["y_true"], out["y_pred"])
    try:
        auc = roc_auc_score(out["y_true"], out["prob_up"])
    except Exception:
        auc = np.nan
    return out, {"acc": acc, "f1": f1, "auc": auc}

def save_run_group_txt(pair_label: str, run_label: str, results_dict: dict,
                       keywords: list, features_count: int, rows_used: int, note: str = ""):
    """
    Write ONE TXT per run containing:
      - asset label + run label (+ timestamp)
      - features/rows info
      - keyword list (or 'None')
      - metrics for ALL models in this run (LR/RF/XGB) or DL summary
    """
    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
    safe_pair = pair_label.replace(" ", "_").lower()
    safe_run  = run_label.replace(" ", "_").replace("+", "plus").lower()
    out_path  = OUT_DIR / f"{safe_pair}_{safe_run}_{ts}.txt"

    with open(out_path, "w") as f:
        f.write(f"{pair_label} — {run_label}\n")
        f.write(f"Timestamp: {ts}\n")
        f.write(f"Features used: {features_count}\n")
        f.write(f"Rows used: {rows_used}\n")
        if note:
            f.write(f"Note: {note}\n")

        f.write("\nKeywords used:\n")
        if keywords:
            for k in keywords: f.write(f"- {k}\n")
        else:
            f.write("- None\n")

        f.write("\nResults (all models):\n")
        for model_name, (_out, met) in results_dict.items():
            acc = met.get("acc", float("nan"))
            f1  = met.get("f1",  float("nan"))
            auc = met.get("auc", float("nan"))
            f.write(f"{model_name}: ACC={acc:.3f}, F1={f1:.3f}, AUC={auc:.3f}\n")

    print("📝 Saved:", out_path)

def sanitize_features(X: pd.DataFrame) -> pd.DataFrame:
    """
    Defensive feature cleaning:
    - replace ±inf with NaN
    - fill NaN with 0 (only for derived columns we add)
    - clip extreme outliers to 0.1%/99.9% quantiles
    """
    X = X.replace([np.inf, -np.inf], np.nan).fillna(0.0)
    lo = X.quantile(0.001)
    hi = X.quantile(0.999)
    return X.clip(lower=lo, upper=hi, axis=1)

def build_trend_lag_features(frame: pd.DataFrame, cols: list) -> pd.DataFrame:
    """
    From *_trend level series, create leak-safe daily change features:
      - 1-day % change, lagged by 1 day
      - 7-day % change, lagged by 1 day
    """
    feats = {}
    for c in cols:
        chg1 = frame[c].pct_change()
        chg7 = frame[c].pct_change(7)
        feats[c+"_chg1_lag1"] = chg1.shift(1)
        feats[c+"_chg7_lag1"] = chg7.shift(1)
    return pd.DataFrame(feats, index=frame.index)

def map_keywords_to_trend_cols(df_cols: list, keywords: list) -> list:
    """
    Map keyword strings (from James's CSV) to *_trend columns present in our dataset.
    Matching is normalized to tolerate underscores/symbols.
    """
    trend_cols_all = [c for c in df_cols if str(c).endswith("_trend")]
    def _norm(s): return re.sub(r"[^a-z0-9]+", "", str(s).lower())
    kw_to_col = []
    for kw in keywords:
        n = _norm(kw)
        hits = [c for c in trend_cols_all if n in _norm(c.replace("_trend",""))]
        if hits:
            kw_to_col.append(hits[0])  # first reasonable match
    # de-dup while preserving order
    seen, ordered = set(), []
    for c in kw_to_col:
        if c not in seen:
            seen.add(c); ordered.append(c)
    return ordered

def load_keywords_for_pair(csv_path: Path, pair_id: str) -> list:
    """Load keywords from James's CSV for a given Pair ID (e.g., 'GC=F')."""
    dfk = pd.read_csv(csv_path)
    assert {"Pair","Keyword"}.issubset(dfk.columns), f"Unexpected keyword CSV columns: {dfk.columns}"
    kw = (dfk.loc[dfk["Pair"] == pair_id, "Keyword"]
            .dropna().astype(str).str.strip().str.lower().unique().tolist())
    return kw


# --- 1.6 Models (re-used across assets) ---
models = {
    "LR":  LogisticRegression(max_iter=500),
    "RF":  RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "XGB": XGBClassifier(
        n_estimators=500, max_depth=4, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        tree_method="hist", random_state=42
    )
}

print("✅ Setup complete.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Libraries loaded.
✅ Setup complete.


In [53]:

# =====================================================
# 2) RUNNER — Baseline → Extended → (DL) for one asset
# =====================================================

def run_asset(asset: dict, dataset_version: str = "raw", use_dl: bool = True):
    """
    Full pipeline for a single asset + chosen dataset:
      dataset_version ∈ {'raw','engineered'}
      1) Baseline: technical features only (no *_trend)
      2) Extended: baseline + lagged trend-change features for this asset's keywords
      3) Deep Learning: MLP on extended features (optional)
    Produces one TXT per run with results for ALL models.
    """
    label       = asset["label"]
    pair_id     = asset["PAIR_ID"]
    price_col   = asset["price_col"]

    ds_path     = get_dataset_path(dataset_version)
    label_tag   = tag_label(label, dataset_version)

    print("\n" + "="*40)
    print(f"▶️  Running {label_tag}  |  Pair: {pair_id}")
    print(f"    Dataset: {ds_path.name}")
    print("="*40)

    # --- Load & target ---
    df = pd.read_csv(ds_path, parse_dates=["Date"]).set_index("Date").sort_index()
    df_mod = make_target(df, price_col)
    print(f"[Data] {df_mod.index.min().date()} → {df_mod.index.max().date()} | Rows: {len(df_mod)}")

    # --- BASELINE: technical-only (exclude *_trend) ---
    all_trends = [c for c in df_mod.columns if str(c).endswith("_trend")]
    exclude    = {price_col, "ret1", "y_up"} | set(all_trends)
    numeric    = [c for c in df_mod.columns if df_mod[c].dtype != "O"]
    baseline_cols = [c for c in numeric if c not in exclude]
    print(f"[Baseline] Features: {len(baseline_cols)}")

    start_idx = pick_start_index(len(df_mod))
    results_baseline = {}
    for name, mdl in models.items():
        out_b, m_b = walk_forward_eval(df_mod, baseline_cols, mdl, start_index=start_idx)
        results_baseline[name] = (out_b, m_b)
        print(f"  BASE {name}: ACC={m_b['acc']:.3f} F1={m_b['f1']:.3f} AUC={m_b['auc']:.3f}")

    save_run_group_txt(label_tag, "baseline", results_baseline,
                       keywords=[], features_count=len(baseline_cols), rows_used=len(df_mod))

    # --- EXTENDED: baseline + (asset) keywords as lagged changes ---
    keep_keywords = load_keywords_for_pair(KW_CSV, pair_id)
    print(f"[Extended] Keywords for {pair_id}: {len(keep_keywords)}")
    sel_trend_cols = map_keywords_to_trend_cols(df_mod.columns, keep_keywords)

    if not sel_trend_cols:
        print("[WARN] No *_trend columns matched these keywords. Skipping extended & DL.")
        return

    lag_feats = build_trend_lag_features(df_mod, sel_trend_cols)
    # Drop sparse derived cols (>50% NaN), then warmup-trim (pct_change(7)+shift)
    nan_ratio = lag_feats.isna().mean()
    kept_cols = nan_ratio[nan_ratio < 0.50].index.tolist()
    lag_feats = lag_feats[kept_cols]
    WARMUP = 8
    df_ext = df_mod.join(lag_feats).iloc[WARMUP:].copy()
    df_ext[kept_cols] = df_ext[kept_cols].fillna(0.0)

    extended_cols = sorted(set(baseline_cols).union(kept_cols))
    df_ext[extended_cols] = sanitize_features(df_ext[extended_cols])
    print(f"[Extended] Features: {len(extended_cols)} | Rows: {len(df_ext)}")

    start_idx_ext = pick_start_index(len(df_ext))
    results_extended = {}
    for name, mdl in models.items():
        out_e, m_e = walk_forward_eval(df_ext, extended_cols, mdl, start_index=start_idx_ext)
        results_extended[name] = (out_e, m_e)
        print(f"  EXT  {name}: ACC={m_e['acc']:.3f} F1={m_e['f1']:.3f} AUC={m_e['auc']:.3f}")

    save_run_group_txt(label_tag, "baseline + keywords", results_extended,
                       keywords=keep_keywords, features_count=len(extended_cols), rows_used=len(df_ext))

    # --- DL (optional): MLP on extended features ---
    if use_dl:
        print("[DL] MLP on extended features…")
        X = df_ext[extended_cols].values
        y = df_ext["y_up"].astype(int).values

        n      = len(df_ext)
        i_tr   = int(n * 0.70)
        i_va   = int(n * 0.85)
        X_tr, X_va, X_te = X[:i_tr], X[i_tr:i_va], X[i_va:]
        y_tr, y_va, y_te = y[:i_tr], y[i_tr:i_va], y[i_va:]

        scaler = StandardScaler().fit(X_tr)
        X_tr, X_va, X_te = scaler.transform(X_tr), scaler.transform(X_va), scaler.transform(X_te)

        tf.keras.utils.set_random_seed(42)
        mlp = keras.Sequential([
            keras.layers.Input(shape=(X_tr.shape[1],)),
            keras.layers.Dense(128, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.30),
            keras.layers.Dense(64, activation="relu"),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.30),
            keras.layers.Dense(1, activation="sigmoid"),
        ])
        mlp.compile(optimizer=keras.optimizers.Adam(1e-3),
                    loss="binary_crossentropy",
                    metrics=[keras.metrics.AUC(name="auc"),
                             keras.metrics.BinaryAccuracy(name="acc")])

        es = keras.callbacks.EarlyStopping(monitor="val_auc", mode="max",
                                           patience=5, restore_best_weights=True)
        mlp.fit(X_tr, y_tr, validation_data=(X_va, y_va),
                epochs=30, batch_size=64, callbacks=[es], verbose=0)

        p_te = mlp.predict(X_te, verbose=0).ravel()
        yhat = (p_te >= 0.5).astype(int)
        dl_metrics = {
            "acc": accuracy_score(y_te, yhat),
            "f1":  f1_score(y_te, yhat),
            "auc": roc_auc_score(y_te, p_te) if len(set(y_te)) > 1 else np.nan
        }
        print(f"  DL MLP: ACC={dl_metrics['acc']:.3f} F1={dl_metrics['f1']:.3f} AUC={dl_metrics['auc']:.3f}")

        save_run_group_txt(label_tag, "DL MLP (extended)",
                           {"MLP": (None, dl_metrics)},
                           keywords=keep_keywords, features_count=len(extended_cols), rows_used=len(df_ext))

    print(f"✅ Finished {label_tag}.")

In [54]:

# =====================================================
# 3) CONVENIENCE  — run by readable label
# =====================================================

def run_asset_by_label(name: str, dataset_version: str = "raw", use_dl: bool = True):
    """
    Call like:
      run_asset_by_label("Gold", dataset_version="raw")
      run_asset_by_label("BTC",  dataset_version="engineered")
    Aliases like 'gc=f', 'wti', 'cny=x' also work.
    """
    canon = resolve_label(name)
    asset = asset_by_label.get(canon)
    if asset is None:
        print(f"[!] Unknown asset '{name}'. Available: {list(asset_by_label.keys())}")
        return
    run_asset(asset, dataset_version=dataset_version, use_dl=use_dl)

def run_pair_raw_and_engineered(name: str, use_dl: bool = True):
    """Run the same asset twice: RAW then ENGINEERED."""
    run_asset_by_label(name, dataset_version="raw",         use_dl=use_dl)
    run_asset_by_label(name, dataset_version="engineered",  use_dl=use_dl)

def run_all_pairs(dataset_version: str = "raw", use_dl: bool = True):
    """Loop all assets for a chosen dataset_version ('raw' or 'engineered')."""
    for lbl in asset_by_label.keys():
        run_asset_by_label(lbl, dataset_version=dataset_version, use_dl=use_dl)

print("✅ Ready: use run_asset_by_label('Gold', dataset_version='raw') etc.")

✅ Ready: use run_asset_by_label('Gold', dataset_version='raw') etc.


In [None]:

# =====================================================
# 4) RUNS  — choose what to execute
# =====================================================

# Examples (uncomment what you need):

# 4.1 Gold only (RAW)
run_asset_by_label("Gold", dataset_version="raw", use_dl=True)

# 4.2 Gold only (ENGINEERED)
# run_asset_by_label("Gold", dataset_version="engineered", use_dl=True)

# 4.3 Gold on BOTH datasets
# run_pair_raw_and_engineered("Gold", use_dl=True)

# 4.4 All pairs on RAW (7 runs per pair)
# run_all_pairs(dataset_version="raw", use_dl=True)

# 4.5 All pairs on ENGINEERED (another 7 runs per pair)
# run_all_pairs(dataset_version="engineered", use_dl=True)

# TXT outputs appear in:
#   /content/drive/MyDrive/gt-markets/outputs/


▶️  Running Gold [raw]  |  Pair: GC=F
    Dataset: merged_financial_trends_data_2025-09-07.csv
[Data] 2016-03-08 → 2020-05-18 | Rows: 985
[Baseline] Features: 40
  BASE LR: ACC=0.503 F1=0.480 AUC=0.492
