<a href="https://colab.research.google.com/github/brendonhuynhbp-hub/gt-markets/blob/main/notebooks/GoogleTrends_Financial_Modeling_Backtest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup: Mount Drive + Paths

In [19]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Project paths
from pathlib import Path
PROJECT_DIR = Path("/content/drive/MyDrive/gt-markets")
DATA_DIR    = PROJECT_DIR / "data" / "processed"
OUT_DIR     = PROJECT_DIR / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Input file prepared by James
MERGED_FILE = DATA_DIR / "merged_financial_trends_data_2025-09-07.csv"
assert MERGED_FILE.exists(), f"File not found: {MERGED_FILE}"
print("Using:", MERGED_FILE)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using: /content/drive/MyDrive/gt-markets/data/processed/merged_financial_trends_data_2025-09-07.csv


Load merged table (prices + trends) and set Date index

In [20]:
import pandas as pd
import numpy as np

df = pd.read_csv(MERGED_FILE)
# Ensure Date is parsed and becomes the index
assert "Date" in df.columns, "Expected a 'Date' column in the CSV."
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date"]).set_index("Date").sort_index()

print(df.shape)
df.head(3)


(2609, 170)


Unnamed: 0_level_0,BTC-USD Close,CL=F Close,DXY Close,GC=F Close,USDCNY=X Close,BTC-USD Open,CL=F Open,DXY Open,GC=F Open,USDCNY=X Open,...,abandon_trend,exploration_trend,letter_trend,favour_trend,cold_trend,sketch_trend,danger_trend,joy_trend,translate_trend,create_trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-09-08,243.606995,45.939999,930.440002,1120.400024,6.3559,239.845993,45.82,930.440002,1119.800049,6.3559,...,81,89,90,86,64,71,76,25,91,86
2015-09-09,238.167999,44.150002,938.400024,1102.199951,6.3572,243.414993,45.790001,938.400024,1123.5,6.3572,...,88,88,87,82,60,76,69,26,91,90
2015-09-10,238.477005,45.919998,913.51001,1109.5,6.3678,238.335999,44.16,913.51001,1107.300049,6.3678,...,84,91,91,83,57,76,76,27,92,84


Pin columns

In [21]:

COL_GOLD = "GC=F Close"   # Gold (target asset)
COL_DXY  = "DXY Close"    # U.S. Dollar Index (macro)

# Trends: every column ending with "_trend"
trend_cols = [c for c in df.columns if c.endswith("_trend")]

print("Gold column:", COL_GOLD)
print("DXY column :", COL_DXY, "(present:", COL_DXY in df.columns, ")")
print("Trend columns:", len(trend_cols))
print(trend_cols[:12])  # sample


Gold column: GC=F Close
DXY column : DXY Close (present: True )
Trend columns: 129
['usd_trend', 'gold_price_trend', 'bitcoin_trend', 'oil_price_trend', 'chinese_yuan_trend', 'inflation_trend', 'interest_rates_trend', 'stock_market_trend', 'recession_trend', 'economic_growth_trend', 'currency_exchange_trend', 'commodity_prices_trend']


Build target: next-day direction of gold (no leakage)

In [22]:
# Gold daily return (today vs yesterday)
df["gold_ret1"] = df[COL_GOLD].pct_change()

# Target = will gold go UP tomorrow? (use shift(-1) so target belongs to today's row)
df["y_up"] = (df["gold_ret1"].shift(-1) > 0).astype(int)

# Clean initial NaNs
data = df.dropna().copy()
print("Rows after target creation:", data.shape[0])


Rows after target creation: 985


Create feature sets

Baseline: numeric engineered columns (includes returns/vols/RSI/etc. already in your file) but excludes raw target helpers.

Extended: Baseline + all _trend columns.

In [23]:
# --- Columns to exclude from features ---
exclude_cols = {COL_GOLD, "gold_ret1", "y_up"}

# All numeric columns
numeric_cols = [c for c in data.columns if data[c].dtype != "O"]

# Identify trend columns explicitly (all *_trend columns)
trend_cols = [c for c in data.columns if c.endswith("_trend")]

# Baseline = all numeric features except excluded ones AND trend columns
baseline_cols = [c for c in numeric_cols if c not in exclude_cols and c not in trend_cols]

# Extended = baseline + trend columns
extended_cols = baseline_cols + trend_cols

print("Baseline feature count:", len(baseline_cols))
print("Extended feature count:", len(extended_cols))


Baseline feature count: 40
Extended feature count: 169


Walk-forward evaluation helper (expanding window)

In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
!pip -q install xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def walk_forward_eval(df_in, feature_cols, model, start_index=500):
    """
    Expanding-window walk-forward:
      - Train on [0 : i) and predict on [i]
      - Scale on train only to avoid leakage
    Returns: (predictions_df, metrics_dict)
    """
    df_in = df_in.copy()
    X_all = df_in[feature_cols].values
    y_all = df_in["y_up"].values
    idxs  = df_in.index

    scaler = StandardScaler(with_mean=True, with_std=True)

    preds, probs, trues, dates = [], [], [], []
    for i in range(start_index, len(df_in)):
        X_train, y_train = X_all[:i], y_all[:i]
        X_test,  y_test  = X_all[i:i+1], y_all[i]

        X_train_s = scaler.fit_transform(X_train)
        X_test_s  = scaler.transform(X_test)

        model.fit(X_train_s, y_train)
        p = model.predict_proba(X_test_s)[0,1]
        yhat = int(p >= 0.5)

        preds.append(yhat)
        probs.append(p)
        trues.append(int(y_test))
        dates.append(idxs[i])

    out = pd.DataFrame({"date": dates, "y_true": trues, "y_pred": preds, "prob_up": probs}).set_index("date")
    acc = accuracy_score(out["y_true"], out["y_pred"])
    f1  = f1_score(out["y_true"], out["y_pred"])
    try:
        auc = roc_auc_score(out["y_true"], out["prob_up"])
    except Exception:
        auc = np.nan
    return out, {"acc": acc, "f1": f1, "auc": auc}


Baseline Features & Models

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
!pip -q install xgboost
from xgboost import XGBClassifier

# Define models to test
models = {
    "LR":  LogisticRegression(max_iter=500),
    "RF":  RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "XGB": XGBClassifier(
        n_estimators=500, max_depth=4, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        tree_method="hist", random_state=42
    )
}


In [26]:
# Define modelling dataset
min_date, max_date = df.index.min(), df.index.max()
print(f"Data range available: {min_date.date()} → {max_date.date()}")

START_DATE = min_date   # full range, or "2017-01-01" for faster debug
df_mod = data.loc[data.index >= START_DATE].copy()

print(f"Using {df_mod.shape[0]} rows for modelling ({df_mod.index.min().date()} → {df_mod.index.max().date()})")


Data range available: 2015-09-08 → 2025-09-05
Using 985 rows for modelling (2016-03-08 → 2020-05-18)


In [None]:
# Use only price/technical features, no *_trend columns
exclude_cols = {"GC=F Close", "gold_ret1", "y_up"} | set(trend_cols)  # adjust if needed
baseline_cols = [c for c in df_mod.columns if df_mod[c].dtype != "O" and c not in exclude_cols]
print(f"[6A] Baseline features: {len(baseline_cols)}")

results_baseline = {}
for name, mdl in models.items():  # models = {"LR":..., "RF":..., "XGB":...}
    out_b, m_b = walk_forward_eval(df_mod, baseline_cols, mdl)
    results_baseline[name] = (out_b, m_b)
    print(f"BASE {name} | ACC: {m_b['acc']:.3f} | F1: {m_b['f1']:.3f} | AUC: {m_b['auc']:.3f}")


[6A] Baseline features: 40


In [None]:
# === Utility: save one RUN (all models together) to a TXT ===
from pathlib import Path
from datetime import datetime

OUT_DIR = Path("/content/drive/MyDrive/gt-markets/outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def save_run_group_txt(
    pair_label: str,          # e.g., "Gold"
    run_label: str,           # e.g., "baseline" or "baseline + keywords"
    results_dict: dict,       # e.g., results_baseline or results_extended
    keywords: list = None,    # list[str] of keywords used; None/[] for baseline
    features_count: int = None,
    rows_used: int = None,
    extra_note: str = ""
):
    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
    safe_pair = pair_label.replace(" ", "_").lower()
    safe_run  = run_label.replace(" ", "_").replace("+", "plus").lower()
    out_path  = OUT_DIR / f"{safe_pair}_{safe_run}_{ts}.txt"

    with open(out_path, "w") as f:
        f.write(f"{pair_label} — {run_label}\n")
        f.write(f"Timestamp: {ts}\n")
        if features_count is not None:
            f.write(f"Features used: {features_count}\n")
        if rows_used is not None:
            f.write(f"Rows used: {rows_used}\n")
        if extra_note:
            f.write(f"Note: {extra_note}\n")
        f.write("\nKeywords used:\n")
        if keywords:
            for k in keywords:
                f.write(f"- {k}\n")
        else:
            f.write("- None\n")

        f.write("\nResults (all models):\n")
        # results_dict is like {"LR": (out_df, {"acc":..., "f1":..., "auc":...}), ...}
        for model_name, (_out, met) in results_dict.items():
            acc = met.get("acc", float("nan"))
            f1  = met.get("f1", float("nan"))
            auc = met.get("auc", float("nan"))
            f.write(f"{model_name}: ACC={acc:.3f}, F1={f1:.3f}, AUC={auc:.3f}\n")

    print("Saved:", out_path)


In [None]:
# Save one TXT with all baseline model results
save_run_group_txt(
    pair_label="Gold",
    run_label="baseline",
    results_dict=results_baseline,
    keywords=[],                              # none for baseline
    features_count=len(baseline_cols),
    rows_used=len(df_mod)
)


In [None]:
# 6B-1: Load pre-filtered keywords and keep only GOLD (Pair = "GC=F")
import pandas as pd, numpy as np, re
from pathlib import Path

KW_PATH = Path("/content/drive/MyDrive/gt-markets/data/Keyword Selection/combined_significant_lagged_correlations.csv")
dfk = pd.read_csv(KW_PATH)

# Expect capitalized headers from James's file
assert {"Pair","Keyword"}.issubset(dfk.columns), f"Unexpected columns: {dfk.columns.tolist()}"

dfk_gold = dfk[dfk["Pair"] == "GC=F"].copy()
assert not dfk_gold.empty, "No GOLD rows found (Pair == 'GC=F') in the keyword file."

keep_keywords = (dfk_gold["Keyword"]
                 .dropna()
                 .astype(str)
                 .str.strip()
                 .str.lower()
                 .unique()
                 .tolist())

print(f"[6B-1] Using {len(keep_keywords)} GOLD keywords from Pair=GC=F")
for k in keep_keywords[:20]: print("-", k)
if len(keep_keywords) > 20: print("...")


In [None]:
# 6B-2: Map GOLD keywords to *_trend columns present in df_mod
def _norm(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", str(s).lower())

trend_cols_all = [c for c in df_mod.columns if c.endswith("_trend")]

kw_to_col, unmatched = {}, []
for kw in keep_keywords:
    n = _norm(kw)
    matches = [c for c in trend_cols_all if n in _norm(c.replace("_trend",""))]
    if matches:
        kw_to_col[kw] = matches[0]         # first/best match
    else:
        unmatched.append(kw)

selected_trend_cols = list(dict.fromkeys(kw_to_col.values()))  # de-dup, preserve order

print(f"[6B-2] Mapped {len(selected_trend_cols)} GOLD keywords to trend columns; unmatched={len(unmatched)}")
if unmatched:
    print("Unmatched sample:", unmatched[:10])
assert len(selected_trend_cols) > 0, "No GOLD keywords matched *_trend columns. Check naming in df_mod."


In [None]:
# 6B-3: Create lagged %change features for selected trends (avoid fragmentation)
def build_trend_lag_features(frame: pd.DataFrame, cols):
    feats = {}
    for c in cols:
        chg1 = frame[c].pct_change()
        chg7 = frame[c].pct_change(7)
        feats[c + "_chg1_lag1"] = chg1.shift(1)  # use prior day → no leakage
        feats[c + "_chg7_lag1"] = chg7.shift(1)
    return pd.DataFrame(feats, index=frame.index)

sel_trend_feats = build_trend_lag_features(df_mod, selected_trend_cols)

# Drop very sparse columns (mostly NaN)
nan_ratio = sel_trend_feats.isna().mean()
keep_feat_cols = nan_ratio[nan_ratio < 0.50].index.tolist()
sel_trend_feats = sel_trend_feats[keep_feat_cols]

print(f"[6B-3] Dropped {len(nan_ratio)-len(keep_feat_cols)} sparse features; kept {len(keep_feat_cols)}")


In [None]:
# 6B-4: Join, keep warmup rows only, fill NaNs in NEW trend features
WARMUP = 8  # due to pct_change(7) then shift(1)
df_ext = df_mod.join(sel_trend_feats)
df_ext = df_ext.iloc[WARMUP:].copy()

# Fill remaining NaNs in newly created trend features ONLY
df_ext[keep_feat_cols] = df_ext[keep_feat_cols].fillna(0.0)

# Extended feature set = baseline + new trend features
sel_feat_cols = keep_feat_cols
extended_cols = sorted(set(baseline_cols).union(sel_feat_cols))

print(f"[6B-4] Extended features: {len(extended_cols)} | Rows for modelling: {len(df_ext):,}")


In [None]:
# 6B-5: Sanitize extended matrix for robust training
X = df_ext[extended_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)

# Winsorize extremes per column (0.1%–99.9%)
lo = X.quantile(0.001)
hi = X.quantile(0.999)
X = X.clip(lower=lo, upper=hi, axis=1)

df_ext[extended_cols] = X

import numpy as np
n_inf = np.isinf(df_ext[extended_cols].to_numpy()).sum()
n_nan = np.isnan(df_ext[extended_cols].to_numpy()).sum()
print(f"[6B-5] Sanitize → inf: {n_inf} | nan: {n_nan}")


In [None]:
# 6C: Train extended models with a dynamic start_index (ensures evaluation runs)
def pick_start_index(n_rows, floor=60, frac=0.30, cap=500):
    return max(floor, min(cap, int(n_rows * frac)))

start_idx = pick_start_index(len(df_ext))
print(f"[6C] Walk-forward start_index = {start_idx}")

results_extended = {}
for name, mdl in models.items():  # models & walk_forward_eval defined in 6A
    out_e, m_e = walk_forward_eval(df_ext, extended_cols, mdl, start_index=start_idx)
    results_extended[name] = (out_e, m_e)
    print(f"EXT {name} | ACC: {m_e['acc']:.3f} | F1: {m_e['f1']:.3f} | AUC: {m_e['auc']:.3f}")


In [None]:
# Save one TXT with all extended model results
keywords_used = list(kw_to_col.keys())  # from your 6B mapping
save_run_group_txt(
    pair_label="Gold",
    run_label="baseline + keywords",
    results_dict=results_extended,
    keywords=keywords_used,
    features_count=len(extended_cols),
    rows_used=len(df_ext)
)


In [None]:
# 6D: Baseline vs Extended comparison
rows = []
for name in models.keys():
    mb = results_baseline[name][1]
    me = results_extended[name][1]
    rows.append({
        "model": name,
        "baseline_acc": mb["acc"], "baseline_f1": mb["f1"], "baseline_auc": mb["auc"],
        "extended_acc": me["acc"], "extended_f1": me["f1"], "extended_auc": me["auc"],
        "acc_diff": me["acc"] - mb["acc"],
        "auc_diff": me["auc"] - mb["auc"],
    })
cmp_df = pd.DataFrame(rows).set_index("model")
print("\n[6D] Baseline vs GOLD Extended:")
display(cmp_df.style.format("{:.3f}"))
