<a href="https://colab.research.google.com/github/brendonhuynhbp-hub/gt-markets/blob/main/notebooks/GoogleTrends_Financial_Modeling_Backtest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup: Mount Drive + Paths

In [1]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Project paths
from pathlib import Path
PROJECT_DIR = Path("/content/drive/MyDrive/gt-markets")
DATA_DIR    = PROJECT_DIR / "data" / "processed"
OUT_DIR     = PROJECT_DIR / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Input file prepared by James
MERGED_FILE = DATA_DIR / "merged_financial_trends_data_2025-09-07.csv"
assert MERGED_FILE.exists(), f"File not found: {MERGED_FILE}"
print("Using:", MERGED_FILE)


Mounted at /content/drive
Using: /content/drive/MyDrive/gt-markets/data/processed/merged_financial_trends_data_2025-09-07.csv


Load merged table (prices + trends) and set Date index

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv(MERGED_FILE)
# Ensure Date is parsed and becomes the index
assert "Date" in df.columns, "Expected a 'Date' column in the CSV."
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date"]).set_index("Date").sort_index()

print(df.shape)
df.head(3)


(2609, 123)


Unnamed: 0_level_0,BTC-USD Close,CL=F Close,DXY Close,GC=F Close,USDCNY=X Close,BTC-USD Open,CL=F Open,DXY Open,GC=F Open,USDCNY=X Open,...,el_salvador_trend,solana_trend,tesla_trend,bitcoin_mining_trend,blockchain_trend,silk_road_trend,tor_trend,nigeria_trend,legal_tender_trend,nayib_bukele_trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-09-08,243.606995,45.939999,930.440002,1120.400024,6.3559,239.845993,45.82,930.440002,1119.800049,6.3559,...,54,80,29,35,39,54,72,58,44,0
2015-09-09,238.167999,44.150002,938.400024,1102.199951,6.3572,243.414993,45.790001,938.400024,1123.5,6.3572,...,52,73,28,36,44,56,73,58,52,0
2015-09-10,238.477005,45.919998,913.51001,1109.5,6.3678,238.335999,44.16,913.51001,1107.300049,6.3678,...,45,69,31,38,47,60,74,54,59,27


Pin columns

In [3]:

COL_GOLD = "GC=F Close"   # Gold (target asset)
COL_DXY  = "DXY Close"    # U.S. Dollar Index (macro)

# Trends: every column ending with "_trend"
trend_cols = [c for c in df.columns if c.endswith("_trend")]

print("Gold column:", COL_GOLD)
print("DXY column :", COL_DXY, "(present:", COL_DXY in df.columns, ")")
print("Trend columns:", len(trend_cols))
print(trend_cols[:12])  # sample


Gold column: GC=F Close
DXY column : DXY Close (present: True )
Trend columns: 83
['usd_trend', 'gold_price_trend', 'bitcoin_trend', 'oil_price_trend', 'chinese_yuan_trend', 'inflation_trend', 'interest_rates_trend', 'stock_market_trend', 'recession_trend', 'economic_growth_trend', 'currency_exchange_trend', 'commodity_prices_trend']


Build target: next-day direction of gold (no leakage)

In [4]:
# Gold daily return (today vs yesterday)
df["gold_ret1"] = df[COL_GOLD].pct_change()

# Target = will gold go UP tomorrow? (use shift(-1) so target belongs to today's row)
df["y_up"] = (df["gold_ret1"].shift(-1) > 0).astype(int)

# Clean initial NaNs
data = df.dropna().copy()
print("Rows after target creation:", data.shape[0])


Rows after target creation: 985


Create feature sets

Baseline: numeric engineered columns (includes returns/vols/RSI/etc. already in your file) but excludes raw target helpers.

Extended: Baseline + all _trend columns.

In [5]:
# Columns to exclude from features
exclude_cols = {COL_GOLD, "gold_ret1", "y_up"}

# Numeric-only columns
numeric_cols = [c for c in data.columns if data[c].dtype != "O"]

# Baseline = all numeric features except excluded ones
baseline_cols = [c for c in numeric_cols if c not in exclude_cols]

# Extended = baseline + trend columns (ensure they exist)
extended_cols = sorted(set(baseline_cols).union(trend_cols))

print("Baseline feature count:", len(baseline_cols))
print("Extended feature count:", len(extended_cols))


Baseline feature count: 122
Extended feature count: 122


Walk-forward evaluation helper (expanding window)

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
!pip -q install xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def walk_forward_eval(df_in, feature_cols, model, start_index=500):
    """
    Expanding-window walk-forward:
      - Train on [0 : i) and predict on [i]
      - Scale on train only to avoid leakage
    Returns: (predictions_df, metrics_dict)
    """
    df_in = df_in.copy()
    X_all = df_in[feature_cols].values
    y_all = df_in["y_up"].values
    idxs  = df_in.index

    scaler = StandardScaler(with_mean=True, with_std=True)

    preds, probs, trues, dates = [], [], [], []
    for i in range(start_index, len(df_in)):
        X_train, y_train = X_all[:i], y_all[:i]
        X_test,  y_test  = X_all[i:i+1], y_all[i]

        X_train_s = scaler.fit_transform(X_train)
        X_test_s  = scaler.transform(X_test)

        model.fit(X_train_s, y_train)
        p = model.predict_proba(X_test_s)[0,1]
        yhat = int(p >= 0.5)

        preds.append(yhat)
        probs.append(p)
        trues.append(int(y_test))
        dates.append(idxs[i])

    out = pd.DataFrame({"date": dates, "y_true": trues, "y_pred": preds, "prob_up": probs}).set_index("date")
    acc = accuracy_score(out["y_true"], out["y_pred"])
    f1  = f1_score(out["y_true"], out["y_pred"])
    try:
        auc = roc_auc_score(out["y_true"], out["prob_up"])
    except Exception:
        auc = np.nan
    return out, {"acc": acc, "f1": f1, "auc": auc}


Baseline Features & Models

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
!pip -q install xgboost
from xgboost import XGBClassifier

# Define models to test
models = {
    "LR":  LogisticRegression(max_iter=500),
    "RF":  RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "XGB": XGBClassifier(
        n_estimators=500, max_depth=4, learning_rate=0.05,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        tree_method="hist", random_state=42
    )
}


In [14]:
# Define modelling dataset
min_date, max_date = df.index.min(), df.index.max()
print(f"Data range available: {min_date.date()} → {max_date.date()}")

START_DATE = min_date   # full range, or "2017-01-01" for faster debug
df_mod = data.loc[data.index >= START_DATE].copy()

print(f"Using {df_mod.shape[0]} rows for modelling ({df_mod.index.min().date()} → {df_mod.index.max().date()})")


Data range available: 2015-09-08 → 2025-09-05
Using 985 rows for modelling (2016-03-08 → 2020-05-18)


In [15]:
# Use only price/technical features, no *_trend columns
exclude_cols = {"GC=F Close", "gold_ret1", "y_up"} | set(trend_cols)  # adjust if needed
baseline_cols = [c for c in df_mod.columns if df_mod[c].dtype != "O" and c not in exclude_cols]
print(f"[6A] Baseline features: {len(baseline_cols)}")

results_baseline = {}
for name, mdl in models.items():  # models = {"LR":..., "RF":..., "XGB":...}
    out_b, m_b = walk_forward_eval(df_mod, baseline_cols, mdl)
    results_baseline[name] = (out_b, m_b)
    print(f"BASE {name} | ACC: {m_b['acc']:.3f} | F1: {m_b['f1']:.3f} | AUC: {m_b['auc']:.3f}")


[6A] Baseline features: 39
BASE LR | ACC: 0.503 | F1: 0.411 | AUC: 0.496
BASE RF | ACC: 0.501 | F1: 0.442 | AUC: 0.487
BASE XGB | ACC: 0.509 | F1: 0.485 | AUC: 0.503
