<a href="https://colab.research.google.com/github/brendonhuynhbp-hub/gt-markets/blob/main/notebooks/GoogleTrends_Financial_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# --- Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# --- Project paths
from pathlib import Path
PROJECT_DIR = Path("/content/drive/MyDrive/gt-markets")
DATA_DIR    = PROJECT_DIR / "data" / "processed"
OUT_DIR     = PROJECT_DIR / "outputs"
DATA_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_DIR:", PROJECT_DIR)
print("DATA_DIR   :", DATA_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
PROJECT_DIR: /content/drive/MyDrive/gt-markets
DATA_DIR   : /content/drive/MyDrive/gt-markets/data/processed


In [3]:
# Find the *newest* CSV whose name suggests it's the merged dataset
candidates = sorted(DATA_DIR.glob("*.csv"))
merged_candidates = [p for p in candidates if "merged" in p.stem.lower() and "engineered" not in p.stem.lower()]

if not merged_candidates:
    raise FileNotFoundError(f"No merged CSV found in {DATA_DIR}. "
                            "Expected something like 'merged_financial_trends_data_YYYY-MM-DD.csv'.")

RAW_MERGED = max(merged_candidates, key=lambda p: p.stat().st_mtime)
print("Using merged CSV:", RAW_MERGED)


Using merged CSV: /content/drive/MyDrive/gt-markets/data/processed/merged_financial_trends_data_2025-09-07.csv


In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv(RAW_MERGED)
assert "Date" in df.columns, "Expected a 'Date' column."

df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date"]).set_index("Date").sort_index()

print("Data range:", df.index.min().date(), "→", df.index.max().date(), "| shape:", df.shape)


Data range: 2015-09-08 → 2025-09-05 | shape: (2609, 163)


In [5]:
# Adjust here only if your column names differ
COLMAP = {
    "gold":   {"close": "GC=F Close",    "rsi": "GC=F 14-day RSI",    "sma20": "GC=F 20-day simple moving average"},
    "btc":    {"close": "BTC-USD Close", "rsi": "BTC-USD 14-day RSI", "sma20": "BTC-USD 20-day simple moving average"},
    "oil":    {"close": "CL=F Close",    "rsi": "CL=F 14-day RSI",    "sma20": "CL=F 20-day simple moving average"},
    "usdcny": {"close": "USDCNY=X Close","rsi": "USDCNY=X 14-day RSI","sma20": "USDCNY=X 20-day simple moving average"},
    "dxy":    {"close": "DXY Close",     "rsi": "DXY 14-day RSI",     "sma20": "DXY 20-day simple moving average"},
}


In [6]:
def ensure_series(frame: pd.DataFrame, col: str):
    """Return numeric Series if the column exists; else None."""
    if col in frame.columns:
        return pd.to_numeric(frame[col], errors="coerce")
    return None

def compute_rsi(close: pd.Series, period: int = 14) -> pd.Series:
    """Compute RSI if not present."""
    delta = close.diff()
    up = delta.clip(lower=0).rolling(period).mean()
    down = (-delta.clip(upper=0)).rolling(period).mean()
    rs = up / (down + 1e-12)
    return 100 - (100 / (1 + rs))

def add_engineered_for_asset(frame: pd.DataFrame, key: str,
                             close_col: str, rsi_col: str = None,
                             sma20_col: str = None, prefix: str = None) -> pd.DataFrame:
    """
    Create engineered features for one asset:
      - distance from SMA20, slope of SMA20
      - 1-day return, vol20, vol-scaled return (lagged)
      - RSI states (lagged)
      - crossover up (price crosses above SMA20)
    """
    prefix = prefix or key
    s_close = ensure_series(frame, close_col)
    assert s_close is not None, f"Close column not found: {close_col}"

    s_sma20 = ensure_series(frame, sma20_col) if sma20_col else None
    if s_sma20 is None:
        s_sma20 = s_close.rolling(20).mean()

    s_rsi = ensure_series(frame, rsi_col) if rsi_col else None
    if s_rsi is None:
        s_rsi = compute_rsi(s_close, period=14)

    # Mean-reversion / trend strength
    frame[f"{prefix}_dist_ma20"]  = s_close / (s_sma20 + 1e-12) - 1
    frame[f"{prefix}_slope_ma20"] = s_sma20 - s_sma20.shift(5)

    # Volatility scaling
    ret1 = s_close.pct_change()
    frame[f"{prefix}_ret1"]    = ret1
    frame[f"{prefix}_vol20"]   = ret1.rolling(20).std()
    frame[f"{prefix}_ret1_vs"] = ret1.shift(1) / (frame[f'{prefix}_vol20'].shift(1) + 1e-12)

    # RSI states (use yesterday to avoid leakage)
    frame[f"{prefix}_rsi"] = s_rsi
    frame[f"{prefix}_rsi_oversold"]  = (s_rsi.shift(1) < 30).astype(float)
    frame[f"{prefix}_rsi_overbought"] = (s_rsi.shift(1) > 70).astype(float)

    # Crossover event
    frame[f"{prefix}_xover_up"] = (
        (s_close > s_sma20) & (s_close.shift(1) <= s_sma20.shift(1))
    ).astype(float)

    return frame


In [7]:
for key, cols in COLMAP.items():
    df = add_engineered_for_asset(
        df, key=key,
        close_col=cols["close"],
        rsi_col=cols.get("rsi"),
        sma20_col=cols.get("sma20"),
        prefix=key
    )

print("Engineered columns (sample):", [c for c in df.columns if c.startswith("gold_")][:10])


Engineered columns (sample): ['gold_price_trend', 'gold_trend', 'gold_dist_ma20', 'gold_slope_ma20', 'gold_ret1', 'gold_vol20', 'gold_ret1_vs', 'gold_rsi', 'gold_rsi_oversold', 'gold_rsi_overbought']


In [8]:
dxy_close = ensure_series(df, COLMAP["dxy"]["close"])
dxy_ma200 = dxy_close.rolling(200).mean()

df["dxy_ma200"]  = dxy_ma200
df["dxy_regime"] = (dxy_close > dxy_ma200).astype(float)

# Example interaction: gold mean-reversion behaves differently in risk-off
df["gold_dist_ma20_in_riskoff"] = df["gold_dist_ma20"] * df["dxy_regime"]


In [9]:
from datetime import date
today_str = date.today().isoformat()

engineered = df.copy().dropna(how="any")

ENG_CSV  = DATA_DIR / f"merged_financial_trends_engineered_{today_str}.csv"
ENG_PARQ = DATA_DIR / f"merged_financial_trends_engineered_{today_str}.parquet"

engineered.to_csv(ENG_CSV)
engineered.to_parquet(ENG_PARQ)

print("[OK] Engineered dataset saved")
print("Rows × Cols:", engineered.shape)
print("CSV   :", ENG_CSV)
print("PARQUET:", ENG_PARQ)

# Quick peek
engineered.tail(2)


[OK] Engineered dataset saved
Rows × Cols: (916, 211)
CSV   : /content/drive/MyDrive/gt-markets/data/processed/merged_financial_trends_engineered_2025-09-07.csv
PARQUET: /content/drive/MyDrive/gt-markets/data/processed/merged_financial_trends_engineered_2025-09-07.parquet


Unnamed: 0_level_0,BTC-USD Close,CL=F Close,DXY Close,GC=F Close,USDCNY=X Close,BTC-USD Open,CL=F Open,DXY Open,GC=F Open,USDCNY=X Open,...,dxy_ret1,dxy_vol20,dxy_ret1_vs,dxy_rsi,dxy_rsi_oversold,dxy_rsi_overbought,dxy_xover_up,dxy_ma200,dxy_regime,gold_dist_ma20_in_riskoff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-15,9328.197266,29.43,755.309998,1753.400024,7.0947,9734.291016,27.639999,755.309998,1739.699951,7.0947,...,0.0,0.00966,0.0,100.0,0.0,1.0,0.0,966.943163,0.0,0.0
2020-05-18,9726.575195,31.82,755.309998,1731.800049,7.1012,9675.695312,29.530001,755.309998,1755.699951,7.1012,...,0.0,0.00966,0.0,100.0,0.0,1.0,0.0,965.292712,0.0,0.0
