<a href="https://colab.research.google.com/github/brendonhuynhbp-hub/gt-markets/blob/pre/notebooks/colab_data_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

PROJECT_DIR = "/content/drive/MyDrive/gt-markets"
DATA_DIR    = f"{PROJECT_DIR}/data"
RAW_DIR     = f"{DATA_DIR}/raw"
PROC_DIR    = f"{DATA_DIR}/processed"

import os
for p in [PROJECT_DIR, DATA_DIR, RAW_DIR, PROC_DIR]:
    os.makedirs(p, exist_ok=True)

print("Ready:", PROJECT_DIR)


Mounted at /content/drive
Ready: /content/drive/MyDrive/gt-markets


In [3]:
import yfinance as yf
import pandas as pd

ASSETS = {
    "XAUUSD": "GC=F",      # Gold futures proxy
    "USDCNY": "CNY=X",     # USD/CNY
    "BTCUSD": "BTC-USD",   # Bitcoin
    "USOIL":  "CL=F"       # WTI crude
}

START_DATE = "2015-01-01"
END_DATE   = pd.Timestamp.today().strftime("%Y-%m-%d")
PRICE_INTERVAL = "1d"

MARKET_CSV = f"{PROC_DIR}/prices_daily.csv"

try:
    # if already saved in Drive, load it
    prices = pd.read_csv(MARKET_CSV, parse_dates=True, index_col=0)
    print("[ok] loaded existing prices:", prices.shape)
except Exception:
    # else, download fresh
    df = yf.download(list(ASSETS.values()),
                     start=START_DATE, end=END_DATE,
                     interval=PRICE_INTERVAL, progress=False)["Close"]

    if isinstance(df.columns, pd.MultiIndex):
        df = df.droplevel(0, axis=1)

    prices = df.rename(columns={v:k for k,v in ASSETS.items()}).sort_index()
    prices.to_csv(MARKET_CSV)
    print("[ok] downloaded & saved prices:", prices.shape)

prices.tail()


  df = yf.download(list(ASSETS.values()),


[ok] downloaded & saved prices: (3893, 4)


Ticker,BTCUSD,USOIL,USDCNY,XAUUSD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-08-24,113458.429688,,,
2025-08-25,110124.351562,64.800003,7.1675,3373.800049
2025-08-26,111802.65625,63.25,7.151,3388.600098
2025-08-27,111222.0625,64.150002,7.152,3404.600098
2025-08-28,112544.804688,64.599998,7.153,3431.800049


In [5]:
!pip -q install pytrends
import pandas as pd, time, datetime as dt
from dateutil.relativedelta import relativedelta
from pytrends.request import TrendReq

KEYWORDS   = ["gold price"]          # start with ONE keyword
START_DATE = "2015-01-01"
END_DATE   = dt.date.today().isoformat()

TRENDS_CSV = f"{PROC_DIR}/trends_daily.csv"
MERGED_CSV = f"{PROC_DIR}/merged_daily.csv"


In [6]:
def daterange_chunks(start_date, end_date, months_per_chunk=6):
    start = pd.to_datetime(start_date).date()
    end   = pd.to_datetime(end_date).date()
    cur = start
    while cur < end:
        nxt = min(cur + relativedelta(months=months_per_chunk) - relativedelta(days=1), end)
        yield cur, nxt
        cur = nxt + relativedelta(days=1)

def fetch_trends_series(keyword, start_date, end_date, geo="", gprop=""):
    pytrends = TrendReq(hl="en-US", tz=0)
    frames = []
    for i, (s, e) in enumerate(daterange_chunks(start_date, end_date, months_per_chunk=6), 1):
        timeframe = f"{s} {e}"
        tries, delay = 0, 8
        while True:
            try:
                pytrends.build_payload([keyword], timeframe=timeframe, geo=geo, gprop=gprop)
                part = pytrends.interest_over_time().drop(columns=["isPartial"], errors="ignore")
                if len(part):
                    part = part.rename(columns={keyword: keyword})
                    frames.append(part)
                print(f"[ok] {keyword} chunk {i}: {s} → {e}, rows={len(part)}")
                time.sleep(1.5)  # polite pause
                break
            except Exception as ex:
                tries += 1
                if tries > 5:
                    print(f"[warn] {keyword} failed for {timeframe}: {ex}")
                    break
                print(f"[429/backoff] {keyword} {timeframe} → sleep {delay}s … ({tries}/5)")
                time.sleep(delay)
                delay = min(int(delay * 1.8), 180)

    if not frames:
        return pd.DataFrame()

    s = pd.concat(frames).sort_index()
    s = s[~s.index.duplicated(keep="last")].asfreq("D")
    return s


In [7]:
all_svi = []
for kw in KEYWORDS:
    s = fetch_trends_series(kw, START_DATE, END_DATE)
    if len(s):
        s.columns = [kw]
        all_svi.append(s)

if all_svi:
    trends = pd.concat(all_svi, axis=1).sort_index()
    trends.to_csv(TRENDS_CSV)
    print("[ok] saved trends:", trends.shape, "→", TRENDS_CSV)
else:
    trends = pd.DataFrame()
    print("[warn] no trends pulled")


[ok] gold price chunk 1: 2015-01-01 → 2015-06-30, rows=181
[ok] gold price chunk 2: 2015-07-01 → 2015-12-31, rows=184
[ok] gold price chunk 3: 2016-01-01 → 2016-06-30, rows=182
[ok] gold price chunk 4: 2016-07-01 → 2016-12-31, rows=184
[ok] gold price chunk 5: 2017-01-01 → 2017-06-30, rows=181
[ok] gold price chunk 6: 2017-07-01 → 2017-12-31, rows=184
[ok] gold price chunk 7: 2018-01-01 → 2018-06-30, rows=181
[ok] gold price chunk 8: 2018-07-01 → 2018-12-31, rows=184
[ok] gold price chunk 9: 2019-01-01 → 2019-06-30, rows=181
[ok] gold price chunk 10: 2019-07-01 → 2019-12-31, rows=184
[ok] gold price chunk 11: 2020-01-01 → 2020-06-30, rows=182
[ok] gold price chunk 12: 2020-07-01 → 2020-12-31, rows=184
[ok] gold price chunk 13: 2021-01-01 → 2021-06-30, rows=181
[ok] gold price chunk 14: 2021-07-01 → 2021-12-31, rows=184
[ok] gold price chunk 15: 2022-01-01 → 2022-06-30, rows=181
[ok] gold price chunk 16: 2022-07-01 → 2022-12-31, rows=184
[ok] gold price chunk 17: 2023-01-01 → 2023-06-30

  df = df.fillna(False)


[ok] gold price chunk 22: 2025-07-01 → 2025-08-29, rows=60
[ok] saved trends: (3894, 1) → /content/drive/MyDrive/gt-markets/data/processed/trends_daily.csv
