# Data Collection and Caching

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import yfinance as yf

#project root finder (folder that contains /data)
cwd = Path.cwd().resolve()
project_root = None
for p in [cwd] + list(cwd.parents):
    if (p / "data").exists():
        project_root = p
        break
if project_root is None:
    raise RuntimeError("Project root not found: missing /data folder")

DATA_DIR = project_root / "data"
RAW_DIR = DATA_DIR / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

print("project_root:", project_root)
print("raw_dir:", RAW_DIR)


project_root: /media/workhorse/Windows/MyWorkPlace/Projects/StockPredictor
raw_dir: /media/workhorse/Windows/MyWorkPlace/Projects/StockPredictor/data/raw


In [2]:
#config
 
TARGET_TICKERS = ["AAPL", "MSFT", "GOOG", "AMZN"]

EXOG_TICKERS = [
    "SPY", "QQQ", "XLK",
    "^VIX", "^TNX",
    "UUP", "CL=F", "GC=F"
]

ALL_TICKERS = TARGET_TICKERS + EXOG_TICKERS

START_DATE = "2013-01-01"
END_DATE = pd.Timestamp.today().strftime("%Y-%m-%d")

LAGS_ENDOG = list(range(1, 61))
LAGS_EXOG = list(range(1, 21))
ROLL_WINDOWS = [5, 10, 20, 60]

print("tickers:", ALL_TICKERS)
print("start:", START_DATE, "end:", END_DATE)

tickers: ['AAPL', 'MSFT', 'GOOG', 'AMZN', 'SPY', 'QQQ', 'XLK', '^VIX', '^TNX', 'UUP', 'CL=F', 'GC=F']
start: 2013-01-01 end: 2026-01-26


In [3]:
#download OHLCV from yfinance and save raw

raw = yf.download(
    tickers=ALL_TICKERS,
    start=START_DATE,
    end=END_DATE,
    interval="1d",
    auto_adjust=False,
    actions=False,
    group_by="ticker",
    threads=True,
    progress=False
)

#convert yfinance output to long format: date, ticker, Open, High, Low, Close, Adj Close, Volume
frames = []
if isinstance(raw.columns, pd.MultiIndex):
    for t in ALL_TICKERS:
        if t not in raw.columns.get_level_values(0):
            continue
        df_t = raw[t].copy()
        df_t = df_t.reset_index().rename(columns={"Date": "date"})
        df_t["ticker"] = t
        frames.append(df_t)
else:
    df_t = raw.reset_index().rename(columns={"Date": "date"})
    df_t["ticker"] = ALL_TICKERS[0]
    frames.append(df_t)

ohlcv = pd.concat(frames, ignore_index=True)
ohlcv["date"] = pd.to_datetime(ohlcv["date"]).dt.tz_localize(None)

#standardize required columns
keep_cols = ["date", "ticker", "Open", "High", "Low", "Close", "Adj Close", "Volume"]
for c in keep_cols:
    if c not in ohlcv.columns:
        ohlcv[c] = np.nan
ohlcv = ohlcv[keep_cols].sort_values(["ticker", "date"]).reset_index(drop=True)

#save raw
ohlcv_path_parquet = RAW_DIR / "yfinance_ohlcv_long.parquet"
ohlcv_path_csv = RAW_DIR / "yfinance_ohlcv_long.csv"
ohlcv.to_parquet(ohlcv_path_parquet, index=False)
ohlcv.to_csv(ohlcv_path_csv, index=False)

print("saved:", ohlcv_path_parquet)
print("saved:", ohlcv_path_csv)
print("ohlcv shape:", ohlcv.shape)
ohlcv.head()


saved: /media/workhorse/Windows/MyWorkPlace/Projects/StockPredictor/data/raw/yfinance_ohlcv_long.parquet
saved: /media/workhorse/Windows/MyWorkPlace/Projects/StockPredictor/data/raw/yfinance_ohlcv_long.csv
ohlcv shape: (39456, 8)


Price,date,ticker,Open,High,Low,Close,Adj Close,Volume
0,2013-01-02,AAPL,19.779285,19.821428,19.343929,19.608213,16.612209,560518000.0
1,2013-01-03,AAPL,19.567142,19.631071,19.321428,19.360714,16.402523,352965200.0
2,2013-01-04,AAPL,19.1775,19.236786,18.779642,18.821428,15.945646,594333600.0
3,2013-01-07,AAPL,18.642857,18.90357,18.4,18.710714,15.851843,484156400.0
4,2013-01-08,AAPL,18.900356,18.996071,18.616072,18.76107,15.894506,458707200.0
