# 01 â€” Data Collection & Preprocessing
Load daily prices, clean missing values, compute log returns, and create train/val/test date splits.

In [None]:
!pip -q install pandas numpy matplotlib scikit-learn statsmodels tensorflow

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit

In [None]:
DATA_PATH = "../data/prices.csv"  # or "/content/prices.csv" in Colab

prices = pd.read_csv(DATA_PATH, parse_dates=["date"])
prices = prices.sort_values("date").set_index("date")

print(prices.shape)
prices.head()

In [None]:
# Drop tickers with too many missing values (tune threshold)
missing_frac = prices.isna().mean()
keep_cols = missing_frac[missing_frac <= 0.05].index  # keep columns with <=5% missing
prices = prices[keep_cols]

# Forward fill then backfill remaining (simple; replace with your approach if different)
prices = prices.ffill().bfill()

print("Remaining NA:", prices.isna().sum().sum())
print("Tickers kept:", prices.shape[1])

In [None]:
rets = np.log(prices).diff().dropna()
rets.head()

In [None]:
# Train/Val/Test date split (walk-forward safe)
# Example split: 70% train, 15% val, 15% test by time
n = len(rets)
train_end = int(0.70 * n)
val_end   = int(0.85 * n)

rets_train = rets.iloc[:train_end]
rets_val   = rets.iloc[train_end:val_end]
rets_test  = rets.iloc[val_end:]

print(rets_train.shape, rets_val.shape, rets_test.shape)

In [None]:
plt.figure()
rets_train.iloc[:, 0].plot()
plt.title("Example log return series (train)")
plt.show()

In [None]:
os.makedirs("../data/processed", exist_ok=True)
rets.to_parquet("../data/processed/returns.parquet")