In [1]:
# !pip install pandas numpy pyarrow

from pathlib import Path
import numpy as np
import pandas as pd

SILVER = Path("data/silver/silver_results.parquet")
GOLD_DIR = Path("data/gold"); GOLD_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_END = pd.Timestamp("2016-12-31")
TEST_START = pd.Timestamp("2017-01-01")

df = pd.read_parquet(SILVER).sort_values("date").reset_index(drop=True)

# rolling form (last 3, lagged)
df["rolling_form_3"] = df["win"].shift(1).rolling(3, min_periods=1).mean()

# simple head-to-head prior win rate vs current opponent (lagged)
prev_wins, prev_games, h2h_vals = {}, {}, []
for _, row in df.iterrows():
    opp = row["opp_team"]
    w = row["win"]
    h2h_vals.append(prev_wins.get(opp, 0)/prev_games.get(opp, 1) if prev_games.get(opp, 0) > 0 else np.nan)
    prev_games[opp] = prev_games.get(opp, 0) + 1
    prev_wins[opp] = prev_wins.get(opp, 0) + (1 if w == 1 else 0)
df["h2h_winrate"] = h2h_vals

# rest days since previous match
df["days_since_prev"] = (df["date"] - df["date"].shift(1)).dt.days

gold_path = Path("data/gold/gold_results.parquet")
df.to_parquet(gold_path, index=False)
print(f"[GOLD] Saved {gold_path} (rows={len(df)})")

# time-based split
train = df[df["date"] <= TRAIN_END].copy()
test  = df[df["date"] >= TEST_START].copy()

X_cols_num = ["home","rolling_form_3","h2h_winrate","days_since_prev","score_margin"]
X_cols_cat = ["opp_team","tournament"]
y_col = "win"

train[X_cols_num + X_cols_cat + [y_col]].to_parquet("data/gold/train.parquet", index=False)
test[X_cols_num + X_cols_cat + [y_col]].to_parquet("data/gold/test.parquet", index=False)
print(f"[GOLD] Split saved: train={len(train)}, test={len(test)}")


[GOLD] Saved data\gold\gold_results.parquet (rows=328)
[GOLD] Split saved: train=255, test=73
