In [2]:
# !pip install kagglehub pandas numpy pyarrow

from pathlib import Path
import pandas as pd
import numpy as np

try:
    import kagglehub
except Exception:
    kagglehub = None

# config
DATASET_ID = "lylebegbie/international-rugby-union-results-from-18712022"
BRONZE = Path("data/bronze"); BRONZE.mkdir(parents=True, exist_ok=True)
SILVER = Path("data/silver"); SILVER.mkdir(parents=True, exist_ok=True)
TEAM_NAME = "South Africa"
MIN_YEAR = 1992

def download_bronze() -> Path:
    if kagglehub is None:
        raise RuntimeError("Please install kagglehub: pip install kagglehub")
    ds = Path(kagglehub.dataset_download(DATASET_ID))
    csvs = sorted(ds.rglob("*.csv"), key=lambda p: p.stat().st_size, reverse=True)
    if not csvs:
        raise FileNotFoundError("No CSV found in Kaggle dataset.")
    src = csvs[0]
    df = pd.read_csv(src)
    # enforce types minimally; keep “raw” spirit
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
    bronze_path = BRONZE / "bronze_results.parquet"
    df.to_parquet(bronze_path, index=False)
    print(f"[BRONZE] Saved {bronze_path} (rows={len(df)})")
    return bronze_path

bronze_path = download_bronze()


[BRONZE] Saved data\bronze\bronze_results.parquet (rows=2783)
