In [2]:
# !pip install pandas numpy scikit-learn matplotlib pyarrow

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, roc_auc_score,
    brier_score_loss, classification_report, RocCurveDisplay
)
from sklearn.calibration import calibration_curve

# -------------------
# Paths & folders
# -------------------
GOLD_DIR = Path("data/gold")
FIG = Path("figures"); FIG.mkdir(parents=True, exist_ok=True)
REPORT = Path("reports"); REPORT.mkdir(parents=True, exist_ok=True)

# -------------------
# Springboks colour scheme
# -------------------
PRIMARY = "#075020"   # green
SECONDARY = "#92875d" # gold/khaki
mpl.rcParams["axes.prop_cycle"] = mpl.cycler(color=[PRIMARY, SECONDARY])

# single timestamp to keep related outputs grouped
TS = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")

# -------------------
# Load chronologically split data
# -------------------
train = pd.read_parquet(GOLD_DIR / "train.parquet")
test  = pd.read_parquet(GOLD_DIR / "test.parquet")

# -------------------
# Features (NO leakage)
# -------------------
# Note: we deliberately exclude 'score_margin' (it uses the final score of the same match).
features_num = ["home", "rolling_form_3", "h2h_winrate", "days_since_prev"]
features_cat = ["opp_team", "tournament"]
target = "win"

X_train = train[features_cat + features_num].copy()
y_train = train[target].astype(int).copy()
X_test  = test[features_cat + features_num].copy()
y_test  = test[target].astype(int).copy()

# -------------------
# Preprocess + model
# -------------------
pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), features_cat),
        ("num", "passthrough", features_num),
    ],
    remainder="drop",
)

model = Pipeline(steps=[
    ("pre", pre),
    ("clf", LogisticRegression(
        max_iter=1000,
        random_state=42,
        class_weight=None  # set to 'balanced' if positive class is rare
    )),
])

model.fit(X_train, y_train)

# -------------------
# Predictions & metrics
# -------------------
p_test = model.predict_proba(X_test)[:, 1]
yhat = (p_test >= 0.5).astype(int)

metrics = {
    "accuracy": accuracy_score(y_test, yhat),
    "precision": precision_score(y_test, yhat, zero_division=0),
    "recall": recall_score(y_test, yhat, zero_division=0),
    "roc_auc": roc_auc_score(y_test, p_test) if len(np.unique(y_test)) > 1 else np.nan,
    "brier": brier_score_loss(y_test, p_test),
}

# -------------------
# Plots: Calibration & ROC (Springboks colours + timestamped filenames)
# -------------------
# Calibration curve
frac_pos, mean_pred = calibration_curve(y_test, p_test, n_bins=10, strategy="quantile")
plt.figure(figsize=(5.6, 5.2))
plt.plot(mean_pred, frac_pos, marker="o", label="LogReg (test)", color=PRIMARY)
plt.plot([0, 1], [0, 1], linestyle="--", color="black", alpha=0.7, label="Perfectly calibrated")
plt.xlabel("Predicted probability (bin mean)")
plt.ylabel("Observed frequency")
plt.title("Calibration curve (test)")
plt.legend()
plt.tight_layout()
plt.savefig(FIG / f"calibration_curve_{TS}.pdf", bbox_inches="tight")
plt.close("all")

# ROC curve
plt.figure(figsize=(5.6, 5.2))
disp = RocCurveDisplay.from_predictions(y_test, p_test, name="LogReg (test)")
# Force the ROC line colour to Springboks green
try:
    disp.line_.set_color(PRIMARY)
except Exception:
    pass
plt.title("ROC curve (test)")
plt.tight_layout()
plt.savefig(FIG / f"roc_curve_{TS}.pdf", bbox_inches="tight")
plt.close("all")

# -------------------
# Save metrics (timestamped)
# -------------------
with open(REPORT / f"metrics_{TS}.txt", "w", encoding="utf-8") as f:
    f.write("Baseline Logistic Regression (test set)\n")
    f.write("--------------------------------------\n")
    for k, v in metrics.items():
        f.write(f"{k}: {v:.4f}\n")
    f.write("\nClassification report (threshold = 0.5):\n")
    f.write(classification_report(y_test, yhat, digits=3))
    f.write("\nFeatures used:\n")
    f.write(f"Numerical: {features_num}\n")
    f.write(f"Categorical (one-hot): {features_cat}\n")

print(f"Saved: figures/calibration_curve_{TS}.pdf, figures/roc_curve_{TS}.pdf, reports/metrics_{TS}.txt")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")


ModuleNotFoundError: No module named 'sklearn'