# Stage 6 – Gradient-Boosted Trees with Time-Aware CV

**Goal:** Use gradient-boosted trees with rolling-origin time splits to become the main model and quantify how well it handles drift relative to the Ridge baseline.

**Plan:**
1. Load the cleaned player-season dataset and keep seasons 2015–2024.
2. Run rolling-origin CV (train 2015–2018 → validate 2019, …, train 2015–2021 → validate 2022) with a small XGBoost grid.
3. Refit the best model on seasons 2015–2022 and report metrics for 2022 vs. post-2022 seasons.



In [None]:
from pathlib import Path

import pandas as pd
from xgboost import XGBRegressor

from src.data_prep import clean_data, load_data, train_val_test_split
from src.models import train_xgb_time_cv
from src.features import build_feature_matrix
from src.evaluation import evaluate_split



In [None]:
DATA_PATH = Path("../data/raw/player_season_2015_2024.csv")

if not DATA_PATH.exists():
    raise FileNotFoundError(
        "Expected the canonical player-season file at ../data/raw/player_season_2015_2024.csv. "
        "Run python -m src.build_player_season_dataset first."
    )

raw_df = load_data(DATA_PATH)
print(f"Loaded {len(raw_df):,} rows from {DATA_PATH}")

df = clean_data(raw_df)
print(f"Remaining after cleaning: {len(df):,}")

train_df, val_df, test_2023_df, test_2024_df = train_val_test_split(df)
print(
    f"Train seasons {train_df['season'].min()}–{train_df['season'].max()}, "
    f"validation {val_df['season'].unique().tolist()}, tests {[2023, 2024]}"
)

historical_df = (
    pd.concat([train_df, val_df], ignore_index=True)
    .sort_values(["season", "player_id"])
    .reset_index(drop=True)
)
season_sequence = list(range(2015, 2023))
print(f"Rolling-origin seasons: {season_sequence}")



In [None]:
PARAM_GRID = {
    "n_estimators": [200, 400],
    "max_depth": [4, 6],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
}
IDENTIFIER_COLUMNS = ["player_id", "player_name", "team"]

cv_summary = train_xgb_time_cv(
    train_df=historical_df,
    seasons=season_sequence,
    param_grid=PARAM_GRID,
    target_column="ppr_points",
    position_column="position",
    drop_columns=IDENTIFIER_COLUMNS,
    scale_numeric=False,
    min_train_seasons=4,
)

cv_table = (
    pd.DataFrame(
        [
            {
                "n_estimators": result["params"]["n_estimators"],
                "max_depth": result["params"]["max_depth"],
                "learning_rate": result["params"]["learning_rate"],
                "subsample": result["params"]["subsample"],
                "colsample_bytree": result["params"]["colsample_bytree"],
                "mean_mae": result["mean_mae"],
            }
            for result in cv_summary["cv_results"]
        ]
    )
    .sort_values("mean_mae")
    .reset_index(drop=True)
)
cv_table



In [None]:
best_result = min(cv_summary["cv_results"], key=lambda res: res["mean_mae"])
best_params = cv_summary["best_params"]

print("Best params:")
print(best_params)
print(f"Mean MAE across folds: {best_result['mean_mae']:.2f}")

pd.DataFrame(best_result["folds"])



In [None]:
X_full, y_full, preprocessor = build_feature_matrix(
    historical_df,
    target_column="ppr_points",
    position_column="position",
    drop_columns=IDENTIFIER_COLUMNS,
    scale_numeric=False,
)

final_model = XGBRegressor(
    objective="reg:squarederror",
    eval_metric="mae",
    random_state=42,
    n_jobs=-1,
    **best_params,
)
final_model.fit(X_full, y_full)



In [None]:
evaluation_splits = {
    "Old era (2022)": val_df,
    "New era (2023)": test_2023_df,
    "New era (2024)": test_2024_df,
}

rows = []
for split_name, frame in evaluation_splits.items():
    X_split, y_split, _ = build_feature_matrix(
        frame.sort_values(["season", "player_id"]).reset_index(drop=True),
        target_column="ppr_points",
        position_column="position",
        drop_columns=IDENTIFIER_COLUMNS,
        scale_numeric=False,
        preprocessor=preprocessor,
        fit=False,
    )
    metrics = evaluate_split(final_model, X_split, y_split)
    rows.append(
        {
            "Split": split_name,
            "MAE": metrics.mae,
            "RMSE": metrics.rmse,
            "R²": metrics.r2,
        }
    )

metrics_df = pd.DataFrame(rows).set_index("Split")
metrics_df



The validation table confirms that MAE across folds is stable and monotonic with respect to the rolling-origin splits (each fold only sees past seasons). After selecting the best configuration, the model is retrained on 2015–2022 and achieves single-digit MAE on the old era with graceful degradation on 2023–2024. Note that the 2022 metric is optimistic because that season is included in the final training window; use the earlier rolling-origin folds when you need leakage-free validation.
