In [None]:
# package imports 
from __future__ import annotations

import os
import glob
import warnings
warnings.filterwarnings("ignore")

from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, SplineTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

from xgboost import XGBClassifier

In [None]:
# Configuration

DATA_PATH = r"contracts_with_isi_v2_SWEEP_WIDE_WITH_KEYS_PLUS_CPI.csv"

BAT_RATES_PATH = r"batting_rates_by_season.csv"
PIT_RATES_PATH = r"pitching_rates_by_season.csv"
DEF_STATS_PATH = r"defensive_stats.csv"
STATCAST_PIT_PATH = r"statcast_pitching_2015_2025.csv"

# WAR folder directory (contains war files from 2010-2025)
WAR_DIR = r"war_rankings_raw\war"

TRAIN_YEARS = {2020, 2021, 2022, 2023}
TEST_YEARS  = {2024, 2025}


PRE_YEARS = 3
POST_YEARS = 3

MAX_YEARS = 5

AAV_COL = "guarantee_real_per_year_2025"
REMOVE_TOP_PCTL = 0.95

# Threshold sweep
WAR_LOSS_THRESHOLDS = [0.5, 1.0, 1.5, 2.0, 2.5]

# Output
OUT_CLS_RESULTS = r"model_comparison_classification_BASELINE_threshold_sweep.csv"

BASE_NUMERIC = [
    "age_at_signing",
    "years_int",
    "opt_out_flag",
    "year",
    "is_pitcher_flag",

    "war_pre_mean",
    "war_pre_sum",
    "war_pre_n",
]

BASE_CATEGORICAL = [
    "position",
    "qualifying_offer",
]

In [None]:
# Helper functions

# drop dups, preserves order
def unique_list(seq):
    return list(dict.fromkeys(seq))

# drop dup col names
def dedupe_columns(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[:, ~df.columns.duplicated()].copy()


PITCHER_PREFIXES = ("P", "SP", "RP", "RHP", "LHP")
def is_pitcher(pos) -> int:
    if pd.isna(pos):
        return 0
    s = str(pos).strip().upper()
    # handles pitcher positional variations
    return int(s.startswith(PITCHER_PREFIXES) or ("RHP" in s) or ("LHP" in s))

def time_split(df: pd.DataFrame, year_col: str = "year") -> Tuple[pd.DataFrame, pd.DataFrame]:
    y = pd.to_numeric(df[year_col], errors="coerce").astype("Int64")
    train = df[y.isin(TRAIN_YEARS)].copy()
    test  = df[y.isin(TEST_YEARS)].copy()
    return train, test

def classification_metrics(y_true: np.ndarray, y_prob: np.ndarray, threshold: float = 0.5) -> Dict[str, float]:
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob).astype(float)

    mask = np.isfinite(y_prob) & np.isfinite(y_true)
    y_true = y_true[mask]
    y_prob = y_prob[mask]

    # Data integrity check
    # # AUC undefined if only one class present
    if len(np.unique(y_true)) < 2:
        auc = np.nan
    else:
        auc = float(roc_auc_score(y_true, y_prob))

    y_pred = (y_prob >= threshold).astype(int)
    acc = float(accuracy_score(y_true, y_pred))
    f1  = float(f1_score(y_true, y_pred, zero_division=0))

    return {
        "AUC": auc,
        "Accuracy": acc,
        "F1": f1,
        "n_eval": int(len(y_true)),
        "pos_rate_eval": float(y_true.mean()) if len(y_true) else np.nan,
    }

def _safe_numeric(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    out = df.copy()
    for c in cols:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce")
    return out

def _weighted_mean(series: pd.Series, weights: pd.Series) -> float:
    s = pd.to_numeric(series, errors="coerce")
    w = pd.to_numeric(weights, errors="coerce").fillna(0.0)
    mask = np.isfinite(s) & np.isfinite(w) & (w > 0)
    if mask.sum() == 0:
        s2 = s[np.isfinite(s)]
        return float(s2.mean()) if len(s2) else np.nan
    return float(np.average(s[mask], weights=w[mask]))

def add_pre_rate_features(
    contracts: pd.DataFrame,
    season_rates: pd.DataFrame,
    *,
    rate_cols: List[str],
    weight_col: str | None,
    prefix: str,
    pre_years: int = 3,
) -> pd.DataFrame:
    dfc = contracts.copy()
    dfc["key_fangraphs"] = pd.to_numeric(dfc["key_fangraphs"], errors="coerce")
    dfc["year"] = pd.to_numeric(dfc["year"], errors="coerce")
    dfc["_row_id"] = np.arange(len(dfc), dtype=int)

    dfs = season_rates.copy()
    dfs["playerId"] = pd.to_numeric(dfs["playerId"], errors="coerce")
    dfs["Season"] = pd.to_numeric(dfs["Season"], errors="coerce")

    dfs = _safe_numeric(dfs, rate_cols + ([weight_col] if weight_col else []))

    m = dfc[["_row_id", "key_fangraphs", "year"]].merge(
        dfs,
        left_on="key_fangraphs",
        right_on="playerId",
        how="left",
    )

    m["lb_start"] = m["year"] - pre_years
    m["lb_end"] = m["year"] - 1
    m = m[m["Season"].between(m["lb_start"], m["lb_end"], inclusive="both")].copy()

    out = dfc.copy()

    cov = m.groupby("_row_id")["Season"].nunique().rename(f"{prefix}_pre_seasons")
    out = out.merge(cov, left_on="_row_id", right_index=True, how="left")
    out[f"{prefix}_pre_seasons"] = out[f"{prefix}_pre_seasons"].fillna(0).astype(int)

    if weight_col and weight_col in m.columns:
        rel_sum = m.groupby("_row_id")[weight_col].sum(min_count=1).rename(f"{prefix}_pre_reliability_sum")
        out = out.merge(rel_sum, left_on="_row_id", right_index=True, how="left")
        out[f"{prefix}_pre_reliability_sum"] = pd.to_numeric(out[f"{prefix}_pre_reliability_sum"], errors="coerce").fillna(0.0)

    for rc in rate_cols:
        feat_name = f"{prefix}_pre_{rc}"
        if rc not in m.columns:
            out[feat_name] = np.nan
            continue

        if weight_col and weight_col in m.columns:
            agg = m.groupby("_row_id").apply(lambda g: _weighted_mean(g[rc], g[weight_col])).rename(feat_name)
        else:
            agg = m.groupby("_row_id")[rc].mean().rename(feat_name)

        out = out.merge(agg, left_on="_row_id", right_index=True, how="left")

    out[f"has_{prefix}_pre"] = (out[f"{prefix}_pre_seasons"] > 0).astype(int)
    out = out.drop(columns=["_row_id"])
    return out

In [None]:
# apply pre panel features

def add_pre_panel_features(
    contracts: pd.DataFrame,
    panel: pd.DataFrame,
    *,
    contract_key_col: str,
    panel_key_col: str,
    contract_year_col: str,
    panel_year_col: str,
    feature_cols: List[str],
    weight_col: str | None,
    prefix: str,
    pre_years: int = 3,
) -> pd.DataFrame:
    """ Generic season panel aggregator (only used for defense and statcast files) """
    # Data integrity
    # Missing value check
    missing_contract = [c for c in [contract_key_col, contract_year_col] if c not in contracts.columns]
    missing_panel = [c for c in [panel_key_col, panel_year_col] if c not in panel.columns]
    if missing_contract:
        raise KeyError(f"[add_pre_panel_features] contracts missing: {missing_contract}")
    if missing_panel:
        raise KeyError(f"[add_pre_panel_features] panel missing: {missing_panel}")

    out = contracts.copy()
    out["_row_id"] = np.arange(len(out), dtype=int)

    # temp cols
    dfc = out[["_row_id", contract_key_col, contract_year_col]].copy()
    dfc["_contract_key"] = pd.to_numeric(dfc[contract_key_col], errors="coerce")
    dfc["_contract_year"] = pd.to_numeric(dfc[contract_year_col], errors="coerce")

    # returns necessary cols only
    keep_feats = [c for c in feature_cols if c in panel.columns]
    keep_cols = [panel_key_col, panel_year_col] + keep_feats
    if weight_col and weight_col in panel.columns and weight_col not in keep_cols:
        keep_cols.append(weight_col)

    p = panel[keep_cols].copy()
    p["_panel_key"] = pd.to_numeric(p[panel_key_col], errors="coerce")
    p["_panel_year"] = pd.to_numeric(p[panel_year_col], errors="coerce")

    for c in keep_feats:
        p[c] = pd.to_numeric(p[c], errors="coerce")
    if weight_col and weight_col in p.columns:
        p[weight_col] = pd.to_numeric(p[weight_col], errors="coerce")

    merge_cols = ["_panel_key", "_panel_year"] + keep_feats + ([weight_col] if (weight_col and weight_col in p.columns) else [])
    m = dfc[["_row_id", "_contract_key", "_contract_year"]].merge(
        p[merge_cols],
        left_on="_contract_key",
        right_on="_panel_key",
        how="left",
    )

    # Apply lookback filter
    m["lb_start"] = m["_contract_year"] - pre_years
    m["lb_end"] = m["_contract_year"] - 1
    m = m[m["_panel_year"].between(m["lb_start"], m["lb_end"], inclusive="both")].copy()

    cov = m.groupby("_row_id")["_panel_year"].nunique().rename(f"{prefix}_pre_seasons")
    out = out.merge(cov, on="_row_id", how="left")
    out[f"{prefix}_pre_seasons"] = out[f"{prefix}_pre_seasons"].fillna(0).astype(int)

    # Weighted sums
    if weight_col and weight_col in m.columns:
        wsum = m.groupby("_row_id")[weight_col].sum(min_count=1).rename(f"{prefix}_pre_weight_sum")
        out = out.merge(wsum, on="_row_id", how="left")
        out[f"{prefix}_pre_weight_sum"] = pd.to_numeric(out[f"{prefix}_pre_weight_sum"], errors="coerce").fillna(0.0)

    # Aggregate features where necessary
    if weight_col and weight_col in m.columns:
        w = pd.to_numeric(m[weight_col], errors="coerce").fillna(0.0)
        for fc in keep_feats:
            feat_name = f"{prefix}_pre_{fc}"
            x = pd.to_numeric(m[fc], errors="coerce")
            num = (x * w).groupby(m["_row_id"]).sum(min_count=1)
            den = w.groupby(m["_row_id"]).sum(min_count=1)
            wmean = (num / den).replace([np.inf, -np.inf], np.nan).rename(feat_name)
            out = out.merge(wmean, on="_row_id", how="left")
    else:
        for fc in keep_feats:
            feat_name = f"{prefix}_pre_{fc}"
            mean = m.groupby("_row_id")[fc].mean().rename(feat_name)
            out = out.merge(mean, on="_row_id", how="left")

    out[f"has_{prefix}_pre"] = (out[f"{prefix}_pre_seasons"] > 0).astype(int)
    out = out.drop(columns=["_row_id"])
    return out


In [None]:
# Load and Utilize WAR Statistics

# Data loader & Data Integrity Check
def load_war_files(war_dir: str) -> pd.DataFrame:
    paths = sorted(glob.glob(os.path.join(war_dir, "war_*.csv")))
    if not paths:
        raise FileNotFoundError(f"No war_*.csv files found in {war_dir}")

    dfs = []
    for p in paths:
        df = pd.read_csv(p)
        df = dedupe_columns(df)
        if "PlayerId" not in df.columns and "playerid" in [c.lower() for c in df.columns]:
            # Fallback for missing values
            for c in df.columns:
                if c.lower() == "playerid":
                    df = df.rename(columns={c: "PlayerId"})
        if "Season" not in df.columns and "season" in [c.lower() for c in df.columns]:
            for c in df.columns:
                if c.lower() == "season":
                    df = df.rename(columns={c: "Season"})

        # Ensures no naming conventions change in future years
        # Total WAR column may vary; handles common variants
        war_col = None
        for c in df.columns:
            if c.strip().lower() in {"total war", "total_war", "war", "totalwar"}:
                war_col = c
                break
        if war_col is None:
            raise KeyError(f"Could not find Total WAR column in {os.path.basename(p)}. Columns: {list(df.columns)}")

        df = df.rename(columns={war_col: "TotalWAR"}).copy()
        df["PlayerId"] = pd.to_numeric(df["PlayerId"], errors="coerce")
        df["Season"] = pd.to_numeric(df["Season"], errors="coerce")
        df["TotalWAR"] = pd.to_numeric(df["TotalWAR"], errors="coerce")

        dfs.append(df[["PlayerId", "Season", "TotalWAR"]])

    out = pd.concat(dfs, ignore_index=True)
    out = out.dropna(subset=["PlayerId", "Season"]).copy()
    out["PlayerId"] = out["PlayerId"].astype(int)
    out["Season"] = out["Season"].astype(int)
    return out

def add_war_pre_post(
    contracts: pd.DataFrame,
    war: pd.DataFrame,
    *,
    contract_key_col: str = "key_fangraphs",
    contract_year_col: str = "year",
    pre_years: int = 3,
    post_years: int = 3,
) -> pd.DataFrame:
    dfc = contracts.copy()
    dfc["_row_id"] = np.arange(len(dfc), dtype=int)
    dfc["_pid"] = pd.to_numeric(dfc[contract_key_col], errors="coerce")
    dfc["_yr"] = pd.to_numeric(dfc[contract_year_col], errors="coerce")

    w = war.copy()
    w["_pid"] = pd.to_numeric(w["PlayerId"], errors="coerce")
    w["_season"] = pd.to_numeric(w["Season"], errors="coerce")
    w["_war"] = pd.to_numeric(w["TotalWAR"], errors="coerce")

    m = dfc[["_row_id", "_pid", "_yr"]].merge(
        w[["_pid", "_season", "_war"]],
        on="_pid",
        how="left",
    )

    out = dfc.copy()

    # pre_war: [Y-pre, Y-1]
    m_pre = m.copy()
    m_pre["lb_start"] = m_pre["_yr"] - pre_years
    m_pre["lb_end"] = m_pre["_yr"] - 1
    m_pre = m_pre[m_pre["_season"].between(m_pre["lb_start"], m_pre["lb_end"], inclusive="both")].copy()

    pre_n = m_pre.groupby("_row_id")["_season"].nunique().rename("war_pre_n")
    pre_sum = m_pre.groupby("_row_id")["_war"].sum(min_count=1).rename("war_pre_sum")
    pre_mean = m_pre.groupby("_row_id")["_war"].mean().rename("war_pre_mean")

    out = out.merge(pre_n, on="_row_id", how="left")
    out = out.merge(pre_sum, on="_row_id", how="left")
    out = out.merge(pre_mean, on="_row_id", how="left")

    # post_war: [Y, Y + post-1]
    m_post = m.copy()
    m_post["lb_start"] = m_post["_yr"]
    m_post["lb_end"] = m_post["_yr"] + (post_years - 1)
    m_post = m_post[m_post["_season"].between(m_post["lb_start"], m_post["lb_end"], inclusive="both")].copy()

    post_n = m_post.groupby("_row_id")["_season"].nunique().rename("war_post_n")
    post_sum = m_post.groupby("_row_id")["_war"].sum(min_count=1).rename("war_post_sum")
    post_mean = m_post.groupby("_row_id")["_war"].mean().rename("war_post_mean")

    out = out.merge(post_n, on="_row_id", how="left")
    out = out.merge(post_sum, on="_row_id", how="left")
    out = out.merge(post_mean, on="_row_id", how="left")

    # war loss, explanation: + means worse than expected
    # mean loss is more easily comparable across post windows
    out["war_loss_mean"] = pd.to_numeric(out["war_pre_mean"], errors="coerce") - pd.to_numeric(out["war_post_mean"], errors="coerce")
    out["war_loss_sum"]  = pd.to_numeric(out["war_pre_sum"], errors="coerce")  - pd.to_numeric(out["war_post_sum"], errors="coerce")

    # cleanup
    out = out.drop(columns=["_row_id", "_pid", "_yr"], errors="ignore")
    return out

In [None]:
# Data Preprocessing
def make_preprocessor(
    numeric_features: List[str],
    categorical_features: List[str],
    use_splines: bool,
) -> ColumnTransformer:
    num_steps = [("imputer", SimpleImputer(strategy="median"))]
    if use_splines:
        num_steps += [
            ("splines", SplineTransformer(n_knots=6, degree=3, include_bias=False)),
            ("scaler", StandardScaler(with_mean=False)),
        ]
    else:
        num_steps += [("scaler", StandardScaler())]

    num_pipe = Pipeline(steps=num_steps)
    cat_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    return ColumnTransformer(
        transformers=[
            ("num", num_pipe, numeric_features),
            ("cat", cat_pipe, categorical_features),
        ],
        remainder="drop",
        sparse_threshold=0.3,
    )

In [None]:
# Model Creation
# Using LogL2,LogEN, RF, GAMSL, XGB Sweep

def get_classification_models(random_state: int = 42) -> Dict[str, object]:
    models: Dict[str, object] = {
        "Logistic_L2": LogisticRegression(max_iter=50000, solver="lbfgs"),
        "Logistic_ElasticNet": LogisticRegression(max_iter=50000, solver="saga", penalty="elasticnet", l1_ratio=0.35),

        "RandomForest_v2": RandomForestClassifier(
            n_estimators=1500,
            max_depth=None,
            min_samples_split=6,
            min_samples_leaf=3,
            max_features="sqrt",
            bootstrap=True,
            class_weight="balanced",
            random_state=random_state,
            n_jobs=-1,
        ),

        "GAM_Splines_Logit": LogisticRegression(max_iter=50000, solver="lbfgs"),
    }

    # XGB sweep config
    xgb_grid = [
        {"n_estimators": 600,  "learning_rate": 0.05,  "max_depth": 3, "subsample": 0.85, "colsample_bytree": 0.85, "reg_lambda": 1.0},
        {"n_estimators": 900,  "learning_rate": 0.04,  "max_depth": 3, "subsample": 0.85, "colsample_bytree": 0.85, "reg_lambda": 1.0},
        {"n_estimators": 1200, "learning_rate": 0.03,  "max_depth": 4, "subsample": 0.85, "colsample_bytree": 0.85, "reg_lambda": 1.0},
        {"n_estimators": 1400, "learning_rate": 0.025, "max_depth": 4, "subsample": 0.80, "colsample_bytree": 0.85, "reg_lambda": 1.5},
        {"n_estimators": 1600, "learning_rate": 0.02,  "max_depth": 5, "subsample": 0.80, "colsample_bytree": 0.80, "reg_lambda": 2.0},
        {"n_estimators": 2500, "learning_rate": 0.005, "max_depth": 4, "subsample": 0.80, "colsample_bytree": 0.85, "reg_lambda": 1.5},
    ]

    # XGB Setup (applied once, rather than for each sweep iteration)
    xgb_defaults = dict(
        random_state=random_state,
        n_jobs=-1,
        eval_metric="logloss",
        min_child_weight=6,
        reg_alpha=0.0,
    )

    # assigns a name for each XGB sweep model
    # ensures no overwriting of previous iterations
    for i, params in enumerate(xgb_grid, start=1):
        name = f"XGB_{i:02d}_ne{params['n_estimators']}_lr{params['learning_rate']}_md{params['max_depth']}"
        models[name] = XGBClassifier(**xgb_defaults, **params)

    return models


# Model training
def fit_predict_classification(
    model_name: str,
    model,
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    *,
    numeric_features: List[str],
    categorical_features: List[str],
) -> np.ndarray:
    use_splines = (model_name == "GAM_Splines_Logit")
    pre = make_preprocessor(numeric_features, categorical_features, use_splines=use_splines)

    clf = Pipeline(steps=[("pre", pre), ("model", model)])
    clf.fit(X_train, y_train)

    # probability of class 1
    prob = clf.predict_proba(X_test)[:, 1]
    return prob

In [None]:
# Main runs baseline classification models that are set up above
if __name__ == "__main__":

    # data loader
    df = pd.read_csv(DATA_PATH)

    bat_rates = pd.read_csv(BAT_RATES_PATH)
    pit_rates = pd.read_csv(PIT_RATES_PATH)
    def_stats = pd.read_csv(DEF_STATS_PATH)
    sc_pit = pd.read_csv(STATCAST_PIT_PATH)
    war = load_war_files(WAR_DIR)

    # Data integrity
    df = dedupe_columns(df)

    df["term_start_year"] = pd.to_numeric(df.get("term_start_year"), errors="coerce")
    df = df.dropna(subset=["term_start_year"]).copy()
    df["year"] = pd.to_numeric(df["term_start_year"], errors="coerce")
    
    # Restrict contract years are 2020-2025
    df = df[(df["year"] >= 2020) & (df["year"] <= 2025)].copy()
    # Restricts contract terms to 1-5 years
    df["years_int"] = pd.to_numeric(df.get("years_int"), errors="coerce")
    before = len(df)
    df = df[df["years_int"].notna() & (df["years_int"] <= MAX_YEARS)].copy()
    print(f"[FILTER] Dropped {before - len(df)} contracts with years_int > {MAX_YEARS} (or missing)")

    # coerce keys
    for c in ["key_fangraphs", "key_mlbam"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # Assign pitcher flag
    df["is_pitcher_flag"] = df["position"].map(is_pitcher).astype(int) if "position" in df.columns else 0

    # computes pre_war, post_war, and war_loss
    df = add_war_pre_post(
        df, war,
        contract_key_col="key_fangraphs",
        contract_year_col="year",
        pre_years=PRE_YEARS,
        post_years=POST_YEARS,
    )

    # full post_war coverage requirements
    df["war_post_n"] = pd.to_numeric(df.get("war_post_n"), errors="coerce")
    df = df[df["war_post_n"] >= 1].copy()

    BAT_EXCLUDE = {"playerId", "Season", "Name", "Tm", "PA"}
    PIT_EXCLUDE = {"playerId", "Season", "Name", "Tm", "IP", "TBF"}

    bat_rate_cols = [c for c in bat_rates.columns if c not in BAT_EXCLUDE and c != "bat_rate_reliability" and not c.endswith("_dup")]
    pit_rate_cols = [c for c in pit_rates.columns if c not in PIT_EXCLUDE and c != "pit_rate_reliability" and not c.endswith("_dup")]

    # Data integrity
    bat_weight_col = "bat_rate_reliability" if "bat_rate_reliability" in bat_rates.columns else None
    pit_weight_col = "pit_rate_reliability" if "pit_rate_reliability" in pit_rates.columns else None

    df = add_pre_rate_features(df, bat_rates, rate_cols=bat_rate_cols, weight_col=bat_weight_col, prefix="bat", pre_years=PRE_YEARS)
    df = add_pre_rate_features(df, pit_rates, rate_cols=pit_rate_cols, weight_col=pit_weight_col, prefix="pit", pre_years=PRE_YEARS)

    def_feature_cols = [c for c in ["defensive_runs_saved", "fielding_percentage", "Errors"] if c in def_stats.columns]
    
    df = add_pre_panel_features(
        contracts=df,
        panel=def_stats,
        contract_key_col="key_mlbam",
        panel_key_col="MLBAMID",
        contract_year_col="year",
        panel_year_col="year",
        feature_cols=def_feature_cols,
        weight_col="Innings_played" if "Innings_played" in def_stats.columns else None,
        prefix="def",
        pre_years=PRE_YEARS,
    )
    
    # Add statcast pitching features
    sc_feature_cols = [c for c in ["fastball_avg_speed","whiff_percent","hard_hit_percent","barrel_batted_rate","exit_velocity_avg","swing_percent"] if c in sc_pit.columns]
    
    df = add_pre_panel_features(
        contracts=df,
        panel=sc_pit,
        contract_key_col="key_mlbam",
        panel_key_col="player_id",
        contract_year_col="year",
        panel_year_col="year",
        feature_cols=sc_feature_cols,
        weight_col="pa" if "pa" in sc_pit.columns else None,
        prefix="scpit",
        pre_years=PRE_YEARS,
    )

    df = dedupe_columns(df)
    df_cls = df.copy()

    # Data Integrity
    # AAV col cleaner
    if AAV_COL in df_cls.columns:
        df_cls[AAV_COL] = (
            df_cls[AAV_COL].astype(str).str.replace(r"[\$,]", "", regex=True).str.strip()
        )
        df_cls[AAV_COL] = pd.to_numeric(df_cls[AAV_COL], errors="coerce")

    train_full, test_full = time_split(df_cls, year_col="year")

    cut_value = None
    if AAV_COL in train_full.columns:
        cut_value = float(train_full[AAV_COL].dropna().quantile(REMOVE_TOP_PCTL))
        print(f"[TOP5%] Train-derived AAV cutoff ({REMOVE_TOP_PCTL:.0%}): {cut_value:,.0f}")
    else:
        print(f"[WARN] {AAV_COL} not present; TOP5_REMOVED variant will be skipped.")

    # Detect newly generated features
    cov_suffixes = ("_pre_seasons", "_pre_reliability_sum", "_pre_weight_sum")
    generated_cov_feats = [
        c for c in df_cls.columns
        if c in {"has_bat_pre","has_pit_pre","has_def_pre","has_scpit_pre"} or c.endswith(cov_suffixes)
    ]

    PREFIXES = ("bat_pre_", "pit_pre_", "def_pre_", "scpit_pre_")
    generated_rate_feats = [c for c in df_cls.columns if c.startswith(PREFIXES) and c not in generated_cov_feats]

    numeric_features = unique_list([c for c in (BASE_NUMERIC + generated_rate_feats + generated_cov_feats) if c in df_cls.columns])
    categorical_features = unique_list([c for c in (BASE_CATEGORICAL) if c in df_cls.columns])

    # Runs model parameter sweeps
    results_rows = []

    def run_one(train_df: pd.DataFrame, test_df: pd.DataFrame, *, variant: str, threshold_value: float, cut_pct: Optional[float], cut_value_used: Optional[float]):
        y_train = (pd.to_numeric(train_df["war_loss_mean"], errors="coerce") >= threshold_value).astype(int)
        y_test  = (pd.to_numeric(test_df["war_loss_mean"], errors="coerce") >= threshold_value).astype(int)

        X_train = train_df[numeric_features + categorical_features]
        X_test  = test_df[numeric_features + categorical_features]

        for model_name, model in get_classification_models().items():
            prob = fit_predict_classification(
                model_name, model,
                X_train, y_train,
                X_test,
                numeric_features=numeric_features,
                categorical_features=categorical_features,
            )

            mets = classification_metrics(y_test.values, prob, threshold=0.5)

            results_rows.append({
                "run": "BASELINE_ONLY",
                "variant": variant,
                "war_loss_threshold": threshold_value,
                "post_years": POST_YEARS,
                "cut_pct": cut_pct,
                "cut_value": cut_value_used,
                "model": model_name,
                "n_train": int(len(train_df)),
                "n_test": int(len(test_df)),
                "pos_rate_train": float(y_train.mean()) if len(y_train) else np.nan,
                "pos_rate_test": float(y_test.mean()) if len(y_test) else np.nan,
                "n_features_numeric": len(numeric_features),
                "n_features_categorical": len(categorical_features),
                **mets,
            })

    # setup differently than regression due to repeated errors
    # assumption is errors were caused by varying model types
    for thr in WAR_LOSS_THRESHOLDS:
        # UNFILTERED
        run_one(train_full, test_full, variant="UNFILTERED", threshold_value=thr, cut_pct=None, cut_value_used=None)

        # TOP5_REMOVED
        if (cut_value is not None) and (AAV_COL in train_full.columns):
            train_cut = train_full[train_full[AAV_COL] <= cut_value].copy()
            test_cut  = test_full[test_full[AAV_COL] <= cut_value].copy()
            run_one(train_cut, test_cut, variant="TOP5_REMOVED", threshold_value=thr, cut_pct=REMOVE_TOP_PCTL, cut_value_used=cut_value)

    results = pd.DataFrame(results_rows)
    results = results.sort_values(["variant", "war_loss_threshold", "AUC"], ascending=[True, True, False])
    results.to_csv(OUT_CLS_RESULTS, index=False)

    print("\nSaved classification sweep:", OUT_CLS_RESULTS)

    # Quick Report
    # Returns best results by AUC
    summary = (
        results.sort_values(["variant","war_loss_threshold","AUC"], ascending=[True,True,False])
        .groupby(["variant","war_loss_threshold"], as_index=False)
        .head(1)
    )
    print("\nBest model per variant/threshold (by AUC):")
    print(summary[["variant","war_loss_threshold","model","AUC","Accuracy","F1","n_train","n_test","pos_rate_test"]].round(4).to_string(index=False))
