In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from functools import reduce
from collections import defaultdict


In [None]:
csv_dir = "./allStatCSVS_2019-2025"
team_files = [f for f in os.listdir(csv_dir) if "_Team_" in f and f.endswith(".csv")]

frames_by_year = defaultdict(list)
seen_stats = set()  # Track (year, stat) pairs

def tidy_frame(path):
    df   = pd.read_csv(path)
    year_match = re.search(r"(19|20)\d{2}", path)
    if not year_match:
        raise ValueError(f"No valid year found in file name: {path}")
    year = int(year_match.group())

    stat = os.path.basename(path).split("_Team_")[0]
    stat_clean = stat.replace("_Per_Set", "PerSet").replace("_", "")

    # Skip duplicates
    key = (year, stat_clean)
    if key in seen_stats:
        print(f"⚠️  Skipping duplicate stat for year {year}: {stat}")
        return None
    seen_stats.add(key)

    # Rename columns to avoid collisions
    df = df.rename(columns={c: f"{stat_clean}_{c}" for c in df.columns if c != "Team"})
    df["Year"] = year
    return df[["Team", "Year", *[c for c in df.columns if c != "Team" and c != "Year"]]]

# Process and group by year
for f in team_files:
    tidy = tidy_frame(os.path.join(csv_dir, f))
    if tidy is not None:
        year = tidy["Year"].iloc[0]
        if 2019 <= year <= 2025:
            frames_by_year[year].append(tidy)

# Merge all dataframes per year
master_frames = {}
for year, frame_list in frames_by_year.items():
    try:
        master_frames[year] = reduce(lambda l, r: pd.merge(l, r, on=["Team", "Year"], how="outer"), frame_list)
        print(f"{year} shape:", master_frames[year].shape)
    except Exception as e:
        print(f"Error merging year {year}: {e}")

⚠️  Skipping duplicate stat for year 2019: Digs_Per_Set
⚠️  Skipping duplicate stat for year 2019: Blocks_Per_Set
⚠️  Skipping duplicate stat for year 2019: Blocks_Per_Set
⚠️  Skipping duplicate stat for year 2019: Team_Attacks_Per_Set
⚠️  Skipping duplicate stat for year 2019: Team_Attacks_Per_Set
⚠️  Skipping duplicate stat for year 2019: Digs_Per_Set
⚠️  Skipping duplicate stat for year 2019: Digs_Per_Set
⚠️  Skipping duplicate stat for year 2019: Team_Attacks_Per_Set
⚠️  Skipping duplicate stat for year 2019: Team_Attacks_Per_Set
⚠️  Skipping duplicate stat for year 2019: Team_Attacks_Per_Set
⚠️  Skipping duplicate stat for year 2019: Blocks_Per_Set
⚠️  Skipping duplicate stat for year 2019: Digs_Per_Set
⚠️  Skipping duplicate stat for year 2019: Digs_Per_Set
⚠️  Skipping duplicate stat for year 2019: Digs_Per_Set
⚠️  Skipping duplicate stat for year 2019: Blocks_Per_Set
⚠️  Skipping duplicate stat for year 2019: Team_Attacks_Per_Set
⚠️  Skipping duplicate stat for year 2019: Block

In [None]:
#drop na rows
master2024 = master2024.dropna(subset=["MatchW-LPctg._Pct."])
master2025 = master2025.dropna(subset=["MatchW-LPctg._Pct."])

master2024.columns

In [None]:
label = "MatchW-LPctg._Pct."

num_cols_24 = master2024.select_dtypes("number").columns
features = [c for c in num_cols_24 if c != label and c != "MatchW-LPctg._Rank" and c != "MatchW-LPctg._L" and c != "MatchW-LPctg._W"]

X_train = master2024[features]
y_train = master2024[label]

num_cols_25 = master2025.select_dtypes("number").columns
X_test = master2025[features]
y_test = master2025[label]

model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=400,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)

model.fit(X_train, y_train)

preds = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, preds))
print("R² :", r2_score(y_test, preds))

results = master2025[["Team", "Year"]].copy()
results["PredictedPct"] = preds
results["ActualPct"] = y_test.values
print(results.head())

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
import matplotlib.pyplot as plt


label = "MatchW-LPctg._Pct."
features = [
    c for c in master2024.select_dtypes("number").columns
    if c != label and c != "MatchW-LPctg._Rank" and c != "MatchW-LPctg._L" and c != "MatchW-LPctg._W"
]

X_train = master2024[features]
y_train = master2024[label]

X_test  = master2025[features]
y_test  = master2025[label]


param_grid = {
    "n_estimators": [200, 400, 600],
    "max_depth":    [3, 4, 5],
    "learning_rate":[0.05, 0.1, 0.2],
    "subsample":    [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}

model = xgb.XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
)

cv = KFold(n_splits=10, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=1,
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("CV MAE:",    -grid.best_score_)

best_model = grid.best_estimator_


preds = best_model.predict(X_test)
print("\n2025 MAE:", mean_absolute_error(y_test, preds))
print("2025 R² :",  r2_score(y_test, preds))

results = master2025[["Team", "Year"]].copy()
results["PredictedPct"] = preds
results["ActualPct"]    = y_test.values
print("\nHead of prediction table:\n", results.head())


plt.figure(figsize=(8, 10))
xgb.plot_importance(best_model, max_num_features=20, height=0.6)
plt.title("Top‑20 Feature Importances (gain)")
plt.tight_layout()
plt.show()
