In [113]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from typing import List
import warnings

# Disable the specific warning
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
# Load Feature File
DATA_ROOT = "/Users/andrewgrowney/Data/kaggle/marchmadness-2024"
df = pd.read_csv("../TeamSeasons.csv")
reg_season_df = pd.read_csv(f"{DATA_ROOT}/MRegularSeasonCompactResults.csv")
reg_season_df = reg_season_df[reg_season_df.Season >= 2003]
tournament_results_df = pd.read_csv(f"{DATA_ROOT}/MNCAATourneyCompactResults.csv")
tournament_results_df = tournament_results_df[tournament_results_df.Season >= 2003]

# Win Probability: Difference of Stats
Model type: Simple Binary Classification

Input: [FGPct_diff, ..., feat_diff_n] (e.g. FGPct_diff = t1.FGPct_mean - t2.FGPct_mean)

Output: 0 -> team 1 wins, 1 -> team 2 wins

In [60]:
def get_diff(df:pd.DataFrame, t1_id:int, t2_id:int, season:int, features:List[str]):
    t1 = df[(df['TeamID'] == t1_id) & (df['Season'] == season)][features].fillna(0)
    t2 = df[(df['TeamID'] == t2_id) & (df['Season'] == season)][features].fillna(0)
    diff = t1.values - t2.values
    return diff[0]

In [61]:
ks_ark = get_diff(df, 1104, 1113, 2023, ['FGM_mean', 'FGA_mean', 'Poss_mean'])
ks_ark

array([2.76470588, 1.73529412, 4.41875   ])

In [110]:
FEATURES = [
    'AdjOE_mean', 'EFG%_mean', 'FGA3_mean', 'TO_mean', 'OR_mean', 'FT%_mean', # Offense
    'AdjDE_mean', 'OppEFG%_mean', 'OppFGA3_mean', 'OppTO_mean', 'OppOR_mean', # Defense
    'AdjNE_mean', 'Poss_mean', 'SOS', 'Q1_WinPct', 'Q2_WinPct' # Other
]

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [115]:
WIN_PROB_DATA = []
REG_SEASON_WEIGHT, TOURN_WEIGHT = 0.5, 1
X, Y, W = [], [], []
for i, row in reg_season_df.iterrows():
    try:
        season, t1_id, t2_id = row.Season, row.WTeamID, row.LTeamID
        [X.append(get_diff(df, t1_id, t2_id, season, FEATURES)), Y.append(1), W.append(REG_SEASON_WEIGHT)]
        [X.append(get_diff(df, t2_id, t1_id, season, FEATURES)), Y.append(0), W.append(REG_SEASON_WEIGHT)]
    except IndexError:
        print(f"IndexError: {i} [{row}]")
for i, row in tournament_results_df.iterrows():
    season, t1_id, t2_id = row.Season, row.WTeamID, row.LTeamID
    [X.append(get_diff(df, t1_id, t2_id, season, FEATURES)), Y.append(1), W.append(TOURN_WEIGHT)]
    [X.append(get_diff(df, t2_id, t1_id, season, FEATURES)), Y.append(0), W.append(TOURN_WEIGHT)]
X = np.array(X)
CLF_SCALER = StandardScaler()
X = CLF_SCALER.fit_transform(X)
Y = np.array(Y)
W = np.array(W)

IndexError: 181682 [Season     2024
DayNum        0
WTeamID    1101
WScore       64
LTeamID    1329
LScore       59
WLoc          A
NumOT         0
Name: 181682, dtype: object]
IndexError: 181683 [Season     2024
DayNum        0
WTeamID    1103
WScore       81
LTeamID    1355
LScore       75
WLoc          A
NumOT         0
Name: 181683, dtype: object]
IndexError: 181684 [Season     2024
DayNum        0
WTeamID    1104
WScore      105
LTeamID    1287
LScore       73
WLoc          H
NumOT         0
Name: 181684, dtype: object]
IndexError: 181685 [Season     2024
DayNum        0
WTeamID    1112
WScore      122
LTeamID    1288
LScore       59
WLoc          H
NumOT         0
Name: 181685, dtype: object]
IndexError: 181686 [Season     2024
DayNum        0
WTeamID    1114
WScore       71
LTeamID    1402
LScore       66
WLoc          H
NumOT         0
Name: 181686, dtype: object]
IndexError: 181687 [Season     2024
DayNum        0
WTeamID    1116
WScore       93
LTeamID    1108
LScore       59

In [116]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, Y_train, Y_test, W_train, W_test = train_test_split(X, Y, W, test_size=0.15, stratify=Y)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, Y_train, sample_weight=W_train)
Y_pred = clf.predict(X_test)

accuracy_score(Y_test, Y_pred)

0.6956096068532966

# Margin of Victory Probability

Model type: Regression

Input: same as above

Output: Float (Positive - team 1 wins by value, Negative - team 2 wins by that value)

In [98]:
MOV_DATA = []
for i, row in tournament_results_df.iterrows():
    season, t1_id, t2_id = row.Season, row.WTeamID, row.LTeamID
    score = row.WScore - row.LScore
    MOV_DATA.append((get_diff(df, t1_id, t2_id, season, FEATURES), score))
    MOV_DATA.append((get_diff(df, t2_id, t1_id, season, FEATURES), -1*score))
MOV_X, MOV_Y = zip(*MOV_DATA)
MOV_X = np.array(MOV_X)
MOV_SCALER = StandardScaler()
MOV_X = MOV_SCALER.fit_transform(MOV_X)
MOV_Y = np.array(MOV_Y)

In [83]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

MOV_X_train, MOV_X_test, MOV_Y_train, MOV_Y_test = train_test_split(MOV_X, MOV_Y, test_size=0.2)
reg = LinearRegression()
reg.fit(MOV_X_train, MOV_Y_train)
MOV_Y_pred = reg.predict(MOV_X_test)

mean_absolute_error(MOV_Y_test, MOV_Y_pred)

8.612803683765094

In [123]:
MOV_Y_pred[0], MOV_Y_test[0]

(-18.74503390638683, -23)

# Ensemble Classification of the clf and reg Models

In [117]:
# Ensemble Classifier using reg and clf from above
base_models = [('clf', clf), ('reg', reg)]
from sklearn.ensemble import StackingClassifier
meta_model = RandomForestClassifier(n_estimators=100, random_state=42)
ens_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

ens_clf.fit(X_train, Y_train)
Y_pred = ens_clf.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.7106929784304727

# Add to Tournament Data

In [134]:
p = ens_clf.predict_proba(get_diff(df, 1104, 1113, 2023, FEATURES).reshape(1, -1))
twps = [(t, wp) for t, wp in zip([1104, 1113], p[0])]
list(max(twps, key=lambda x: x[1]))


[1113, 0.91]

In [136]:
import json
for season in range(2015, 2024):
    with open(f"../data/web/tourney_v2/{season}.json", "r") as f:
        tourney = json.load(f)
    print(f"Processing {season}...")
    # Add predictions to tourney
    tourney['predictions'] = {}
    tourney_team_ids = list(tourney['teams'].keys())
    for i, t1_id in enumerate(tourney_team_ids):
        for t2_id in tourney_team_ids[i+1:]:
            diff = get_diff(df, int(t1_id), int(t2_id), season, FEATURES)
            diff = CLF_SCALER.transform([diff])
            win_probs = clf.predict_proba(diff)[0]
            # IMPORTANT: 0 -> t2 wins, 1 -> t1 wins
            winner_prob = max([(t, wp) for t, wp in zip([t2_id, t1_id], win_probs)], key=lambda x: x[1])
            matchup_key = "_".join(sorted([t1_id, t2_id]))
            # Value: [win_prob, win_team_id]
            tourney['predictions'][matchup_key] = list(winner_prob)
    print(tourney['predictions'])
    # Save tourney
    with open(f"../data/web/tourney_v2/{season}.json", "w") as f:
        json.dump(tourney, f)

Processing 2015...
{'1107_1112': ['1112', 0.97], '1107_1116': ['1116', 0.7725], '1107_1124': ['1124', 0.87], '1107_1125': ['1125', 0.5216666666666667], '1107_1129': ['1129', 0.915], '1107_1138': ['1138', 0.7234999999999999], '1107_1139': ['1139', 0.7765000000000001], '1107_1140': ['1140', 0.7625], '1107_1153': ['1153', 0.7591666666666665], '1107_1157': ['1157', 0.6106547619047619], '1107_1172': ['1172', 0.7325], '1107_1173': ['1173', 0.7325], '1107_1181': ['1181', 0.97], '1107_1186': ['1186', 0.5866666666666667], '1107_1207': ['1207', 0.78], '1107_1208': ['1208', 0.635], '1107_1209': ['1209', 0.555], '1107_1211': ['1211', 1.0], '1107_1214': ['1107', 0.7454999999999999], '1107_1217': ['1217', 0.5575], '1107_1231': ['1231', 0.6166666666666667], '1107_1234': ['1234', 0.73], '1107_1235': ['1235', 0.96], '1107_1242': ['1242', 0.85], '1107_1246': ['1246', 0.98], '1107_1248': ['1107', 0.5958333333333333], '1107_1257': ['1257', 0.7733333333333334], '1107_1261': ['1261', 0.81], '1107_1264': ['1

In [137]:
# Save the clf and scaler as pickle files
import pickle
with open("../Models/clf.pkl", "wb") as f:
    pickle.dump(clf, f)
with open("../Models/scaler.pkl", "wb") as f:
    pickle.dump(CLF_SCALER, f)