In [113]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from typing import List
import warnings

# Disable the specific warning
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
# Load Feature File
DATA_ROOT = "/Users/andrewgrowney/Data/kaggle/marchmadness-2024"
df = pd.read_csv("../TeamSeasons.csv")
reg_season_df = pd.read_csv(f"{DATA_ROOT}/MRegularSeasonCompactResults.csv")
reg_season_df = reg_season_df[reg_season_df.Season >= 2003]
tournament_results_df = pd.read_csv(f"{DATA_ROOT}/MNCAATourneyCompactResults.csv")
tournament_results_df = tournament_results_df[tournament_results_df.Season >= 2003]

# Win Probability: Difference of Stats
Model type: Simple Binary Classification

Input: [FGPct_diff, ..., feat_diff_n] (e.g. FGPct_diff = t1.FGPct_mean - t2.FGPct_mean)

Output: 0 -> team 1 wins, 1 -> team 2 wins

In [60]:
def get_diff(df:pd.DataFrame, t1_id:int, t2_id:int, season:int, features:List[str]):
    t1 = df[(df['TeamID'] == t1_id) & (df['Season'] == season)][features].fillna(0)
    t2 = df[(df['TeamID'] == t2_id) & (df['Season'] == season)][features].fillna(0)
    diff = t1.values - t2.values
    return diff[0]

In [61]:
ks_ark = get_diff(df, 1104, 1113, 2023, ['FGM_mean', 'FGA_mean', 'Poss_mean'])
ks_ark

array([2.76470588, 1.73529412, 4.41875   ])

In [110]:
FEATURES = [
    'AdjOE_mean', 'EFG%_mean', 'FGA3_mean', 'TO_mean', 'OR_mean', 'FT%_mean', # Offense
    'AdjDE_mean', 'OppEFG%_mean', 'OppFGA3_mean', 'OppTO_mean', 'OppOR_mean', # Defense
    'AdjNE_mean', 'Poss_mean', 'SOS', 'Q1_WinPct', 'Q2_WinPct' # Other
]

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [114]:
WIN_PROB_DATA = []
REG_SEASON_WEIGHT = 0.5
TOURN_WEIGHT = 1
X, Y, W = [], [], []
for i, row in reg_season_df.iterrows():
    try:
        season, t1_id, t2_id = row.Season, row.WTeamID, row.LTeamID
        [X.append(get_diff(df, t1_id, t2_id, season, FEATURES)), Y.append(1), W.append(REG_SEASON_WEIGHT)]
        [X.append(get_diff(df, t2_id, t1_id, season, FEATURES)), Y.append(0), W.append(REG_SEASON_WEIGHT)]
    except IndexError:
        print(f"IndexError: {i} [{row}]")
for i, row in tournament_results_df.iterrows():
    season, t1_id, t2_id = row.Season, row.WTeamID, row.LTeamID
    [X.append(get_diff(df, t1_id, t2_id, season, FEATURES)), Y.append(1), W.append(TOURN_WEIGHT)]
    [X.append(get_diff(df, t2_id, t1_id, season, FEATURES)), Y.append(0), W.append(TOURN_WEIGHT)]
X = np.array(X)
CLF_SCALER = StandardScaler()
X = CLF_SCALER.fit_transform(X)
Y = np.array(Y)
W = np.array(W)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, Y_train, Y_test, W_train, W_test = train_test_split(X, Y, W, test_size=0.15, stratify=Y)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, Y_train, sample_weight=W_train)
Y_pred = clf.predict(X_test)

accuracy_score(Y_test, Y_pred)

0.6759493670886076

# Margin of Victory Probability

Model type: Regression

Input: same as above

Output: Float (Positive - team 1 wins by value, Negative - team 2 wins by that value)

In [98]:
MOV_DATA = []
for i, row in tournament_results_df.iterrows():
    season, t1_id, t2_id = row.Season, row.WTeamID, row.LTeamID
    score = row.WScore - row.LScore
    MOV_DATA.append((get_diff(df, t1_id, t2_id, season, FEATURES), score))
    MOV_DATA.append((get_diff(df, t2_id, t1_id, season, FEATURES), -1*score))
MOV_X, MOV_Y = zip(*MOV_DATA)
MOV_X = np.array(MOV_X)
MOV_SCALER = StandardScaler()
MOV_X = MOV_SCALER.fit_transform(MOV_X)
MOV_Y = np.array(MOV_Y)

In [83]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

MOV_X_train, MOV_X_test, MOV_Y_train, MOV_Y_test = train_test_split(MOV_X, MOV_Y, test_size=0.2)
reg = LinearRegression()
reg.fit(MOV_X_train, MOV_Y_train)
MOV_Y_pred = reg.predict(MOV_X_test)

mean_absolute_error(MOV_Y_test, MOV_Y_pred)

8.612803683765094

# Ensemble Classification of the clf and reg Models

In [109]:
# Ensemble Classifier using reg and clf from above
base_models = [('clf', clf), ('reg', reg)]
from sklearn.ensemble import StackingClassifier
meta_model = RandomForestClassifier(n_estimators=100, random_state=42)
ens_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

ens_clf.fit(X_train, Y_train)
Y_pred = ens_clf.predict(X_test)
accuracy_score(Y_test, Y_pred)

0.6278481012658228