# Introduction
Data for this notebook was obtained from [Kaggle](https://www.kaggle.com/datasets/nishaanamin/march-madness-data).  This model is based solely on Bart Torvik's BADJ EM metric for team strength into a probability of winning each game.  According to the Kaggle web page, BADJ EM estimates how many points a team would outscore the average Division I basketball team.  The approach of is to convert BADJ EM metric into a probability of winning each game.  This is done by calculating the BADJ EM difference between teams that played each other in past tournaments beginning in 2014.  This was plotted against the final score difference.  The score difference enabled an easy calculation for how likely a team is to win given the BADJ EM difference.  This fit model is used to predict a bracket for 2024.

In [35]:
import random
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm
random.seed(0)

df_path = r"C:\Users\adiad\Documents\GitHub\ncaa-tourney-predict\data_2024\Barttorvik Away-Neutral.csv"
df_path = df_path.replace("\\", "/")
df = pd.read_csv(df_path)
df.head()

Unnamed: 0,YEAR,TEAM NO,TEAM ID,TEAM,SEED,ROUND,BADJ EM,BADJ O,BADJ D,BARTHAG,...,BADJT RANK,AVG HGT RANK,EFF HGT RANK,EXP RANK,TALENT RANK,FT% RANK,OP FT% RANK,PPPO RANK,PPPD RANK,ELITE SOS RANK
0,2024,1025,197,Akron,14,0,3.1,105.0,101.9,0.585,...,265,238,199,19,176,186,188,120,47,269
1,2024,1064,48,Alabama,4,0,18.7,125.4,106.7,0.865,...,10,33,8,156,106,12,357,3,351,8
2,2024,1072,33,Arizona,2,0,23.7,119.7,96.0,0.927,...,40,50,37,196,7,157,284,19,98,52
3,2024,1066,43,Auburn,4,0,26.8,120.8,94.0,0.947,...,67,86,76,127,69,127,260,5,18,77
4,2024,1071,36,Baylor,3,0,17.6,117.7,100.1,0.866,...,275,31,22,304,34,137,248,52,189,11


# Fit Past Tournaments

In [36]:
df_path = r"C:\Users\adiad\Documents\GitHub\ncaa-tourney-predict\data_2024\Tournament Matchups.csv"
df_path = df_path.replace("\\", "/")
past_matchup_df = pd.read_csv(df_path)
past_matchup_df = past_matchup_df[(past_matchup_df.YEAR < 2024) & (past_matchup_df.YEAR > 2013)]
past_matchup_df.TEAM = past_matchup_df.TEAM.str.strip()
badjem_data = []
for ind in range(0, past_matchup_df.shape[0], 2):
    year = past_matchup_df.YEAR.iloc[ind]
    team_a = past_matchup_df.TEAM.iloc[ind]
    score_a = past_matchup_df.SCORE.iloc[ind]
    badjem_a = df.loc[(df.YEAR == year) & (df.TEAM == team_a), "BADJ EM"].values[0]
    team_b = past_matchup_df.TEAM.iloc[ind + 1]
    score_b = past_matchup_df.SCORE.iloc[ind + 1]
    badjem_b = df.loc[(df.YEAR == year) & (df.TEAM == team_b), "BADJ EM"].values[0]
    badjem_data += [[badjem_a - badjem_b, score_a - score_b]]
badjem_df = pd.DataFrame(badjem_data, columns=["badjem", "score"])

fig = px.scatter(badjem_df, x="badjem", y="score", trendline="ols")
fig.update_layout(
    xaxis_title="BADJ EM Difference",
    yaxis_title="Final Score Difference"
)
fig.show()

In [37]:
# break BADJ EM axis into intervals, calculate win probability for each interval
interval = 3
badjem_start_ranges = range(-20, 30, interval)
data = []
for range_start in badjem_start_ranges:
    badjem_results = badjem_df.loc[(badjem_df.badjem >= range_start) & (badjem_df.badjem < range_start + interval), "score"].values
    odds = len([x for x in badjem_results if x > 0])/len(badjem_results)
    data += [[range_start + interval/2, odds]]
odds_df = pd.DataFrame(data, columns=["badjem_diff", "win_odds"])

In [38]:
# fit win probability with quadratic
polynomial_features= PolynomialFeatures(degree=2)
xp = polynomial_features.fit_transform(np.expand_dims(odds_df.badjem_diff.values, axis=1))
model = sm.OLS(odds_df.win_odds, xp).fit()
ypred = model.predict(xp)
model.summary()


kurtosistest only valid for n>=20 ... continuing anyway, n=17



0,1,2,3
Dep. Variable:,win_odds,R-squared:,0.973
Model:,OLS,Adj. R-squared:,0.969
Method:,Least Squares,F-statistic:,250.0
Date:,"Tue, 19 Mar 2024",Prob (F-statistic):,1.11e-11
Time:,01:19:17,Log-Likelihood:,26.544
No. Observations:,17,AIC:,-47.09
Df Residuals:,14,BIC:,-44.59
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5145,0.020,26.324,0.000,0.473,0.556
x1,0.0229,0.001,19.016,0.000,0.020,0.026
x2,-0.0002,7.06e-05,-3.240,0.006,-0.000,-7.73e-05

0,1,2,3
Omnibus:,0.834,Durbin-Watson:,1.836
Prob(Omnibus):,0.659,Jarque-Bera (JB):,0.783
Skew:,-0.31,Prob(JB):,0.676
Kurtosis:,2.151,Cond. No.,507.0


In [39]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=odds_df.badjem_diff, y=odds_df.win_odds, mode="markers"))
fig.add_trace(go.Scatter(x=odds_df.badjem_diff, y=ypred, mode="lines"))
fig.update_layout(
    xaxis_title="BADJ EM Difference",
    yaxis_title="Win Probability",
    showlegend=False
)
fig.show()

In [40]:
def get_win_odds(badjem_diff):
    if badjem_diff < -20:
        return 0
    elif badjem_diff > 29.5:
        return 1
    return model.predict(polynomial_features.fit_transform(np.array([[badjem_diff]])))[0]

In [41]:
# define the region of each team in the current 2024 bracket
df = df[df.YEAR == 2024]
team_regions = {
    "Connecticut": "East",
    "Stetson": "East",
    "Florida Atlantic": "East",
    "Northwestern": "East",
    "San Diego St.": "East",
    "UAB": "East",
    "Auburn": "East",
    "Yale": "East",
    "BYU": "East",
    "Duquesne": "East",
    "Illinois": "East",
    "Morehead St.": "East",
    "Washington St.": "East",
    "Drake": "East",
    "Iowa St.": "East",
    "South Dakota St.": "East",
    "North Carolina": "West",
    "Howard": "West",
    "Mississippi St.": "West",
    "Michigan St.": "West",
    "Saint Mary's": "West",
    "Grand Canyon": "West",
    "Alabama": "West",
    "College of Charleston": "West",
    "Clemson": "West",
    "New Mexico": "West",
    "Baylor": "West",
    "Colgate": "West",
    "Dayton": "West",
    "Nevada": "West",
    "Arizona": "West",
    "Long Beach St.": "West",
    "Houston": "South",
    "Longwood": "South",
    "Nebraska": "South",
    "Texas A&M": "South",
    "Wisconsin": "South",
    "James Madison": "South",
    "Duke": "South",
    "Vermont": "South",
    "Texas Tech": "South",
    "North Carolina St.": "South",
    "Kentucky": "South",
    "Oakland": "South",
    "Florida": "South",
    "Boise St.": "South",
    "Marquette": "South",
    "Western Kentucky": "South",
    "Purdue": "Midwest",
    "Grambling St.": "Midwest",
    "Utah St.": "Midwest",
    "TCU": "Midwest",
    "Gonzaga": "Midwest",
    "McNeese St.": "Midwest",
    "Kansas": "Midwest",
    "Samford": "Midwest",
    "South Carolina": "Midwest",
    "Oregon": "Midwest",
    "Creighton": "Midwest",
    "Akron": "Midwest",
    "Texas": "Midwest",
    "Virginia": "Midwest",
    "Tennessee": "Midwest",
    "Saint Peter's": "Midwest"
}
for team in team_regions:
    assert team in df.loc[df.YEAR == 2024, "TEAM"].tolist()
df["REGION"] = df.TEAM.map(team_regions)

In [42]:
# generate bracket as a DataFrame
def get_winner(region, ts: int, bs: int):
    top_strength = df.loc[(df.REGION == region) & (df.SEED == ts), "BADJ EM"].values[0]
    bottom_strength = df.loc[(df.REGION == region) & (df.SEED == bs), "BADJ EM"].values[0]
    winner_seed = ts if random.random() < get_win_odds(top_strength - bottom_strength) else bs
    winner_team = df.TEAM[(df.REGION == region) & (df.SEED == winner_seed)].values[0]
    winner_strength = top_strength if winner_seed == ts else bottom_strength
    return winner_seed, winner_team, winner_strength

# first round simulation
round_name = "First"
tournament_df = pd.DataFrame(columns=["Region", "Round", "Winner Seed", "Winner"])
tournament_data = []
matchups = [(ts,bs) for ts, bs in zip([1,8,5,4,6,3,7,2], [16,9,12,13,11,14,10,15])]
for region in ["East", "Midwest", "South", "West"]:
    for matchup in matchups:
        winner_seed, winner_team, winner_strength = get_winner(region, matchup[0], matchup[1])
        tournament_data += [[region, round_name, winner_seed, winner_team, winner_strength]]
tournament_df = pd.DataFrame(tournament_data, columns=["Region", "Round", "Winner Seed", "Winner", "Winner Strength"])

# second round simulation
round_name = "Second"
for region in ["East", "Midwest", "South", "West"]:
    region_round_seeds = tournament_df.loc[(tournament_df.Region == region) & (tournament_df.Round == "First"), "Winner Seed"].values
    for top_seed_ind in range(0, len(region_round_seeds), 2):
        ts = region_round_seeds[top_seed_ind]
        bs = region_round_seeds[top_seed_ind + 1]
        winner_seed, winner_team, winner_strength = get_winner(region, ts, bs)
        tournament_data += [[region, round_name, winner_seed, winner_team, winner_strength]]
tournament_df = pd.DataFrame(tournament_data, columns=["Region", "Round", "Winner Seed", "Winner", "Winner Strength"])

# sweet 16 simulation
round_name = "Sweet 16"
for region in ["East", "Midwest", "South", "West"]:
    region_round_seeds = tournament_df.loc[(tournament_df.Region == region) & (tournament_df.Round == "Second"), "Winner Seed"].values
    for top_seed_ind in range(0, len(region_round_seeds), 2):
        ts = region_round_seeds[top_seed_ind]
        bs = region_round_seeds[top_seed_ind + 1]
        winner_seed, winner_team, winner_strength = get_winner(region, ts, bs)
        tournament_data += [[region, round_name, winner_seed, winner_team, winner_strength]]
tournament_df = pd.DataFrame(tournament_data, columns=["Region", "Round", "Winner Seed", "Winner", "Winner Strength"])

# elite 8 simulation
round_name = "Elite 8"
for region in ["East", "Midwest", "South", "West"]:
    region_round_seeds = tournament_df.loc[(tournament_df.Region == region) & (tournament_df.Round == "Sweet 16"), "Winner Seed"].values
    for top_seed_ind in range(0, len(region_round_seeds), 2):
        ts = region_round_seeds[top_seed_ind]
        bs = region_round_seeds[top_seed_ind + 1]
        winner_seed, winner_team, winner_strength = get_winner(region, ts, bs)
        tournament_data += [[region, round_name, winner_seed, winner_team, winner_strength]]
tournament_df = pd.DataFrame(tournament_data, columns=["Region", "Round", "Winner Seed", "Winner", "Winner Strength"])

def get_f4_champ_winner(df, last_round, regions):
    a_region_mask = True
    if regions:
        a_region_mask = df.Region == regions[0]
    a_strength = df.loc[(df.Round == last_round) & (a_region_mask), "Winner Strength"].values[0]
    a_team = df.loc[(df.Round == last_round) & (a_region_mask), "Winner"].values[0]
    a_seed = df.loc[(df.Round == last_round) & (a_region_mask), "Winner Seed"].values[0]
    a_stats = (a_seed, a_team, a_strength)

    b_region_mask = True
    if regions:
        b_region_mask = df.Region == regions[1]
    b_strength = df.loc[(df.Round == last_round) & (b_region_mask), "Winner Strength"].values[0]
    b_team = df.loc[(df.Round == last_round) & (b_region_mask), "Winner"].values[0]
    b_seed = df.loc[(df.Round == last_round) & (b_region_mask), "Winner Seed"].values[0]
    b_stats = (b_seed, b_team, b_strength)
    return a_stats if random.random() < get_win_odds(a_strength - b_strength) else b_stats

# final four simulation
round_name = "Final 4"
region = "N/A"
winner_seed, winner_team, winner_strength = get_f4_champ_winner(tournament_df, "Elite 8", ["East", "West"])
tournament_data += [[region, round_name, winner_seed, winner_team, winner_strength]]

winner_seed, winner_team, winner_strength = get_f4_champ_winner(tournament_df, "Elite 8", ["South", "Midwest"])
tournament_data += [[region, round_name, winner_seed, winner_team, winner_strength]]
tournament_df = pd.DataFrame(tournament_data, columns=["Region", "Round", "Winner Seed", "Winner", "Winner Strength"])

# championship simulation
round_name = "Championship"
region = "N/A"
winner_seed, winner_team, winner_strength = get_f4_champ_winner(tournament_df, "Final 4", [])
tournament_data += [[region, round_name, winner_seed, winner_team, winner_strength]]
tournament_df = pd.DataFrame(tournament_data, columns=["Region", "Round", "Winner Seed", "Winner", "Winner Strength"])
tournament_df.head()

Unnamed: 0,Region,Round,Winner Seed,Winner,Winner Strength
0,East,First,1,Connecticut,31.2
1,East,First,9,Northwestern,16.0
2,East,First,5,San Diego St.,16.5
3,East,First,4,Auburn,26.8
4,East,First,6,BYU,17.4
