## March Madness XGBoost Model

March Madness is one of the most popular sporting spectacles in the world, drawing in millions of viewers and generating some of the greatest stories in sports. And, as March Madness comes along, so does a huge uptick in betting, bracket-making, and general attempts to predict the outcome of every single game. To do so, I created a model to predict teams' scores using XGBoost and historical data ranging from 2003-2023. Given these scores, the model is able to predict the team that will progress and eventually win the whole tournament.

***
## Table of Contents:
* [Creating the Training Dataset](#creating)
* [Transforming Data into Metrics](#transforming)
* [Preparing and Running the Model](#modeling)
* [Predicting Every Round](#predicting)
* [The Final Bracket](#bracket)

<br>

***

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

<div id="creating"></a>

### Creating the Training Dataset

In [2]:
current_seeds_file = open("./Data/2024_tourney_seeds.csv")
current_seeds_df = pd.read_csv(current_seeds_file)

m_past_seeds_file = open("./Data/MNCAATourneySeeds.csv")
m_past_seeds = pd.read_csv(m_past_seeds_file)

m_reg_results_file = open("./Data/MRegularSeasonDetailedResults.csv")
m_reg_results_df = pd.read_csv(m_reg_results_file)

m_tourney_results_file = open("./Data/MNCAATourneyCompactResults.csv")
m_tourney_results_df = pd.read_csv(m_tourney_results_file)

In [3]:
m_trainset_full = pd.DataFrame()
past_yearlist = list(range(2003, 2024))
m_tourney_results_df = m_tourney_results_df.where(m_tourney_results_df["Season"] >= 2003).dropna()

m_trainset_seasoncol = []
m_trainset_team0col = []
m_trainset_team1col = []
m_trainset_scorecol = []

for i in range(0, len(m_tourney_results_df["Season"])):
    m_trainset_seasoncol.append(m_tourney_results_df["Season"].tolist()[i])
    m_trainset_seasoncol.append(m_tourney_results_df["Season"].tolist()[i])
    
    m_trainset_team0col.append(m_tourney_results_df["WTeamID"].tolist()[i])
    m_trainset_team0col.append(m_tourney_results_df["LTeamID"].tolist()[i])

    m_trainset_team1col.append(m_tourney_results_df["LTeamID"].tolist()[i])
    m_trainset_team1col.append(m_tourney_results_df["WTeamID"].tolist()[i])

    m_trainset_scorecol.append(m_tourney_results_df["WScore"].tolist()[i])
    m_trainset_scorecol.append(m_tourney_results_df["LScore"].tolist()[i])

m_trainset_full.insert(0, "Season", m_trainset_seasoncol)
m_trainset_full.insert(1, "Team0", m_trainset_team0col)
m_trainset_full.insert(2, "Team1", m_trainset_team1col)
m_trainset_full.insert(3, "Score", m_trainset_scorecol)

m_trainset_full

Unnamed: 0,Season,Team0,Team1,Score
0,2003.0,1421.0,1411.0,92.0
1,2003.0,1411.0,1421.0,84.0
2,2003.0,1112.0,1436.0,80.0
3,2003.0,1436.0,1112.0,51.0
4,2003.0,1113.0,1272.0,84.0
...,...,...,...,...
2625,2023.0,1274.0,1163.0,59.0
2626,2023.0,1361.0,1194.0,72.0
2627,2023.0,1194.0,1361.0,71.0
2628,2023.0,1163.0,1361.0,76.0


In [4]:
m_predset_full = pd.DataFrame()
m_current_seeds_df = current_seeds_df.where(current_seeds_df["Tournament"] == "M").dropna()
m_current_seeds_teamdict = {i : int() for i in current_seeds_df["Seed"].tolist()}

m_predset_seasoncol = []
m_predset_seedcol = []
m_predset_team0col = []
m_predset_team1col = []

m_current_seeds_w = []
m_current_seeds_w_top = []
m_current_seeds_w_bottom = []

m_current_seeds_x = []
m_current_seeds_x_top = []
m_current_seeds_x_bottom = []

m_current_seeds_y = []
m_current_seeds_y_top = []
m_current_seeds_y_bottom = []

m_current_seeds_z = []
m_current_seeds_z_top = []
m_current_seeds_z_bottom = []

for seed in m_current_seeds_df["Seed"].tolist():
    m_current_seeds_teamdict[seed] = m_current_seeds_df["TeamID"].where(m_current_seeds_df["Seed"] == seed).dropna().tolist()[0]

for i in range(0, len(m_current_seeds_df["Seed"])):
    m_predset_seasoncol.append(2024)

    if "W" in m_current_seeds_df["Seed"].tolist()[i]:
        m_current_seeds_w.append(m_current_seeds_df["Seed"].tolist()[i])
    
    elif "X" in m_current_seeds_df["Seed"].tolist()[i]:
        m_current_seeds_x.append(m_current_seeds_df["Seed"].tolist()[i])

    elif "Y" in m_current_seeds_df["Seed"].tolist()[i]:
        m_current_seeds_y.append(m_current_seeds_df["Seed"].tolist()[i])

    elif "Z" in m_current_seeds_df["Seed"].tolist()[i]:
        m_current_seeds_z.append(m_current_seeds_df["Seed"].tolist()[i])

m_current_seeds_w_top = m_current_seeds_w[:8]
m_current_seeds_w_bottom = m_current_seeds_w[8:][::-1]
m_current_seeds_x_top = m_current_seeds_x[:8]
m_current_seeds_x_bottom = m_current_seeds_x[8:][::-1]
m_current_seeds_y_top = m_current_seeds_y[:8]
m_current_seeds_y_bottom = m_current_seeds_y[8:][::-1]
m_current_seeds_z_top = m_current_seeds_z[:8]
m_current_seeds_z_bottom = m_current_seeds_z[8:][::-1]

m_current_seeds_w = []
m_current_seeds_x = []
m_current_seeds_y = []
m_current_seeds_z = []

for i in range(0, len(m_current_seeds_w_top)):
    m_current_seeds_w.append(m_current_seeds_w_top[i])
    m_current_seeds_w.append(m_current_seeds_w_bottom[i])
    m_current_seeds_x.append(m_current_seeds_x_top[i])
    m_current_seeds_x.append(m_current_seeds_x_bottom[i])
    m_current_seeds_y.append(m_current_seeds_y_top[i])
    m_current_seeds_y.append(m_current_seeds_y_bottom[i])
    m_current_seeds_z.append(m_current_seeds_z_top[i])
    m_current_seeds_z.append(m_current_seeds_z_bottom[i])

m_current_seeds_bracket = m_current_seeds_w + m_current_seeds_x + m_current_seeds_y + m_current_seeds_z

m_current_seeds_bracket_team0 = []
for seed in m_current_seeds_bracket:
    m_current_seeds_bracket_team0.append(m_current_seeds_teamdict[seed])

m_current_seeds_bracket_team1 = []
for i in range(0, (len(m_current_seeds_bracket_team0))):
    if i%2 == 0:
        m_current_seeds_bracket_team1.append(m_current_seeds_bracket_team0[i])
    else:
        m_current_seeds_bracket_team1.insert(i-1, m_current_seeds_bracket_team0[i])

m_predset_full.insert(0, "Season", m_predset_seasoncol)
m_predset_full.insert(1, "Seed", m_current_seeds_bracket)
m_predset_full.insert(2, "Team0", m_current_seeds_bracket_team0)
m_predset_full.insert(3, "Team1", m_current_seeds_bracket_team1)

In [5]:
m_team0_dict = {l : list() for l in list(range(2003, 2025))}
m_team1_dict = {l : list() for l in list(range(2003, 2025))}

for year in m_team0_dict.keys():
    if year == 2024:
        m_team0_dict[year] = m_predset_full["Team0"].tolist()
        m_team1_dict[year] = m_predset_full["Team1"].tolist()
    else:
        m_team0_dict[year] = m_trainset_full["Team0"].where(m_trainset_full["Season"] == year).dropna().tolist()
        m_team1_dict[year] = m_trainset_full["Team1"].where(m_trainset_full["Season"] == year).dropna().tolist()

<div id="transforming"></a>

### Transforming Data into Metrics

In [6]:
# Metric Dictionary
yearlist = list(range(2003, 2025))
metriclist = ["AVG_PPG", "FG_ATT", "FG_PCT", "3PT_ATT", "3PT_PCT", "FT_ATT", "FT_PCT", "OFF_RB", "DEF_RB", "AVG_AST", "AVG_TO", "AVG_STL", "AVG_BLK", "EFG_PCT", "TS_PCT"]

m_team0_metric_dict = {d : dict() for d in metriclist}
m_team1_metric_dict = {d : dict() for d in metriclist}
m_team0_opp_metric_dict = {d : dict() for d in metriclist}
m_team1_opp_metric_dict = {d : dict() for d in metriclist}

for key in metriclist:
    m_team0_metric_dict[key] = {l : list() for l in yearlist}
    m_team1_metric_dict[key] = {l : list() for l in yearlist}
    m_team0_opp_metric_dict[key] = {l : list() for l in yearlist}
    m_team1_opp_metric_dict[key] = {l : list() for l in yearlist}

In [7]:
# Metric Creation for Team 0

for year in yearlist:
    year_results = m_reg_results_df.where(m_reg_results_df["Season"] == year).dropna()
    for team in m_team0_dict[year]:

        num_games_played = len(year_results.where((year_results["WTeamID"] == team) | (year_results["LTeamID"] == team)).dropna())

        # Avg. PPG
        totalpoints = year_results["WScore"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LScore"].where(year_results["LTeamID"] == team).dropna().sum()
        avgppg = round(totalpoints / num_games_played, 2)
        m_team0_metric_dict["AVG_PPG"][year].append(avgppg)

        # FG Att. and Pct.
        totalfgatt = year_results["WFGA"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LFGA"].where(year_results["LTeamID"] == team).dropna().sum()
        avgfgatt = round(totalfgatt / num_games_played, 2)
        totalfgmade = year_results["WFGM"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LFGM"].where(year_results["LTeamID"] == team).dropna().sum()
        fgpct = round(((totalfgmade / totalfgatt)*100), 2)
        m_team0_metric_dict["FG_ATT"][year].append(avgfgatt)
        m_team0_metric_dict["FG_PCT"][year].append(fgpct)

        # 3PT Att. and Pct.
        total3ptatt = year_results["WFGA3"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LFGA3"].where(year_results["LTeamID"] == team).dropna().sum()
        avg3ptatt = round(total3ptatt / num_games_played, 2)
        total3ptmade = year_results["WFGM3"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LFGM3"].where(year_results["LTeamID"] == team).dropna().sum()
        threeptpct = round(((total3ptmade / total3ptatt)*100), 2)
        m_team0_metric_dict["3PT_ATT"][year].append(avg3ptatt)
        m_team0_metric_dict["3PT_PCT"][year].append(threeptpct)

        # FT Att. and Pct.
        totalftatt = year_results["WFTA"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LFTA"].where(year_results["LTeamID"] == team).dropna().sum()
        avgftatt = round(totalftatt / num_games_played, 2)
        totalftmade = year_results["WFTM"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LFTM"].where(year_results["LTeamID"] == team).dropna().sum()
        ftpct = round(((totalftmade / totalftatt)*100), 2)
        m_team0_metric_dict["FT_ATT"][year].append(avgftatt)
        m_team0_metric_dict["FT_PCT"][year].append(ftpct)

        # Avg. Offensive and Defensive Rebounds
        totaloffrb = year_results["WOR"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LOR"].where(year_results["LTeamID"] == team).dropna().sum()
        avgoffrb = round(totaloffrb / num_games_played, 2)
        totaldefrb = year_results["WDR"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LDR"].where(year_results["LTeamID"] == team).dropna().sum()
        avgdefrb = round(totaldefrb / num_games_played, 2)
        m_team0_metric_dict["OFF_RB"][year].append(avgoffrb)
        m_team0_metric_dict["DEF_RB"][year].append(avgdefrb)

        # Avg. Assists
        totalassists = year_results["WAst"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LAst"].where(year_results["LTeamID"] == team).dropna().sum()
        avgassists = round(totalassists / num_games_played, 2)
        m_team0_metric_dict["AVG_AST"][year].append(avgassists)

        # Avg. Turnovers
        totalturnovers = year_results["WTO"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LTO"].where(year_results["LTeamID"] == team).dropna().sum()
        avgturnovers = round(totalturnovers / num_games_played, 2)
        m_team0_metric_dict["AVG_TO"][year].append(avgturnovers)

        # Avg. Steals
        totalsteals = year_results["WStl"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LStl"].where(year_results["LTeamID"] == team).dropna().sum()
        avgsteals = round(totalsteals / num_games_played, 2)
        m_team0_metric_dict["AVG_STL"][year].append(avgsteals)

        # Avg. Blocks
        totalblocks = year_results["WBlk"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["LBlk"].where(year_results["LTeamID"] == team).dropna().sum()
        avgblocks = round(totalblocks / num_games_played, 2)
        m_team0_metric_dict["AVG_BLK"][year].append(avgblocks)

        # Effective Field Goal Percentage
        total2ptmade = totalfgmade - total3ptmade
        efg_pct = round(((total2ptmade + (1.5*total3ptmade)) / totalfgatt)*100, 2)
        m_team0_metric_dict["EFG_PCT"][year].append(efg_pct)

        # True Shooting Percentage
        ts_pct = round((totalpoints / (2*(totalfgatt + (0.44*totalftatt))))*100, 2)
        m_team0_metric_dict["TS_PCT"][year].append(ts_pct)

In [8]:
# Metric Creation for Team 1

for key in m_team0_metric_dict.keys():
    for year in yearlist:
        for i in range(0, len(m_team0_metric_dict[key][year])):
            if i%2 == 0:
                m_team1_metric_dict[key][year].append(m_team0_metric_dict[key][year][i])
            else:
                m_team1_metric_dict[key][year].insert(i-1, m_team0_metric_dict[key][year][i])

In [9]:
# Metric Creation for Team 0 (Opponent/Defensive Metric)

for year in yearlist:
    year_results = m_reg_results_df.where(m_reg_results_df["Season"] == year).dropna()
    for team in m_team0_dict[year]:

        num_games_played = len(year_results.where((year_results["WTeamID"] == team) | (year_results["LTeamID"] == team)).dropna())

        # Avg. PPG
        totalpoints = year_results["LScore"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WScore"].where(year_results["LTeamID"] == team).dropna().sum()
        avgppg = round(totalpoints / num_games_played, 2)
        m_team0_opp_metric_dict["AVG_PPG"][year].append(avgppg)

        # FG Att. and Pct.
        totalfgatt = year_results["LFGA"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WFGA"].where(year_results["LTeamID"] == team).dropna().sum()
        avgfgatt = round(totalfgatt / num_games_played, 2)
        totalfgmade = year_results["LFGM"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WFGM"].where(year_results["LTeamID"] == team).dropna().sum()
        fgpct = round(((totalfgmade / totalfgatt)*100), 2)
        m_team0_opp_metric_dict["FG_ATT"][year].append(avgfgatt)
        m_team0_opp_metric_dict["FG_PCT"][year].append(fgpct)

        # 3PT Att. and Pct.
        total3ptatt = year_results["LFGA3"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WFGA3"].where(year_results["LTeamID"] == team).dropna().sum()
        avg3ptatt = round(total3ptatt / num_games_played, 2)
        total3ptmade = year_results["LFGM3"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WFGM3"].where(year_results["LTeamID"] == team).dropna().sum()
        threeptpct = round(((total3ptmade / total3ptatt)*100), 2)
        m_team0_opp_metric_dict["3PT_ATT"][year].append(avg3ptatt)
        m_team0_opp_metric_dict["3PT_PCT"][year].append(threeptpct)

        # FT Att. and Pct.
        totalftatt = year_results["LFTA"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WFTA"].where(year_results["LTeamID"] == team).dropna().sum()
        avgftatt = round(totalftatt / num_games_played, 2)
        totalftmade = year_results["LFTM"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WFTM"].where(year_results["LTeamID"] == team).dropna().sum()
        ftpct = round(((totalftmade / totalftatt)*100), 2)
        m_team0_opp_metric_dict["FT_ATT"][year].append(avgftatt)
        m_team0_opp_metric_dict["FT_PCT"][year].append(ftpct)

        # Avg. Offensive and Defensive Rebounds
        totaloffrb = year_results["LOR"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WOR"].where(year_results["LTeamID"] == team).dropna().sum()
        avgoffrb = round(totaloffrb / num_games_played, 2)
        totaldefrb = year_results["LDR"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WDR"].where(year_results["LTeamID"] == team).dropna().sum()
        avgdefrb = round(totaldefrb / num_games_played, 2)
        m_team0_opp_metric_dict["OFF_RB"][year].append(avgoffrb)
        m_team0_opp_metric_dict["DEF_RB"][year].append(avgdefrb)

        # Avg. Assists
        totalassists = year_results["LAst"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WAst"].where(year_results["LTeamID"] == team).dropna().sum()
        avgassists = round(totalassists / num_games_played, 2)
        m_team0_opp_metric_dict["AVG_AST"][year].append(avgassists)

        # Avg. Turnovers
        totalturnovers = year_results["LTO"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WTO"].where(year_results["LTeamID"] == team).dropna().sum()
        avgturnovers = round(totalturnovers / num_games_played, 2)
        m_team0_opp_metric_dict["AVG_TO"][year].append(avgturnovers)

        # Avg. Steals
        totalsteals = year_results["LStl"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WStl"].where(year_results["LTeamID"] == team).dropna().sum()
        avgsteals = round(totalsteals / num_games_played, 2)
        m_team0_opp_metric_dict["AVG_STL"][year].append(avgsteals)

        # Avg. Blocks
        totalblocks = year_results["LBlk"].where(year_results["WTeamID"] == team).dropna().sum() + year_results["WBlk"].where(year_results["LTeamID"] == team).dropna().sum()
        avgblocks = round(totalblocks / num_games_played, 2)
        m_team0_opp_metric_dict["AVG_BLK"][year].append(avgblocks)

        # Effective Field Goal Percentage
        total2ptmade = totalfgmade - total3ptmade
        efg_pct = round(((total2ptmade + (1.5*total3ptmade)) / totalfgatt)*100, 2)
        m_team0_opp_metric_dict["EFG_PCT"][year].append(efg_pct)

        # True Shooting Percentage
        ts_pct = round((totalpoints / (2*(totalfgatt + (0.44*totalftatt))))*100, 2)
        m_team0_opp_metric_dict["TS_PCT"][year].append(ts_pct)

In [10]:
# Metric Creation for Team 1 (Opponent/Defensive Metric)

for key in m_team0_opp_metric_dict.keys():
    for year in yearlist:
        for i in range(0, len(m_team0_opp_metric_dict[key][year])):
            if i%2 == 0:
                m_team1_opp_metric_dict[key][year].append(m_team0_opp_metric_dict[key][year][i])
            else:
                m_team1_opp_metric_dict[key][year].insert(i-1, m_team0_opp_metric_dict[key][year][i])

In [11]:
# Combining all metric lists and creating columns for full dataset

m_trainset_team0_cols = {l : list() for l in metriclist}
m_trainset_team1_cols = {l : list() for l in metriclist}
m_trainset_team0_opp_cols = {l : list() for l in metriclist}
m_trainset_team1_opp_cols = {l : list() for l in metriclist}

for key in metriclist:
    m_trainset_team0_cols[key] = []
    m_trainset_team1_cols[key] = []
    m_trainset_team0_opp_cols[key] = []
    m_trainset_team1_opp_cols[key] = []

    for year in yearlist:
        if year == 2020 or year == 2024:
            continue
        m_trainset_team0_cols[key] += m_team0_metric_dict[key][year]
        m_trainset_team1_cols[key] += m_team1_metric_dict[key][year]
        m_trainset_team0_opp_cols[key] += m_team0_opp_metric_dict[key][year]
        m_trainset_team1_opp_cols[key] += m_team1_opp_metric_dict[key][year]

In [12]:
# Inserting all metric columns into trainset

# Team 0
for key in metriclist:
    m_trainset_full.insert(len(m_trainset_full.columns), f"T0 {key}", m_trainset_team0_cols[key])

# Team 1
for key in metriclist:
    m_trainset_full.insert(len(m_trainset_full.columns), f"T1 {key}", m_trainset_team1_cols[key])

# Team 0 (Opponent Metric)
for key in metriclist:
    m_trainset_full.insert(len(m_trainset_full.columns), f"T0 Opp. {key}", m_trainset_team0_opp_cols[key])

# Team 1 (Opponent Metric)
for key in metriclist:
    m_trainset_full.insert(len(m_trainset_full.columns), f"T1 Opp. {key}", m_trainset_team1_opp_cols[key])

In [13]:
# Inserting all metric columns into predset

# Team 0
for key in metriclist:
    m_predset_full.insert(len(m_predset_full.columns), f"T0 {key}", m_team0_metric_dict[key][2024])

# Team 1
for key in metriclist:
    m_predset_full.insert(len(m_predset_full.columns), f"T1 {key}", m_team1_metric_dict[key][2024])

# Team 0 (Opponent Metric)
for key in metriclist:
    m_predset_full.insert(len(m_predset_full.columns), f"T0 Opp. {key}", m_team0_opp_metric_dict[key][2024])

# Team 1 (Opponent Metric)
for key in metriclist:
    m_predset_full.insert(len(m_predset_full.columns), f"T1 Opp. {key}", m_team1_opp_metric_dict[key][2024])

<div id="modeling"></a>

### Preparing and Running the Model

In [14]:
# Preparing trainset for modeling

m_trainset_formodel = m_trainset_full.drop(columns = ["Season", "Team0", "Team1"])
X, y = m_trainset_formodel.drop("Score", axis = 1), m_trainset_formodel[["Score"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

dtrain_reg = xgb.DMatrix(X_train, y_train)
dtest_reg = xgb.DMatrix(X_test, y_test)

In [15]:
# Running the model and model evaluation

params = {"objective": "reg:squarederror", "tree_method": "auto"}
n = 1000
evals = [(dtrain_reg , "train"), (dtest_reg, "validation")]

model = xgb.train(
    params = params,
    dtrain = dtrain_reg,
    num_boost_round = n,
    evals = evals,
    verbose_eval = 10,
    early_stopping_rounds = 20
)

preds = model.predict(dtest_reg)

rmse = mean_squared_error(y_test, preds, squared = False)
print(f"RMSE of the base model: {rmse:.3f}")

[0]	train-rmse:49.69340	validation-rmse:49.46804


[10]	train-rmse:7.28689	validation-rmse:10.90321
[20]	train-rmse:5.47924	validation-rmse:10.91836
[30]	train-rmse:4.41005	validation-rmse:10.90409
[34]	train-rmse:4.03965	validation-rmse:10.92013
RMSE of the base model: 10.915


In [64]:
# Using the model on prediction data

m_predset_formodel = m_predset_full.drop(columns = ["Season", "Seed", "Team0", "Team1"])
m_predset_matrix = xgb.DMatrix(m_predset_formodel)
prediction = model.predict(m_predset_matrix)

high_seed_prediction = prediction[::2]
low_seed_prediction = prediction[1::2]
round1_results_df = pd.DataFrame()
round1_results_df.insert(0, "High Seed Score", high_seed_prediction)
round1_results_df.insert(1, "Low Seed Score", low_seed_prediction)

round1_results_df.head()

Unnamed: 0,High Seed Score,Low Seed Score
0,80.455826,63.574142
1,77.951469,61.477436
2,69.94207,63.980904
3,69.194618,66.653526
4,75.883797,69.346764


<div id="predicting"></a>

### Predicting Every Round

In [65]:
# Determining winners/losers and creating the bracket for the subsequent round

m_current_seeds_round2 = []

for i in range(0, len(prediction)):
    if i%2 != 0:
        continue
    if prediction[i] > prediction[i+1]:
        m_current_seeds_round2.append(m_current_seeds_bracket[i])
    else:
        m_current_seeds_round2.append(m_current_seeds_bracket[i+1])

m_current_seeds_bracket_round2_w_top = m_current_seeds_round2[:4]
m_current_seeds_bracket_round2_w_bottom = m_current_seeds_round2[4:8][::-1]
m_current_seeds_bracket_round2_x_top = m_current_seeds_round2[8:12]
m_current_seeds_bracket_round2_x_bottom = m_current_seeds_round2[12:16][::-1]
m_current_seeds_bracket_round2_y_top = m_current_seeds_round2[16:20]
m_current_seeds_bracket_round2_y_bottom = m_current_seeds_round2[20:24][::-1]
m_current_seeds_bracket_round2_z_top = m_current_seeds_round2[24:28]
m_current_seeds_bracket_round2_z_bottom = m_current_seeds_round2[28:32][::-1]

m_current_seeds_bracket_round2_top = m_current_seeds_bracket_round2_w_top + m_current_seeds_bracket_round2_x_top + m_current_seeds_bracket_round2_y_top + m_current_seeds_bracket_round2_z_top
m_current_seeds_bracket_round2_bottom = m_current_seeds_bracket_round2_w_bottom + m_current_seeds_bracket_round2_x_bottom + m_current_seeds_bracket_round2_y_bottom + m_current_seeds_bracket_round2_z_bottom

m_current_seeds_bracket_round2 = []

for i in range(0, 16):
    m_current_seeds_bracket_round2.append(m_current_seeds_bracket_round2_top[i])
    m_current_seeds_bracket_round2.append(m_current_seeds_bracket_round2_bottom[i])

In [66]:
# Creating predset for next round

m_predset_round2 = pd.DataFrame()
m_predset_round2.insert(0, "Seed", m_current_seeds_bracket_round2)

# Creating column lists for team 0 and team 1
m_current_seeds_bracket_round2_team0 = []
for seed in m_current_seeds_bracket_round2:
    m_current_seeds_bracket_round2_team0.append(m_current_seeds_teamdict[seed])

m_current_seeds_bracket_round2_team1 = []
for i in range(0, len(m_current_seeds_bracket_round2_team0)):
    if i%2 == 0:
        m_current_seeds_bracket_round2_team1.append(m_current_seeds_bracket_round2_team0[i])
    else:
        m_current_seeds_bracket_round2_team1.insert(i-1, m_current_seeds_bracket_round2_team0[i])

# Creating column lists with metrics for all round 2 teams
m_predset_metrics_round2 = {d : dict() for d in metriclist}
m_predset_opp_metrics_round2 = {d : dict() for d in metriclist}

for metric in metriclist:
    m_predset_metrics_perteam0 = {f : float() for f in m_current_seeds_bracket_team0}
    m_predset_opp_metrics_perteam0 = {f : float() for f in m_current_seeds_bracket_team0}
    for i in range(0, len(m_current_seeds_bracket_team0)):
        m_predset_metrics_perteam0[m_current_seeds_bracket_team0[i]] = m_team0_metric_dict[metric][2024][i]
        m_predset_opp_metrics_perteam0[m_current_seeds_bracket_team0[i]] = m_team0_opp_metric_dict[metric][2024][i]
    m_predset_metrics_round2[metric] = m_predset_metrics_perteam0
    m_predset_opp_metrics_round2[metric] = m_predset_opp_metrics_perteam0

m_predset_metrics_round2_list_team0 = {l : list() for l in metriclist}
m_predset_opp_metrics_round2_list_team0 = {l : list() for l in metriclist}
for metric in metriclist:
    for team in m_current_seeds_bracket_round2_team0:
        m_predset_metrics_round2_list_team0[metric].append(m_predset_metrics_round2[metric][team])
        m_predset_opp_metrics_round2_list_team0[metric].append(m_predset_opp_metrics_round2[metric][team])

m_predset_metrics_round2_list_team1 = {l : list() for l in metriclist}
m_predset_opp_metrics_round2_list_team1 = {l : list() for l in metriclist}
for metric in metriclist:
    for i in range(0, len(m_predset_metrics_round2_list_team0[metric])):
        if i%2 == 0:
            m_predset_metrics_round2_list_team1[metric].append(m_predset_metrics_round2_list_team0[metric][i])
            m_predset_opp_metrics_round2_list_team1[metric].append(m_predset_opp_metrics_round2_list_team0[metric][i])
        else:
            m_predset_metrics_round2_list_team1[metric].insert(i-1, m_predset_metrics_round2_list_team0[metric][i])
            m_predset_opp_metrics_round2_list_team1[metric].insert(i-1, m_predset_opp_metrics_round2_list_team0[metric][i])

# Inserting column lists into new dataframe
m_predset_round2.insert(len(m_predset_round2.columns), "Team0", m_current_seeds_bracket_round2_team0)
m_predset_round2.insert(len(m_predset_round2.columns), "Team1", m_current_seeds_bracket_round2_team1)

for metric in metriclist:
    m_predset_round2.insert(len(m_predset_round2.columns), f"T0 {metric}", m_predset_metrics_round2_list_team0[metric])

for metric in metriclist:
    m_predset_round2.insert(len(m_predset_round2.columns), f"T1 {metric}", m_predset_metrics_round2_list_team1[metric])

for metric in metriclist:
    m_predset_round2.insert(len(m_predset_round2.columns), f"T0 Opp. {metric}", m_predset_opp_metrics_round2_list_team0[metric])

for metric in metriclist:
    m_predset_round2.insert(len(m_predset_round2.columns), f"T1 Opp. {metric}", m_predset_opp_metrics_round2_list_team1[metric])

In [67]:
# Using the model on round 2 prediction data

m_predset_round2_formodel = m_predset_round2.drop(columns = ["Seed", "Team0", "Team1"])
m_predset_round2_matrix = xgb.DMatrix(m_predset_round2_formodel)
round2_prediction = model.predict(m_predset_round2_matrix)

high_seed_round2_prediction = round2_prediction[::2]
low_seed_round2_prediction = round2_prediction[1::2]
round2_results_df = pd.DataFrame()
round2_results_df.insert(0, "High Seed Score", high_seed_round2_prediction)
round2_results_df.insert(1, "Low Seed Score", low_seed_round2_prediction)

round2_results_df.head()

Unnamed: 0,High Seed Score,Low Seed Score
0,72.419861,71.057907
1,70.286034,69.851624
2,72.587044,77.755966
3,76.148811,68.954147
4,73.7043,71.948288


In [68]:
# Rinsing and repeating for every round, changing small aspects to accommodate different bracket sizes
# ROUND 3

# Determining winners/losers and creating the bracket for the subsequent round

m_current_seeds_round3 = []

for i in range(0, len(round2_prediction)):
    if i%2 != 0:
        continue
    if round2_prediction[i] > round2_prediction[i+1]:
        m_current_seeds_round3.append(m_current_seeds_bracket_round2[i])
    else:
        m_current_seeds_round3.append(m_current_seeds_bracket_round2[i+1])

m_current_seeds_bracket_round3_w_top = m_current_seeds_round3[:2]
m_current_seeds_bracket_round3_w_bottom = m_current_seeds_round3[2:4][::-1]
m_current_seeds_bracket_round3_x_top = m_current_seeds_round3[4:6]
m_current_seeds_bracket_round3_x_bottom = m_current_seeds_round3[6:8][::-1]
m_current_seeds_bracket_round3_y_top = m_current_seeds_round3[8:10]
m_current_seeds_bracket_round3_y_bottom = m_current_seeds_round3[10:12][::-1]
m_current_seeds_bracket_round3_z_top = m_current_seeds_round3[12:14]
m_current_seeds_bracket_round3_z_bottom = m_current_seeds_round3[14:16][::-1]

m_current_seeds_bracket_round3_top = m_current_seeds_bracket_round3_w_top + m_current_seeds_bracket_round3_x_top + m_current_seeds_bracket_round3_y_top + m_current_seeds_bracket_round3_z_top
m_current_seeds_bracket_round3_bottom = m_current_seeds_bracket_round3_w_bottom + m_current_seeds_bracket_round3_x_bottom + m_current_seeds_bracket_round3_y_bottom + m_current_seeds_bracket_round3_z_bottom

m_current_seeds_bracket_round3 = []

for i in range(0, 8):
    m_current_seeds_bracket_round3.append(m_current_seeds_bracket_round3_top[i])
    m_current_seeds_bracket_round3.append(m_current_seeds_bracket_round3_bottom[i])

# Creating predset for next round

m_predset_round3 = pd.DataFrame()
m_predset_round3.insert(0, "Seed", m_current_seeds_bracket_round3)

# Creating column lists for team 0 and team 1
m_current_seeds_bracket_round3_team0 = []
for seed in m_current_seeds_bracket_round3:
    m_current_seeds_bracket_round3_team0.append(m_current_seeds_teamdict[seed])

m_current_seeds_bracket_round3_team1 = []
for i in range(0, len(m_current_seeds_bracket_round3_team0)):
    if i%2 == 0:
        m_current_seeds_bracket_round3_team1.append(m_current_seeds_bracket_round3_team0[i])
    else:
        m_current_seeds_bracket_round3_team1.insert(i-1, m_current_seeds_bracket_round3_team0[i])

# Creating column lists with metrics for all round 2 teams
m_predset_metrics_round3 = {d : dict() for d in metriclist}
m_predset_opp_metrics_round3 = {d : dict() for d in metriclist}

for metric in metriclist:
    m_predset_metrics_perteam0 = {f : float() for f in m_current_seeds_bracket_team0}
    m_predset_opp_metrics_perteam0 = {f : float() for f in m_current_seeds_bracket_team0}
    for i in range(0, len(m_current_seeds_bracket_team0)):
        m_predset_metrics_perteam0[m_current_seeds_bracket_team0[i]] = m_team0_metric_dict[metric][2024][i]
        m_predset_opp_metrics_perteam0[m_current_seeds_bracket_team0[i]] = m_team0_opp_metric_dict[metric][2024][i]
    m_predset_metrics_round3[metric] = m_predset_metrics_perteam0
    m_predset_opp_metrics_round3[metric] = m_predset_opp_metrics_perteam0

m_predset_metrics_round3_list_team0 = {l : list() for l in metriclist}
m_predset_opp_metrics_round3_list_team0 = {l : list() for l in metriclist}
for metric in metriclist:
    for team in m_current_seeds_bracket_round3_team0:
        m_predset_metrics_round3_list_team0[metric].append(m_predset_metrics_round3[metric][team])
        m_predset_opp_metrics_round3_list_team0[metric].append(m_predset_opp_metrics_round3[metric][team])

m_predset_metrics_round3_list_team1 = {l : list() for l in metriclist}
m_predset_opp_metrics_round3_list_team1 = {l : list() for l in metriclist}
for metric in metriclist:
    for i in range(0, len(m_predset_metrics_round3_list_team0[metric])):
        if i%2 == 0:
            m_predset_metrics_round3_list_team1[metric].append(m_predset_metrics_round3_list_team0[metric][i])
            m_predset_opp_metrics_round3_list_team1[metric].append(m_predset_opp_metrics_round3_list_team0[metric][i])
        else:
            m_predset_metrics_round3_list_team1[metric].insert(i-1, m_predset_metrics_round3_list_team0[metric][i])
            m_predset_opp_metrics_round3_list_team1[metric].insert(i-1, m_predset_opp_metrics_round3_list_team0[metric][i])

# Inserting column lists into new dataframe
m_predset_round3.insert(len(m_predset_round3.columns), "Team0", m_current_seeds_bracket_round3_team0)
m_predset_round3.insert(len(m_predset_round3.columns), "Team1", m_current_seeds_bracket_round3_team1)

for metric in metriclist:
    m_predset_round3.insert(len(m_predset_round3.columns), f"T0 {metric}", m_predset_metrics_round3_list_team0[metric])

for metric in metriclist:
    m_predset_round3.insert(len(m_predset_round3.columns), f"T1 {metric}", m_predset_metrics_round3_list_team1[metric])

for metric in metriclist:
    m_predset_round3.insert(len(m_predset_round3.columns), f"T0 Opp. {metric}", m_predset_opp_metrics_round3_list_team0[metric])

for metric in metriclist:
    m_predset_round3.insert(len(m_predset_round3.columns), f"T1 Opp. {metric}", m_predset_opp_metrics_round3_list_team1[metric])

In [69]:
# Using the model on round 3 prediction data

m_predset_round3_formodel = m_predset_round3.drop(columns = ["Seed", "Team0", "Team1"])
m_predset_round3_matrix = xgb.DMatrix(m_predset_round3_formodel)
round3_prediction = model.predict(m_predset_round3_matrix)

high_seed_round3_prediction = round3_prediction[::2]
low_seed_round3_prediction = round3_prediction[1::2]
round3_results_df = pd.DataFrame()
round3_results_df.insert(0, "High Seed Score", high_seed_round3_prediction)
round3_results_df.insert(1, "Low Seed Score", low_seed_round3_prediction)

round3_results_df.head()

Unnamed: 0,High Seed Score,Low Seed Score
0,79.00032,66.275948
1,70.44841,71.85276
2,69.980293,75.070435
3,89.232109,77.269547
4,69.690865,76.239738


In [70]:
# Rinsing and repeating for every round, changing small aspects to accommodate different bracket sizes
# ROUND 4


# Determining winners/losers and creating the bracket for the subsequent round

m_current_seeds_round4 = []

for i in range(0, len(round3_prediction)):
    if i%2 != 0:
        continue
    if round3_prediction[i] > round3_prediction[i+1]:
        m_current_seeds_round4.append(m_current_seeds_bracket_round3[i])
    else:
        m_current_seeds_round4.append(m_current_seeds_bracket_round3[i+1])

m_current_seeds_bracket_round4_w_top = m_current_seeds_round4[0]
m_current_seeds_bracket_round4_w_bottom = m_current_seeds_round4[1]
m_current_seeds_bracket_round4_x_top = m_current_seeds_round4[2]
m_current_seeds_bracket_round4_x_bottom = m_current_seeds_round4[3]
m_current_seeds_bracket_round4_y_top = m_current_seeds_round4[4]
m_current_seeds_bracket_round4_y_bottom = m_current_seeds_round4[5]
m_current_seeds_bracket_round4_z_top = m_current_seeds_round4[6]
m_current_seeds_bracket_round4_z_bottom = m_current_seeds_round4[7]

m_current_seeds_bracket_round4_top = []
m_current_seeds_bracket_round4_top.append(m_current_seeds_bracket_round4_w_top)
m_current_seeds_bracket_round4_top.extend((m_current_seeds_bracket_round4_x_top, m_current_seeds_bracket_round4_y_top, m_current_seeds_bracket_round4_z_top)) 
m_current_seeds_bracket_round4_bottom = []
m_current_seeds_bracket_round4_bottom.append(m_current_seeds_bracket_round4_w_bottom)
m_current_seeds_bracket_round4_bottom.extend((m_current_seeds_bracket_round4_x_bottom, m_current_seeds_bracket_round4_y_bottom, m_current_seeds_bracket_round4_z_bottom))

m_current_seeds_bracket_round4 = []

for i in range(0, 4):
    m_current_seeds_bracket_round4.append(m_current_seeds_bracket_round4_top[i])
    m_current_seeds_bracket_round4.append(m_current_seeds_bracket_round4_bottom[i])

# Creating predset for next round

m_predset_round4 = pd.DataFrame()
m_predset_round4.insert(0, "Seed", m_current_seeds_bracket_round4)

# Creating column lists for team 0 and team 1
m_current_seeds_bracket_round4_team0 = []
for seed in m_current_seeds_bracket_round4:
    m_current_seeds_bracket_round4_team0.append(m_current_seeds_teamdict[seed])

m_current_seeds_bracket_round4_team1 = []
for i in range(0, len(m_current_seeds_bracket_round4_team0)):
    if i%2 == 0:
        m_current_seeds_bracket_round4_team1.append(m_current_seeds_bracket_round4_team0[i])
    else:
        m_current_seeds_bracket_round4_team1.insert(i-1, m_current_seeds_bracket_round4_team0[i])

# Creating column lists with metrics for all round 2 teams
m_predset_metrics_round4 = {d : dict() for d in metriclist}
m_predset_opp_metrics_round4 = {d : dict() for d in metriclist}

for metric in metriclist:
    m_predset_metrics_perteam0 = {f : float() for f in m_current_seeds_bracket_team0}
    m_predset_opp_metrics_perteam0 = {f : float() for f in m_current_seeds_bracket_team0}
    for i in range(0, len(m_current_seeds_bracket_team0)):
        m_predset_metrics_perteam0[m_current_seeds_bracket_team0[i]] = m_team0_metric_dict[metric][2024][i]
        m_predset_opp_metrics_perteam0[m_current_seeds_bracket_team0[i]] = m_team0_opp_metric_dict[metric][2024][i]
    m_predset_metrics_round4[metric] = m_predset_metrics_perteam0
    m_predset_opp_metrics_round4[metric] = m_predset_opp_metrics_perteam0

m_predset_metrics_round4_list_team0 = {l : list() for l in metriclist}
m_predset_opp_metrics_round4_list_team0 = {l : list() for l in metriclist}
for metric in metriclist:
    for team in m_current_seeds_bracket_round4_team0:
        m_predset_metrics_round4_list_team0[metric].append(m_predset_metrics_round4[metric][team])
        m_predset_opp_metrics_round4_list_team0[metric].append(m_predset_opp_metrics_round4[metric][team])

m_predset_metrics_round4_list_team1 = {l : list() for l in metriclist}
m_predset_opp_metrics_round4_list_team1 = {l : list() for l in metriclist}
for metric in metriclist:
    for i in range(0, len(m_predset_metrics_round4_list_team0[metric])):
        if i%2 == 0:
            m_predset_metrics_round4_list_team1[metric].append(m_predset_metrics_round4_list_team0[metric][i])
            m_predset_opp_metrics_round4_list_team1[metric].append(m_predset_opp_metrics_round4_list_team0[metric][i])
        else:
            m_predset_metrics_round4_list_team1[metric].insert(i-1, m_predset_metrics_round4_list_team0[metric][i])
            m_predset_opp_metrics_round4_list_team1[metric].insert(i-1, m_predset_opp_metrics_round4_list_team0[metric][i])

# Inserting column lists into new dataframe
m_predset_round4.insert(len(m_predset_round4.columns), "Team0", m_current_seeds_bracket_round4_team0)
m_predset_round4.insert(len(m_predset_round4.columns), "Team1", m_current_seeds_bracket_round4_team1)

for metric in metriclist:
    m_predset_round4.insert(len(m_predset_round4.columns), f"T0 {metric}", m_predset_metrics_round4_list_team0[metric])

for metric in metriclist:
    m_predset_round4.insert(len(m_predset_round4.columns), f"T1 {metric}", m_predset_metrics_round4_list_team1[metric])

for metric in metriclist:
    m_predset_round4.insert(len(m_predset_round4.columns), f"T0 Opp. {metric}", m_predset_opp_metrics_round4_list_team0[metric])

for metric in metriclist:
    m_predset_round4.insert(len(m_predset_round4.columns), f"T1 Opp. {metric}", m_predset_opp_metrics_round4_list_team1[metric])

In [71]:
# Using the model on round 4 prediction data

m_predset_round4_formodel = m_predset_round4.drop(columns = ["Seed", "Team0", "Team1"])
m_predset_round4_matrix = xgb.DMatrix(m_predset_round4_formodel)
round4_prediction = model.predict(m_predset_round4_matrix)

high_seed_round4_prediction = round4_prediction[::2]
low_seed_round4_prediction = round4_prediction[1::2]
round4_results_df = pd.DataFrame()
round4_results_df.insert(0, "High Seed Score", high_seed_round4_prediction)
round4_results_df.insert(1, "Low Seed Score", low_seed_round4_prediction)

round4_results_df.head()

Unnamed: 0,High Seed Score,Low Seed Score
0,72.349731,75.759163
1,73.81575,81.519569
2,74.860313,74.428734
3,79.21727,73.082397


In [72]:
# Rinsing and repeating for every round, changing small aspects to accommodate different bracket sizes
# ROUND 5


# Determining winners/losers and creating the bracket for the subsequent round

m_current_seeds_round5 = []

for i in range(0, len(round4_prediction)):
    if i%2 != 0:
        continue
    if round4_prediction[i] > round4_prediction[i+1]:
        m_current_seeds_round5.append(m_current_seeds_bracket_round4[i])
    else:
        m_current_seeds_round5.append(m_current_seeds_bracket_round4[i+1])

m_current_seeds_bracket_round5_w = m_current_seeds_round5[0]
m_current_seeds_bracket_round5_x = m_current_seeds_round5[1]
m_current_seeds_bracket_round5_y = m_current_seeds_round5[2]
m_current_seeds_bracket_round5_z = m_current_seeds_round5[3]

m_current_seeds_bracket_round5_top = []
m_current_seeds_bracket_round5_top.append(m_current_seeds_bracket_round5_w)
m_current_seeds_bracket_round5_top.append(m_current_seeds_bracket_round5_y)
m_current_seeds_bracket_round5_bottom = []
m_current_seeds_bracket_round5_bottom.append(m_current_seeds_bracket_round5_x)
m_current_seeds_bracket_round5_bottom.append(m_current_seeds_bracket_round5_z)

m_current_seeds_bracket_round5 = []

for i in range(0, 2):
    m_current_seeds_bracket_round5.append(m_current_seeds_bracket_round5_top[i])
    m_current_seeds_bracket_round5.append(m_current_seeds_bracket_round5_bottom[i])

# Creating predset for next round

m_predset_round5 = pd.DataFrame()
m_predset_round5.insert(0, "Seed", m_current_seeds_bracket_round5)

# Creating column lists for team 0 and team 1
m_current_seeds_bracket_round5_team0 = []
for seed in m_current_seeds_bracket_round5:
    m_current_seeds_bracket_round5_team0.append(m_current_seeds_teamdict[seed])

m_current_seeds_bracket_round5_team1 = []
for i in range(0, len(m_current_seeds_bracket_round5_team0)):
    if i%2 == 0:
        m_current_seeds_bracket_round5_team1.append(m_current_seeds_bracket_round5_team0[i])
    else:
        m_current_seeds_bracket_round5_team1.insert(i-1, m_current_seeds_bracket_round5_team0[i])

# Creating column lists with metrics for all round 2 teams
m_predset_metrics_round5 = {d : dict() for d in metriclist}
m_predset_opp_metrics_round5 = {d : dict() for d in metriclist}

for metric in metriclist:
    m_predset_metrics_perteam0 = {f : float() for f in m_current_seeds_bracket_team0}
    m_predset_opp_metrics_perteam0 = {f : float() for f in m_current_seeds_bracket_team0}
    for i in range(0, len(m_current_seeds_bracket_team0)):
        m_predset_metrics_perteam0[m_current_seeds_bracket_team0[i]] = m_team0_metric_dict[metric][2024][i]
        m_predset_opp_metrics_perteam0[m_current_seeds_bracket_team0[i]] = m_team0_opp_metric_dict[metric][2024][i]
    m_predset_metrics_round5[metric] = m_predset_metrics_perteam0
    m_predset_opp_metrics_round5[metric] = m_predset_opp_metrics_perteam0

m_predset_metrics_round5_list_team0 = {l : list() for l in metriclist}
m_predset_opp_metrics_round5_list_team0 = {l : list() for l in metriclist}
for metric in metriclist:
    for team in m_current_seeds_bracket_round5_team0:
        m_predset_metrics_round5_list_team0[metric].append(m_predset_metrics_round5[metric][team])
        m_predset_opp_metrics_round5_list_team0[metric].append(m_predset_opp_metrics_round5[metric][team])

m_predset_metrics_round5_list_team1 = {l : list() for l in metriclist}
m_predset_opp_metrics_round5_list_team1 = {l : list() for l in metriclist}
for metric in metriclist:
    for i in range(0, len(m_predset_metrics_round5_list_team0[metric])):
        if i%2 == 0:
            m_predset_metrics_round5_list_team1[metric].append(m_predset_metrics_round5_list_team0[metric][i])
            m_predset_opp_metrics_round5_list_team1[metric].append(m_predset_opp_metrics_round5_list_team0[metric][i])
        else:
            m_predset_metrics_round5_list_team1[metric].insert(i-1, m_predset_metrics_round5_list_team0[metric][i])
            m_predset_opp_metrics_round5_list_team1[metric].insert(i-1, m_predset_opp_metrics_round5_list_team0[metric][i])

# Inserting column lists into new dataframe
m_predset_round5.insert(len(m_predset_round5.columns), "Team0", m_current_seeds_bracket_round5_team0)
m_predset_round5.insert(len(m_predset_round5.columns), "Team1", m_current_seeds_bracket_round5_team1)

for metric in metriclist:
    m_predset_round5.insert(len(m_predset_round5.columns), f"T0 {metric}", m_predset_metrics_round5_list_team0[metric])

for metric in metriclist:
    m_predset_round5.insert(len(m_predset_round5.columns), f"T1 {metric}", m_predset_metrics_round5_list_team1[metric])

for metric in metriclist:
    m_predset_round5.insert(len(m_predset_round5.columns), f"T0 Opp. {metric}", m_predset_opp_metrics_round5_list_team0[metric])

for metric in metriclist:
    m_predset_round5.insert(len(m_predset_round5.columns), f"T1 Opp. {metric}", m_predset_opp_metrics_round5_list_team1[metric])

In [73]:
# Using the model on round 5 prediction data

m_predset_round5_formodel = m_predset_round5.drop(columns = ["Seed", "Team0", "Team1"])
m_predset_round5_matrix = xgb.DMatrix(m_predset_round5_formodel)
round5_prediction = model.predict(m_predset_round5_matrix)

high_seed_round5_prediction = round5_prediction[::2]
low_seed_round5_prediction = round5_prediction[1::2]
round5_results_df = pd.DataFrame()
round5_results_df.insert(0, "High Seed Score", high_seed_round5_prediction)
round5_results_df.insert(1, "Low Seed Score", low_seed_round5_prediction)

round5_results_df.head()

Unnamed: 0,High Seed Score,Low Seed Score
0,85.63047,79.908516
1,75.650513,76.35025


In [74]:
# Rinsing and repeating for every round, changing small aspects to accommodate different bracket sizes
# ROUND 6


# Determining winners/losers and creating the bracket for the subsequent round

m_current_seeds_round6 = []

for i in range(0, len(round5_prediction)):
    if i%2 != 0:
        continue
    if round5_prediction[i] > round5_prediction[i+1]:
        m_current_seeds_round6.append(m_current_seeds_bracket_round5[i])
    else:
        m_current_seeds_round6.append(m_current_seeds_bracket_round5[i+1])

m_current_seeds_bracket_round6_wx = m_current_seeds_round6[0]
m_current_seeds_bracket_round6_yz = m_current_seeds_round6[1]

m_current_seeds_bracket_round6 = []
m_current_seeds_bracket_round6.append(m_current_seeds_bracket_round6_wx)
m_current_seeds_bracket_round6.append(m_current_seeds_bracket_round6_yz)

# Creating predset for next round

m_predset_round6 = pd.DataFrame()
m_predset_round6.insert(0, "Seed", m_current_seeds_bracket_round6)

# Creating column lists for team 0 and team 1
m_current_seeds_bracket_round6_team0 = []
for seed in m_current_seeds_bracket_round6:
    m_current_seeds_bracket_round6_team0.append(m_current_seeds_teamdict[seed])

m_current_seeds_bracket_round6_team1 = []
for i in range(0, len(m_current_seeds_bracket_round6_team0)):
    if i%2 == 0:
        m_current_seeds_bracket_round6_team1.append(m_current_seeds_bracket_round6_team0[i])
    else:
        m_current_seeds_bracket_round6_team1.insert(i-1, m_current_seeds_bracket_round6_team0[i])

# Creating column lists with metrics for all round 2 teams
m_predset_metrics_round6 = {d : dict() for d in metriclist}
m_predset_opp_metrics_round6 = {d : dict() for d in metriclist}

for metric in metriclist:
    m_predset_metrics_perteam0 = {f : float() for f in m_current_seeds_bracket_team0}
    m_predset_opp_metrics_perteam0 = {f : float() for f in m_current_seeds_bracket_team0}
    for i in range(0, len(m_current_seeds_bracket_team0)):
        m_predset_metrics_perteam0[m_current_seeds_bracket_team0[i]] = m_team0_metric_dict[metric][2024][i]
        m_predset_opp_metrics_perteam0[m_current_seeds_bracket_team0[i]] = m_team0_opp_metric_dict[metric][2024][i]
    m_predset_metrics_round6[metric] = m_predset_metrics_perteam0
    m_predset_opp_metrics_round6[metric] = m_predset_opp_metrics_perteam0

m_predset_metrics_round6_list_team0 = {l : list() for l in metriclist}
m_predset_opp_metrics_round6_list_team0 = {l : list() for l in metriclist}
for metric in metriclist:
    for team in m_current_seeds_bracket_round6_team0:
        m_predset_metrics_round6_list_team0[metric].append(m_predset_metrics_round6[metric][team])
        m_predset_opp_metrics_round6_list_team0[metric].append(m_predset_opp_metrics_round6[metric][team])

m_predset_metrics_round6_list_team1 = {l : list() for l in metriclist}
m_predset_opp_metrics_round6_list_team1 = {l : list() for l in metriclist}
for metric in metriclist:
    for i in range(0, len(m_predset_metrics_round6_list_team0[metric])):
        if i%2 == 0:
            m_predset_metrics_round6_list_team1[metric].append(m_predset_metrics_round6_list_team0[metric][i])
            m_predset_opp_metrics_round6_list_team1[metric].append(m_predset_opp_metrics_round6_list_team0[metric][i])
        else:
            m_predset_metrics_round6_list_team1[metric].insert(i-1, m_predset_metrics_round6_list_team0[metric][i])
            m_predset_opp_metrics_round6_list_team1[metric].insert(i-1, m_predset_opp_metrics_round6_list_team0[metric][i])

# Inserting column lists into new dataframe
m_predset_round6.insert(len(m_predset_round6.columns), "Team0", m_current_seeds_bracket_round6_team0)
m_predset_round6.insert(len(m_predset_round6.columns), "Team1", m_current_seeds_bracket_round6_team1)

for metric in metriclist:
    m_predset_round6.insert(len(m_predset_round6.columns), f"T0 {metric}", m_predset_metrics_round6_list_team0[metric])

for metric in metriclist:
    m_predset_round6.insert(len(m_predset_round6.columns), f"T1 {metric}", m_predset_metrics_round6_list_team1[metric])

for metric in metriclist:
    m_predset_round6.insert(len(m_predset_round6.columns), f"T0 Opp. {metric}", m_predset_opp_metrics_round6_list_team0[metric])

for metric in metriclist:
    m_predset_round6.insert(len(m_predset_round6.columns), f"T1 Opp. {metric}", m_predset_opp_metrics_round6_list_team1[metric])

In [75]:
# Using the model on round 6 prediction data

m_predset_round6_formodel = m_predset_round6.drop(columns = ["Seed", "Team0", "Team1"])
m_predset_round6_matrix = xgb.DMatrix(m_predset_round6_formodel)
round6_prediction = model.predict(m_predset_round6_matrix)

high_seed_round6_prediction = round6_prediction[::2]
low_seed_round6_prediction = round6_prediction[1::2]
round6_results_df = pd.DataFrame()
round6_results_df.insert(0, "High Seed Score", high_seed_round6_prediction)
round6_results_df.insert(1, "Low Seed Score", low_seed_round6_prediction)

round6_results_df.head()

Unnamed: 0,High Seed Score,Low Seed Score
0,78.606873,77.986954


In [76]:
# Calculating the final winner

m_current_seeds_winner = []

for i in range(0, len(round6_prediction)):
    if i%2 != 0:
        continue
    if round6_prediction[i] > round6_prediction[i+1]:
        m_current_seeds_winner.append(m_current_seeds_bracket_round6[i])
    else:
        m_current_seeds_winner.append(m_current_seeds_bracket_round6[i+1])

m_current_seeds_winner

['W06']

In [113]:
# Overall Bracket Table

m_current_seeds_bracket_ovr = m_current_seeds_bracket + m_current_seeds_bracket_round2 + m_current_seeds_bracket_round3 + m_current_seeds_bracket_round4 + m_current_seeds_bracket_round5 + m_current_seeds_bracket_round6 + m_current_seeds_winner
m_current_seeds_bracket_ovr_df = pd.DataFrame()

# Seed

m_current_seeds_bracket_ovr_col_team0 = []
m_current_seeds_bracket_ovr_col_team1 = []
m_current_seeds_bracket_ovr_col_winner = []

m_current_seeds_bracket_ovr_col_team0 += m_current_seeds_bracket[::2]
m_current_seeds_bracket_ovr_col_team1 += m_current_seeds_bracket[1::2]
m_current_seeds_bracket_ovr_col_winner += m_current_seeds_round2

m_current_seeds_bracket_ovr_col_team0 += m_current_seeds_bracket_round2[::2]
m_current_seeds_bracket_ovr_col_team1 += m_current_seeds_bracket_round2[1::2]
m_current_seeds_bracket_ovr_col_winner += m_current_seeds_round3

m_current_seeds_bracket_ovr_col_team0 += m_current_seeds_bracket_round3[::2]
m_current_seeds_bracket_ovr_col_team1 += m_current_seeds_bracket_round3[1::2]
m_current_seeds_bracket_ovr_col_winner += m_current_seeds_round4

m_current_seeds_bracket_ovr_col_team0 += m_current_seeds_bracket_round4[::2]
m_current_seeds_bracket_ovr_col_team1 += m_current_seeds_bracket_round4[1::2]
m_current_seeds_bracket_ovr_col_winner += m_current_seeds_round5

m_current_seeds_bracket_ovr_col_team0 += m_current_seeds_bracket_round5[::2]
m_current_seeds_bracket_ovr_col_team1 += m_current_seeds_bracket_round5[1::2]
m_current_seeds_bracket_ovr_col_winner += m_current_seeds_round6

m_current_seeds_bracket_ovr_col_team0 += m_current_seeds_bracket_round6[::2]
m_current_seeds_bracket_ovr_col_team1 += m_current_seeds_bracket_round6[1::2]
m_current_seeds_bracket_ovr_col_winner += m_current_seeds_winner

m_current_seeds_bracket_ovr_df.insert(0, "Team 0", m_current_seeds_bracket_ovr_col_team0)
m_current_seeds_bracket_ovr_df.insert(1, "Team 1", m_current_seeds_bracket_ovr_col_team1)
m_current_seeds_bracket_ovr_df.insert(2, "Winner", m_current_seeds_bracket_ovr_col_winner)

# Team

m_teams_file = open("./Data/MTeams.csv")
m_teams_df = pd.read_csv(m_teams_file)

m_current_seeds_bracket_ovr_col_team0_teamid = []
m_current_seeds_bracket_ovr_col_team1_teamid = []
m_current_seeds_bracket_ovr_col_winner_teamid = []

for i in range(0, len(m_current_seeds_bracket_ovr_col_team0)):
    m_current_seeds_bracket_ovr_col_team0_teamid.append(m_current_seeds_teamdict[m_current_seeds_bracket_ovr_col_team0[i]])
    m_current_seeds_bracket_ovr_col_team1_teamid.append(m_current_seeds_teamdict[m_current_seeds_bracket_ovr_col_team1[i]])
    m_current_seeds_bracket_ovr_col_winner_teamid.append(m_current_seeds_teamdict[m_current_seeds_bracket_ovr_col_winner[i]])

m_teams_dict = {s : str() for s in m_teams_df["TeamID"]}
for teamid in m_teams_dict.keys():
    m_teams_dict[teamid] = m_teams_df["TeamName"].where(m_teams_df["TeamID"] == teamid).dropna().tolist()[0]

m_current_seeds_bracket_ovr_col_team0_teamname = []
m_current_seeds_bracket_ovr_col_team1_teamname = []
m_current_seeds_bracket_ovr_col_winner_teamname = []

for i in range(0, len(m_current_seeds_bracket_ovr_col_team0_teamid)):
    m_current_seeds_bracket_ovr_col_team0_teamname.append(m_teams_dict[m_current_seeds_bracket_ovr_col_team0_teamid[i]])
    m_current_seeds_bracket_ovr_col_team1_teamname.append(m_teams_dict[m_current_seeds_bracket_ovr_col_team1_teamid[i]])
    m_current_seeds_bracket_ovr_col_winner_teamname.append(m_teams_dict[m_current_seeds_bracket_ovr_col_winner_teamid[i]])

m_current_seeds_bracket_ovr_df.insert(1, "Team 0 Name", m_current_seeds_bracket_ovr_col_team0_teamname)
m_current_seeds_bracket_ovr_df.insert(3, "Team 1 Name", m_current_seeds_bracket_ovr_col_team1_teamname)
m_current_seeds_bracket_ovr_df.insert(5, "Winner Name", m_current_seeds_bracket_ovr_col_winner_teamname)

# Round Column

m_current_seeds_bracket_ovr_col_round = []
for i in range(0, 63):
    if i < 32:
        m_current_seeds_bracket_ovr_col_round.append("R1")
    elif i >= 32 & i < 48:
        m_current_seeds_bracket_ovr_col_round.append("R2")
    elif i >= 48 & i < 56:
        m_current_seeds_bracket_ovr_col_round.append("R3")
    elif i >= 56 & i < 60:
        m_current_seeds_bracket_ovr_col_round.append("R4")
    elif i >= 60 & i < 62:
        m_current_seeds_bracket_ovr_col_round.append("R5")
    elif i == 62:
        m_current_seeds_bracket_ovr_col_round.append("R6")

m_current_seeds_bracket_ovr_df.insert(0, "Round", m_current_seeds_bracket_ovr_col_round)

<div id="bracket"></a>

### The Final Bracket

In [112]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(m_current_seeds_bracket_ovr_df)

Unnamed: 0,Round,Team 0,Team 0 Name,Team 1,Team 1 Name,Winner,Winner Name
0,R1,W01,Connecticut,W16,Stetson,W01,Connecticut
1,R1,W02,Iowa St,W15,S Dakota St,W02,Iowa St
2,R1,W03,Illinois,W14,Morehead St,W03,Illinois
3,R1,W04,Auburn,W13,Yale,W04,Auburn
4,R1,W05,San Diego St,W12,UAB,W05,San Diego St
5,R1,W06,BYU,W11,Duquesne,W06,BYU
6,R1,W07,Washington St,W10,Drake,W07,Washington St
7,R1,W08,FL Atlantic,W09,Northwestern,W08,FL Atlantic
8,R1,X01,North Carolina,X16,Howard,X01,North Carolina
9,R1,X02,Arizona,X15,Long Beach St,X02,Arizona
