The data for this analysis was downloaded from:
https://www.kaggle.com/c/mens-march-mania-2022/data

In [165]:
import numpy as np
import pandas as pd
import plotly.express as px

proj_root_dir = r"C:\Users\adiad\Documents\GitHub\ncaa-tourney-predict"
proj_root_dir = proj_root_dir.replace("\\", "/") + "/"

tourn_df = pd.read_csv(proj_root_dir + "data/mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneyCompactResults.csv")
seed_df = pd.read_csv(proj_root_dir + "data/mens-march-mania-2022/MDataFiles_Stage1/MNCAATourneySeeds.csv")

In [166]:
# drop play-in games
tourn_df = tourn_df[tourn_df.DayNum >= 136]

# limit data to last 10 years
tourn_df = tourn_df[tourn_df.Season > 2011]

tourn_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
1719,2012,136,1124,68,1355,60,N,0
1720,2012,136,1160,68,1424,64,N,0
1721,2012,136,1211,77,1452,54,N,0
1722,2012,136,1231,79,1308,66,N,0
1723,2012,136,1235,77,1163,64,N,0


In [167]:
# extract seeds as integers
seed_df["SeedNum"] = seed_df.Seed.str[1:3].astype(int)
seed_df.head()

Unnamed: 0,Season,Seed,TeamID,SeedNum
0,1985,W01,1207,1
1,1985,W02,1210,2
2,1985,W03,1228,3
3,1985,W04,1260,4
4,1985,W05,1374,5


In [168]:
# join winning seed numbers
tourn_df = tourn_df.merge(seed_df, how="left", left_on=["Season", "WTeamID"], right_on=["Season", "TeamID"])
tourn_df = tourn_df.drop(["TeamID"], axis=1)
tourn_df.rename(columns = {"SeedNum": "WSeedNum"}, inplace = True)

# join losing seed numbers
tourn_df = tourn_df.merge(seed_df.drop(["Seed"], axis=1), how="left", \
    left_on=["Season", "LTeamID"], right_on=["Season", "TeamID"])
tourn_df = tourn_df.drop(["TeamID"], axis=1)
tourn_df.rename(columns = {"SeedNum": "LSeedNum"}, inplace = True)

# drop games where seeds played against same seed from another region
tourn_df = tourn_df[tourn_df.WSeedNum != tourn_df.LSeedNum]

# add seed pair column
tourn_df["SeedPair"] = tourn_df.apply(lambda row: \
    (min(row.WSeedNum, row.LSeedNum), max(row.WSeedNum, row.LSeedNum)), axis=1)

# add top-seed-won boolean column
tourn_df["TopSeedWon"] = tourn_df.WSeedNum < tourn_df.LSeedNum

tourn_df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,Seed,WSeedNum,LSeedNum,SeedPair,TopSeedWon
0,2012,136,1124,68,1355,60,N,0,Y03,3,14,"(3, 14)",True
1,2012,136,1160,68,1424,64,N,0,Y11,11,6,"(6, 11)",False
2,2012,136,1211,77,1452,54,N,0,W07,7,10,"(7, 10)",True
3,2012,136,1231,79,1308,66,N,0,Y04,4,13,"(4, 13)",True
4,2012,136,1235,77,1163,64,N,0,Y08,8,9,"(8, 9)",True


In [169]:
# calculate win percentage for top seed in all seed pairs
top_seed_win_df = tourn_df[["SeedPair", "TopSeedWon"]].groupby("SeedPair").agg(["sum", "count"])
top_seed_win_df.columns = top_seed_win_df.columns.get_level_values(1)
top_seed_win_df["avg"] = top_seed_win_df["sum"] / top_seed_win_df["count"]
top_seed_win_df

Unnamed: 0_level_0,sum,count,avg
SeedPair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(1, 2)",7,14,0.5
"(1, 3)",8,10,0.8
"(1, 4)",15,19,0.789474
"(1, 5)",9,10,0.9
"(1, 6)",2,2,1.0
"(1, 7)",2,3,0.666667
"(1, 8)",16,20,0.8
"(1, 9)",14,16,0.875
"(1, 10)",1,2,0.5
"(1, 11)",3,4,0.75


In [170]:
# build an adjacency matrix
adj_arr = 0.5*np.diag(np.diag(np.ones((16,16))))
for r in range(16):
    for c in range(r + 1, 16):
        try:
            adj_arr[r, c] = top_seed_win_df.loc[top_seed_win_df.index == (r+1,c+1), "avg"].values[0]
        except:
            # this seed pair was never observed, so assume top seed always wins
            adj_arr[r, c] = 1

fig = px.imshow(adj_arr, x=list(range(1,17)), y=list(range(1,17)))
fig.show()

In [171]:
# make a bracket
#np.random.seed(123)
region_ls = ["N", "E", "S", "W"]
final_four_dict = {}
for r in region_ls:
    print(f"Region {r} results:")
    matchups_ls = [(ts,bs) for ts, bs in zip([1,8,5,4,6,3,7,2], [16,9,12,13,11,14,10,15])]
    for round in ["First", "Second", "Sweet 16", "Elite 8"]:
        print(f"{round} round")
        print("-------------------")
        victors_ls = []
        for matchup in matchups_ls:
            ts = matchup[0]
            bs = matchup[1]
            result = np.random.rand()
            if result < adj_arr[ts-1,bs-1]:
                print(f"Seed #{ts} beat #{bs}")
                victors_ls.append(ts)
            else:
                print(f"Seed #{ts} lost to #{bs}")
                victors_ls.append(bs)
        
        if len(victors_ls) > 1:
            matchups_ls = []
            for i in range(0, len(victors_ls), 2):
                if victors_ls[i] < victors_ls[i+1]:
                    matchups_ls.append((victors_ls[i], victors_ls[i+1]))
                else:
                    matchups_ls.append((victors_ls[i+1], victors_ls[i]))
        
        print("")

    final_four_dict[r] = victors_ls[0]
        
print("Final four round")
print("-------------------")
championship_dict = {}
for r1, r2 in zip(["E", "N"], ["W", "S"]):
    result = np.random.rand()
    if final_four_dict[r1] == final_four_dict[r2]:
        ts = final_four_dict[r1]
        tr = r1
        bs = final_four_dict[r2]
        br = r2

    else:

        if final_four_dict[r1] < final_four_dict[r2]:
            ts = final_four_dict[r1]
            tr = r1
            bs = final_four_dict[r2]
            br = r2
        else:
            ts = final_four_dict[r2]
            tr = r2
            bs = final_four_dict[r1]
            br = r1

    if result < adj_arr[ts-1,bs-1]:
        print(f"Seed #{ts} from the {tr} beat #{bs} from the {br}")
        championship_dict[tr] = ts
    else:
        print(f"Seed #{ts} from the {tr} lost to #{bs} from the {br}")
        championship_dict[br] = bs

print()
print("Championship round")
print("-------------------")
result = np.random.rand()
r1 = list(championship_dict.keys())[0]
r2 = list(championship_dict.keys())[1]
if championship_dict[r1] == championship_dict[r2]:
    ts = championship_dict[r1]
    tr = r1
    bs = championship_dict[r2]
    br = r2

else:

    if championship_dict[r1] < championship_dict[r2]:
        ts = championship_dict[r1]
        tr = r1
        bs = championship_dict[r2]
        br = r2
    else:
        ts = championship_dict[r2]
        tr = r2
        bs = championship_dict[r1]
        br = r1

if result < adj_arr[ts-1,bs-1]:
    print(f"Seed #{ts} from the {tr} beat #{bs} from the {br}")
else:
    print(f"Seed #{ts} from the {tr} lost to #{bs} from the {br}")

Region N results:
First round
-------------------
Seed #1 beat #16
Seed #8 lost to #9
Seed #5 lost to #12
Seed #4 beat #13
Seed #6 beat #11
Seed #3 beat #14
Seed #7 lost to #10
Seed #2 beat #15

Second round
-------------------
Seed #1 beat #9
Seed #4 lost to #12
Seed #3 lost to #6
Seed #2 beat #10

Sweet 16 round
-------------------
Seed #1 beat #12
Seed #2 beat #6

Elite 8 round
-------------------
Seed #1 beat #2

Region E results:
First round
-------------------
Seed #1 beat #16
Seed #8 beat #9
Seed #5 lost to #12
Seed #4 lost to #13
Seed #6 lost to #11
Seed #3 lost to #14
Seed #7 beat #10
Seed #2 beat #15

Second round
-------------------
Seed #1 beat #8
Seed #12 beat #13
Seed #11 beat #14
Seed #2 beat #7

Sweet 16 round
-------------------
Seed #1 beat #12
Seed #2 beat #11

Elite 8 round
-------------------
Seed #1 lost to #2

Region S results:
First round
-------------------
Seed #1 beat #16
Seed #8 lost to #9
Seed #5 beat #12
Seed #4 beat #13
Seed #6 lost to #11
Seed #3 beat #1