In [4]:
import json
import pandas as pd
from mm_analytics.utilities import NpEncoder
DATA_ROOT = "/Users/andrewgrowney/data/kaggle/marchmadness-2024"

In [5]:
seasons_df = pd.read_csv(f"{DATA_ROOT}/MSeasons.csv") # Season, DayZero, RegionW, RegionX, RegionY, RegionZ
seeds_df = pd.read_csv(f"{DATA_ROOT}/MNCAATourneySeeds.csv") # Season, Seed, TeamID
slots_df = pd.read_csv(f"{DATA_ROOT}/MNCAATourneySlots.csv") # Season, Slot[R1W4], StrongSeed[W04], WeakSeed[W13]
seed_round_slots_df = pd.read_csv(f"{DATA_ROOT}/MNCAATourneySeedRoundSlots.csv") # Seed, GameRound[1], GameSlot[R1W4], EarlyDayNum, LateDayNum
results_df = pd.read_csv(f"{DATA_ROOT}/MNCAATourneyCompactResults.csv") # Season, DayNum, WTeamID, LTeamID, WScore, LScore, WLoc, NumOT

In [6]:
TEAM_DATA_ROOT = "../data/web/ts"
def get_team_data(tid, season):
    with open(f"{TEAM_DATA_ROOT}/{tid}_{season}.json") as f:
        team_data = json.load(f)
    return {
        "Seed": team_data.get("tournament", {}).get("seed", None),
        "Rec": team_data["record"]["overall"],
        "Ranks": {
            "AdjOE": team_data["stat_rankings"]["AdjOE"],
            "AdjDE": team_data["stat_rankings"]["AdjDE"],
            "AdjNE": team_data["stat_rankings"]["AdjNE"],
            "SOS": team_data["stat_rankings"]["SOS"],
            "Poss": team_data["stat_rankings"]["Poss"]
        },
        "Sims": [{"i": i["id"], "y": i["year"], "s": i["avg"], "e": i["er"]} for i in team_data["similar_teams"]]
    }

In [25]:
"""Pseudocode for getting the probabilities
from the target_slot, find the slots that feed into it -> feed_slots

for each feed_slot:
    if there is a record in slot_winners for the slot:
        return that 
    else:

Step 1: Build out tree of slots
ex: { 
    "W11a": {"day": 134, "prob": None, "feed_slots": ["W11a"]},
    "W11b": {"day": 134, "prob": None, "feed_slots": ["W11b"]},
    "W1":   {"day": 136, "prob": None, "feed_slots": ["W1"]},
    "W11": {"day": 136, "prob": None, "feed_slots":  ["W11a", "W11b"]},
    "R1W1": {"day": 1, "prob": None, "feed_slots": ["W1", "W16"], "winner": None },
    "R1W2": {"day": 1, "prob": None, "feed_slots": ["W8", "W9"], "winner": None },
    "R2W1": {"day": 2, "prob": None, "feed_slots": ["R1W1", "R1W2"], "winner": None }, 
}
Step 2: Order the slots by day
{ 
    "W1":   {"day": 0, "prob": {1242: 1.0}, "feed_slots": ["W1"]},
    "W16":   {"day": 0, "prob": {1113: 1.0}, "feed_slots": ["W16"]},
    "W11a":  {"day": 0, "prob": {1113: 1.0}, "feed_slots": ["W11a"]},
    "W11a":  {"day": 0, "prob": {1113: 1.0}, "feed_slots": ["W11a"]},
    "R1W1": {"day": 1, "prob": None, "feed_slots": ["W1", "W16"], "winner": None },
    "R1W2": {"day": 1, "prob": None, "feed_slots": ["W8", "W9"], "winner": None },
    "R2W1": {"day": 2, "prob": None, "feed_slots": ["R1W1", "R1W2"], "winner": None }, 
}
Step 3: Fill out the prob for each slot
{
    "W1":   {"day": 0, "prob": {1242: 1.0}, "feed_slots": ["W1"]},
    "W16":  {"day": 0, "prob": {1113: 1.0}, "feed_slots": ["W16"]},
    "R1W1": {"day": 1, "prob": {1242: 0.5, 1113: 0.5}, "feed_slots": ["W1", "W16"] },
}
for slot_id, slot_val in slots.items():
    if len(slot_val["feed_slots"]) == 1:
        slot_val["prob"] = { team_seeds[slot_val["feed_slots"][0]]: 1.0 }
    else:
        slot_prob = {}
        feed_slot_probs = [slots[fs]["prob"] for fs in slot_val["feed_slots"]]
        for tid, feed_prob in feed_slot_probs[0].items():
            total_t_prob = 0
            for opp, opp_prob in feed_slot_probs[1].items():
                total_t_prob += feed_prob * opp_prob * get_win_prob(model_id, tid, opp)[0]
            slot_prob[tid] = total_t_prob
        for tid, feed_prob in feed_slot_probs[1].items():
            total_t_prob = 0
            for opp, opp_prob in feed_slot_probs[0].items():
                total_t_prob += feed_prob * opp_prob * get_win_prob(model_id, tid, opp)[0]
            slot_prob[tid] = total_t_prob
        slot_val["prob"] = slot_prob

"""

# def get_win_prob(model_id, team1, team2):
#     """Get the probability of team1 winning against team2
#     """
#     return [0.5, 0.5]

# def get_slot_probabilities(seed_teams):
#     """calculate the chances of each team winning the slot
#     """
#     slots = {}
#     # Initialize seeds as slots
#     for sid, tid in seed_teams.items():
#         slots[sid] = { "day": 136, "feed_slots": [sid] }
#     # Assign Feed Slots
#     for _, row in slots_df.iterrows():
#         slots[row.Slot] = { "feed_slots": [row.StrongSeed, row.WeakSeed] } # Slot[R1W4], StrongSeed[W04], WeakSeed[W13]
    
#     for _, row in seed_round_slots_df.iterrows(): # Seed, GameRound[1], GameSlot[R1W4], EarlyDayNum, LateDayNum
#         slot_id = row.GameSlot
#         slots[slot_id]["day"] = row.EarlyDayNum
#     print({k: v["day"] for k, v in slots.items()})
#     # Order slots by round
#     slots = {k: v for k, v in sorted(slots.items(), key=lambda item: item[1]["day"])}
#     print({k: v["day"] for k, v in slots.items()})
#     # Fill out the prob for each slot
#     for slot_id, slot_val in slots.items():
#         if len(slot_val["feed_slots"]) == 1:
#             print(f"Filled {slot_id} with {seed_teams[slot_val['feed_slots'][0]]}")
#             slots[slot_id]["prob"] = { seed_teams[slot_val["feed_slots"][0]]: 1.0 }
#         else:
#             slot_prob = {}
#             print(f"{slot_id}: {slot_val}")
#             print(slots.keys())
#             feed_slot_probs = [slots[fs]["prob"] for fs in slot_val["feed_slots"]]
#             print(f"feed_slot_probs: {feed_slot_probs}")
#             for tid, feed_prob in feed_slot_probs[0].items():
#                 total_t_prob = 0
#                 for opp, opp_prob in feed_slot_probs[1].items():
#                     total_t_prob += feed_prob * opp_prob * get_win_prob(None, tid, opp)[0]
#                 slot_prob[tid] = total_t_prob
#             for tid, feed_prob in feed_slot_probs[1].items():
#                 total_t_prob = 0
#                 for opp, opp_prob in feed_slot_probs[0].items():
#                     total_t_prob += feed_prob * opp_prob * get_win_prob(None, tid, opp)[0]
#                 slot_prob[tid] = total_t_prob
#             slot_val["prob"] = slot_prob
#     return slots

    
# seed_teams = {}
# for index, row in seeds_df[seeds_df["Season"] == 2023].iterrows():
#     seed = row["Seed"]
#     team_id = row["TeamID"]
#     seed_teams[seed] = team_id
# print(seed_teams)
# slot_probs = get_slot_probabilities(seed_teams)
# slot_probs

{'W01': 1345, 'W02': 1266, 'W03': 1243, 'W04': 1397, 'W05': 1181, 'W06': 1246, 'W07': 1277, 'W08': 1272, 'W09': 1194, 'W10': 1425, 'W11': 1344, 'W12': 1331, 'W13': 1418, 'W14': 1286, 'W15': 1436, 'W16a': 1192, 'W16b': 1411, 'X01': 1104, 'X02': 1112, 'X03': 1124, 'X04': 1438, 'X05': 1361, 'X06': 1166, 'X07': 1281, 'X08': 1268, 'X09': 1452, 'X10': 1429, 'X11': 1301, 'X12': 1158, 'X13': 1202, 'X14': 1364, 'X15': 1343, 'X16a': 1369, 'X16b': 1394, 'Y01': 1222, 'Y02': 1400, 'Y03': 1462, 'Y04': 1231, 'Y05': 1274, 'Y06': 1235, 'Y07': 1401, 'Y08': 1234, 'Y09': 1120, 'Y10': 1336, 'Y11a': 1280, 'Y11b': 1338, 'Y12': 1179, 'Y13': 1245, 'Y14': 1244, 'Y15': 1159, 'Y16': 1297, 'Z01': 1242, 'Z02': 1417, 'Z03': 1211, 'Z04': 1163, 'Z05': 1388, 'Z06': 1395, 'Z07': 1321, 'Z08': 1116, 'Z09': 1228, 'Z10': 1129, 'Z11a': 1113, 'Z11b': 1305, 'Z12': 1433, 'Z13': 1233, 'Z14': 1213, 'Z15': 1421, 'Z16': 1224}
{'W01': 136, 'W02': 136, 'W03': 136, 'W04': 136, 'W05': 136, 'W06': 136, 'W07': 136, 'W08': 136, 'W09': 136

KeyError: 'W11a'

In [7]:
# Load clf and scaler 
from typing import List
import pandas as pd
import pickle
with open("../Models/clf.pkl", "rb") as f:
    clf = pickle.load(f)
with open("../Models/scaler.pkl", "rb") as f:
    clf_scaler = pickle.load(f)

df = pd.read_csv("../TeamSeasons.csv")

FEATURES = [
    'AdjOE_mean', 'EFG%_mean', 'FGA3_mean', 'TO_mean', 'OR_mean', 'FT%_mean', # Offense
    'AdjDE_mean', 'OppEFG%_mean', 'OppFGA3_mean', 'OppTO_mean', 'OppOR_mean', # Defense
    'AdjNE_mean', 'Poss_mean', 'SOS', 'Q1_WinPct', 'Q2_WinPct' # Other
]

def get_diff(df:pd.DataFrame, t1_id:int, t2_id:int, season:int, features:List[str]):
    t1 = df[(df['TeamID'] == t1_id) & (df['Season'] == season)][features].fillna(0)
    t2 = df[(df['TeamID'] == t2_id) & (df['Season'] == season)][features].fillna(0)
    diff = t1.values - t2.values
    return diff[0]

def get_model_prob(model_id, team1, team2):
    if model_id == "clf":
        diff = get_diff(df, team1, team2, 2023, FEATURES)
        diff = clf_scaler.transform([diff])
        prob = clf.predict_proba(diff)
        return prob[0]
    else:    
        return [0.5, 0.5]

In [8]:
# Build out json view of the tournament from the dataframes
for season in range(2013, 2024):
    print(f"Season: {season}")
    seasion_regions = seasons_df[seasons_df["Season"] == season]
    season_slots = slots_df[slots_df["Season"] == season]
    season_seeds = seeds_df[seeds_df["Season"] == season]
    season_results = results_df[results_df["Season"] == season]
    season_teams = season_seeds["TeamID"].unique()
    season_teams.sort()

    # Build out json view of the tournament from the dataframes
    # Use slots to build out the tournament structure
    # Each slot has a strong seed and a weak seed
    tournament = {
        "regions": {
            "W": seasion_regions["RegionW"].values[0],
            "X": seasion_regions["RegionX"].values[0],
            "Y": seasion_regions["RegionY"].values[0],
            "Z": seasion_regions["RegionZ"].values[0]
        },
        "slots": {},
        "teams": { int(team_id): get_team_data(int(team_id), season) for team_id in season_teams }
    }
    for index, row in season_slots.iterrows():
        slot = row["Slot"]
        strong_seed = row["StrongSeed"]
        weak_seed = row["WeakSeed"]
        seed_round_slots = seed_round_slots_df[seed_round_slots_df["GameSlot"] == slot]
        if slot not in tournament["slots"]:
            tournament["slots"][slot] = {}
        tournament["slots"][slot]["round_start"] = seed_round_slots["EarlyDayNum"].values[0]
        tournament["slots"][slot]["strong_seed"] = strong_seed
        tournament["slots"][slot]["weak_seed"] = weak_seed
    
    # Fill Team Seeds Mapping
    seed_teams = {}
    for index, row in season_seeds.iterrows():
        seed = row["Seed"]
        team_id = row["TeamID"]
        seed_teams[seed] = team_id

    # Fill in the first round slot
    # strong seed and week seed data from seeds data
    slot_winners, slot_probabilities = {}, {}
    slot_keys_sorted_by_day_range = sorted(tournament["slots"].keys(), key=lambda x: tournament["slots"][x]["round_start"])
    for slot in slot_keys_sorted_by_day_range:
        try:
            strong_seed = tournament["slots"][slot]["strong_seed"]
            if strong_seed.startswith("R") or strong_seed in slot_winners:
                # Fetch game winner
                strong_seed_team_id = slot_winners[strong_seed]
                strong_seed_probs = slot_probabilities[strong_seed]
            else:
                # Fetch from seeding data
                strong_seed_team_id = seed_teams[strong_seed]
                strong_seed_probs = { strong_seed_team_id: 1.0 }

            weak_seed = tournament["slots"][slot]["weak_seed"]
            if weak_seed.startswith("R") or weak_seed in slot_winners:
                weak_seed_team_id = slot_winners[weak_seed]
                weak_seed_probs = slot_probabilities[weak_seed]
            else:
                weak_seed_team_id = seed_teams[weak_seed]
                weak_seed_probs = { weak_seed_team_id: 1.0 }

            tournament["slots"][slot]["strong_seed"] = strong_seed_team_id
            tournament["slots"][slot]["weak_seed"] = weak_seed_team_id
            # Fetch slot winner from seaon_results
            if season == 2021 and slot == "R1X7":
                # VCU vs Oregon forfeit
                slot_winner, wscore, lscore = 1332, 2, 0
            else:
                game_row = season_results[season_results["WTeamID"].isin([strong_seed_team_id, weak_seed_team_id]) & season_results["LTeamID"].isin([strong_seed_team_id, weak_seed_team_id])]
                slot_winner = game_row["WTeamID"].values[0]
                wscore, lscore = game_row["WScore"].values[0], game_row["LScore"].values[0]
            tournament["slots"][slot]["winner"] = slot_winner
            tournament["slots"][slot]["wscore"] = wscore
            tournament["slots"][slot]["lscore"] = lscore

            # ---- Slot Probabilities ----
            slot_probs = {}
            for tid, team_prob in strong_seed_probs.items():
                t_prob = 0
                for opp, opp_prob in weak_seed_probs.items():
                    t_prob += team_prob * opp_prob * get_model_prob('clf', tid, opp)[1] # 0 index -> P(Opp Win), 1 index -> P(Team Win)
                slot_probs[tid] = round(t_prob, 4)
            for tid, team_prob in weak_seed_probs.items():
                t_prob = 0
                for opp, opp_prob in strong_seed_probs.items():
                    t_prob += team_prob * opp_prob * get_model_prob('clf', tid, opp)[1]
                slot_probs[tid] = round(t_prob, 4)
            tournament["slots"][slot]["prob"] = slot_probs
            slot_probabilities[slot] = slot_probs

            # print(f"{slot} winner: {slot_winner} [{game_row['WScore'].values[0]}-{game_row['LScore'].values[0]}]")
            slot_winners[slot] = slot_winner
        except Exception as e:
            print(f"Error {e}: {slot} {strong_seed} {weak_seed}, {slot_winners}")
            raise

    with open(f"../data/web/tourney_v2/{season}.json", "w") as f:
        json.dump(tournament, f, cls=NpEncoder)

Season: 2013
Season: 2014
Season: 2015


KeyboardInterrupt: 