# Summary

**This notebook takes predictions formatted the same way as submissions from previous competitions to simulate n brackets.**

- The predictions should have the columns `ID` and `Pred`. Where ID has the following format year_team1_team2 and the prediction is the the predicted probability of team1 winning against team2. 

- Setting `n_brackets=1` and `sim=False` will give you a single full-chalk bracket for each tournament.

Update: Found an inefficiency in the rng of my code after looking at this [simulation notebook](https://www.kaggle.com/code/goodspellr/seed-benchmark-submission) for the Seed-Benchmark by Good Spellr. Updated version should be much faster. Further performance improvements after suggestion by Ryan Armstrong to [precompute the random-values](https://www.kaggle.com/competitions/march-machine-learning-mania-2024/discussion/482696#2690435).

In [14]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm

# Load and filter data
round_slots = pd.read_csv('./march-machine-learning-mania-2025/MNCAATourneySlots.csv')
round_slots = round_slots[round_slots['Season'] == 2024]
round_slots = round_slots[round_slots['Slot'].str.contains('R')] # Filter out First Four


seeds_m = pd.read_csv("march-machine-learning-mania-2025/MNCAATourneySeeds.csv")
seeds_m = seeds_m[seeds_m['Season'] == 2024].copy()
seeds_m['Season'] = 2025

seeds_w = pd.read_csv("march-machine-learning-mania-2025/WNCAATourneySeeds.csv")
seeds_w = seeds_w[seeds_w['Season'] == 2024].copy()
seeds_w['Season'] = 2025

preds = pd.read_csv('./submission.csv') 
preds['ID'] = preds['ID'].str.split('_')

seeds_m[seeds_m['Seed'] == 'Y15']

Unnamed: 0,Season,Seed,TeamID
2538,2025,Y15,1389


In [None]:
def prepare_data(seeds, preds):
    # Function preparing the data for the simulation
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}
    probas_dict = {}
    
    for teams, proba in zip(preds['ID'], preds['Pred']):
        team1, team2 = teams[1], teams[2]

        probas_dict.setdefault(team1, {})[team2] = proba
        probas_dict.setdefault(team2, {})[team1] = 1 - proba

    return seed_dict, inverted_seed_dict, probas_dict


def simulate(round_slots, seeds, inverted_seeds, probas, random_values, sim=True):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - probas (dict): Dictionary containing matchup probabilities.
    - random_values (array-like): Array with precomputed random-values.
    - sim (boolean): Simulates match if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak, random_val in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed, random_values):
        team1, team2 = seeds[strong], seeds[weak]

        # Get the probability of team_1 winning
        proba = probas[str(team1)][str(team2)]
            
        if sim:
            # Randomly determine the winner based on the probability
            winner = team1 if random_val < proba else team2
        else:
            # Determine the winner based on the higher probability
            winner = [team1, team2][np.argmax([proba, 1-proba])]
            
        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    # Convert winners to original seeds using the inverted_seeds dictionary
    return [inverted_seeds[w] for w in winners], slots


def run_simulation(brackets=1, seeds=None, preds=None, round_slots=None, sim=True):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - brackets (int): Number of brackets to simulate.
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - preds (pd.DataFrame): DataFrame containing prediction information for each match-up.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - sim (boolean): Simulates matches if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict, probas_dict = prepare_data(seeds, preds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []
    
    # Precompute random-values
    random_values = np.random.random(size=(brackets, len(round_slots)))

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets+1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, probas_dict, random_values[b-1], sim)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

n_brackets = 1

# The key error is from the first four not having their results in the seeds csv, have to manually change/add those to make this work
result_m=run_simulation(brackets=n_brackets, seeds=seeds_m, preds=preds, round_slots=round_slots, sim=True)
result_m['Tournament'] = 'M'
result_w=run_simulation(brackets=n_brackets, seeds=seeds_w, preds=preds, round_slots=round_slots, sim=True)
result_w['Tournament'] = 'W'
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

  0%|          | 0/1 [00:00<?, ?it/s]


KeyError: 'X16'

In [None]:
submission.to_csv('bracket_simulations.csv')
submission

Unnamed: 0_level_0,Bracket,Slot,Team,Tournament
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,R1W1,W01,M
1,1,R1W2,W02,M
2,1,R1W3,W03,M
3,1,R1W4,W04,M
4,1,R1W5,W05,M
...,...,...,...,...
12599995,100000,R4Y1,Y01,W
12599996,100000,R4Z1,Z02,W
12599997,100000,R5WX,W01,W
12599998,100000,R5YZ,Y01,W
