In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from datetime import datetime, timedelta
from IPython.display import clear_output
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/march-machine-learning-mania-2024/Conferences.csv
/kaggle/input/march-machine-learning-mania-2024/sample_submission.csv
/kaggle/input/march-machine-learning-mania-2024/WNCAATourneyDetailedResults.csv
/kaggle/input/march-machine-learning-mania-2024/WRegularSeasonCompactResults.csv
/kaggle/input/march-machine-learning-mania-2024/MNCAATourneySeedRoundSlots.csv
/kaggle/input/march-machine-learning-mania-2024/MRegularSeasonDetailedResults.csv
/kaggle/input/march-machine-learning-mania-2024/MNCAATourneyCompactResults.csv
/kaggle/input/march-machine-learning-mania-2024/MGameCities.csv
/kaggle/input/march-machine-learning-mania-2024/WGameCities.csv
/kaggle/input/march-machine-learning-mania-2024/MSeasons.csv
/kaggle/input/march-machine-learning-mania-2024/WNCAATourneySlots.csv
/kaggle/input/march-machine-learning-mania-2024/MSecondaryTourneyTeams.csv
/kaggle/input/march-machine-learning-mania-2024/2024_tourney_seeds.csv
/kaggle/input/march-machine-learning-mania-2024/Cities.csv
/

In [2]:
start_data_season = {
    'M': 2003,
    'W': 2009
}
classifiers = {
    'M': RandomForestClassifier(max_depth=80, max_features='sqrt', min_samples_leaf=4,
                       n_estimators=1600, random_state=0),
    'W': RandomForestClassifier(max_depth=100, max_features=None, min_samples_leaf=4,
                       min_samples_split=5, n_estimators=2000, random_state=0)
}
columns = [
            'Season', 'TeamA', 'TeamB', 'TeamNameA', 'TeamNameB', 
            'ASeedNum', 'ALastTournPct', 'ARegSznPct',
            'APts', 'APtsOpp', 'AFG', 'AFG3', 'AFT', 'AAst', 'ATO', 'AOR', 'ADR', 'AStl', 'ABlk', 'APF',
            'BSeedNum', 'BLastTournPct', 'BRegSznPct',
            'BPts', 'BPtsOpp', 'BFG', 'BFG3', 'BFT', 'BAst', 'BTO', 'BOR', 'BDR', 'BStl', 'BBlk', 'BPF',
            'Winner'
        ]
current_season = 2024

In [3]:
def get_teams_info(gender):
    teams_file = f'/kaggle/input/march-machine-learning-mania-2024/{gender}Teams.csv'
    teams_df = pd.read_csv(teams_file)
    display(teams_df)
    return teams_df

m_teams_info_df = get_teams_info('M')
w_teams_info_df = get_teams_info('W')

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2024
1,1102,Air Force,1985,2024
2,1103,Akron,1985,2024
3,1104,Alabama,1985,2024
4,1105,Alabama A&M,2000,2024
...,...,...,...,...
373,1474,Queens NC,2023,2024
374,1475,Southern Indiana,2023,2024
375,1476,Stonehill,2023,2024
376,1477,TX A&M Commerce,2023,2024


Unnamed: 0,TeamID,TeamName
0,3101,Abilene Chr
1,3102,Air Force
2,3103,Akron
3,3104,Alabama
4,3105,Alabama A&M
...,...,...
371,3474,Queens NC
372,3475,Southern Indiana
373,3476,Stonehill
374,3477,TX A&M Commerce


In [4]:
def get_seed_number(seed):
    if 'a' in seed or 'b' in seed:
        return 17
    return int(seed[1:])

def generate_numeric_cols(df, team):
    df[f'{team}FG'] = (df[f'{team}FGM'] * 100) / df[f'{team}FGA']
    df[f'{team}FG3'] = (df[f'{team}FGM3'] * 100) / df[f'{team}FGA3']
    df[f'{team}FT'] = (df[f'{team}FTM'] * 100) / df[f'{team}FTA']

def get_files_by_competition(gender):
    # Getting teams df
    teams_file = f'/kaggle/input/march-machine-learning-mania-2024/{gender}TeamConferences.csv'
    teams_df = pd.read_csv(teams_file)
    display(teams_df)
    
    # Getting regular season games df
    reg_szn_file = f'/kaggle/input/march-machine-learning-mania-2024/{gender}RegularSeasonDetailedResults.csv'
    reg_szn_df = pd.read_csv(reg_szn_file)
    generate_numeric_cols(reg_szn_df, 'W')
    generate_numeric_cols(reg_szn_df, 'L')
    display(reg_szn_df)
    
    # Getting tournament season games df
    tourn_file = f'/kaggle/input/march-machine-learning-mania-2024/{gender}NCAATourneyCompactResults.csv'
    tourn_df = pd.read_csv(tourn_file)
    display(tourn_df)
    
    # Getting seeding df
    seed_file = f'/kaggle/input/march-machine-learning-mania-2024/{gender}NCAATourneySeeds.csv'
    seed_df = pd.read_csv(seed_file)
    seed_df['SeedNum'] = seed_df.Seed.apply(lambda x: get_seed_number(x))
    display(seed_df)
    
    # Getting slots df
    slots_file = f'/kaggle/input/march-machine-learning-mania-2024/{gender}NCAATourneySlots.csv'
    slots_df = pd.read_csv(slots_file)
    display(slots_df)
    
    return teams_df, reg_szn_df, tourn_df, seed_df, slots_df

In [5]:
def get_reg_szn_stats(season, team, reg_szn_df):
    w_games = reg_szn_df[reg_szn_df['WTeamID'] == team].rename(columns = {
        'WFG': 'FG', 'WFG3': 'FG3', 'WFT': 'FT', 'WAst': 'Ast', 'WTO': 'TO', 'WOR': 'OR', 'WDR': 'DR', 'WStl': 'Stl', 'WBlk': 'Blk', 'WPF': 'PF',
        'WScore': 'Score',
        
        'LFG': 'OppFG', 'LFG3': 'OppFG3', 'LFT': 'OppFT', 'LAst': 'OppAst', 'LTO': 'OppTO', 'LOR': 'OppOR', 'LDR': 'OppDR', 'LStl': 'OppStl', 'LBlk': 'OppBlk', 'LPF': 'OppPF',
        'LScore': 'OppScore',
        })
    w_games['Won'] = 1
    
    l_games = reg_szn_df[reg_szn_df['LTeamID'] == team].rename(columns = {
        'LFG': 'FG', 'LFG3': 'FG3', 'LFT': 'FT', 'LAst': 'Ast', 'LTO': 'TO', 'LOR': 'OR', 'LDR': 'DR', 'LStl': 'Stl', 'LBlk': 'Blk', 'LPF': 'PF',
        'LScore': 'Score',
        
        'WFG': 'OppFG', 'WFG3': 'OppFG3', 'WFT': 'OppFT', 'WAst': 'OppAst', 'WTO': 'OppTO', 'WOR': 'OppOR', 'WDR': 'OppDR', 'WStl': 'OppStl', 'WBlk': 'OppBlk', 'WPF': 'OppPF',
        'WScore': 'OppScore',
        })
    l_games['Won'] = 0
    
    games = pd.concat([w_games, l_games], axis=0, ignore_index=True)
    
    reg_szn_pct = (len(w_games) * 100) / len(games)
    
    return [   
               reg_szn_pct, 
               games['Score'].mean(), games['OppScore'].mean(), games['FG'].mean(), games['FG3'].mean(), games['FT'].mean(), games['Ast'].mean(), games['TO'].mean(), 
               games['OR'].mean(), games['DR'].mean(), games['Stl'].mean(), games['Blk'].mean(), games['PF'].mean()
           ]

In [6]:
def get_last_tourn_pct(season, team, tourn_df):
    wins = len(tourn_df[(tourn_df['Season'] == season-1) & (tourn_df['WTeamID'] == team)])
    total_games = len(tourn_df[(tourn_df['Season'] == season-1) & ((tourn_df['WTeamID'] == team) | (tourn_df['LTeamID'] == team))])
    
    if not total_games: return 0
    return (wins * 100) / total_games

In [7]:
def set_team_name(team_id, teams_df):
    return teams_df[teams_df['TeamID'] == team_id].reset_index().loc[0, 'TeamName']

def get_game_stats(gender, season, team_a, team_b, reg_szn_df, seed_df, tourn_df):
    a_reg_szn_stats = get_reg_szn_stats(season, team_a, reg_szn_df)
    b_reg_szn_stats = get_reg_szn_stats(season, team_b, reg_szn_df)

    try:
        a_seed = seed_df[(seed_df['Season'] == season) & (seed_df['TeamID'] == team_a)].reset_index().loc[0, 'SeedNum']
    except:
        a_seed = 24
    try:
        b_seed = seed_df[(seed_df['Season'] == season) & (seed_df['TeamID'] == team_b)].reset_index().loc[0, 'SeedNum']
    except:
        b_seed = 24

    a_last_tourn_pct = get_last_tourn_pct(season, team_a, tourn_df)
    b_last_tourn_pct = get_last_tourn_pct(season, team_b, tourn_df)

    stats_a = [a_seed, a_last_tourn_pct] + (a_reg_szn_stats)
    stats_b = [b_seed, b_last_tourn_pct] + (b_reg_szn_stats)
    
    teams_info_df = m_teams_info_df if gender == 'M' else w_teams_info_df
    team_a_name = set_team_name(team_a, teams_info_df)
    team_b_name = set_team_name(team_b, teams_info_df)
    
    return [season, team_a, team_b, team_a_name, team_b_name] + stats_a + stats_b

def build_dataset(gender, start_data_season, reg_szn_df, tourn_df, seed_df, end_season):
    data = []

    for season in range(start_data_season + 1, end_season + 1):
        tourney_games = tourn_df[tourn_df['Season'] == season].reset_index(drop=True)
        for idx, g in tourney_games.iterrows():

            clear_output(wait=True)
            print(f"{season}: {idx}/{len(tourney_games)}")

            team_a = min([g['WTeamID'], g['LTeamID']])
            team_b = max([g['WTeamID'], g['LTeamID']])

            if team_a == g['WTeamID']:
                winner = 'A'
                team_a_score = g['WScore']
                team_b_score = g['LScore']
            else:
                winner = 'B'
                team_a_score = g['LScore']
                team_b_score = g['WScore']

            print(f"{team_a} x {team_b}")

            game_stats = get_game_stats(gender, season, team_a, team_b, reg_szn_df, seed_df, tourn_df)

            data.append(game_stats + [winner])

    data_df = pd.DataFrame(data, columns=columns)
    display(data_df)
    
    return data_df

In [8]:
def train_model(x_train, y_train, gender):
    classifier = classifiers[gender]
    classifier.fit(x_train, y_train)
    return classifier

def predict_seasons(gender, start_data_season, data_df, detailed_results):

    acc_sum = 0
    seasons_count = 0

    for season in range(start_data_season + 2, current_season + 1):
        data_train = data_df[(data_df['Season'] < season)].reset_index(drop=True)
        data_test = data_df[(data_df['Season'] == season)].reset_index(drop=True)

        if not len(data_test):
            continue

        x_train = data_train.drop(['TeamNameA', 'TeamNameB', 'Winner'], axis=1)
        y_train = data_train.loc[:, 'Winner']
            
        classifier = train_model(x_train, y_train, gender)

        x_test = data_test.drop(['TeamNameA', 'TeamNameB', 'Winner'], axis=1)
        y_test = data_test.loc[:, 'Winner']

        predictions = classifier.predict(x_test)
        
        acc = accuracy_score(y_test, predictions)
        acc_sum += acc
        seasons_count += 1
        
        if detailed_results:
            print(f'\nResults for {gender} season {season}:')
            print('Accuracy predictions:', acc)

            cm = confusion_matrix(y_test, predictions)
            print('Confusion matrix:')
            cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classifier.classes_)
            cm_disp.plot()
            plt.show()

    print(f'\n\n{gender} Mean Accuracy: {acc_sum/seasons_count}')
    
    return classifier

In [9]:
def get_current_predictions(gender, classifier, reg_szn_df, tourn_df, seed_df):
    data = []
    teams = seed_df[seed_df['Season'] == current_season].TeamID.unique()
    teams.sort()
    
    for idx, team_a in enumerate(teams):
        for idx_b in range(idx + 1, len(teams)):
            team_b = teams[idx_b]
            
            clear_output(wait=True)
            print(f"{idx}:{idx_b}/{len(teams)}")
            print(f"{team_a} x {team_b}")
            
            game_stats = get_game_stats(gender, current_season, team_a, team_b, reg_szn_df, seed_df, tourn_df)
            data.append(game_stats)
            
    data_df = pd.DataFrame(data, columns=columns[:-1])
    data_to_predict = data_df.drop(['TeamNameA', 'TeamNameB'], axis=1)
    
    probs = classifier.predict_proba(data_to_predict)
    pred = classifier.predict(data_to_predict)

    data_df['AProb'] = probs[:, 0]
    data_df['BProb'] = probs[:, 1]
    data_df['Pred'] = pred
    
    return data_df

In [10]:
def pipeline(gender, detailed_results=False):
    teams_df, reg_szn_df, tourn_df, seed_df, slots_df = get_files_by_competition(gender)
    data_df = build_dataset(gender, start_data_season[gender], reg_szn_df, tourn_df, seed_df, current_season)
    classifier = predict_seasons(gender, start_data_season[gender], data_df, detailed_results)
    current_szn_df = get_current_predictions(gender, classifier, reg_szn_df, tourn_df, seed_df)
    display(current_szn_df)
    return current_szn_df, teams_df, slots_df

In [11]:
m_current_szn_df, m_teams_df, m_slots = pipeline('M')

66:67/68
1458 x 1463


Unnamed: 0,Season,TeamA,TeamB,TeamNameA,TeamNameB,ASeedNum,ALastTournPct,ARegSznPct,APts,APtsOpp,...,BAst,BTO,BOR,BDR,BStl,BBlk,BPF,AProb,BProb,Pred
0,2024,1103,1104,Akron,Alabama,14,0.0,65.198238,71.906021,67.004405,...,12.762663,13.353111,11.494935,25.085384,6.732272,4.494935,17.386397,0.273667,0.726333,B
1,2024,1103,1112,Akron,Arizona,14,0.0,65.198238,71.906021,67.004405,...,15.508499,12.701133,11.202550,25.516997,6.433428,3.696884,16.539660,0.081314,0.918686,B
2,2024,1103,1120,Akron,Auburn,14,0.0,65.198238,71.906021,67.004405,...,13.609078,13.525622,11.456808,23.537335,7.547584,4.401171,18.734993,0.324682,0.675318,B
3,2024,1103,1124,Akron,Baylor,14,0.0,65.198238,71.906021,67.004405,...,14.027356,13.205167,11.989362,23.346505,6.946809,4.033435,17.372340,0.132743,0.867257,B
4,2024,1103,1129,Akron,Boise St,14,0.0,65.198238,71.906021,67.004405,...,13.064759,12.715361,9.915663,24.209337,6.117470,2.572289,17.641566,0.420034,0.579966,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,2024,1447,1458,Wagner,Wisconsin,17,0.0,52.208202,68.156151,68.526814,...,12.669516,9.732194,9.858974,23.629630,5.564103,3.170940,14.914530,0.230160,0.769840,B
2274,2024,1447,1463,Wagner,Yale,17,0.0,52.208202,68.156151,68.526814,...,13.723842,13.727273,10.018868,24.732419,6.250429,3.509434,17.926244,0.271430,0.728570,B
2275,2024,1450,1458,Washington St,Wisconsin,7,0.0,48.214286,67.712798,67.355655,...,12.669516,9.732194,9.858974,23.629630,5.564103,3.170940,14.914530,0.355824,0.644176,B
2276,2024,1450,1463,Washington St,Yale,7,0.0,48.214286,67.712798,67.355655,...,13.723842,13.727273,10.018868,24.732419,6.250429,3.509434,17.926244,0.542109,0.457891,A


In [12]:
w_current_szn_df, w_teams_df, w_slots = pipeline('W')

66:67/68
3453 x 3465


Unnamed: 0,Season,TeamA,TeamB,TeamNameA,TeamNameB,ASeedNum,ALastTournPct,ARegSznPct,APts,APtsOpp,...,BAst,BTO,BOR,BDR,BStl,BBlk,BPF,AProb,BProb,Pred
0,2024,3104,3112,Alabama,Arizona,8,0.0,52.654867,66.747788,64.314159,...,12.678571,15.196429,10.895089,23.970982,8.506696,3.334821,16.520089,0.631672,0.368328,A
1,2024,3104,3120,Alabama,Auburn,8,0.0,52.654867,66.747788,64.314159,...,12.719457,16.160633,13.269231,21.954751,10.303167,3.728507,17.574661,0.655606,0.344394,A
2,2024,3104,3124,Alabama,Baylor,8,0.0,52.654867,66.747788,64.314159,...,19.549061,14.112735,13.912317,30.087683,8.006263,5.824635,15.139875,0.210071,0.789929,B
3,2024,3104,3151,Alabama,Chattanooga,8,0.0,52.654867,66.747788,64.314159,...,12.405286,13.973568,10.050661,25.092511,6.685022,3.647577,13.724670,0.502204,0.497796,A
4,2024,3104,3160,Alabama,Colorado,8,0.0,52.654867,66.747788,64.314159,...,13.527027,15.702703,12.709459,25.011261,8.432432,3.662162,18.123874,0.363616,0.636384,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,2024,3439,3453,Virginia Tech,WI Green Bay,4,80.0,55.333333,65.402222,61.328889,...,16.480176,13.343612,10.786344,24.471366,9.057269,2.444934,15.980176,0.695628,0.304372,A
2274,2024,3439,3465,Virginia Tech,Cal Baptist,4,80.0,55.333333,65.402222,61.328889,...,18.609467,14.668639,8.710059,29.260355,8.704142,3.272189,17.544379,0.640275,0.359725,A
2275,2024,3452,3453,West Virginia,WI Green Bay,8,0.0,68.884120,68.611588,58.950644,...,16.480176,13.343612,10.786344,24.471366,9.057269,2.444934,15.980176,0.640044,0.359956,A
2276,2024,3452,3465,West Virginia,Cal Baptist,8,0.0,68.884120,68.611588,58.950644,...,18.609467,14.668639,8.710059,29.260355,8.704142,3.272189,17.544379,0.596618,0.403382,A


In [13]:
def aggregate_probabilities_by_team(df):
    teams = pd.concat([df['TeamA'], df['TeamB']]).unique()
    team_probabilities = pd.DataFrame(index=teams, columns=teams)

    # Fill the win probability DataFrame with NaNs
    team_probabilities[:] = float('NaN')

    # Calculate win probabilities for each team against each other team
    for index, row in df.iterrows():
        team_probabilities.at[row['TeamA'], row['TeamB']] = row['AProb']
        team_probabilities.at[row['TeamB'], row['TeamA']] = row['BProb']

    # Create a new DataFrame to store the results
    result = pd.DataFrame(columns=['Team', 'Opponent', 'WinProb'])

    # Iterate over each team and opponent to fill the result DataFrame
    for team in teams:
        for opponent in teams:
            if team != opponent:
                win_prob = team_probabilities.at[team, opponent]
                result = pd.concat([result, pd.DataFrame([[team, opponent, win_prob]], columns=['Team', 'Opponent', 'WinProb'])], ignore_index=True)
    
    return result

probabilities_m = aggregate_probabilities_by_team(m_current_szn_df)
display(probabilities_m)

probabilities_w = aggregate_probabilities_by_team(w_current_szn_df)
display(probabilities_w)

  result = pd.concat([result, pd.DataFrame([[team, opponent, win_prob]], columns=['Team', 'Opponent', 'WinProb'])], ignore_index=True)


Unnamed: 0,Team,Opponent,WinProb
0,1103,1104,0.273667
1,1103,1112,0.081314
2,1103,1120,0.324682
3,1103,1124,0.132743
4,1103,1129,0.420034
...,...,...,...
4551,1463,1438,0.395481
4552,1463,1443,0.538739
4553,1463,1447,0.728570
4554,1463,1450,0.457891


  result = pd.concat([result, pd.DataFrame([[team, opponent, win_prob]], columns=['Team', 'Opponent', 'WinProb'])], ignore_index=True)


Unnamed: 0,Team,Opponent,WinProb
0,3104,3112,0.631672
1,3104,3120,0.655606
2,3104,3124,0.210071
3,3104,3151,0.502204
4,3104,3160,0.363616
...,...,...,...
4551,3465,3428,0.325700
4552,3465,3435,0.419587
4553,3465,3439,0.359725
4554,3465,3452,0.403382


In [14]:
def get_win_probability_by_teams(probs_df, team, opponent):
    return probs_df.loc[(probs_df['Team'] == team) & (probs_df['Opponent'] == opponent), 'WinProb'].iloc[0]

In [15]:
def build_slots(slots):
    slots = slots[slots['Season'] == current_season]
    slots = slots[slots['Slot'].str.contains('R')] 

    return slots

current_slots_m = build_slots(m_slots)
current_slots_w = build_slots(w_slots)

display(current_slots_m)
display(current_slots_w)

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
2452,2024,R1W1,W01,W16
2453,2024,R1W2,W02,W15
2454,2024,R1W3,W03,W14
2455,2024,R1W4,W04,W13
2456,2024,R1W5,W05,W12
...,...,...,...,...
2510,2024,R4Y1,R3Y1,R3Y2
2511,2024,R4Z1,R3Z1,R3Z2
2512,2024,R5WX,R4W1,R4X1
2513,2024,R5YZ,R4Y1,R4Z1


Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
1646,2024,R1W1,W01,W16
1647,2024,R1W2,W02,W15
1648,2024,R1W3,W03,W14
1649,2024,R1W4,W04,W13
1650,2024,R1W5,W05,W12
...,...,...,...,...
1704,2024,R4Y1,R3Y1,R3Y2
1705,2024,R4Z1,R3Z1,R3Z2
1706,2024,R5WX,R4W1,R4X1
1707,2024,R5YZ,R4Y1,R4Z1


In [16]:
def build_seeds_files():
    seeds_file = f'/kaggle/input/march-machine-learning-mania-2024/2024_tourney_seeds.csv'
    seeds_df = pd.read_csv(seeds_file)
    
    display(seeds_df)
    
    seeds_m = seeds_df[seeds_df['Tournament'] == 'M']
    seeds_w = seeds_df[seeds_df['Tournament'] == 'W']

    return seeds_m, seeds_w

seeds_m, seeds_w = build_seeds_files()

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1163
1,M,W02,1235
2,M,W03,1228
3,M,W04,1120
4,M,W05,1361
...,...,...,...
123,W,Z12,3162
124,W,Z13,3267
125,W,Z14,3238
126,W,Z15,3263


In [17]:
def prepare_data(seeds):
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}

    return seed_dict, inverted_seed_dict


def simulate(round_slots, seeds, inverted_seeds, probabilites):
    winners = []
    slots = []

    for slot, strong, weak in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed):
        team_1, team_2 = seeds[strong], seeds[weak]

        team_1_prob = get_win_probability_by_teams(probabilites, team_1, team_2)
        winner = np.random.choice([team_1, team_2], p=[team_1_prob, 1 - team_1_prob])

        # Append the winner and corresponding slot to the lists
        winners.append(winner)
        slots.append(slot)

        seeds[slot] = winner

    return [inverted_seeds[w] for w in winners], slots


def run_simulation(seeds, round_slots, probabilites, brackets):
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict = prepare_data(seeds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets + 1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, probabilites)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

In [18]:
n_brackets = 100000

result_m = run_simulation(seeds_m, current_slots_m, probabilities_m, n_brackets)
result_m.insert(0, 'Tournament', 'M')

result_w = run_simulation(seeds_w, current_slots_w, probabilities_w, n_brackets)
result_w.insert(0, 'Tournament', 'W')

100%|██████████| 100000/100000 [2:35:35<00:00, 10.71it/s]
100%|██████████| 100000/100000 [2:36:56<00:00, 10.62it/s]


In [19]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

display(submission)

submission.to_csv('submission.csv')

Unnamed: 0_level_0,Tournament,Bracket,Slot,Team
RowId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,M,1,R1W1,W01
1,M,1,R1W2,W02
2,M,1,R1W3,W03
3,M,1,R1W4,W04
4,M,1,R1W5,W05
...,...,...,...,...
12599995,W,100000,R4Y1,Y02
12599996,W,100000,R4Z1,Z05
12599997,W,100000,R5WX,X02
12599998,W,100000,R5YZ,Z05


In [20]:
def set_game_id(row):
    return f"{row['Season']}_{row['TeamA']}_{row['TeamB']}"

def set_impossible_games_probs(gender, current_szn_df, teams_df):
    teams_a = current_szn_df['TeamA'].unique().tolist()
    teams_b = current_szn_df['TeamB'].unique().tolist()
    tourney_teams = set(teams_a + teams_b)

    teams = teams_df[teams_df['Season'] == current_season]['TeamID'].unique().tolist()

    data = []

    for idx, team_a in enumerate(teams):
        for idx_b in range(idx + 1, len(teams)):
            team_b = teams[idx_b]

            if team_a in tourney_teams and team_b in tourney_teams:
                continue
                
            teams_info_df = m_teams_info_df if gender == 'M' else w_teams_info_df
            # There is no reason for setting team names or probabilities if these matchups will never happen in the tournament
            team_a_name = None # set_team_name(team_a, teams_info_df)
            team_b_name = None # set_team_name(team_b, teams_info_df)

            game_id = f"{current_season}_{team_a}_{team_b}"
            data.append([game_id, team_a, team_b, team_a_name, team_b_name, 0.5])

    data_df = pd.DataFrame(data, columns=['ID', 'TeamA', 'TeamB', 'TeamNameA', 'TeamNameB', 'Pred'])
    return data_df

# current_szn_df = pd.concat([m_current_szn_df, w_current_szn_df], axis=0).reset_index(drop=True)
# current_szn_df['ID'] = current_szn_df.apply(lambda row: set_game_id(row), axis=1)

# m_impossible_df = set_impossible_games_probs('M', m_current_szn_df, m_teams_df)
# w_impossible_df = set_impossible_games_probs('W', w_current_szn_df, w_teams_df)

# final_df = current_szn_df.loc[:, ['ID', 'TeamA', 'TeamB', 'TeamNameA', 'TeamNameB', 'AProb']].rename({'AProb': 'Pred'}, axis=1)
# display(final_df)
# final_df = pd.concat([final_df, m_impossible_df, w_impossible_df], axis=0).sort_values(by='ID').reset_index(drop=True)

# display(final_df)

In [21]:
def create_submission_file(df):
    df_filtered = df.loc[:, ['ID', 'Pred']]
    df_filtered.to_csv('submission.csv', index=False)
    
# create_submission_file(final_df)

In [22]:
def get_rf_best_parameters(gender):
    teams_df, reg_szn_df, tourn_df, seed_df = get_files_by_competition(gender)

    data_df = build_dataset(gender, start_data_season[gender], reg_szn_df, tourn_df, seed_df, end_season = 2022) 
    x = data_df.drop(['TeamNameA', 'TeamNameB', 'Winner'], axis=1)
    y = data_df.loc[:, 'Winner']

    from sklearn.model_selection import RandomizedSearchCV
    rs_optimizer = RandomForestClassifier()

    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['sqrt', 'log2', None]
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                    'max_features': max_features,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf,
                    'bootstrap': bootstrap}

    rf_random = RandomizedSearchCV(estimator = rs_optimizer, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=0, n_jobs = -1)

    rf_random.fit(x, y)

    best_random = rf_random.best_estimator_
    best_parameters = rf_random.cv_results_
    #     print(best_parameters)
    print(best_random)
    
# get_rf_best_parameters('M')