In [968]:
import pandas as pd
import numpy as np
import sklearn
import random
from collections import Counter
import matplotlib.pyplot as plt

bracket = pd.read_csv("Bracket.csv")
mm_elos = pd.read_csv("MM_Team_Elos.csv")

combined = bracket.merge(mm_elos, left_on="Team", right_on="Team")
# print(combined)

cols_list = ["game_no", "team1", "team2", "team1_rating", 
                              "team2_rating", "team1_seed", "team2_seed", "team1_win_prob", 
                              "team2_win_prob", "team1_pred_score", "team2_pred_score"]
stats_cols = ["Team", "Rating", "Seed"]


round1_df = pd.DataFrame(columns=cols_list)

In [969]:
def calculateWinExpectation(rating1, rating2):
    return 1 / (1 + 10 ** ((rating2 - rating1) / 400))


def getPredScore(score1, score2, team_no):
    if (score1 > score2 and team_no == 1):
        return 1
    elif (score1 > score2 and team_no == 2):
        return 1
    else:
        return 0


def placeInDf(game_no, team_pair, df_to_insert):
    team1 = team_pair[0]
    team2 = team_pair[1]

    new_row = {"game_no": game_no, "team1": team1["Team"], "team2": team2["Team"], "team1_rating": team1["Rating"], 
               "team2_rating": team2["Rating"], "team1_seed": team1["Seed"], "team2_seed": team2["Seed"], 
               "team1_win_prob": calculateWinExpectation(team1["Rating"], team2["Rating"]), 
               "team2_win_prob": calculateWinExpectation(team2["Rating"], team1["Rating"]), 
               "team1_pred_score": getPredScore(calculateWinExpectation(team1["Rating"], team2["Rating"]), 
                                                calculateWinExpectation(team2["Rating"], team1["Rating"]), 1),
               "team2_pred_score": getPredScore(calculateWinExpectation(team2["Rating"], team1["Rating"]), 
                                                calculateWinExpectation(team1["Rating"], team2["Rating"]), 2)}

    df_to_insert.loc[len(df_to_insert)] = new_row


def getGameBracket(teams_df, df_to_insert):
    team_pair = []
    game_count = 0
    
    for index, row in teams_df.iterrows():
        team_pair.append(row)

        if index % 2 == 1:
            game_count += 1
            placeInDf(game_count, team_pair, df_to_insert)
            team_pair.clear()
    return df_to_insert

def getPredictedWinner(df):
    df['pred_winner'] = np.where(df['team1_win_prob'] > df['team2_win_prob'], df['team1'], df['team2'])
    return df


In [970]:
round1_df = getGameBracket(combined, round1_df)
# display(round1_df)
# print(games)

In [971]:
round1_df = getPredictedWinner(round1_df)
# display(round1_df)

In [972]:
def getActualWinner(round_no, df):
    round_winners = pd.DataFrame()

    if round_no == 6:
        round_winners = pd.read_csv("final_winner.csv")
    else: 
        round_winners = pd.read_csv(f"round{round_no}_winners.csv")
    # print(round_winners)

    df = pd.concat([df, round_winners], axis=1)
    # display(df)
    return df

In [973]:
round1_df = getActualWinner(1, round1_df)
display(round1_df)

Unnamed: 0,game_no,team1,team2,team1_rating,team2_rating,team1_seed,team2_seed,team1_win_prob,team2_win_prob,team1_pred_score,team2_pred_score,pred_winner,round1_actual
0,1,Alabama,Texas A&M-Corpus Christi,2820.05248,2130.04288,1,16,0.981513,0.018487,1,0,Alabama,Alabama
1,2,Maryland,West Virginia,2574.208,2563.5456,8,9,0.51534,0.48466,1,0,Maryland,Maryland
2,3,San Diego State,College of Charleston,2621.12256,2430.11328,5,12,0.750173,0.249827,1,0,San Diego State,San Diego State
3,4,Virginia,Furman,2553.18784,2383.19872,4,13,0.726818,0.273182,1,0,Virginia,Furman
4,5,Creighton,North Carolina State,2667.42784,2480.98816,6,11,0.745211,0.254789,1,0,Creighton,Creighton
5,6,Baylor,UC-Santa Barbara,2654.32832,2300.64128,3,14,0.884524,0.115476,1,0,Baylor,Baylor
6,7,Missouri,Utah State,2479.46496,2543.13472,7,10,0.409384,0.590616,0,1,Utah State,Missouri
7,8,Arizona,Princeton,2711.296,2297.29024,2,15,0.915538,0.084462,1,0,Arizona,Princeton
8,9,Purdue,Fairleigh Dickinson,2725.91872,1961.8816,1,16,0.987849,0.012151,1,0,Purdue,Fairleigh Dickinson
9,10,Memphis,Florida Atlantic,2587.30752,2515.41248,8,9,0.602013,0.397987,1,0,Memphis,Florida Atlantic


In [990]:
# dict_df = round1_df.groupby('team1').apply(lambda x: x[['team1_seed', 'team1_win_prob']].values.tolist()).to_dict()
team_seeds_round1winprob =  round1_df.groupby('team1').apply(lambda x: [item for sublist in x[['team1_seed', 'team1_win_prob']].values.tolist() for item in sublist]).to_dict()
print(team_seeds_round1winprob)





{'Alabama': [1.0, 0.9815127654018506], 'Arizona': [2.0, 0.9155378907291901], 'Arkansas': [8.0, 0.5245313712322464], 'Baylor': [3.0, 0.884523931941147], 'Connecticut': [4.0, 0.8435993248488951], 'Creighton': [6.0, 0.7452112699349032], 'Duke': [5.0, 0.8004149556841573], 'Gonzaga': [3.0, 0.9211892164393481], 'Houston': [1.0, 0.9726266123180525], 'Indiana': [4.0, 0.7557188928612939], 'Iowa': [8.0, 0.4811572031404587], 'Iowa State': [6.0, 0.7010438057072189], 'Kansas': [1.0, 0.9723921958961946], 'Kansas State': [3.0, 0.8321694659385912], 'Kentucky': [6.0, 0.6383436246674115], 'Marquette': [2.0, 0.879598943195645], 'Maryland': [8.0, 0.5153396117171185], 'Memphis': [8.0, 0.6020133402428408], 'Miami (FL)': [5.0, 0.6045315887896701], 'Michigan State': [7.0, 0.5341429337202324], 'Missouri': [7.0, 0.4093839682119658], 'Northwestern': [7.0, 0.48466038828288144], 'Purdue': [1.0, 0.9878493940203507], "Saint Mary's (CA)": [5.0, 0.6785365274071116], 'San Diego State': [5.0, 0.7501734935719745], 'Tenne

In [974]:
def getWinningSeeds(current_team, round_no, df):
    correct_predictions = df['pred_winner'].eq(df[f'round{round_no}_actual'])
    correct_rows = df[correct_predictions]

    if (correct_rows['team1'] == current_team).any():
        return correct_rows[correct_rows['team1'] == current_team]['team1_seed'].values[0] * round_no
    elif (correct_rows['team2'] == current_team).any():
        return correct_rows[correct_rows['team2'] == current_team]['team2_seed'].values[0] * round_no
    else:
        return 0

def getRoundScore(round_no, df):
    df['round_score'] = df[f'round{round_no}_actual'].apply(getWinningSeeds, args=(round_no, df,))
    df['round_score'].fillna(0, inplace=True)
    return df
# display(games)


In [975]:
# team_seeds_scores = combined.groupby('Team')[['Rating']].agg(list).to_dict('series')
# # print(team_seeds_scores)

# count = 0
# for key, value in team_seeds_scores.items():
#     print(key)
#     val = team_seeds_scores[key]
#     print("row: ", row)
#     val.append(row['Seed'].values[0])
#     count+=1

# print(team_seeds_scores)

# def monte_carlo():
#     num_simulations = 100

#     team_seeds_scores = combined.groupby('Team')[['Seed', 'Rating']].agg(list).to_dict('series')
#     for i in range(num_simulations):
#         results = np.zeros(len(team_seeds_scores))
        

In [976]:
def getRoundStats(round_no, df):
    df = getRoundScore(round_no, df)
    round_score = df['round_score'].sum()
    print(f"round {round_no} score:", round_score)

    correct_predictions = df['pred_winner'].eq(df[f'round{round_no}_actual'])
    correct_rows = df[correct_predictions]
    percentage_correct = len(correct_rows) / len(df)
    print("percentage correct:", percentage_correct)

    std_dev = np.std(df['round_score'])
    print("std dev:", std_dev)


getRoundStats(1, round1_df)

round 1 score: 103
percentage correct: 0.75
std dev: 2.7010920823807543


In [977]:
def rebalanceScore(round_no, df):
    correct_predictions = df['pred_winner'].eq(df[f'round{round_no}_actual'])
    correct_rows = df[correct_predictions]
    
    incorrect_predictions = df['pred_winner'].ne(df[f'round{round_no}_actual'])
    incorrect_rows = df[incorrect_predictions]

    k_factor = 0

    if round_no == 1 or round_no == 2 or round_no == 3:
        k_factor = 32
    elif round_no == 4 or round_no == 5:
        k_factor = 16


    for index, row in df.iterrows():
        if index in incorrect_rows.index:
            if row['pred_winner'] == row['team1']:
                df.at[index, 'updated_team1_score'] = row['team1_rating'] + k_factor * (0 - row['team1_pred_score'])
                df.at[index, 'updated_team2_score']  = row['team2_rating'] + k_factor * (1 - row['team2_pred_score'])
            elif row['pred_winner'] == row['team2']:
                df.at[index, 'updated_team1_score'] = row['team1_rating'] + k_factor * (1 - row['team1_pred_score'])
                df.at[index, 'updated_team2_score']  = row['team2_rating'] + k_factor * (0 - row['team2_pred_score'])
        if index in correct_rows.index:
            df.at[index, 'updated_team1_score'] = row['team1_rating']
            df.at[index, 'updated_team2_score'] = row['team2_rating']
        

rebalanceScore(1, round1_df)
display(round1_df)
    

Unnamed: 0,game_no,team1,team2,team1_rating,team2_rating,team1_seed,team2_seed,team1_win_prob,team2_win_prob,team1_pred_score,team2_pred_score,pred_winner,round1_actual,round_score,updated_team1_score,updated_team2_score
0,1,Alabama,Texas A&M-Corpus Christi,2820.05248,2130.04288,1,16,0.981513,0.018487,1,0,Alabama,Alabama,1,2820.05248,2130.04288
1,2,Maryland,West Virginia,2574.208,2563.5456,8,9,0.51534,0.48466,1,0,Maryland,Maryland,8,2574.208,2563.5456
2,3,San Diego State,College of Charleston,2621.12256,2430.11328,5,12,0.750173,0.249827,1,0,San Diego State,San Diego State,5,2621.12256,2430.11328
3,4,Virginia,Furman,2553.18784,2383.19872,4,13,0.726818,0.273182,1,0,Virginia,Furman,0,2521.18784,2415.19872
4,5,Creighton,North Carolina State,2667.42784,2480.98816,6,11,0.745211,0.254789,1,0,Creighton,Creighton,6,2667.42784,2480.98816
5,6,Baylor,UC-Santa Barbara,2654.32832,2300.64128,3,14,0.884524,0.115476,1,0,Baylor,Baylor,3,2654.32832,2300.64128
6,7,Missouri,Utah State,2479.46496,2543.13472,7,10,0.409384,0.590616,0,1,Utah State,Missouri,0,2511.46496,2511.13472
7,8,Arizona,Princeton,2711.296,2297.29024,2,15,0.915538,0.084462,1,0,Arizona,Princeton,0,2679.296,2329.29024
8,9,Purdue,Fairleigh Dickinson,2725.91872,1961.8816,1,16,0.987849,0.012151,1,0,Purdue,Fairleigh Dickinson,0,2693.91872,1993.8816
9,10,Memphis,Florida Atlantic,2587.30752,2515.41248,8,9,0.602013,0.397987,1,0,Memphis,Florida Atlantic,0,2555.30752,2547.41248


In [978]:
# def getTeamStats(prev_round_df, df_to_insert, round_no):
#     prev_round_no = round_no - 1
#     # print("prev round df:", prev_round_no)
#     for index, row in prev_round_df.iterrows():
#         # print(row)
#         if row[f'round{prev_round_no}_actual'] == row['team1']:
#             new_row = {"Team": row['team1'], "Rating": row['updated_team1_score'], "Seed": row['team1_seed']}
#         elif row[f'round{prev_round_no}_actual'] == row['team2']:
#             new_row = {"Team": row['team2'], "Rating": row['updated_team2_score'], "Seed": row['team2_seed']}
#         else:
#             print("error: ", row[f'round{prev_round_no}_actual'])

#         # print(new_row)
#         df_to_insert.loc[len(df_to_insert)] = new_row

def getTeamStats(prev_round_df, df_to_insert, round_no):
    prev_round_no = round_no - 1
    # print("prev round df:", prev_round_no)
    for index, row in prev_round_df.iterrows():
        # print(row)
        if row['pred_winner'] == row['team1']:
            new_row = {"Team": row['team1'], "Rating": row['updated_team1_score'], "Seed": row['team1_seed']}
        elif row['pred_winner'] == row['team2']:
            new_row = {"Team": row['team2'], "Rating": row['updated_team2_score'], "Seed": row['team2_seed']}
        else:
            print("error: ", row['pred_winner'])

        # print(new_row)
        df_to_insert.loc[len(df_to_insert)] = new_row


In [979]:
def getRound(round_df, round_stats_df, prev_round_df, round_no):
    prev_round_no = round_no - 1

    getTeamStats(prev_round_df, round_stats_df, round_no)
    # display(round_stats_df)

    getGameBracket(round_stats_df, round_df)
    getPredictedWinner(round_df)
    round_df = getActualWinner(round_no, round_df)
    # display(round4_df)
    # print(round4_df.columns)

    rebalanceScore(round_no, round_df)
    # display(round2_df)
    getRoundScore(round_no, round_df)
    display(round_df)

    getRoundStats(round_no, round_df)
    return round_df
    

In [980]:
# def monte_carlo_simulation(num_simulations):
#     simulation_results = []

#     for _ in range(num_simulations):
#         try:
#             # Copy the initial DataFrame for simulation
#             round1_df_sim = round1_df.copy()
#             round2_df_sim = pd.DataFrame(columns=cols_list)
#             round2_stats_df_sim = pd.DataFrame(columns=stats_cols)
#             round3_df_sim = pd.DataFrame(columns=cols_list)
#             round3_stats_df_sim = pd.DataFrame(columns=stats_cols)
#             round4_df_sim = pd.DataFrame(columns=cols_list)
#             round4_stats_df_sim = pd.DataFrame(columns=stats_cols)
#             round5_df_sim = pd.DataFrame(columns=cols_list)
#             round5_stats_df_sim = pd.DataFrame(columns=stats_cols)
#             round6_df_sim = pd.DataFrame(columns=cols_list)
#             round6_stats_df_sim = pd.DataFrame(columns=stats_cols)

#             # Run the rounds
#             round1_df_sim = getGameBracket(combined, round1_df_sim)
#             round1_df_sim = getPredictedWinner(round1_df_sim)
#             round1_df_sim = getActualWinner(1, round1_df_sim)
#             round1_df_sim = getRoundScore(1, round1_df_sim)
#             rebalanceScore(1, round1_df_sim)

#             round2_df_sim = getRound(round2_df_sim, round2_stats_df_sim, round1_df_sim, 2)
#             round3_df_sim = getRound(round3_df_sim, round3_stats_df_sim, round2_df_sim, 3)
#             round4_df_sim = getRound(round4_df_sim, round4_stats_df_sim, round3_df_sim, 4)
#             round5_df_sim = getRound(round5_df_sim, round5_stats_df_sim, round4_df_sim, 5)
#             round6_df_sim = getRound(round6_df_sim, round6_stats_df_sim, round5_df_sim, 6)

#             # Collect results from this simulation
#             final_winner = round6_df_sim['pred_winner'].iloc[0]
#             simulation_results.append(final_winner)
#         except Exception as e:
#             print(f"Error during simulation: {e}")
#             continue  # Skip this simulation if an error occurs

#     return simulation_results

# Call the Monte Carlo simulation function


In [981]:
# num_simulations = 100
# simulation_results = monte_carlo_simulation(num_simulations)

# # Convert results to strings to avoid type issues
# simulation_results = [str(result) for result in simulation_results]

# # Analyze the results
# winner_counts = Counter(simulation_results)
# total_simulations = len(simulation_results)

# print("Simulation Results:")
# for winner, count in winner_counts.items():
#     print(f"Team {winner}: {count / total_simulations:.2%}")

In [982]:
round2_df = pd.DataFrame(columns=cols_list)
round2_stats_df = pd.DataFrame(columns=stats_cols)
round2_df = getRound(round2_df, round2_stats_df, round1_df, 2)

Unnamed: 0,game_no,team1,team2,team1_rating,team2_rating,team1_seed,team2_seed,team1_win_prob,team2_win_prob,team1_pred_score,team2_pred_score,pred_winner,round2_actual,updated_team1_score,updated_team2_score,round_score
0,1,Alabama,Maryland,2820.05248,2574.208,1,8,0.804584,0.195416,1,0,Alabama,Alabama,2820.05248,2574.208,2
1,2,San Diego State,Virginia,2621.12256,2521.18784,5,4,0.639978,0.360022,1,0,San Diego State,San Diego State,2621.12256,2521.18784,10
2,3,Creighton,Baylor,2667.42784,2654.32832,6,3,0.518843,0.481157,1,0,Creighton,Creighton,2667.42784,2654.32832,12
3,4,Utah State,Arizona,2511.13472,2679.296,10,2,0.275277,0.724723,0,1,Arizona,Princeton,2543.13472,2647.296,0
4,5,Purdue,Memphis,2693.91872,2555.30752,1,8,0.689527,0.310473,1,0,Purdue,Florida Atlantic,2661.91872,2587.30752,0
5,6,Duke,Tennessee,2653.71904,2646.40768,5,4,0.51052,0.48948,1,0,Duke,Tennessee,2621.71904,2678.40768,0
6,7,Kentucky,Kansas State,2620.20864,2572.6848,6,3,0.567969,0.432031,1,0,Kentucky,Kansas State,2588.20864,2604.6848,0
7,8,Michigan State,Marquette,2553.49248,2668.6464,7,2,0.340093,0.659907,0,1,Marquette,Michigan State,2585.49248,2636.6464,0
8,9,Houston,Auburn,2839.85408,2585.47968,1,9,0.812189,0.187811,1,0,Houston,Houston,2839.85408,2585.47968,2
9,10,Miami (FL),Indiana,2546.7904,2614.11584,5,4,0.404306,0.595694,0,1,Indiana,Miami (FL),2578.7904,2582.11584,0


round 2 score: 54
percentage correct: 0.5625
std dev: 3.8547859862773186


In [983]:
round3_df = pd.DataFrame(columns=cols_list)
round3_stats_df = pd.DataFrame(columns=stats_cols)
round3_df = getRound(round3_df, round3_stats_df, round2_df, 3)

Unnamed: 0,game_no,team1,team2,team1_rating,team2_rating,team1_seed,team2_seed,team1_win_prob,team2_win_prob,team1_pred_score,team2_pred_score,pred_winner,round3_actual,updated_team1_score,updated_team2_score,round_score
0,1,Alabama,San Diego State,2820.05248,2621.12256,1,5,0.758621,0.241379,1,0,Alabama,San Diego State,2788.05248,2653.12256,0
1,2,Creighton,Arizona,2667.42784,2647.296,6,2,0.52894,0.47106,1,0,Creighton,Creighton,2667.42784,2647.296,18
2,3,Purdue,Duke,2661.91872,2621.71904,1,5,0.557595,0.442405,1,0,Purdue,Florida Atlantic,2629.91872,2653.71904,0
3,4,Kentucky,Marquette,2588.20864,2636.6464,6,2,0.430741,0.569259,0,1,Marquette,Kansas State,2620.20864,2604.6464,0
4,5,Houston,Indiana,2839.85408,2582.11584,1,4,0.815125,0.184875,1,0,Houston,Miami (FL),2807.85408,2614.11584,0
5,6,Xavier,Texas,2569.94304,2744.19712,3,2,0.268335,0.731665,0,1,Texas,Texas,2569.94304,2744.19712,6
6,7,Kansas,Connecticut,2698.18368,2718.60736,1,4,0.470642,0.529358,0,1,Connecticut,Connecticut,2698.18368,2718.60736,12
7,8,Gonzaga,UCLA,2739.32288,2688.75264,3,2,0.572267,0.427733,1,0,Gonzaga,Gonzaga,2739.32288,2688.75264,9


round 3 score: 45
percentage correct: 0.5
std dev: 6.4408365139941255


In [984]:
round4_df = pd.DataFrame(columns=cols_list)
round4_stats_df = pd.DataFrame(columns=stats_cols)
round4_df = getRound(round4_df, round4_stats_df, round3_df, 4)


Unnamed: 0,game_no,team1,team2,team1_rating,team2_rating,team1_seed,team2_seed,team1_win_prob,team2_win_prob,team1_pred_score,team2_pred_score,pred_winner,round4_actual,updated_team1_score,updated_team2_score,round_score
0,1,Alabama,Creighton,2788.05248,2667.42784,1,6,0.666939,0.333061,1,0,Alabama,San Diego State,2772.05248,2683.42784,0
1,2,Purdue,Marquette,2629.91872,2604.6464,1,2,0.536306,0.463694,1,0,Purdue,Florida Atlantic,2613.91872,2620.6464,0
2,3,Houston,Texas,2807.85408,2744.19712,1,2,0.590598,0.409402,1,0,Houston,Miami (FL),2791.85408,2760.19712,0
3,4,Connecticut,Gonzaga,2718.60736,2739.32288,4,3,0.470223,0.529777,0,1,Gonzaga,Connecticut,2734.60736,2723.32288,0


round 4 score: 0
percentage correct: 0.0
std dev: 0.0


In [985]:
round5_df = pd.DataFrame(columns=cols_list)
round5_stats_df = pd.DataFrame(columns=stats_cols)
round5_df = getRound(round5_df, round5_stats_df, round4_df, 5)


Unnamed: 0,game_no,team1,team2,team1_rating,team2_rating,team1_seed,team2_seed,team1_win_prob,team2_win_prob,team1_pred_score,team2_pred_score,pred_winner,round5_actual,updated_team1_score,updated_team2_score,round_score
0,1,Alabama,Purdue,2772.05248,2613.91872,1,1,0.71306,0.28694,1,0,Alabama,San Diego State,2756.05248,2629.91872,0
1,2,Houston,Gonzaga,2791.85408,2723.32288,1,3,0.597365,0.402635,1,0,Houston,Connecticut,2775.85408,2739.32288,0


round 5 score: 0
percentage correct: 0.0
std dev: 0.0


In [986]:
round6_df = pd.DataFrame(columns=cols_list)
round6_stats_df = pd.DataFrame(columns=stats_cols)
getRound(round6_df, round6_stats_df, round5_df, 6)

# getTeamStats(round5_df, round6_stats_df, 6)

Unnamed: 0,game_no,team1,team2,team1_rating,team2_rating,team1_seed,team2_seed,team1_win_prob,team2_win_prob,team1_pred_score,team2_pred_score,pred_winner,round6_actual,updated_team1_score,updated_team2_score,round_score
0,1,Alabama,Houston,2756.05248,2775.85408,1,1,0.471534,0.528466,0,1,Houston,Connecticut,2756.05248,2775.85408,0


round 6 score: 0
percentage correct: 0.0
std dev: 0.0


Unnamed: 0,game_no,team1,team2,team1_rating,team2_rating,team1_seed,team2_seed,team1_win_prob,team2_win_prob,team1_pred_score,team2_pred_score,pred_winner,round6_actual,updated_team1_score,updated_team2_score,round_score
0,1,Alabama,Houston,2756.05248,2775.85408,1,1,0.471534,0.528466,0,1,Houston,Connecticut,2756.05248,2775.85408,0
