In [None]:
import pandas as pd
import numpy as np
import re

## Loading data

First, load all of the data frames I'm going to need and check out their shape

In [None]:
seeds = pd.read_csv("march-madness-2017/TourneySeeds.csv")
seeds.tail()

In [None]:
geog = pd.read_csv("march-madness-2017/TourneyGeog.csv")
geog.drop(["lat", "lng", "host"], axis=1, inplace=True)
geog.head(5)

In [None]:
games = pd.read_csv("march-madness-2017/TourneyCompactResults.csv")
#games = pd.concat([games, slots], ignore_index=True, axis=1)
print(games.shape)
games.head()



In [None]:
slots = pd.read_csv("march-madness-2017/TourneySlots.csv")
slots = slots.reset_index(drop=True)
print(slots.shape)
slots.head(10)

In [None]:
print(games.shape, slots.shape)

# just take games since 2000
games = games[games["Season"] > 1999]


games.head()

 ## Merge dataframes to get slots seeds and games in one frame

In [None]:
# do merging of games with seeds and slots 


games = pd.merge(games, seeds, left_on=["Season", "Wteam"], right_on=["Season", "Team"], copy=False)
games.rename(columns={"Seed":"Wseed"}, inplace=True)
print(games.shape)
games = pd.merge(games, seeds, left_on=["Season", "Lteam"], right_on=["Season", "Team"], copy=False)
games.rename(columns={"Seed":"Lseed"}, inplace=True)
print(games.shape)
games = pd.merge(games, geog, left_on=["Season", "Wteam", "Lteam"], right_on=["season", "wteam", "lteam"], copy=False)
print(games.shape)

games.drop(["wteam", "lteam", "Team_x", "Team_y", "Numot", "daynum", "season", "Wloc"], axis=1, inplace=True)

print(games.shape)

In [None]:
games = pd.merge(games, slots, left_on=["Season", "slot"], right_on=["Season", "Slot"], copy=False)
games.drop(["slot", "Wscore", "Lscore"], axis=1, inplace=True)
games.head(10)

In [None]:
# pre-process games

# drop early (play-in) games
games = games[games["Daynum"] >  135]
games.rename(columns={"Strongseed":"seed1", "Weakseed": "seed2"}, inplace=True)

day_to_round = {136:1, 137:1, 138:2,139:2, 143:3, 144:3, 145:4, 146:4, 152:5,154:6}
round_to_points = {1:1, 2:2, 3:4, 4:8, 5:16, 6:32}
games["round"] = games["Daynum"].map(day_to_round)
games["bracket_points"] = games["round"].map(round_to_points)

games.head()

In [None]:
# make sure we have the right nr of games each year

num_games_by_day = games.groupby(["Season", "Daynum"])["Daynum", "Season"].agg(['count'])
games.groupby("Season").agg("count")["Daynum"]

In [None]:
# do predictions by slots

def make_group(size):
    # make a group of given size
    players = []
    
    if size == 2:
        # one less risky than other, but cant have one player always playing completely safe
        players =[ ("p_1", .05), ("p_1", .8/(np.random.randint(5)+1))]
    else:    
        for p in range(size):
            players.append(("p_" +str(p), p/size))
    return players

print("size 2", make_group(2))
print("size 10", make_group(10)) 

In [None]:
# test out with 2003

games_2003 = games[games["Season"] == 2003]
print(games_2003.shape)
games_2003.head()

In [None]:
def get_empty_bracket(tourney):
    # separate by round because they will be sequentially picked each round based on picks before
    # winners are not taken into account
    by_round = {}
    for i in range(1,7):
        by_round[i] =  tourney[tourney["round"] == i]
        #print(by_round[i].shape)
    
    return by_round
empty = get_empty_bracket(games_2003)
empty[1].head()

In [None]:
def doSub(s):
    # get just seed num instead of "Z01"
    return int(re.sub("[^0-9]", "", s))
    

def do_picks(bracket, player):
    
    for rnd in bracket.keys():
        # sequentially pick rounds
        thisRound = bracket[rnd]

        if rnd == 1:
            #no need to get winners from previous round'
            
            thisRound["seed1_num"] = thisRound["seed1"].map(doSub)
            thisRound["seed2_num"] = thisRound["seed2"].map(doSub)
            
            thisRound["pick"] = thisRound.apply(pickGame, axis=1, args=(player[1],))
            bracket[1] = thisRound
        else:
            # get this player's picks from previous round (match by game slot)
           
            prev = bracket[rnd-1]
            
            #merge slot with previous round winner
            thisRound = pd.merge(thisRound, prev.loc[:, ["Slot", "pick"]], left_on=["seed1"], right_on=["Slot"])
            
            thisRound.drop(["seed1", "Slot_y"], axis=1, inplace=True)
            thisRound.rename(columns={"pick":"seed1", "Slot_x": "Slot"}, inplace=True)
            
            #do again for other game
            thisRound = pd.merge(thisRound, prev.loc[:, ["Slot", "pick"]], left_on=["seed2"], right_on=["Slot"])
            thisRound.drop(["seed2", "Slot_y"], axis=1, inplace=True)
            thisRound.rename(columns={"pick":"seed2", "Slot_x":"Slot"}, inplace=True)
            
            
            thisRound["seed1_num"] = thisRound["seed1"].map(doSub)
            thisRound["seed2_num"] = thisRound["seed2"].map(doSub)
            
            
            # pick for given game
            thisRound["pick"] = thisRound.apply(pickGame, axis=1, args=(player[1],))
        
        # add picks to return frame
        bracket[rnd] = thisRound
    return bracket
            
def pickGame(g, risk_factor):
    # randomly pick a game for given risk factor
    
    # adjust for seed difference, dont pick 16 over 1 as frequently as 10 over 7
    seed_diff = abs(g["seed1_num"] - g["seed2_num"])
    if seed_diff > 2:
        risk_factor /= seed_diff/2
    
    pick_upset = np.random.binomial(1, risk_factor)
    
    first_favorite = g["seed1_num"] < g["seed2_num"]
    #print(risk_factor, pick_upset,  first_favorite, g["seed1_num"] , g["seed2_num"] )
    
    if pick_upset and first_favorite or not pick_upset and not first_favorite:
        return g["seed2"]
    
    return g["seed1"]
    
    
    

In [None]:
this_bracket = get_empty_bracket(games_2003)
this_bracket = do_picks(this_bracket,  ("b",.5))
pd.concat(this_bracket.values()).tail(10)

In [None]:
def is_upset(g):
    #whos the pick
    if g["pick"] == g["seed1"] and g["seed1_num"] > g["seed2_num"] or g["pick"] == g["seed2"] and g["seed1_num"] < g["seed2_num"]:
        return True
    return False
    

In [None]:
tournament = pd.concat(this_bracket.values())

tournament["is_upset"] =  tournament.apply(is_upset, axis=1)


round_to_points = {1:1, 2:2, 3:4, 4:8, 5:16, 6:32}

tournament["bracket_points"] = tournament["round"].map(round_to_points)
tournament["correct_pick"] = tournament["pick"] == tournament["Wseed"]
tournament["points_awarded"] = tournament["bracket_points"] * tournament["correct_pick"]

print(tournament.shape)
tournament.tail(15)

In [None]:
tournament["bracket_points"].sum()

In [None]:
def do_preds_for_group(group, games):
    res = []
    for player in group:
        this_bracket = get_empty_bracket(games)
        this_bracket = do_picks(this_bracket,  player)
        tournament = pd.concat(this_bracket.values())

        tournament["is_upset"] =  tournament.apply(is_upset, axis=1)

       
        tournament["correct_pick"] = tournament["pick"] == tournament["Wseed"]
        tournament["points_awarded"] = tournament["bracket_points"] * tournament["correct_pick"]
        
        
        
        res.append({"player": player[0], "upsets": tournament["is_upset"].sum(), "points" : tournament["points_awarded"].sum(),
                   "upsets_1": tournament.loc[tournament["round"] == 1, "is_upset"].sum(),
                    "upsets_2": tournament.loc[tournament["round"] == 2, "is_upset"].sum(), 
                    "upsets_3": tournament.loc[tournament["round"] == 3, "is_upset"].sum(), 
                    "upsets_4": tournament.loc[tournament["round"] == 4, "is_upset"].sum(), 
                    "upsets_5": tournament.loc[tournament["round"] == 5, "is_upset"].sum(), 
                    "upsets_6": tournament.loc[tournament["round"] == 6, "is_upset"].sum() 
                   
                   })
        
        # delete this guys stuff
        del tournament
        del this_bracket
    return pd.DataFrame(res)



In [None]:
# test for group of 100 in 2003
gps_100 = []
gps_100.append(do_preds_for_group(make_group(100), games_2003))
gps_100[len(gps_100)-1].sort_values("points", ascending=False).head(10)

In [None]:
def optimal_preds_for_group(games, size, iterations):
    winners = []
    upsets = []
    for i in range(iterations):
        brackets = do_preds_for_group(make_group(size), games)
        winners.append(brackets.sort_values("points", ascending=False).head(1))
        upsets.append(brackets["upsets"].mean())
    return pd.concat(winners), sum(upsets)/len(upsets)



In [None]:
def optimal_throughout_time(games, size, iterations):
    
    # for each season find the optimal for given group size
    seasons = games["Season"].unique()
    
    sums = {}
    upsets = {}
    for season in seasons:
        print(season, size)
        these_games = games[games["Season"] == season]
        optimal, upsets_num = optimal_preds_for_group(these_games, size, iterations)
        
        sums[season] = optimal.mean()
        upsets[season] = upsets_num
    return sum(sums.values())/len(seasons) , sum(upsets.values())/len(upsets)

#quick = optimal_throughout_time(games)

In [None]:
# set dicts for results, not doing this all at once
upset_means = {}
results = {}

In [None]:
# do preds for each group size
group_sizes = [2, 5]
#group_sizes = [10, 25, 50, 100]
#group_sizes = [200, 5000, 1000]
#group_sizes = [5000]

for g in group_sizes:
    results[g], upset_means[g] = optimal_throughout_time(games, g, 1000)
    print(results)
print(results)
