Goal is to calculate optimal picks based on win probability, public pick probability, and number of players in pool. Idea would be: the more players in the pool, the riskier you have to be to increase probability of winning pool.

In [3]:
import pandas as pd
import numpy as np
import pickle

In [119]:
model = pickle.load(open('data/game_prediction_model.pickle', 'rb'))
team_cols = pickle.load(open('data/game_prediction_model_covariates.pickle', 'rb'))
starting_teams = pickle.load(open('data/starting_teams.pickle', 'rb'))

In [130]:
df_predictions = pd.read_csv('data/2019_tourney_predictions.csv')
df_predictions = df_predictions[['team','RoundOf32','SweetSixteen','EliteEight',\
                                'FinalFour','Championship','Champions']]
df_predictions.columns = ['Team','W1','W2','W3','W4','W5','W6']
df_predictions.head()

Unnamed: 0,Team,W1,W2,W3,W4,W5,W6
0,Duke,0.78,0.61,0.36,0.23,0.13,0.06
1,North Dakota St.,0.22,0.02,0.0,0.0,0.0,0.0
2,VCU,0.44,0.15,0.03,0.02,0.0,0.0
3,UCF,0.56,0.22,0.07,0.03,0.01,0.0
4,Mississippi St.,0.71,0.42,0.21,0.07,0.02,0.01


In [194]:
def _generate_bracket(df_predictions):
    df_teams = df_predictions.copy(deep=True)
    rounds = len(df_teams.columns)-1
    df_teams[df_teams.columns.tolist()[1:]] = \
        df_teams[df_teams.columns.tolist()[1:]].cumsum()
    df_bracket = pd.DataFrame(columns = df_teams.columns)
    df_bracket['Team'] = df_teams['Team'].copy(deep=True)
    df_bracket.fillna(0)
    n_winners = 1
    for r in range(rounds,0,-1):
        cur_col = 'R'+str(r)
        for w in range(n_winners):
            start_i = df_teams.loc[df_teams[cur_col]>w].index[0]
            try:
                end_i = df_teams.loc[df_teams[cur_col]>w+1].index[0]-1
            except IndexError:
                end_i = len(df_teams.index)-1
            if not df_bracket.loc[start_i:end_i, cur_col].max()==1:
                rand = np.random.rand() + w
                win_i = df_teams.loc[df_teams[cur_col]>=rand].index[0]
                win_c = df_teams.columns.tolist().index(cur_col)
                df_bracket.iloc[win_i, 1:win_c+1] = 1
        n_winners = n_winners*2
    df_bracket = df_bracket.fillna(0)
        
    return df_bracket

In [195]:
def _generate_pool(df_public_picks, pool_size):
    rounds = len(df_public_picks.columns)-1
    df_public_picks[df_public_picks.columns.tolist()[1:]] = \
        df_public_picks[df_public_picks.columns.tolist()[1:]].cumsum()
    pool = []
    for i in range(pool_size-1):
        df_pool = pd.DataFrame(columns = df_public_picks.columns)
        df_pool['Team'] = df_public_picks['Team'].copy(deep=True)
        df_pool.fillna(0)
        n_winners = 1
        for r in range(rounds,0,-1):
            cur_col = 'R'+str(r)
            for w in range(n_winners):
                start_i = df_public_picks.loc[df_public_picks[cur_col]>w].index[0]
                try:
                    end_i = df_public_picks.loc[df_public_picks[cur_col]>w+1].index[0]-1
                except IndexError:
                    end_i = len(df_public_picks.index)-1
                if not df_pool.loc[start_i:end_i, cur_col].max()==1:
                    rand = np.random.rand() + w
                    win_i = df_public_picks.loc[df_public_picks[cur_col]>=rand].index[0]
                    win_c = df_public_picks.columns.tolist().index(cur_col)
                    df_pool.iloc[win_i, 1:win_c+1] = 1
            n_winners = n_winners*2
        df_pool = df_pool.fillna(0)
        pool.append(df_pool)
        
    return pool

In [300]:
def _win_pool_sim(reps, df_public_picks, df_picks, df_predictions, pool_size, rewards):
    wins = 0.0
    rewards = np.array(rewards)
    for i in range(reps):
        pool = [df_picks] + _generate_pool(df_public_picks, pool_size)
        bracket = _generate_bracket(df_predictions)
        winner_count = 0
        cur_max = 0
        for i, p in enumerate(pool):
            correct_picks = p.drop('Team',1).multiply(bracket.drop('Team',1)).sum().values
            points = np.dot(rewards, correct_picks)
            if i==0:
                print (points)
            if points>cur_max:
                winner_index = i
                cur_max = points
                winner_count = 1
            elif points==cur_max:
                winner_count += 1
            else:
                pass
            
        if winner_index==0:
            wins += 1.0/winner_count
        print ('Number of winners: {0}'.format(str(winner_count)))
        print ('Winning points: {0}'.format(str(cur_max)))
        print ('Wins: {0}'.format(str(wins)))
            
    return wins/reps

In [325]:
public_data = [
    ['Duke', 0.5, 0.3, 0.23],
    ['Cornell', 0.5, 0.3, 0.24],
    ['UNC', 0.75, 0.4, 0.23],
    ['Harvard', 0.25, 0.0, 0.0],
    ['Gonzaga', 0.9, 0.7, 0.25],
    ['Buffalo', 0.1, 0.04, 0.01],
    ['Virginia', 0.85, 0.25, 0.04],
    ['Clarkson', 0.15, 0.01, 0.0]
]
df_public_picks = pd.DataFrame(public_data, columns=['Team','R1','R2','R3'])
picks_data = [
    ['Duke', 0, 0, 0],
    ['Cornell', 1, 1, 1],
    ['UNC', 1, 0, 0],
    ['Harvard', 0, 0, 0],
    ['Gonzaga', 1, 0, 0],
    ['Buffalo', 0, 0, 0],
    ['Virginia', 1, 1, 0],
    ['Clarkson', 0, 0, 0]
]
df_picks = pd.DataFrame(picks_data, columns=['Team','R1','R2','R3'])
predictions_data = [
    ['Duke', 0.3, 0.2, 0.13],
    ['Cornell', 0.7, 0.4, 0.34],
    ['UNC', 0.75, 0.4, 0.23],
    ['Harvard', 0.25, 0.0, 0.0],
    ['Gonzaga', 0.9, 0.7, 0.25],
    ['Buffalo', 0.1, 0.04, 0.01],
    ['Virginia', 0.85, 0.25, 0.04],
    ['Clarkson', 0.15, 0.01, 0.0]
]
df_predictions = pd.DataFrame(predictions_data, columns=['Team','R1','R2','R3'])
win_pct = _win_pool_sim(10, df_public_picks, df_picks, df_predictions, 12, [1,2,4])
print (win_pct)

4
Number of winners: 2
Winning points: 9
Wins: 0.0
3
Number of winners: 6
Winning points: 4
Wins: 0.0
4
Number of winners: 1
Winning points: 7
Wins: 0.0
6
Number of winners: 1
Winning points: 6
Wins: 1.0
10
Number of winners: 1
Winning points: 10
Wins: 2.0
2
Number of winners: 6
Winning points: 3
Wins: 2.0
6
Number of winners: 1
Winning points: 6
Wins: 3.0
10
Number of winners: 1
Winning points: 10
Wins: 4.0
5
Number of winners: 1
Winning points: 5
Wins: 5.0
6
Number of winners: 1
Winning points: 6
Wins: 6.0
0.6


In [186]:
print (pool[0])
print (bracket)
rewards = np.array([1, 2, 4])
correct_picks = pool[0].drop('Team',1).multiply(bracket.drop('Team',1)).sum().values
np.dot(rewards, correct_picks)

       Team  R1  R2  R3
0      Duke   1   1   0
1   Cornell   0   0   0
2       UNC   0   0   0
3   Harvard   1   0   0
4   Gonzaga   1   1   1
5   Buffalo   0   0   0
6  Virginia   1   0   0
7  Clarkson   0   0   0
       Team  R1  R2  R3
0      Duke   1   1   1
1   Cornell   0   0   0
2       UNC   1   0   0
3   Harvard   0   0   0
4   Gonzaga   1   1   0
5   Buffalo   0   0   0
6  Virginia   1   0   0
7  Clarkson   0   0   0


7