Data is from the public BigQuery dataset, ncaa_basketball.

The tourney_games.csv file is from the tourney_games.sql query (add code to run that query?)

The tourney_teams.csv file is from the tourney_teams.sql query.

The tourney_teams_2019.csv file (the current season) is extracted from kenpom.com (add code to scrape this data?).

Note: seedings for the 2018 tournament (2017-18 season) were missing, and were filled in manually in the data files.

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
df_predict = pd.read_csv('data/tourney_teams_2019.csv')
df_predict['season'] = 2018
df_predict['adj_off_ptsperposs'] = df_predict['adj_off_ptsperposs']/100
df_predict['adj_def_ptsperposs'] = df_predict['adj_def_ptsperposs']/100
df_predict['adj_net_ptsperposs'] = df_predict['adj_net_ptsperposs']/100
df_predict['opp_adj_net_ptsperposs'] = df_predict['opp_adj_net_ptsperposs']/100

In [5]:
df_teams = pd.read_csv('data/tourney_teams.csv').dropna().drop(['conf_alias','tournament_wins'],1)
df_games = pd.read_csv('data/tourney_games.csv').dropna()

In [6]:
df = df_games.merge(df_teams, on=['alias','market','season'], how='inner')
df_teams2 = df_teams.copy(deep=True)
df_teams2.columns = ['opp_alias','opp_market','season'] + ['gopp_'+c for c in df_teams.columns[3:]]
df = df.merge(df_teams2, on=['opp_alias','opp_market','season'], how='inner')

In [7]:
df = df.loc[df['alias']>df['opp_alias']]

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [9]:
X_cols = df.columns.values.tolist()
X_cols.remove('alias')
X_cols.remove('market')
X_cols.remove('season')
X_cols.remove('opp_alias')
X_cols.remove('opp_market')
X_cols.remove('win')
X_cols.remove('wins')
X_cols.remove('losses')
X_cols.remove('gopp_wins')
X_cols.remove('gopp_losses')
X_cols.remove('off_ptsperposs')
X_cols.remove('def_ptsperposs')
X_cols.remove('net_ptsperposs')
X_cols.remove('possperg')
X_cols.remove('gopp_off_ptsperposs')
X_cols.remove('gopp_def_ptsperposs')
X_cols.remove('gopp_net_ptsperposs')
X_cols.remove('gopp_possperg')
X_cols.remove('off_sd')
X_cols.remove('def_sd')
X_cols.remove('gopp_off_sd')
X_cols.remove('gopp_def_sd')

In [10]:
X_cols

['win_pct',
 'adj_off_ptsperposs',
 'adj_def_ptsperposs',
 'adj_net_ptsperposs',
 'adj_possperg',
 'opp_adj_net_ptsperposs',
 'seed',
 'gopp_win_pct',
 'gopp_adj_off_ptsperposs',
 'gopp_adj_def_ptsperposs',
 'gopp_adj_net_ptsperposs',
 'gopp_adj_possperg',
 'gopp_opp_adj_net_ptsperposs',
 'gopp_seed']

In [11]:
model = MLPClassifier(max_iter=1000, activation='logistic')

param_dist = {
    'hidden_layer_sizes' : [(7,),(5,),(10,)],
    'alpha' : [00.005,0.01,0.05]
}

y = np.ravel(df[['win']])
X = df[X_cols].values
random_search = GridSearchCV(model, param_dist, cv=5, return_train_score=True)
random_search.fit(X, y)

report_cols = ['mean_test_score','std_test_score']+['param_'+param for param in param_dist]
report = pd.DataFrame(random_search.cv_results_)[report_cols].sort_values(by='mean_test_score', ascending=False)
report



Unnamed: 0,mean_test_score,std_test_score,param_hidden_layer_sizes,param_alpha
4,0.720779,0.04538,"(5,)",0.01
3,0.707792,0.031466,"(7,)",0.01
5,0.704545,0.041392,"(10,)",0.01
1,0.701299,0.050056,"(5,)",0.005
0,0.691558,0.043577,"(7,)",0.005
2,0.688312,0.032911,"(10,)",0.005
8,0.688312,0.040085,"(10,)",0.05
6,0.681818,0.077991,"(7,)",0.05
7,0.652597,0.091784,"(5,)",0.05


In [12]:
df_train = df.loc[df['season']<2017]
df_test = df.loc[df['season']==2017]

y_train = np.ravel(df_train[['win']])
X_train = df_train[X_cols].values

y_test = np.ravel(df_test[['win']])
X_test = df_test[X_cols].values

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[17, 14],
       [ 5, 27]])

In [13]:
def _train_model(X, y):
    model = MLPClassifier(hidden_layer_sizes=(7,), max_iter=1000, activation='logistic', alpha=0.005)
    model = model.fit(X,y)
    
    return model

In [14]:
def _matchup_proba(df, model, team_name_col, team_cols, team1, team2, season):
    prob_win = []
    X = df.loc[(df[team_name_col]==team1)&(df['season']==season)][team_cols].values.tolist()
    X_opp = df.loc[(df[team_name_col]==team2)&(df['season']==season)][team_cols].values.tolist()[0]
    X2 = X_opp + X[0]
    X[0].extend(X_opp)

    prob_win.append(model.predict_proba(X)[0,1])
    prob_win.append(1-model.predict_proba([X2])[0,1])
    
    return np.mean(prob_win)

In [15]:
def _sim_tourney(model, starting_teams, df, team_name_col, team_cols, season, n):
    out_dict = {k:[0,0,0,0,0,0] for k in starting_teams}
    for m in range(n):
        remaining_teams = starting_teams.copy()
        for rnd in range(int(np.log2(len(starting_teams)))):
            if rnd>0:
                remaining_teams = next_round.copy()
            next_round = []
            for i in range(int(len(remaining_teams)/2)):
                prob_win = _matchup_proba(df, model, team_name_col, team_cols, remaining_teams[2*i],
                                         remaining_teams[2*i+1], season)
                r = np.random.random()
                if r<prob_win:
                    out_dict[remaining_teams[2*i]][rnd]+=1
                    next_round.append(remaining_teams[2*i])
                else:
                    out_dict[remaining_teams[2*i+1]][rnd]+=1
                    next_round.append(remaining_teams[2*i+1])
                    
    return out_dict

In [16]:
df_train = df.copy(deep=True)

y_train = np.ravel(df_train[['win']])
X_train = df_train[X_cols].values

model = _train_model(X_train, y_train)

In [19]:
starting_teams = ['Duke','North Dakota St.','VCU','UCF','Mississippi St.','Liberty','Virginia Tech','Saint Louis',\
                  'Maryland','Belmont','LSU','Yale','Louisville','Minnesota','Michigan St.','Bradley',\
                  'Gonzaga','Fairleigh Dickinson','Syracuse','Baylor','Marquette','Murray St.','Florida St.',\
                  'Vermont','Buffalo','Arizona St.','Texas Tech','Northern Kentucky','Nevada','Florida',\
                  'Michigan','Montana','Virginia','Gardner Webb','Mississippi','Oklahoma','Wisconsin','Oregon',\
                  'Kansas St.','UC Irvine','Villanova',"Saint Mary's",'Purdue','Old Dominion','Cincinnati','Iowa',\
                  'Tennessee','Colgate','North Carolina','Iona','Utah St.','Washington','Auburn','New Mexico St.',\
                  'Kansas','Northeastern','Iowa St.','Ohio St.','Houston','Georgia St.','Wofford',\
                  'Seton Hall','Kentucky','Abilene Christian']
team_cols = []
for c in X_cols:
    if 'gopp' not in c:
        team_cols.append(c)
season = 2018
#n_iter = 10000
n_iter = 100
results = _sim_tourney(model, starting_teams, df_predict, 'team', team_cols, season, n_iter)

In [25]:
# save model to be used for optimal picks calculations
pickle.dump(model, open('data/game_prediction_model.pickle', 'wb'))
pickle.dump(team_cols, open('data/game_prediction_model_covariates.pickle', 'wb'))
pickle.dump(starting_teams, open('data/starting_teams.pickle', 'wb'))

In [20]:
_matchup_proba(df_predict, model, 'team', team_cols, 'Iowa St.', 'Houston', season)

0.36654558939703497

In [26]:
df_results = pd.DataFrame.from_dict(results, orient='index', columns=['RoundOf32','SweetSixteen','EliteEight',\
                                                                      'FinalFour','Championship','Champions'])
df_results = df_results.divide(n_iter)
df_results.sort_values(by=['Champions'], ascending=False)

Unnamed: 0,RoundOf32,SweetSixteen,EliteEight,FinalFour,Championship,Champions
Virginia,0.83,0.72,0.48,0.31,0.22,0.14
Gonzaga,0.82,0.59,0.42,0.27,0.16,0.10
Kentucky,0.87,0.69,0.46,0.31,0.14,0.10
Tennessee,0.82,0.51,0.33,0.20,0.17,0.09
Michigan,0.79,0.54,0.40,0.22,0.14,0.09
Houston,0.77,0.53,0.28,0.16,0.08,0.06
Duke,0.78,0.61,0.36,0.23,0.13,0.06
Kansas St.,0.87,0.50,0.22,0.09,0.06,0.04
Virginia Tech,0.81,0.46,0.28,0.14,0.08,0.04
Wisconsin,0.68,0.37,0.15,0.09,0.06,0.03


In [27]:
df_results['BracketPlacement'] = np.nan
i = 1
for t in starting_teams:
    df_results.at[t,'BracketPlacement'] = i
    i += 1
df_results['BracketPlacement'] = df_results['BracketPlacement'].astype(int)

In [28]:
df_results = df_results.merge(df_predict, left_index=True, right_on='team', how='left')
df_results = df_results.set_index('team')

In [30]:
df_results.to_csv('data/2019_tourney_predictions.csv')