In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, RFE
import re
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline

# What data do we have?

In [3]:
#Seasons df indicates the start date of each season with indicators for region ordering for that season (WXYZ)
seasons = pd.read_csv('Seasons.csv')
print ('shape =',seasons.shape)
seasons.tail()

shape = (33, 6)


Unnamed: 0,Season,Dayzero,Regionw,Regionx,Regiony,Regionz
28,2013,11/5/2012,East,South,Midwest,West
29,2014,11/4/2013,East,South,Midwest,West
30,2015,11/3/2014,East,South,Midwest,West
31,2016,11/2/2015,East,Midwest,South,West
32,2017,10/31/2016,East,West,Midwest,South


In [4]:
#List of all the teams and their Ids
teams = pd.read_csv('Teams.csv')
print ('shape =',teams.shape)
teams.head()

shape = (364, 2)


Unnamed: 0,Team_Id,Team_Name
0,1101,Abilene Chr
1,1102,Air Force
2,1103,Akron
3,1104,Alabama
4,1105,Alabama A&M


In [5]:
#Compact stats for seasonal games going back to 1985
season_comp = pd.read_csv('RegularSeasonCompactResults.csv')
print ('shape =',season_comp.shape)
season_comp.head()

shape = (150684, 8)


Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [6]:
#More detailed stats than in season_comp, but only going back to 2003
season_detail = pd.read_csv('RegularSeasonDetailedResults.csv')
print ('shape =',season_detail.shape)
season_detail.head()

shape = (76636, 34)


Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [7]:
#Compact game stats for tournaments back to 1985
tourney_comp = pd.read_csv('TourneyCompactResults.csv')
print ('shape =',tourney_comp.shape)
tourney_comp.head()

shape = (2050, 8)


Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [8]:
#Detailed game stats for tournaments back to 2003
tourney_detail = pd.read_csv('TourneyDetailedResults.csv')
print ('shape =',tourney_detail.shape)
tourney_detail.head()

shape = (914, 34)


Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [9]:
#Team seeds in each tournament
seeds = pd.read_csv('TourneySeeds.csv')
print ('shape =',seeds.shape)
seeds.head(5)

shape = (2150, 3)


Unnamed: 0,Season,Seed,Team
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [10]:
#How the bracket is structured in each season
slots = pd.read_csv('TourneySlots.csv')
print ('shape =',slots.shape)
slots.head()

shape = (2117, 4)


Unnamed: 0,Season,Slot,Strongseed,Weakseed
0,1985,R1W1,W01,W16
1,1985,R1W2,W02,W15
2,1985,R1W3,W03,W14
3,1985,R1W4,W04,W13
4,1985,R1W5,W05,W12


In [11]:
sample = pd.read_csv('SampleSubmission.csv')

In [12]:
sample.head(20)

Unnamed: 0,Id,Pred
0,2017_1112_1116,0.5
1,2017_1112_1124,0.5
2,2017_1112_1137,0.5
3,2017_1112_1139,0.5
4,2017_1112_1153,0.5
5,2017_1112_1166,0.5
6,2017_1112_1173,0.5
7,2017_1112_1181,0.5
8,2017_1112_1190,0.5
9,2017_1112_1195,0.5


In [13]:
sample.shape

(2278, 2)

In [19]:
len(teams)

364

In [20]:
season_detail.head(10)

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14
5,2003,11,1458,81,1186,55,H,0,26,57,...,11,12,17,6,22,8,19,4,3,25
6,2003,12,1161,80,1236,62,H,0,23,55,...,15,20,28,9,21,11,30,10,4,28
7,2003,12,1186,75,1457,61,N,0,28,62,...,17,17,23,8,25,10,15,14,8,18
8,2003,12,1194,71,1156,66,N,0,28,58,...,18,12,27,13,26,13,25,8,2,18
9,2003,12,1458,84,1296,56,H,0,32,67,...,14,7,12,9,23,10,18,1,3,18


# Preprocessing

In [163]:
#Make some functions to be used in apply methods

def id_maker(x):
    """Forms the game Id how Kaggle wants it in the form yyyy_lower-team-Id_higher-team-Id"""
    return '{}_{}_{}'.format(x['Season'],
                             min(x['Wteam'],x['Lteam']),
                             max(x['Wteam'],x['Lteam']))

def wl_grabber(x):
    """ Determine whether the lower-Id team won; returns 1 if they did, 0 otherwise"""
    return int(x['Wteam'] == min(x['Wteam'],x['Lteam']))

def col_agg(x,cols,df):
    """Calculates aggregate stats for each team based on the previous 3 games to the current
    game played by each team.
    
    cols is the list of stat columns that will be aggregated and has the form ['score','fgm', 'fga',...]
    
    df is the DataFrame over which to compute the stats. 
    In this case it will be either season_detail or season_comp
    """
    #Assign upper and lower teams based on team ID
    lower_team = min(x['Wteam'],x['Lteam'])
    higher_team = max(x['Wteam'],x['Lteam'])
    
    #Map upper/lower to winning/losing
    wl = ['Lteam','Wteam']
    low_wl = wl[lower_team == x['Wteam']]
    up_wl = wl[lower_team != x['Wteam']]
    
    #Make a df of all the games played by the team with the lower Id in the same season as the current game
    low_games = pd.concat(
        [df.loc[(df.Wteam==x[low_wl]) & (df.Season==x.Season),['W'+col for col in cols]]\
             .rename(columns={'W'+col:col for col in cols}),
        df.loc[(df.Lteam==x[low_wl]) & (df.Season==x.Season),['L'+col for col in cols]]
            .rename(columns={'L'+col:col for col in cols})])\
        .sort_index()
    
    #Find the index of the current game in the low_games df
    low_index= list(low_games.index).index(x.name)
    
    #Calculate agg stats for the 3 games prior to the current game and append an 'L' to the stat names
    low_avgs = low_games[low_index-3:low_index].mean().rename({col:'L'+col for col in cols})
    
    #Do the same thing for the team with the larger Id
    up_games = pd.concat(
        [df.loc[(df.Wteam==x[up_wl]) & (df.Season==x.Season),['W'+col for col in cols]]\
             .rename(columns={'W'+col:col for col in cols}),
        df.loc[(df.Lteam==x[up_wl]) & (df.Season==x.Season),['L'+col for col in cols]]
            .rename(columns={'L'+col:col for col in cols})])\
        .sort_index()
    
    up_index= list(up_games.index).index(x.name)
    
    #Append 'U' to the stat names for the upper team
    up_avgs = up_games[up_index-3:up_index].mean().rename({col:'U'+col for col in cols})
    
    #Combine the stats for the lower and upper teams into one series
    all_avgs = pd.concat([low_avgs,up_avgs])
        
    #Return the aggregated values in lower, upper order    
    return all_avgs

In [22]:
#Apply above functions
game = season_detail.apply(id_maker,axis=1)
won = season_detail.apply(wl_grabber,axis=1)
feature_mat = pd.concat([game,won],axis=1)
feature_mat.rename(columns = {0:'game',1:'won'},inplace=True)

#Set columns to perform agg on
agg_cols = ['score','fgm','fga','fgm3','fga3','ftm','fta','or','dr','ast','to','stl','blk','pf']

#Use season_detail to get stats (only use games from 2003+)
feature_mat = pd.concat([feature_mat,season_detail.apply(col_agg,axis=1,args=(agg_cols,season_detail))],axis=1)
feature_mat.set_index('game',inplace=True)
feature_mat.head()

Unnamed: 0_level_0,won,Lscore,Lfgm,Lfga,Lfgm3,Lfga3,Lftm,Lfta,Lor,Ldr,...,Ufga3,Uftm,Ufta,Uor,Udr,Uast,Uto,Ustl,Ublk,Upf
game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003_1104_1328,1,,,,,,,,,,...,,,,,,,,,,
2003_1272_1393,1,,,,,,,,,,...,,,,,,,,,,
2003_1266_1437,1,,,,,,,,,,...,,,,,,,,,,
2003_1296_1457,1,,,,,,,,,,...,,,,,,,,,,
2003_1208_1400,0,,,,,,,,,,...,,,,,,,,,,


In [24]:
#Drop rows with NA values - these exist because there are games at the start of each season that don't have
#enough historical data within that season (3 games) from which to calculate aggregate stats
feature_mat.dropna(inplace=True)

In [25]:
#Separate target column from feature matrix
X = feature_mat.drop('won',axis=1)
y = feature_mat.won

In [26]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=.75)



In [117]:
#Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model Selection/Tuning

In [164]:
rf = RandomForestClassifier()
params  = {
    'n_estimators' : np.arange(15,50,5),
    'max_depth' : [None,5,10,15]#,
#     'oob_score' : [True,False]   
}

#Use 'neg_log_loss' for scoring as that's how the Kaggle competition uses log loss for evaluation
gs = GridSearchCV(rf,param_grid=params,verbose=1,scoring='neg_log_loss')
gs.fit(X_train,y_train)

#Multiply score by -1 to convert to log loss from neg_log_loss
print(gs.best_score_ * -1)
print(gs.best_params_)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:  2.9min finished


0.655687506448


In [165]:
print(gs.best_params_)

{'max_depth': 10, 'n_estimators': 45}


In [90]:
#This is similar to the col_agg fct, but slightly different to account for the fact that the tourney games
#are in a different df than the season games that the stats are pulled from. It also pulls the team seeds from
#seeds df.
def tourney_col_agg(x,cols,df):
    #Assign upper and lower teams based on team ID
    lower_team = min(x['Wteam'],x['Lteam'])
    higher_team = max(x['Wteam'],x['Lteam'])
    
    #Map upper/lower to winning/losing
    wl = ['Lteam','Wteam']
    low_wl = wl[lower_team == x['Wteam']]
    up_wl = wl[lower_team != x['Wteam']]
    

    
    low_games = pd.concat(
        [df.loc[(df.Wteam==x[low_wl]) & (df.Season==x.Season),['W'+col for col in cols]]\
             .rename(columns={'W'+col:col for col in cols}),
        df.loc[(df.Lteam==x[low_wl]) & (df.Season==x.Season),['L'+col for col in cols]]
            .rename(columns={'L'+col:col for col in cols})])\
        .sort_index()
    
    low_avgs = low_games[-3:].mean().rename({col:'L'+col for col in cols})
    
    up_games = pd.concat(
        [df.loc[(df.Wteam==x[up_wl]) & (df.Season==x.Season),['W'+col for col in cols]]\
             .rename(columns={'W'+col:col for col in cols}),
        df.loc[(df.Lteam==x[up_wl]) & (df.Season==x.Season),['L'+col for col in cols]]
            .rename(columns={'L'+col:col for col in cols})])\
        .sort_index()
    
    up_avgs = up_games[-3:].mean().rename({col:'U'+col for col in cols})
    
    all_avgs = pd.concat([low_avgs,up_avgs])
    
    low_seed = int(re.findall('\d+',seeds.loc[(seeds.Team==lower_team) & (seeds.Season==x.Season),'Seed'].values[0])[0])
    up_seed = int(re.findall('\d+',seeds.loc[(seeds.Team==higher_team) & (seeds.Season==x.Season),'Seed'].values[0])[0])
    all_avgs['Lseed'] = low_seed
    all_avgs['Useed'] = up_seed
        
    #Return the aggregated values in lower, upper order    
    return all_avgs

In [96]:
#Apply created functions
tourney_game = tourney_detail.apply(id_maker,axis=1)
tourney_won = tourney_detail.apply(wl_grabber,axis=1)
tourney_feature_mat = pd.concat([tourney_game,tourney_won],axis=1)
tourney_feature_mat.rename(columns = {0:'game',1:'won'},inplace=True)
tourney_feature_mat = pd.concat([tourney_feature_mat,
        tourney_detail.apply(tourney_col_agg,axis=1,args=(agg_cols,season_detail))],axis=1)
tourney_feature_mat.set_index('game',inplace=True)

In [160]:
#Make win probability predictions using the model fitted with season game data and use that as another feature
temp_sc = sc.fit_transform(tourney_feature_mat.drop(['won','Lseed','Useed'],axis=1))
preds = gs.predict_proba(temp_sc)[:,1]
tourney_feature_mat['pred_prob'] = preds
tourney_sc = sc.fit_transform(tourney_feature_mat.drop('won',axis=1))

In [161]:
#Grid search a Random Forest
params  = {
    'n_estimators' : np.arange(20,100,10),
    'max_depth' : [5,10,15],
    'min_samples_leaf' : [2,5,7,10]  
}
tourney_gs = GridSearchCV(RandomForestClassifier(),param_grid=params,verbose=1,scoring='neg_log_loss')
tourney_gs.fit(tourney_sc,tourney_feature_mat.won)
print(tourney_gs.best_score_*-1)
print(tourney_gs.best_params_)

Fitting 3 folds for each of 96 candidates, totalling 288 fits
0.586745528499
{'max_depth': 15, 'min_samples_leaf': 5, 'n_estimators': 30}


[Parallel(n_jobs=1)]: Done 288 out of 288 | elapsed:   29.5s finished


In [186]:
#Grid search using Logistic Regression
params  = {
    'penalty' : ['l1','l2'],
    'C' : [.01,.1,1,10],
}
tourney_gs = GridSearchCV(LogisticRegression(),param_grid=params,verbose=1,scoring='neg_log_loss')
tourney_gs.fit(tourney_sc,tourney_feature_mat.won)
print(tourney_gs.best_score_*-1)
print(tourney_gs.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
0.558256505041
{'C': 0.1, 'penalty': 'l1'}


[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    0.1s finished


Logistic Regression appears to perform the best, producing a log loss of 0.558.

# 2017 Predictions

In [185]:
playoff_teams = seeds.loc[seeds.Season==2017,'Team'].values
playoff_teams.sort()
lis = []
for team_1 in playoff_teams:
    for team_2 in playoff_teams:
        if team_2 > team_1:
            lis.append('2017_{}_{}'.format(str(team_1),str(team_2)))

In [190]:
features_2017 = pd.DataFrame({'id':lis})

In [191]:
features_2017.head()

Unnamed: 0,id
0,2017_1112_1116
1,2017_1112_1124
2,2017_1112_1137
3,2017_1112_1139
4,2017_1112_1153


In [193]:
#One last stat aggregator
def col_agg_2017(x,cols,df):
    #Assign upper and lower teams based on team ID
    game_id = x.id.split('_')
    lower_team = int(game_id[1])
    higher_team = int(game_id[2])
    season = int(game_id[0])
        
    low_games = pd.concat(
        [df.loc[(df.Wteam==lower_team) & (df.Season==season),['W'+col for col in cols]]\
             .rename(columns={'W'+col:col for col in cols}),
        df.loc[(df.Lteam==lower_team) & (df.Season==season),['L'+col for col in cols]]
            .rename(columns={'L'+col:col for col in cols})])\
        .sort_index()
    
    low_avgs = low_games[-3:].mean().rename({col:'L'+col for col in cols})
    
    up_games = pd.concat(
        [df.loc[(df.Wteam==higher_team) & (df.Season==season),['W'+col for col in cols]]\
             .rename(columns={'W'+col:col for col in cols}),
        df.loc[(df.Lteam==higher_team) & (df.Season==season),['L'+col for col in cols]]
            .rename(columns={'L'+col:col for col in cols})])\
        .sort_index()
    
    up_avgs = up_games[-3:].mean().rename({col:'U'+col for col in cols})
    
    all_avgs = pd.concat([low_avgs,up_avgs])
    
    low_seed = int(re.findall('\d+',seeds.loc[(seeds.Team==lower_team) & (seeds.Season==season),'Seed'].values[0])[0])
    up_seed = int(re.findall('\d+',seeds.loc[(seeds.Team==higher_team) & (seeds.Season==season),'Seed'].values[0])[0])
    all_avgs['Lseed'] = low_seed
    all_avgs['Useed'] = up_seed
        
    #Return the aggregated values in lower, upper order    
    return all_avgs

In [195]:
features_2017 = pd.concat([features_2017,features_2017.apply(col_agg_2017,axis=1,args=(agg_cols,season_detail))],axis=1)

In [201]:
features_2017.set_index('id',inplace=True)

In [202]:
features_2017.head()

Unnamed: 0_level_0,Lscore,Lfgm,Lfga,Lfgm3,Lfga3,Lftm,Lfta,Lor,Ldr,Last,...,Ufta,Uor,Udr,Uast,Uto,Ustl,Ublk,Upf,Lseed,Useed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017_1112_1116,87.0,29.0,53.333333,8.666667,18.666667,20.333333,29.0,8.0,27.666667,14.0,...,17.333333,9.333333,25.333333,11.0,12.0,8.666667,5.0,17.666667,2.0,8.0
2017_1112_1124,87.0,29.0,53.333333,8.666667,18.666667,20.333333,29.0,8.0,27.666667,14.0,...,21.666667,16.333333,24.666667,14.0,15.333333,4.666667,3.333333,18.333333,2.0,3.0
2017_1112_1137,87.0,29.0,53.333333,8.666667,18.666667,20.333333,29.0,8.0,27.666667,14.0,...,16.0,7.333333,29.666667,17.333333,10.666667,6.666667,6.0,19.333333,2.0,13.0
2017_1112_1139,87.0,29.0,53.333333,8.666667,18.666667,20.333333,29.0,8.0,27.666667,14.0,...,16.666667,7.333333,18.0,11.0,8.666667,8.666667,3.0,17.333333,2.0,4.0
2017_1112_1153,87.0,29.0,53.333333,8.666667,18.666667,20.333333,29.0,8.0,27.666667,14.0,...,27.333333,14.333333,22.0,8.333333,6.666667,6.0,2.0,19.0,2.0,6.0


In [203]:
#Make win probability predictions using the model fitted with season game data and use that as another feature
temp_sc = sc.fit_transform(features_2017.drop(['Lseed','Useed'],axis=1))
preds = gs.predict_proba(temp_sc)[:,1]
features_2017['pred_prob'] = preds
sc_2017 = sc.fit_transform(features_2017)

In [211]:
#This is the file I would submit to the Kaggle competition
preds_2017 = pd.DataFrame({'id':features_2017.index, 'pred':tourney_gs.predict_proba(sc_2017)[:,1]})