In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import log_loss
import os

In [2]:
f = {f.split('/')[-1]: pd.read_csv(f, encoding='latin1') for f in os.listdir() if f.endswith('.csv')}
#for df in f: print(df, list(f[df].columns))

season_dresults = f['MRegularSeasonDetailedResults.csv']
tourney_dresults = f['MNCAATourneyDetailedResults.csv']
slots = f['MNCAATourneySlots.csv']
seeds = f['MNCAATourneySeeds.csv']
seeds = {'_'.join(map(str,[int(k1),k2])):int(v[1:3]) for k1, v, k2 in seeds[['Season', 'Seed', 'TeamID']].values}
seeds = {**seeds, **{k.replace('2021_','2022_'):seeds[k] for k in seeds if '2021_' in k}}
sub = f['MSampleSubmissionStage1.csv']

In [3]:
season_dresults['ST'] = 'S'
tourney_dresults['ST'] = 'T'

In [224]:
# games = pd.concat((season_cresults, tourney_cresults), axis=0, ignore_index=True)
games = pd.concat((season_dresults, tourney_dresults), axis=0, ignore_index=True)
games.reset_index(drop=True, inplace=True)
games['WLoc'] = games['WLoc'].map({'A': -1, 'H': 1, 'N': 0})

games['ID'] = games.apply(lambda r: '_'.join(map(str, [r['Season']]+sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
games['IDTeams'] = games.apply(lambda r: '_'.join(map(str, sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
games['Team1'] = games.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[0], axis=1)
games['Team2'] = games.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[1], axis=1)
games['IDTeam1'] = games.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team1']])), axis=1)
games['IDTeam2'] = games.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team2']])), axis=1)

games['Team1Seed'] = games['IDTeam1'].map(seeds).fillna(0)
games['Team2Seed'] = games['IDTeam2'].map(seeds).fillna(0)

games['ScoreDiff'] = games['WScore'] - games['LScore']
games['Pred'] = games.apply(lambda r: 1. if sorted([r['WTeamID'],r['LTeamID']])[0]==r['WTeamID'] else 0., axis=1)
games['ScoreDiffNorm'] = games.apply(lambda r: r['ScoreDiff'] * -1 if r['Pred'] == 0. else r['ScoreDiff'], axis=1)
games['SeedDiff'] = games['Team1Seed'] - games['Team2Seed']
games['WEFFG'] = games['WFGM'] / games['WFGA']
games['WEFFG3'] = games['WFGM3'] / games['WFGA3']
games['WDARE'] = games['WFGM3'] / games['WFGM']
games['WTOQUETOQUE'] = games['WAst'] / games['WFGM']

games['LEFFG'] = games['LFGM'] / games['LFGA']
games['LEFFG3'] = games['LFGM3'] / games['LFGA3']
games['LDARE'] = games['LFGM3'] / games['LFGM']
games['LTOQUETOQUE'] = games['LAst'] / games['LFGM']

games['DFGM'] = games.apply(lambda r: r['LFGM'] - r['WFGM'] if r['Pred'] == 0. else r['WFGM'] - r['LFGM'], axis=1)
games['DFGA'] = games.apply(lambda r: r['LFGA'] - r['WFGA'] if r['Pred'] == 0. else r['WFGA'] - r['LFGA'], axis=1)
games['DFGM3'] = games.apply(lambda r: r['LFGM3'] - r['WFGM3'] if r['Pred'] == 0. else r['WFGM3'] - r['LFGM3'], axis=1)
games['DFGA3'] = games.apply(lambda r: r['LFGA3'] - r['WFGA3'] if r['Pred'] == 0. else r['WFGA3'] - r['LFGA3'], axis=1)
games['DFTM'] = games.apply(lambda r: r['LFTM'] - r['WFTM'] if r['Pred'] == 0. else r['WFTM'] - r['LFTM'], axis=1)
games['DFTA'] = games.apply(lambda r: r['LFTA'] - r['WFTA'] if r['Pred'] == 0. else r['WFTA'] - r['LFTA'], axis=1)
games['DOR'] = games.apply(lambda r: r['LOR'] - r['WOR'] if r['Pred'] == 0. else r['WOR'] - r['LOR'], axis=1)
games['DDR'] = games.apply(lambda r: r['LDR'] - r['WDR'] if r['Pred'] == 0. else r['WDR'] - r['LDR'], axis=1)
games['DAst'] = games.apply(lambda r: r['LAst'] - r['WAst'] if r['Pred'] == 0. else r['WAst'] - r['LAst'], axis=1)
games['DTO'] = games.apply(lambda r: r['LTO'] - r['WTO'] if r['Pred'] == 0. else r['WTO'] - r['LTO'], axis=1)
games['DStl'] = games.apply(lambda r: r['LStl'] - r['WStl'] if r['Pred'] == 0. else r['WStl'] - r['LStl'], axis=1)
games['DBlk'] = games.apply(lambda r: r['LBlk'] - r['WBlk'] if r['Pred'] == 0. else r['WBlk'] - r['LBlk'], axis=1)
games['DPF'] = games.apply(lambda r: r['LPF'] - r['WPF'] if r['Pred'] == 0. else r['WPF'] - r['LPF'], axis=1)

games = games.fillna(-1)
games.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,DFGA3,DFTM,DFTA,DOR,DDR,DAst,DTO,DStl,DBlk,DPF
0,2003,10,1104,68,1328,62,0,0,27,58,...,4,-5,-4,4,2,5,5,-2,-1,2
1,2003,10,1272,70,1393,63,0,0,26,62,...,-4,1,-1,-5,3,9,1,-4,-2,2
2,2003,11,1266,73,1437,61,0,0,24,58,...,-8,3,6,-14,4,6,-2,3,-3,2
3,2003,11,1296,56,1457,50,0,0,18,38,...,-13,9,16,-11,-1,2,-7,10,-1,-5
4,2003,11,1400,77,1208,71,0,0,30,61,...,2,6,14,4,-7,0,-4,3,-3,-6


In [225]:
games = games[games.Season<2016]
games.shape

(66719, 68)

In [227]:
c_score_col = ['ScoreDiffNorm', 'Pred', 'SeedDiff', 'NumOT',
 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF',
 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF' ,
 'WEFFG', 'WEFFG3', 'WDARE', 'WTOQUETOQUE', 'LEFFG', 'LEFFG3', 'LDARE', 'LTOQUETOQUE',
 'DFGM', 'DFGA', 'DFGM3', 'DFGA3', 'DFTM', 'DFTA', 'DOR', 'DDR', 'DAst', 'DTO', 'DStl', 'DBlk', 'DPF']
c_score_agg = ['sum', 'mean', 'median', 'max', 'min', 'std', 'skew']

gb = games.groupby(by=['IDTeams']).agg({k: c_score_agg for k in c_score_col}).reset_index()
gb.columns = [''.join(c) + '_c_score' for c in gb.columns]

games = games[games['ST']=='S']
games = pd.merge(games, gb, how='left', left_on='IDTeams', right_on='IDTeams_c_score')
col = [c for c in games.columns if c not in ['ID', 'DayNum', 'ST', 'Team1', 'Team2', 'IDTeams', 'IDTeam1', 'IDTeam2', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'NumOT', 'Pred', 'ScoreDiff', 'ScoreDiffNorm', 'WLoc', 'IDTeams_c_score']+c_score_col]
games.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,DBlkmin_c_score,DBlkstd_c_score,DBlkskew_c_score,DPFsum_c_score,DPFmean_c_score,DPFmedian_c_score,DPFmax_c_score,DPFmin_c_score,DPFstd_c_score,DPFskew_c_score
0,2003,10,1104,68,1328,62,0,0,27,58,...,-3,2.081666,0.0,2,0.5,2.0,4,-6,4.434712,-1.719868
1,2003,10,1272,70,1393,63,0,0,26,62,...,-2,3.304038,1.559507,25,6.25,6.5,10,2,3.5,-0.3207
2,2003,11,1266,73,1437,61,0,0,24,58,...,-3,1.548438,-0.009451,1,0.052632,0.0,8,-10,5.482028,-0.314478
3,2003,11,1296,56,1457,50,0,0,18,38,...,-1,1.154701,-1.732051,-4,-1.333333,-2.0,3,-5,4.041452,0.722109
4,2003,11,1400,77,1208,71,0,0,30,61,...,-3,,,-6,-6.0,-6.0,-6,-6,,


In [243]:
import xgboost as xgb

params = {
            'objective': 'reg:logistic',
            'eval_metric': 'auc',
            'booster': 'gbtree',
            'eta': 0.1,
            'subsample': 0.3,
            'colsample_bytree': 0.6,
            'num_parallel_tree': 10,
            'min_child_weight': 40,
            'gamma': 10,
            'max_depth': 5
        }
xgb_DM = xgb.DMatrix(games[col].fillna(-1), games['Pred'])
model = xgb.train(params=params, dtrain=xgb_DM, num_boost_round=200)

In [244]:
pred = model.predict(xgb_DM).clip(0.0000001, 0.9999999)
print('Log Loss:', log_loss(games['Pred'], pred))

Log Loss: 0.4467199582407012


In [245]:
correct_file= pd.read_csv('perfect_submission.csv')
idx = (correct_file.Pred!=0.5) #& (correct_file['ID'].str.startswith('2016'))
true_pred = correct_file[idx]

In [250]:
sub = f['MSampleSubmissionStage1.csv']
sub['WLoc'] = sub['WLoc'].map({'A': -1, 'H': 1, 'N': 0})
sub['Season'] = sub['ID'].map(lambda x: x.split('_')[0])
sub['Season'] = sub['ID'].map(lambda x: x.split('_')[0])
sub['Season'] = sub['Season'].astype(int)
sub['Team1'] = sub['ID'].map(lambda x: x.split('_')[1])
sub['Team2'] = sub['ID'].map(lambda x: x.split('_')[2])
sub['IDTeams'] = sub.apply(lambda r: '_'.join(map(str, [r['Team1'], r['Team2']])), axis=1)
sub['IDTeam1'] = sub.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team1']])), axis=1)
sub['IDTeam2'] = sub.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team2']])), axis=1)
sub['Team1Seed'] = sub['IDTeam1'].map(seeds).fillna(0)
sub['Team2Seed'] = sub['IDTeam2'].map(seeds).fillna(0)
sub['SeedDiff'] = sub['Team1Seed'] - sub['Team2Seed'] 

sub = sub.fillna(-1)
sub = pd.merge(sub, gb, how='left', left_on='IDTeams', right_on='IDTeams_c_score')

In [247]:
sub_pred = model.predict(xgb.DMatrix(sub[col].fillna(-1))).clip(0.0000001, 0.9999999)
sub['Pred'] = sub_pred
print('Log Loss:', log_loss(true_pred['Pred'], sub.loc[idx, 'Pred']))

Log Loss: 2.093083246091721


In [236]:

sub[['ID', 'Pred']].to_csv('st_submission.csv', index=False)

In [249]:
games.to_csv('train.csv', index=False)

In [252]:
sub.to_csv('test.csv', index=False)