In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss

import os, sys

from src.data import make_dataset

from subprocess import check_output
#print(check_output(["ls", "../input/datafiles"]).decode("utf8"))

In [2]:
data_dir = '../input/datafiles/'
df_tour = pd.read_csv(data_dir + 'NCAATourneyCompactResults.csv')
df_advstats = pd.read_csv(data_dir + 'AdvAvg_Stats.csv')
#df_alpha = pd.read_csv(data_dir + 'alpha.csv')
#df_beta = pd.read_csv(data_dir + 'beta.csv')
df_conf = pd.read_csv(data_dir + 'TeamConferences.csv')

In [3]:
# merging the team conferences into the advstats
df_advstats = pd.merge(left=df_advstats, right=df_conf, how='left',
                      left_on=['Season', 'TEAMID'], right_on=['Season', 'TeamID'])

In [4]:
# merging the alpha and betas into the advstats
#df_advstats = pd.merge(left=df_advstats, right=df_alpha, how='left',
#                       left_on=['Season', 'TEAMID'], right_on=['Season', 'team_id'])

#df_advstats = pd.merge(left=df_advstats, right=df_beta, how='left',
#                       left_on=['Season', 'ConfAbbrev'], right_on=['Season', 'conf_id'])

df_advstats.tail()

Unnamed: 0,Season,TEAMID,DayNum,SCORE,O_TEAMID,O_SCORE,FGM,FGA,FGM3,FGA3,...,O_TOV,DEFRTG,OFFRTG,TOR,O_TOR,STLTO,O_STLTO,PIE,TeamID,ConfAbbrev
5829,2019,1462,65.30303,71.878788,1269.757576,70.606061,26.121212,55.848485,7.0,21.030303,...,0.12827,105.617057,107.398112,14.507433,12.038266,0.498428,0.677313,0.520061,1462,big_east
5830,2019,1463,74.357143,80.892857,1226.607143,73.714286,29.821429,60.107143,7.785714,20.821429,...,0.1201,100.648883,111.3956,13.474279,11.826174,0.485324,0.684144,0.580806,1463,ivy
5831,2019,1464,61.8,73.5,1291.3,79.233333,26.833333,63.633333,9.566667,28.0,...,0.124648,112.860682,104.430941,13.526967,12.333482,0.520204,0.561064,0.420928,1464,horizon
5832,2019,1465,65.884615,75.461538,1304.423077,75.192308,26.038462,59.038462,8.807692,25.230769,...,0.116796,107.517094,108.3867,13.816599,11.938762,0.469579,0.551258,0.493732,1465,wac
5833,2019,1466,64.862069,65.62069,1302.068966,74.931034,22.931034,59.655172,7.241379,24.344828,...,0.13519,107.139871,92.913774,15.033538,13.258713,0.494909,0.595332,0.377009,1466,a_sun


In [5]:
#list(df_advstats.columns.values)

In [6]:
df_advstats = df_advstats[['Season', 'TEAMID', 'O_TEAMID', 'PIE',
                           'EFG', 'ORB', 'DRB', 'TS']]

In [7]:
df_advstats.tail()

Unnamed: 0,Season,TEAMID,O_TEAMID,PIE,EFG,ORB,DRB,TS
5829,2019,1462,1269.757576,0.520061,0.532344,0.318135,0.727182,0.562719
5830,2019,1463,1226.607143,0.580806,0.56095,0.258883,0.75903,0.592507
5831,2019,1464,1291.3,0.420928,0.49824,0.325596,0.711715,0.524869
5832,2019,1465,1304.423077,0.493732,0.519118,0.286488,0.718024,0.562946
5833,2019,1466,1302.068966,0.377009,0.445168,0.26876,0.752489,0.484899


Drop unneeded variables and pre 2003 tournaments (since we don't have advanced stats for these)

In [8]:
df_tour.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
df_tour = df_tour[df_tour['Season'] >= 2003]
df_tour_win = df_tour.copy()
df_tour_lose = df_tour.copy()

df_tour_win['Win'] = 1
df_tour_lose['Win'] = 0

We put in the advanced box scores for the winning and losing team. For symmetry, we do this twice (again for the losses).

In [9]:
df_tour_win = pd.merge(left=df_tour_win, right=df_advstats, how='left', left_on=['Season', 'WTeamID'],
                   right_on=['Season', 'TEAMID'], suffixes=('', '_1'))
df_tour_win = pd.merge(left=df_tour_win, right=df_advstats, how='left', left_on=['Season', 'LTeamID'],
                   right_on=['Season', 'TEAMID'], suffixes=('', '_2'))
df_tour_lose = pd.merge(left=df_tour_lose, right=df_advstats, how='left', left_on=['Season', 'LTeamID'],
                   right_on=['Season', 'TEAMID'], suffixes=('', '_1'))
df_tour_lose = pd.merge(left=df_tour_lose, right=df_advstats, how='left', left_on=['Season', 'WTeamID'],
                   right_on=['Season', 'TEAMID'], suffixes=('', '_2'))

In [10]:
df_tour_win.head()

Unnamed: 0,Season,WTeamID,LTeamID,Win,TEAMID,O_TEAMID,PIE,EFG,ORB,DRB,TS,TEAMID_2,O_TEAMID_2,PIE_2,EFG_2,ORB_2,DRB_2,TS_2
0,2003,1421,1411,1,1421,1253.241379,0.440672,0.4898,0.347184,0.625619,0.541825,1411,1249.4,0.547692,0.503036,0.360133,0.681723,0.539819
1,2003,1112,1436,1,1112,1329.892857,0.654693,0.517632,0.394027,0.68056,0.557334,1436,1234.482759,0.561326,0.494732,0.380016,0.732898,0.528383
2,2003,1113,1272,1,1113,1320.758621,0.58516,0.517334,0.39808,0.68395,0.557354,1272,1296.103448,0.597778,0.498337,0.366803,0.683904,0.534496
3,2003,1141,1166,1,1141,1243.655172,0.569291,0.572835,0.359177,0.662368,0.624037,1166,1285.818182,0.644084,0.567455,0.339233,0.681826,0.596901
4,2003,1143,1301,1,1143,1320.448276,0.55613,0.524098,0.32231,0.692086,0.556456,1301,1279.1,0.541945,0.534189,0.308813,0.681661,0.58211


In [11]:
#list(df_tour_win.columns.values)

In [12]:
#drop the columns
df_tour_win = df_tour_win.drop(columns=['TEAMID', 'O_TEAMID',
 'TEAMID_2', 'O_TEAMID_2',])
df_tour_lose = df_tour_lose.drop(columns=['TEAMID', 'O_TEAMID',
 'TEAMID_2', 'O_TEAMID_2',])

df_tour_win.rename(columns={'WTeamID':'team1', 'LTeamID':'team2'},inplace=True)
df_tour_win = df_tour_win.reindex(sorted(df_tour_win.columns), axis=1)

df_tour_lose.rename(columns={'WTeamID':'team2', 'LTeamID':'team1'},inplace=True)
df_tour_lose = df_tour_lose.reindex(sorted(df_tour_lose.columns), axis=1)

df_tour_win.head()

Unnamed: 0,DRB,DRB_2,EFG,EFG_2,ORB,ORB_2,PIE,PIE_2,Season,TS,TS_2,Win,team1,team2
0,0.625619,0.681723,0.4898,0.503036,0.347184,0.360133,0.440672,0.547692,2003,0.541825,0.539819,1,1421,1411
1,0.68056,0.732898,0.517632,0.494732,0.394027,0.380016,0.654693,0.561326,2003,0.557334,0.528383,1,1112,1436
2,0.68395,0.683904,0.517334,0.498337,0.39808,0.366803,0.58516,0.597778,2003,0.557354,0.534496,1,1113,1272
3,0.662368,0.681826,0.572835,0.567455,0.359177,0.339233,0.569291,0.644084,2003,0.624037,0.596901,1,1141,1166
4,0.692086,0.681661,0.524098,0.534189,0.32231,0.308813,0.55613,0.541945,2003,0.556456,0.58211,1,1143,1301


In [13]:
df_tour = pd.concat([df_tour_win, df_tour_lose])
df_tour = shuffle(df_tour)
df_tour.head()

Unnamed: 0,DRB,DRB_2,EFG,EFG_2,ORB,ORB_2,PIE,PIE_2,Season,TS,TS_2,Win,team1,team2
360,0.707372,0.693692,0.553043,0.496307,0.35572,0.326723,0.630279,0.580208,2008,0.59957,0.538843,1,1462,1345
959,0.729448,0.696724,0.540774,0.559321,0.292358,0.347044,0.549213,0.592677,2017,0.568119,0.587101,0,1277,1242
28,0.63079,0.65805,0.515151,0.50089,0.385242,0.332819,0.604162,0.590305,2003,0.555913,0.56188,1,1393,1264
559,0.722184,0.683937,0.577067,0.533601,0.368996,0.316272,0.682696,0.580145,2011,0.607305,0.566455,1,1242,1228
742,0.707702,0.69025,0.484364,0.500976,0.34954,0.374587,0.553744,0.595543,2014,0.546459,0.533456,0,1344,1314


Now we concatenate the wins and loss dfs, so that each tournament game shows up twice (in two rows) once as a win and once as a loss.

In [14]:
testindex = (df_tour['Season'] == 2019)

X = df_tour.drop(columns=['Season', 'Win', 'team1', 'team2'])
X_train = X[testindex == False]
X_test = X[testindex == True]

y = df_tour.Win.values
y_train = y[testindex == False]
y_test = y[testindex == True]

X_train.shape

(2096, 10)

In [15]:
y_train.shape

(2096,)

In [16]:
logreg = LogisticRegression(penalty='l2', fit_intercept=False, solver='lbfgs',
                            max_iter=10000, warm_start=True)

params = {'C': np.logspace(start = -5, stop = 3, num=15)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True, cv=5)
clf.fit(X_train, y_train)

print('Best log_loss {:.4}, with best C: {}' .format(clf.best_score_, clf.best_params_['C']))

Best log_loss -0.6098, with best C: 71.96856730011514


In [17]:
final_logreg = LogisticRegression(penalty='l2', C=clf.best_params_['C'],
                                  fit_intercept=False, solver='lbfgs',
                                  max_iter=10000, warm_start=True)
final_logreg.fit(X_train, y_train)

LogisticRegression(C=71.96856730011514, class_weight=None, dual=False,
          fit_intercept=False, intercept_scaling=1, max_iter=10000,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='lbfgs', tol=0.0001, verbose=0, warm_start=True)

Now we run the best C on the data again.

In [18]:
pred_train = final_logreg.predict_proba(X_train)[:,0]
#pred_test = final_logreg.predict_proba(X_test)[:,0]

In [19]:
#probability clipping
#pred_train = np.clip(pred_train, 0.2, 0.8)
#pred_test = np.clip(pred_test, 0.2, 0.8)

In [20]:
train_loss = log_loss(y_train, pred_train)
#test_loss = log_loss(y_test, pred_test)

print('train log_loss:{:0.2f}'.format(train_loss))
#print('train log_loss:{:0.2f}\ttest log_loss:{:0.2f}'.format(train_loss, test_loss))

train log_loss:0.98


In [21]:
train_acc = np.mean(y_train == final_logreg.predict(X_train))
#test_acc = np.mean(y_test == final_logreg.predict(X_test))

print('train accuracy:{:0.2f}'.format(train_acc))
#print('train accuracy:{:0.2f}\ttest accuracy:{:0.2f}'.format(train_acc, test_acc))

train accuracy:0.65


In [22]:
df_tour['Pred'] = 0
df_tour.loc[testindex == False, 'Pred'] = pred_train

In [23]:
df_tour[testindex == True].head()

Unnamed: 0,DRB,DRB_2,EFG,EFG_2,ORB,ORB_2,PIE,PIE_2,Season,TS,TS_2,Win,team1,team2,Pred


Now we save it in the right form for Young's folder.

In [24]:
# # this is for making the 2003-2018 predictions for ensembling

# modelpath = './log_advstat_l2/'
# os.mkdir(modelpath)

# for i in range(2003,2018+1):
#     data = make_dataset.get_train_data_v1(i)
#     data = data[data['tourney'] == 1]
#     savethis = pd.merge(left=data, right=df_tour, left_on=['season','team1','team2'],
#                    right_on=['Season', 'team1', 'team2'])
#     savethis = savethis[['ID','Pred']]

#     yearpath = modelpath + str(i)
#     os.mkdir(yearpath)

#     savethis.to_csv(yearpath+'/pred.csv', index=False)

In [27]:
#this is 2019 predictions

df_submit = pd.read_csv(data_dir + 'SampleSubmissionStage2.csv')
df_submit.head()

Unnamed: 0,ID,Pred
0,2019_1101_1113,0.5
1,2019_1101_1120,0.5
2,2019_1101_1124,0.5
3,2019_1101_1125,0.5
4,2019_1101_1133,0.5


In [46]:
splitID = df_submit["ID"].str.split("_", n = 2, expand = True)
df_submit['Season'] = splitID[0].astype('int64')
df_submit['team1'] = splitID[1].astype('int64')
df_submit['team2'] = splitID[2].astype('int64')

In [43]:
df_submit.head()

Unnamed: 0,ID,Pred,Season,team1,team2
0,2019_1101_1113,0.5,2019,1101,1113
1,2019_1101_1120,0.5,2019,1101,1120
2,2019_1101_1124,0.5,2019,1101,1124
3,2019_1101_1125,0.5,2019,1101,1125
4,2019_1101_1133,0.5,2019,1101,1133


In [44]:
df_advstats.head()

Unnamed: 0,Season,TEAMID,O_TEAMID,PIE,EFG,ORB,DRB,TS
0,2003,1102,1318.928571,0.488599,0.584407,0.168235,0.630384,0.606248
1,2003,1103,1258.407407,0.509717,0.536564,0.305803,0.626998,0.585812
2,2003,1104,1282.642857,0.536514,0.475785,0.371256,0.686897,0.521729
3,2003,1105,1270.153846,0.41505,0.457983,0.335166,0.641434,0.504339
4,2003,1106,1245.25,0.527568,0.481697,0.34948,0.679342,0.509554


In [56]:
X_test = pd.merge(left=df_submit, right=df_advstats, how='left',
                      left_on=['Season', 'team1'], right_on=['Season', 'TEAMID'])
X_test = pd.merge(left=X_test, right=df_advstats, how='left',
               left_on=['Season', 'team2'], right_on=['Season', 'TEAMID'],
               suffixes=('', '_2'))

In [59]:
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
X_test.drop(labels=['Season','Win','team1','team2'], inplace=True, axis=1)
X_test.head()

Unnamed: 0,DRB,DRB_2,EFG,EFG_2,ORB,ORB_2,PIE,PIE_2,TS,TS_2
0,0.725586,0.734541,0.524811,0.512606,0.274224,0.324304,0.574618,0.550505,0.564156,0.55246
1,0.725586,0.67478,0.524811,0.542735,0.274224,0.319014,0.574618,0.577956,0.564156,0.57625
2,0.725586,0.714692,0.524811,0.51083,0.274224,0.378682,0.574618,0.545062,0.564156,0.542789
3,0.725586,0.759502,0.524811,0.582854,0.274224,0.254261,0.574618,0.609242,0.564156,0.609354
4,0.725586,0.732956,0.524811,0.497537,0.274224,0.268269,0.574618,0.516696,0.564156,0.536705


In [62]:
pred_test = final_logreg.predict_proba(X_test)[:,0]
pred_test.shape

(2278,)

In [69]:
df_submit['Pred'] = pred_test
df_submit = df_submit[['ID','Pred']]
df_submit.head()

Unnamed: 0,ID,Pred
0,2019_1101_1113,0.469687
1,2019_1101_1120,0.603725
2,2019_1101_1124,0.530728
3,2019_1101_1125,0.587294
4,2019_1101_1133,0.316776


In [70]:
# modelpath = './log_advstat_l2/'
# yearpath = modelpath + str(2019)
# os.mkdir(yearpath)
# df_submit.to_csv(yearpath+'/pred.csv', index=False)

NameError: name 'savethis' is not defined