In [21]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../NCAA/DataFiles"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

Cities.csv
ConferenceTourneyGames.csv
Conferences.csv
GameCities.csv
NCAATourneyCompactResults.csv
NCAATourneyDetailedResults.csv
NCAATourneySeedRoundSlots.csv
NCAATourneySeeds.csv
NCAATourneySlots.csv
RegularSeasonCompactResults.csv
RegularSeasonDetailedResults.csv
SampleSubmissionStage1.csv
Seasons.csv
SecondaryTourneyCompactResults.csv
SecondaryTourneyTeams.csv
TeamCoaches.csv
TeamConferences.csv
TeamSpellings.csv
Teams.csv



Pipeline: Merge everything a

In [22]:
def seed_to_int(seed):
    #Get just the digits from the seeding. Return as int
    s_int = int(seed[1:3])
    return s_int


# Get features you want for each team

In [23]:
def merge_features(df,feature_list):
    '''
    Input: dataframe which contains Season and TeamID (ex. input df_seeds), list of dataframes with features 
    Output: dataframe with merged features
    '''
    for feature in feature_list:
        if 'Season' in feature.columns:
            df = pd.merge(left = df, right = feature, how = 'left', on = ['Season', 'TeamID'])
        else:
            df = pd.merge(left = df, right = feature, how = 'left', on = ['TeamID'])
    return df

# Combine features with Team1 and Team2

In [24]:
def get_features(df_merge,df_team):
    '''
    Input:  dataframe from merge_features, dataframe which contains team1, team2, and year'''
    df_1 = df_merge.rename(columns={'TeamID':'Team1'})
    df_2 = df_merge.rename(columns={'TeamID':'Team2'})
    df_dummy = pd.merge(left=df_team, right=df_1, how='left', on=['Season', 'Team1'])
    df_concat = pd.merge(left=df_dummy, right=df_2, on=['Season', 'Team2'])
    return df_concat

In [25]:
def change_for_training(df_team):
    df_team = df_team.rename(columns = {'WTeamID': 'Team1','LTeamID':'Team2'})
    return df_team

# Feature engineering

In [26]:
def get_seed_diff(df_final):
    df_final['SeedDiff'] = df_final['Seed_x']-df_final['Seed_y']
    return df_final
def D1diff(df):
    df['D1diff'] = df['FirstD1Season_x']-df['FirstD1Season_y']
    df = df.drop(['FirstD1Season_x','FirstD1Season_y'], axis=1)
    return df

# Training features

In [27]:
def get_predictions(df, features):
    '''
    input: dataframe of training dataset with all features, list of specific features we want from training data
    output: dataframe for training data
    '''
    df_wins = df[features]
    df_wins['Result'] = 1
    df_losses = df[features]
    df_losses['SeedDiff'] = -df['SeedDiff']
    df_losses['Result'] = 0
    df_predictions = pd.concat((df_wins, df_losses))
    return df_predictions

Idea: Get features of training data in a similar fashion as test data, and just join the one datasets with inner join

In [28]:
data_dir = '../NCAA/DataFiles/'
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'NCAATourneyCompactResults.csv')
df_conferences = pd.read_csv(data_dir + "TeamConferences.csv")
df_coaches = pd.read_csv(data_dir + "TeamCoaches.csv")
df_firstd1season = pd.read_csv(data_dir + "Teams.csv")


In [29]:
'Season' in df_conferences.columns

True

In [30]:
df_seeds['Seed'] = df_seeds.Seed.apply(seed_to_int)
feature_list = [df_conferences, df_firstd1season]
df_team = change_for_training(df_tour)

In [31]:
df_merge = merge_features(df_seeds,feature_list)
df_concat = get_features(df_merge,df_team)


df_concat = df_concat.pipe(get_seed_diff).pipe(D1diff)

In [32]:
df_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,1,1207
1,1985,2,1210
2,1985,3,1228
3,1985,4,1260
4,1985,5,1374


In [33]:
winners = df_concat.rename( columns = { 'WTeamID' : 'TeamID1', 'LTeamID' : 'TeamID2'})
winners['Result'] = 1.0

losers = df_concat.rename( columns = { 'WTeamID' : 'TeamID2', 'LTeamID' : 'TeamID1'})
losers['Result'] = 0.0
losers['D1diff'] = winners['D1diff']*-1
losers['SeedDiff'] = df_concat['SeedDiff']*-1

train = pd.concat( [winners, losers], axis = 0).reset_index(drop = True)



In [34]:
tscv = TimeSeriesSplit(n_splits = 5)

In [35]:
train['DayNum'].describe()

count    4234.000000
mean      139.150685
std         4.209544
min       134.000000
25%       136.000000
50%       137.000000
75%       139.000000
max       154.000000
Name: DayNum, dtype: float64

In [36]:
for train_index, test_index in tscv.split(train[['Season','DayNum']].set_index('Season')):
    print (train.iloc[train_index][['Season','DayNum']])

     Season  DayNum
0      1985     136
1      1985     136
2      1985     136
3      1985     136
4      1985     136
5      1985     136
6      1985     136
7      1985     136
8      1985     136
9      1985     136
10     1985     136
11     1985     136
12     1985     136
13     1985     136
14     1985     136
15     1985     136
16     1985     137
17     1985     137
18     1985     137
19     1985     137
20     1985     137
21     1985     137
22     1985     137
23     1985     137
24     1985     137
25     1985     137
26     1985     137
27     1985     137
28     1985     137
29     1985     137
..      ...     ...
679    1995     143
680    1995     143
681    1995     143
682    1995     144
683    1995     144
684    1995     144
685    1995     144
686    1995     145
687    1995     145
688    1995     146
689    1995     146
690    1995     152
691    1995     152
692    1995     154
693    1996     136
694    1996     136
695    1996     136
696    1996     136


In [37]:
train.head()

Unnamed: 0,Season,DayNum,Team1,WScore,Team2,LScore,WLoc,NumOT,Seed_x,ConfAbbrev_x,TeamName_x,LastD1Season_x,Seed_y,ConfAbbrev_y,TeamName_y,LastD1Season_y,SeedDiff,D1diff,Result
0,1985,136,1116,63,1234,54,N,0,9,swc,Arkansas,2018,8,big_ten,Iowa,2018,1,0,1.0
1,1985,136,1120,59,1345,58,N,0,11,sec,Auburn,2018,6,big_ten,Purdue,2018,5,0,1.0
2,1985,136,1207,68,1250,43,N,0,1,big_east,Georgetown,2018,16,ecc,Lehigh,2018,-15,0,1.0
3,1985,136,1229,58,1425,55,N,0,9,mvc,Illinois St,2018,8,pac_ten,USC,2018,1,0,1.0
4,1985,136,1242,49,1325,38,N,0,3,big_eight,Kansas,2018,14,mac,Ohio,2018,-11,0,1.0


In [38]:
pre_train = train[['Seed_x','Seed_y','SeedDiff','Result','D1diff']]

In [40]:
X_train = pre_train.drop('Result',axis=1).values
y_train = pre_train.Result.values
X_train, y_train = shuffle(X_train, y_train)

In [41]:
logreg = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_['C']))

Best log_loss: -0.5543, with best C: 0.001


In [43]:
from sklearn.ensemble import RandomForestClassifier


In [44]:
rf = RandomForestClassifier()
random_forest = dict(n_estimators=[2000],
                    criterion = ['gini','entropy'],
                    max_features = ['sqrt','log2', None],
                    random_state = [1],
                    min_samples_leaf = [1, 2,3],
                    min_samples_split = [2,3])
clf = GridSearchCV(rf, random_forest, scoring = 'neg_log_loss', refit=True)
clf.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_))

KeyboardInterrupt: 

## Merge seed for each team
Merge the Seeds with their corresponding TeamIDs in the compact results dataframe.

# Test set

In [282]:
df_sample_sub = pd.read_csv(data_dir + 'SampleSubmissionStage2.csv')
df_sample_sub['Season'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[0]) )
df_sample_sub['Team1'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[1]) )
df_sample_sub['Team2'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[2]) )

In [None]:
df_seeds['Seed'] = df_seeds.Seed.apply(seed_to_int)
df_merge = merge_features(df_seeds,feature_list)

In [283]:
#Merging important features

df_concat_test = get_features(df_merge,df_sample_sub)

#Feature engineering

df_concat_test = df_concat_test.pipe(get_seed_diff).pipe(D1diff)

df_concat_test.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2,ConfAbbrev_x,TeamName_x,LastD1Season_x,ConfAbbrev_y,TeamName_y,LastD1Season_y,SeedDiff,D1diff
0,2014_1107_1110,0.5,2014,1107,1110,aec,Albany NY,2018,patriot,American Univ,2018,1,15
1,2014_1107_1112,0.5,2014,1107,1112,aec,Albany NY,2018,pac_twelve,Arizona,2018,15,15
2,2014_1110_1112,0.5,2014,1110,1112,patriot,American Univ,2018,pac_twelve,Arizona,2018,14,0
3,2014_1107_1113,0.5,2014,1107,1113,aec,Albany NY,2018,pac_twelve,Arizona St,2018,6,15
4,2014_1110_1113,0.5,2014,1110,1113,patriot,American Univ,2018,pac_twelve,Arizona St,2018,5,0


In [284]:
features = ['ID','SeedDiff','D1diff','Seed_x','Seed_y']

#'D1diff',
X_pre_test = df_concat_test[features]

In [285]:
X_test = pd.merge(left = df_sample_sub, right = X_pre_test, how = 'inner', on ='ID')[features[1:]]

X_test.head()

Unnamed: 0,SeedDiff,D1diff
0,1,15
1,15,15
2,6,15
3,10,15
4,6,15


# Predict on Test set

In [286]:
preds = clf.predict_proba(X_test.values)[:,1]

clipped_preds = np.clip(preds, 0.05, 0.95)
df_sample_sub['Pred'] = clipped_preds
df_sample_sub.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2
0,2014_1107_1110,0.425035,2014,1107,1110
1,2014_1107_1112,0.066995,2014,1107,1112
2,2014_1107_1113,0.243264,2014,1107,1113
3,2014_1107_1124,0.141723,2014,1107,1124
4,2014_1107_1140,0.243264,2014,1107,1140


In [287]:
df_sample_sub = df_sample_sub.drop(['Season','Team1','Team2'],axis=1)

In [288]:
df_sample_sub.to_csv('logreg_seed_starter.csv', index=False)

In [289]:
df_sample_sub.head()

Unnamed: 0,ID,Pred
0,2014_1107_1110,0.425035
1,2014_1107_1112,0.066995
2,2014_1107_1113,0.243264
3,2014_1107_1124,0.141723
4,2014_1107_1140,0.243264
