In [118]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.model_selection import PredefinedSplit
from sklearn.preprocessing import StandardScaler

In [119]:
def seed_to_int(seed):
    '''
    Input: Seed
    Output: Get just the digits from the seeding. Return as int
    '''
    s_int = int(seed[1:3])
    return s_int
def merge_features(df,feature_list):
    '''
    Input: dataframe which contains Season and TeamID (ex. input df_seeds), list of dataframes with features 
    Output: dataframe with merged features
    '''
    for feature in feature_list:
        if 'Season' in feature.columns:
            df = pd.merge(left = df, right = feature, how = 'left', on = ['Season', 'TeamID'])
        else:
            df = pd.merge(left = df, right = feature, how = 'left', on = ['TeamID'])
    return df
def get_features(df_merge,df_team):
    '''
    Input:  dataframe from merge_features, dataframe which contains team1, team2, and year
    Output: dataframe of merged features'''
    df_1 = df_merge.rename(columns={'TeamID':'Team1'})
    df_2 = df_merge.rename(columns={'TeamID':'Team2'})
    df_dummy = pd.merge(left=df_team, right=df_1, how='left', on=['Season', 'Team1'])
    df_concat = pd.merge(left=df_dummy, right=df_2, on=['Season', 'Team2'])
    return df_concat
def change_for_training(df_team):
    '''
    Input: dataframe
    Output: renamed dataframe'''
    df_team = df_team.rename(columns = {'WTeamID': 'Team1','LTeamID':'Team2'})
    return df_team
def get_seed_diff(df_final):
    '''
    Input: dataframe with seed of respective teams
    Ouput: Dataframe with a column that contains difference in seeds
    '''
    df_final['SeedDiff'] = df_final['Seed_x']-df_final['Seed_y']
    return df_final
def D1diff(df):
    '''
    Input: Dataframe 
    Ouput: Difference in first d1 season for respective teams'''
    df['D1diff'] = df['FirstD1Season_x']-df['FirstD1Season_y']
    df = df.drop(['FirstD1Season_x','FirstD1Season_y'], axis=1)
    return df
def get_predictions(df, features):
    '''
    Input: dataframe of training dataset with all features, list of specific features we want from training data
    Output: dataframe for training data
    '''
    df_wins = df[features]
    df_wins['Result'] = 1
    df_losses = df[features]
    df_losses['SeedDiff'] = -df['SeedDiff']
    df_losses['Result'] = 0
    df_predictions = pd.concat((df_wins, df_losses))
    return df_predictions

In [120]:
def get_regular_scores(year):
    '''
    Input: year
    Output: wins in a year
    '''
    return pd.concat([regular[regular['Season'] == year][['TeamID2','LScore']],regular[regular['Season'] == year][['TeamID2','LScore']].rename(columns={'TeamID1':'TeamID2','WScore':'LScore'})], ignore_index=True)
    

# Get features 
    - For prediction I'm going to use seed difference between two teams, Difference between when a college became

In [121]:
data_dir = '../NCAA/DataFiles/'
df_regular = pd.read_csv(data_dir + 'RegularSeasonCompactResults.csv')
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'NCAATourneyCompactResults.csv')
df_conferences = pd.read_csv(data_dir + "TeamConferences.csv")
df_coaches = pd.read_csv(data_dir + "TeamCoaches.csv")
df_firstd1season = pd.read_csv(data_dir + "Teams.csv")

In [124]:
df_seeds['Seed'] = df_seeds.Seed.apply(seed_to_int)
feature_list = [df_conferences, df_firstd1season]

df_team = change_for_training(df_regular)

df_team_val = change_for_training(df_tour)

In [125]:
df_merge = merge_features(df_seeds,feature_list)
df_concat = get_features(df_merge,df_team)
df_concat = df_concat.pipe(get_seed_diff).pipe(D1diff)

df_concat_val = get_features(df_merge,df_team_val)
df_concat_val = df_concat_val.pipe(get_seed_diff).pipe(D1diff)

In [126]:
winners = df_regular.rename( columns = { 'WTeamID' : 'TeamID1', 'LTeamID' : 'TeamID2'})
losers = df_regular.rename( columns = { 'WTeamID' : 'TeamID2', 'LTeamID' : 'TeamID1'})

regular = pd.concat( [winners, losers], axis = 0).reset_index(drop = True)

winners_val = df_concat_val.rename( columns = { 'WTeamID' : 'TeamID1', 'LTeamID' : 'TeamID2'})
winners_val['Result'] = 1.0

losers_val= df_concat_val.rename( columns = { 'WTeamID' : 'TeamID2', 'LTeamID' : 'TeamID1'})
losers_val['Result'] = 0.0
losers_val['D1diff'] = winners_val['D1diff']*-1
losers_val['SeedDiff'] = df_concat_val['SeedDiff']*-1

playoff = pd.concat( [winners_val, losers_val], axis = 0).reset_index(drop = True)

In [127]:
playoff.head()

Unnamed: 0,Season,DayNum,Team1,WScore,Team2,LScore,WLoc,NumOT,Seed_x,ConfAbbrev_x,TeamName_x,LastD1Season_x,Seed_y,ConfAbbrev_y,TeamName_y,LastD1Season_y,SeedDiff,D1diff,Result
0,1985,136,1116,63,1234,54,N,0,9,swc,Arkansas,2018,8,big_ten,Iowa,2018,1,0,1.0
1,1985,136,1120,59,1345,58,N,0,11,sec,Auburn,2018,6,big_ten,Purdue,2018,5,0,1.0
2,1985,136,1207,68,1250,43,N,0,1,big_east,Georgetown,2018,16,ecc,Lehigh,2018,-15,0,1.0
3,1985,136,1229,58,1425,55,N,0,9,mvc,Illinois St,2018,8,pac_ten,USC,2018,1,0,1.0
4,1985,136,1242,49,1325,38,N,0,3,big_eight,Kansas,2018,14,mac,Ohio,2018,-11,0,1.0


In [131]:
final = pd.DataFrame(columns = ['score_diff','D1diff','SeedDiff','Result'])
for i in range(1985,2018):
    current = get_regular_scores(i)
    playoff_current = playoff[playoff['Season'] == i]
    year_current = pd.DataFrame(get_regular_scores(i).groupby('TeamID2')['LScore'].mean())
    year_current['TeamID2'] = year_current.index
    playoff_current = pd.merge(playoff_current, year_current, left_on = "Team1", right_on = 'TeamID2')
    playoff_current['Wscore_avg'] = playoff_current['LScore_y']
    playoff_current = playoff_current.drop(['LScore_y','TeamID2'],axis=1)
    playoff_current['LScore_avg'] = pd.merge(playoff_current, year_current, left_on = "Team2", right_on = 'TeamID2')['LScore']
    playoff_current['score_diff'] = playoff_current['Wscore_avg']-playoff_current['LScore_avg']
    final = pd.concat([final, playoff_current[['score_diff','D1diff','SeedDiff','Result','Seed_x','Seed_y']]])

In [135]:
final.head()

Unnamed: 0,D1diff,Result,SeedDiff,Seed_x,Seed_y,score_diff
0,0,1.0,1,9.0,8.0,1.324242
1,0,0.0,-1,9.0,8.0,1.324242
2,0,1.0,5,11.0,6.0,2.4
3,0,1.0,8,11.0,3.0,2.4
4,0,0.0,-5,11.0,6.0,-5.333333


In [154]:
X_train = final.drop('Result',axis=1).values
y_train = final.Result.values
X_train, y_train = shuffle(X_train, y_train)

In [155]:
logreg = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_['C']))

Best log_loss: -0.5538, with best C: 0.001


# Test set

In [137]:
df_sample_sub = pd.read_csv('SampleSubmissionStage2.csv')
df_sample_sub['Season'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[0]) )
df_sample_sub['Team1'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[1]) )
df_sample_sub['Team2'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[2]) )

In [140]:
data_dir = '../NCAA/Stage2UpdatedDataFiles/'
df_regular = pd.read_csv(data_dir + 'RegularSeasonCompactResults.csv')
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')


In [141]:
winners = df_regular.rename( columns = { 'WTeamID' : 'TeamID1', 'LTeamID' : 'TeamID2'})
losers = df_regular.rename( columns = { 'WTeamID' : 'TeamID2', 'LTeamID' : 'TeamID1'})

regular = pd.concat( [winners, losers], axis = 0).reset_index(drop = True)

In [142]:
df_seeds['Seed'] = df_seeds.Seed.apply(seed_to_int)
df_merge = merge_features(df_seeds,feature_list)

In [143]:
#Merging important features

df_concat_test = get_features(df_merge,df_sample_sub)

#Feature engineering

df_concat_test = df_concat_test.pipe(get_seed_diff).pipe(D1diff)

df_concat_test.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2,Seed_x,ConfAbbrev_x,TeamName_x,LastD1Season_x,Seed_y,ConfAbbrev_y,TeamName_y,LastD1Season_y,SeedDiff,D1diff
0,2018_1104_1112,0.5,2018,1104,1112,9,sec,Alabama,2018,4,pac_twelve,Arizona,2018,5,0
1,2018_1104_1113,0.5,2018,1104,1113,9,sec,Alabama,2018,11,pac_twelve,Arizona St,2018,-2,0
2,2018_1112_1113,0.5,2018,1112,1113,4,pac_twelve,Arizona,2018,11,pac_twelve,Arizona St,2018,-7,0
3,2018_1104_1116,0.5,2018,1104,1116,9,sec,Alabama,2018,7,sec,Arkansas,2018,2,0
4,2018_1112_1116,0.5,2018,1112,1116,4,pac_twelve,Arizona,2018,7,sec,Arkansas,2018,-3,0


In [148]:
final_test = pd.DataFrame(columns = ['score_diff','D1diff','SeedDiff'])

current = get_regular_scores(2018)
playoff_current = df_concat_test[df_concat_test['Season'] == 2018]
year_current = pd.DataFrame(get_regular_scores(2018).groupby('TeamID2')['LScore'].mean())
year_current['TeamID2'] = year_current.index
playoff_current = pd.merge(playoff_current, year_current, left_on = "Team1", right_on = 'TeamID2')
playoff_current['Wscore_avg'] = playoff_current['LScore']
playoff_current = playoff_current.drop(['LScore','TeamID2'],axis=1)
playoff_current['LScore_avg'] = pd.merge(playoff_current, year_current, left_on = "Team2", right_on = 'TeamID2')['LScore']
playoff_current['score_diff'] = playoff_current['Wscore_avg']-playoff_current['LScore_avg']
final_test = pd.concat([final_test, playoff_current[['score_diff','D1diff','SeedDiff','Seed_x','Seed_y']]])


In [149]:
final_test.head()

Unnamed: 0,D1diff,SeedDiff,Seed_x,Seed_y,score_diff
0,0,5,9.0,4.0,-4.058824
1,0,-2,9.0,11.0,-7.72296
2,0,2,9.0,7.0,-7.72296
3,0,5,9.0,4.0,-5.352941
4,0,-5,9.0,14.0,-5.352941


# Predict on Test Set

In [77]:
len(df_sample_sub)

2278

In [156]:
preds = clf.predict_proba(final_test.values)[:,1]
clipped_preds = np.clip(preds, 0.05, 0.95)
df_sample_sub['Pred'] = clipped_preds
df_sample_sub.head()

Unnamed: 0,ID,Pred,Season,Team1,Team2
0,2018_1104_1112,0.303366,2018,1104,1112
1,2018_1104_1113,0.569454,2018,1104,1113
2,2018_1104_1116,0.409278,2018,1104,1116
3,2018_1104_1120,0.301833,2018,1104,1120
4,2018_1104_1137,0.685219,2018,1104,1137


In [157]:
df_sample_sub = df_sample_sub.drop(['Season','Team1','Team2'],axis=1)

In [158]:
df_sample_sub.to_csv('logreg_final.csv', index=False)