In [131]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, VotingClassifier
from sklearn.metrics import accuracy_score, log_loss, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

random.seed(1)

root = "/home/austin/Github/kaggle-ncaa-2018/"

data = pd.read_csv(root + "derived_data/Master3.csv")
matchups = pd.read_csv("/home/austin/Github/DataFiles/TourneyWinratesBySeed.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")

In [132]:
seasons = range(2010, 2018)

all_columns = data.columns.tolist()
non_stats_columns = ["Season", "TeamID", "Seed", "Elo", "NumTeamID", "DenTeamID"]#, "BPI", "Predictor_Score"]
stats_columns = [c for c in all_columns if c not in non_stats_columns]
# stats_columns_A = ["A"+c for c in all_columns if c not in non_stats_columns]
# stats_columns_B = ["B"+c for c in all_columns if c not in non_stats_columns]

# Put all data in a single DataFrame

In [133]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)
extra_cols = ["NumTeamSeed", "DenTeamSeed", "NumTeamElo", "DenTeamElo", "HistWinPct", "Upset"]

pred_data = pd.DataFrame(index=range(tourney_games.shape[0]), columns= ["Season", "NumTeamID", "DenTeamID"] + stats_columns + extra_cols + ["NumTeamWon"])
pred_data.loc[:,"NumTeamWon"] = np.ones(tourney_games.shape[0])
pred_data.loc[:,"Upset"] = np.zeros(tourney_games.shape[0])
                                    
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        pred_data.loc[index, "Season"] = season
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        
        teamA_seed = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        teamB_seed = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        
        teamA_elo = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        teamB_elo = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        
        teamA_stats = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season), stats_columns].as_matrix()
        teamB_stats = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season), stats_columns].as_matrix()
        
        r = random.random()
        
        if r > 0.5:
            pred_data.at[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
            pred_data.at[index, "NumTeamID"] = teamA_id
            pred_data.at[index, "DenTeamID"] = teamB_id
            pred_data.at[index, "NumTeamSeed"] = teamA_seed
            pred_data.at[index, "DenTeamSeed"] = teamB_seed
            pred_data.at[index, "NumTeamElo"] = teamA_elo
            pred_data.at[index, "DenTeamElo"] = teamB_elo
            pred_data.at[index, "HistWinPct"] = matchups[(matchups["WinSeed"] == teamA_seed) & (matchups["LoseSeed"] == teamB_seed)]["1985"]
            
        else:
            pred_data.at[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            pred_data.at[index, "NumTeamID"] = teamB_id
            pred_data.at[index, "DenTeamID"] = teamA_id
            pred_data.at[index, "NumTeamSeed"] = teamB_seed
            pred_data.at[index, "DenTeamSeed"] = teamA_seed
            pred_data.at[index, "NumTeamElo"] = teamB_elo
            pred_data.at[index, "DenTeamElo"] = teamA_elo
            pred_data.at[index, "HistWinPct"] = matchups[(matchups["WinSeed"] == teamB_seed) & (matchups["LoseSeed"] == teamA_seed)]["1985"]
            pred_data.at[index, "NumTeamWon"] = 0
            
        if teamA_seed >= teamB_seed + 3:
                pred_data.at[index, "Upset"] = 1
        
        index += 1

pred_data = pred_data.dropna(axis=0, how='any')

# Select which stats to use

In [134]:
all_train_columns = stats_columns + extra_cols + ["NumTeamWon"]

# train_columns_to_drop = ["FGA", "Ast", "FGM", "Blk", "STL%", "Stl", "DR", "NumTeamSeed", "NumTeamElo", "OR", "Pace",
#                          "ASSIST_RATIO", "AST%", "PF", "DEF_EFF", "PIE", "FT_PCT"]

train_columns_to_drop = []

train_columns = [c for c in all_train_columns if c not in train_columns_to_drop]
print(train_columns)

['FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'PIE', 'FG_PCT', 'TURNOVER_RATE', 'OFF_REB_PCT', 'FT_RATE', '4FACTOR', 'OFF_EFF', 'DEF_EFF', 'ASSIST_RATIO', 'DEF_REB_PCT', 'FT_PCT', 'WINPCT', 'Pace', 'ORtg', 'FTr', '3PAr', 'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'NumTeamSeed', 'DenTeamSeed', 'NumTeamElo', 'DenTeamElo', 'HistWinPct', 'Upset', 'NumTeamWon']


In [135]:
# train_columns = features_to_keep

# Split data into train and test

In [136]:
from sklearn.utils import shuffle

seasons_test = [2017]
seasons_train = [season for season in seasons if season not in seasons_test]
print(seasons_train)

# train_x = pred_data.loc[pred_data["Season"].isin(seasons_train)][train_columns].as_matrix()
# train_y = pred_data.loc[pred_data["Season"].isin(seasons_train)]["U"].as_matrix()
# test_x = pred_data.loc[pred_data["Season"].isin(seasons_test)][train_columns].as_matrix()
# test_y = pred_data.loc[pred_data["Season"].isin(seasons_test)]["NumTeamWon"].as_matrix()

pred_data_no_upset = pred_data[pred_data["Upset"] == 0]
pred_data_upset = pred_data[pred_data["Upset"] == 1]
num_upset = pred_data_upset.shape[0]
pred_data_no_upset_sample = pred_data_no_upset.sample(num_upset)

data_to_use = pd.concat([pred_data_upset, pred_data_no_upset_sample])
data_to_use = shuffle(data_to_use)

train_x = data_to_use.loc[data_to_use["Season"].isin(seasons_train)][train_columns].as_matrix()
train_y = data_to_use.loc[data_to_use["Season"].isin(seasons_train)]["Upset"].as_matrix()
test_x = pred_data.loc[pred_data["Season"].isin(seasons_test)][train_columns].as_matrix()
test_y = pred_data.loc[(pred_data["Season"].isin(seasons_test)) & (pred_data["TeamA"])]["Upset"].as_matrix()

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

# print(pred_data_no_upset)

# from keras.utils import to_categorical
# # train_y = to_categorical(train_y)

[2010, 2011, 2012, 2013, 2014, 2015, 2016]
(196, 45)
(196,)
(67, 45)
(67,)


# Logistic Regression

In [137]:
parameters = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10],
              'tol': [1e-4, 1e-3, 1e-2, 1e-1]
              }

logReg = GridSearchCV(LogisticRegression(), parameters, cv=10)

logReg.fit(train_x, train_y.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'tol': [0.0001, 0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [138]:
# preds_train = np.array(logReg.predict_proba(train_x))
preds_test_lr = np.array(logReg.predict_proba(test_x))[:,1]

# print(preds_test)
# print(test_y)

# threshold = .1

# preds_test_lr[preds_test_lr > 1 - threshold] = 1
# preds_test_lr[preds_test_lr < threshold] = 0

# print(logReg.best_params_)

# print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# print("Log Loss, Train: ", log_loss(train_y, preds_train))
# print("")
print("Accuracy: ", logReg.score(test_x,test_y))
# print("Accuracy, Test:  ", accuracy_score(test_y, preds_test_lr > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test_lr))
print("")
print(preds_test_lr)
print(pred_data.loc[pred_data["Season"].isin(seasons_test)])

Accuracy:  1.0
Log Loss, Test:   0.2886680312481349

[0.17480871 0.08909212 0.12336426 0.19189308 0.18644835 0.24473435
 0.23420471 0.21146608 0.26835234 0.24843216 0.70381561 0.21859215
 0.19920083 0.21594288 0.2521763  0.18470174 0.23072965 0.19281275
 0.22358907 0.73638217 0.2136455  0.2374623  0.22135954 0.19403274
 0.25059165 0.17839448 0.19426041 0.26395794 0.23893487 0.18535091
 0.25559525 0.7248086  0.20681181 0.22043245 0.76568569 0.74812356
 0.25875917 0.22775919 0.30243576 0.3200402  0.32578664 0.32447721
 0.81261597 0.77708909 0.27676314 0.26948224 0.28601845 0.74896315
 0.3141394  0.24351432 0.81988891 0.30543724 0.33315205 0.32954514
 0.25891054 0.74141973 0.26643117 0.36896722 0.28624594 0.82220086
 0.26639245 0.31456652 0.3600041  0.79708788 0.31742165 0.37254661
 0.3693121 ]
     Season NumTeamID DenTeamID       FGM       FGA      FGM3      FGA3  \
466    2017      1448      1243   1.13349   1.10069   1.14143   1.06462   
467    2017      1309      1291   1.01994  0.98

# Random Forest

In [139]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'max_features': np.arange(1,12,1),
              'min_samples_split': np.arange(2,50,4),
              'min_samples_leaf': np.arange(1,25,4)
              }

# RF = RandomizedSearchCV(RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0), parameters,
#                     cv=10, n_iter=10)
RF = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_features=1, min_samples_leaf=30)

# clf = ExtraTreesRegressor(n_estimators=1000, max_features=1, n_jobs=-1)
# clf = LogisticRegression(n_jobs=-1)
RF.fit(train_x, train_y.ravel())
# print("OOB Score: ", clf.oob_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=30, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [140]:
# print(RF.best_params_)

preds_train = RF.predict(train_x)
# preds_test_rf = RF.predict_proba(test_x)[:,1]
preds_test_rf = RF.predict(test_x)

# threshold = .1
# preds_test_rf[preds_test_rf > 1 - threshold] = 1
# preds_test_rf[preds_test_rf < threshold] = 0

# print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# print("Log Loss, Train: ", log_loss(train_y, preds_train))
# print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test_rf > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test_rf))

Accuracy, Test:   0.6865671641791045
Log Loss, Test:   10.825813382750718


In [141]:
variables = train_columns
feature_importance = RF.feature_importances_

feature_importance, variables = (list(t) for t in zip(*sorted(zip(feature_importance, variables))))

num_features = len(feature_importance)
features_to_keep = []
for i in range(len(feature_importance)):
    if feature_importance[i] > 0.0152:
        features_to_keep.append(variables[i])
#     print(variables[i], ":", feature_importance[i])

print(features_to_keep)

['NumTeamSeed', 'WINPCT', 'OFF_EFF', '4FACTOR', 'PIE', 'FTA', 'STL%', 'FT_RATE', 'DEF_EFF', 'PF', 'BLK%', 'Ast', 'OR', 'DenTeamElo', 'Blk', 'TOV%', 'Stl', 'TO', 'FT_PCT', 'FGA', 'NumTeamElo', 'TS%', 'FGM3', 'FTr', 'FG_PCT', 'FGM', 'DR', 'ORtg', 'eFG%', 'TRB%', 'FT/FGA', 'DenTeamSeed', 'Upset', 'AST%', 'DEF_REB_PCT', 'ASSIST_RATIO', 'TURNOVER_RATE', 'HistWinPct', 'Pace', '3PAr', 'ORB%', 'FGA3', 'FTM', 'OFF_REB_PCT']


# KNN

# Voter

In [142]:
voter = VotingClassifier(estimators=[('lr', logReg), ('rf', RF)], voting='soft')
voter.fit(train_x, train_y.ravel())

VotingClassifier(estimators=[('lr', GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.000...imators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [143]:
preds_test = voter.predict_proba(test_x)[:,1]
threshold = .12
preds_test[preds_test > 1 - threshold] = 1
preds_test[preds_test < threshold] = 0
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))

Accuracy, Test:   1.0
Log Loss, Test:   0.46147700738522024


In [144]:
preds = (preds_test_lr + preds_test_rf) / 2
print("Log Loss, Test:  ", log_loss(test_y, preds))

Log Loss, Test:   0.40749111635437696
