In [170]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, VotingClassifier
from sklearn.metrics import accuracy_score, log_loss, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

random.seed(1)

root = "/home/austin/Github/kaggle-ncaa-2018/"

data = pd.read_csv(root + "derived_data/Master.csv")
matchups = pd.read_csv("/home/austin/Github/DataFiles/TourneyWinratesBySeed.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")

In [171]:
seasons = range(2003, 2018)

all_columns = data.columns.tolist()
non_stats_columns = ["Season", "TeamID", "Seed", "Elo"]#, "BPI", "Predictor_Score"]
stats_columns = [c for c in all_columns if c not in non_stats_columns]

# stats_columns_A = [c+"num" for c in stats_columns]
# stats_columns_B = [c+"den" for c in stats_columns]

# Put all data in a single DataFrame

In [172]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)
extra_cols = ["NumTeamSeed", "DenTeamSeed", "NumTeamElo", "DenTeamElo", "HistWinPct"]

pred_data = pd.DataFrame(index=range(tourney_games.shape[0]), columns= ["Season"] + stats_columns + extra_cols + ["NumTeamWon"])
pred_data.loc[:,"NumTeamWon"] = np.ones(tourney_games.shape[0])
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        pred_data.loc[index, "Season"] = season
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        
        teamA_seed = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        teamB_seed = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        
        teamA_elo = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        teamB_elo = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        
        teamA_stats = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season), stats_columns].as_matrix()
        teamB_stats = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season), stats_columns].as_matrix()
        
        r = random.random()
        
        if r > 0.5:
            pred_data.at[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
            pred_data.at[index, "NumTeamSeed"] = teamA_seed
            pred_data.at[index, "DenTeamSeed"] = teamB_seed
            pred_data.at[index, "NumTeamElo"] = teamA_elo
            pred_data.at[index, "DenTeamElo"] = teamB_elo
            pred_data.at[index, "HistWinPct"] = matchups[(matchups["WinSeed"] == teamA_seed) & (matchups["LoseSeed"] == teamB_seed)]["1985"]
        else:
            pred_data.at[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            pred_data.at[index, "NumTeamSeed"] = teamB_seed
            pred_data.at[index, "DenTeamSeed"] = teamA_seed
            pred_data.at[index, "NumTeamElo"] = teamB_elo
            pred_data.at[index, "DenTeamElo"] = teamA_elo
            pred_data.at[index, "HistWinPct"] = matchups[(matchups["WinSeed"] == teamB_seed) & (matchups["LoseSeed"] == teamA_seed)]["1985"]
            pred_data.at[index, "NumTeamWon"] = 0
        
        index += 1

pred_data = pred_data.dropna(axis=0, how='any')

# Select which stats to use

In [173]:
all_train_columns = stats_columns + extra_cols
# train_columns_to_drop = ["FTM", "FGA", "TS%", "eFG%", "4FACTOR", "FG_PCT", "FT/FGA", "3PAr", "STL%", "Ast",
#                          "ASSIST_RATIO", "DR", "FGA3", "DEF_REB_PCT"]

# train_columns_to_drop = ["FGM3", "FTM", "FGA3", "FTA", "4FACTOR", "DEF_REB_PCT", "FT_PCT", "DR", "ASSIST_RATIO",
#                          "Stl", "FG_PCT", "FGA", "PF", "FT_RATE", "OR", "Ast", "TO", "Blk"]

train_columns_to_drop = []

train_columns = [c for c in all_train_columns if c not in train_columns_to_drop]
# train_columns = ['FGM', 'DEF_EFF', 'WINPCT', 'OFF_EFF', 'DenTeamSeed', 'PIE', 'DenTeamElo', 'NumTeamSeed', 'NumTeamElo', 'HistWinPct']

# train_columns = ["NumTeamElo", "DenTeamElo", "ORB%", "TOV%", "Pace", "ORtg"]
print(train_columns)

['FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'PIE', 'FG_PCT', 'TURNOVER_RATE', 'OFF_REB_PCT', 'FT_RATE', '4FACTOR', 'OFF_EFF', 'DEF_EFF', 'ASSIST_RATIO', 'DEF_REB_PCT', 'FT_PCT', 'WINPCT', 'NumTeamSeed', 'DenTeamSeed', 'NumTeamElo', 'DenTeamElo', 'HistWinPct']


# Split data into train and test

In [174]:
seasons_test = [2017]
seasons_train = [season for season in seasons if season not in seasons_test]

train_x = pred_data.loc[pred_data["Season"].isin(seasons_train)][train_columns].as_matrix()
train_y = pred_data.loc[pred_data["Season"].isin(seasons_train)]["NumTeamWon"].as_matrix()
test_x = pred_data.loc[pred_data["Season"].isin(seasons_test)][train_columns].as_matrix()
test_y = pred_data.loc[pred_data["Season"].isin(seasons_test)]["NumTeamWon"].as_matrix()

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)
print(len(train_columns))

# from keras.utils import to_categorical
# # train_y = to_categorical(train_y)

(914, 30)
(914,)
(67, 30)
(67,)
30


# Logistic Regression

In [175]:
parameters = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10],
              'tol': [1e-4, 1e-3, 1e-2, 1e-1]
              }

logReg = GridSearchCV(LogisticRegression(), parameters, cv=10)

logReg.fit(train_x, train_y.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'tol': [0.0001, 0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [176]:
preds_train = np.array(logReg.predict_proba(train_x))
preds_test_lr = np.array(logReg.predict_proba(test_x))[:,1]

# print(preds_test)
# print(test_y)

# threshold = .1

# preds_test_lr[preds_test_lr > 1 - threshold] = 1
# preds_test_lr[preds_test_lr < threshold] = 0

print(logReg.best_params_)

# print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# print("Log Loss, Train: ", log_loss(train_y, preds_train))
# print("")
print("Accuracy, Test:  ", logReg.score(test_x, test_y))
print("Log Loss, Test:  ", log_loss(test_y, preds_test_lr))

{'C': 10, 'tol': 0.0001}
Accuracy, Test:   0.7761194029850746
Log Loss, Test:   0.5122243365950685


# Random Forest

In [177]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'max_features': np.arange(1,12,1),
              'min_samples_split': np.arange(2,50,4),
              'min_samples_leaf': np.arange(1,25,4)
              }

# RF = GridSearchCV(RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0), parameters,
#                     cv=10, scoring=make_scorer(log_loss))
RF = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_features=4, min_samples_leaf=40)

# clf = ExtraTreesRegressor(n_estimators=1000, max_features=1, n_jobs=-1)
# clf = LogisticRegression(n_jobs=-1)
RF.fit(train_x, train_y.ravel())
# print("OOB Score: ", clf.oob_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=40, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [178]:
# print(RF.best_params_)

preds_train = RF.predict(train_x)
preds_test_rf = RF.predict_proba(test_x)[:,1]
# preds_test_rf = RF.predict_proba(test_x)

# threshold = .1
# preds_test_rf[preds_test_rf > 1 - threshold] = 1
# preds_test_rf[preds_test_rf < threshold] = 0

# print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# print("Log Loss, Train: ", log_loss(train_y, preds_train))
# print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test_rf > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test_rf))

Accuracy, Test:   0.746268656716418
Log Loss, Test:   0.5385038508794131


In [179]:
variables = train_columns
feature_importance = RF.feature_importances_

feature_importance, variables = (list(t) for t in zip(*sorted(zip(feature_importance, variables))))

# features_to_keep = []
for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])
#     if feature_importance[i] > 0.025:
#         features_to_keep.append(variables[i])
# print(features_to_keep)

FGM3 : 0.0038384700109756077
FTM : 0.0038607200920063116
FGA3 : 0.0043999904518818665
FTA : 0.004485741497124605
4FACTOR : 0.0046001269954303965
DEF_REB_PCT : 0.0051692381893911656
FT_PCT : 0.005772051804293564
DR : 0.005875924242828527
ASSIST_RATIO : 0.006343233338749744
Stl : 0.006512221982256073
FG_PCT : 0.007898503219778438
FGA : 0.008757772421568452
PF : 0.008763457809594009
FT_RATE : 0.009760742682694994
OR : 0.01090169886295754
Ast : 0.010947606365233944
TO : 0.01126904518864288
Blk : 0.015306405806461733
OFF_REB_PCT : 0.017880172272048817
TURNOVER_RATE : 0.020009118922276085
FGM : 0.02592105182410565
DEF_EFF : 0.0306102841619177
WINPCT : 0.049093207247669746
OFF_EFF : 0.05200256399167118
DenTeamSeed : 0.0719617581985144
PIE : 0.07669910607643693
DenTeamElo : 0.07791245467141034
NumTeamSeed : 0.09444445423518727
NumTeamElo : 0.1292776528470162
HistWinPct : 0.2197252245898757


# MLP

In [180]:
from sklearn.neural_network import MLPRegressor, MLPClassifier
mlp = MLPClassifier()

mlp.fit(train_x, train_y)
print(mlp.score(test_x, test_y))

0.6567164179104478


# KNN

In [181]:
# from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
# from sklearn.preprocessing import scale
# knn = KNeighborsClassifier()

# parameters = {'leaf_size': np.arange(1,51,2),
#               'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#               'n_neighbors': np.arange(1,11,1)
#              }

# # knn = KNeighborsClassifier()
# knn = GridSearchCV(KNeighborsClassifier(), parameters, cv=10)
# knn.fit(scale(train_x), train_y.ravel())

In [182]:
# print(knn.best_params_)

# preds_train = knn.predict(train_x)
# # preds_test_rf = RF.predict_proba(test_x)[:,1]
# preds_test_rf = knn.predict_proba(scale(test_x))[:,1]

# # threshold = .1
# # preds_test_rf[preds_test_rf > 1 - threshold] = 1
# # preds_test_rf[preds_test_rf < threshold] = 0

# # print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# # print("Log Loss, Train: ", log_loss(train_y, preds_train))
# # print("")
# print("Accuracy, Test:  ", knn.score(scale(test_x), test_y))
# print("Log Loss, Test:  ", log_loss(test_y, preds_test_rf))

# Voter

In [183]:
from sklearn.model_selection import cross_val_score
logReg = LogisticRegression(C=1, tol=0.001)
RF = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_features=4, min_samples_leaf=40)
mlp = MLPClassifier()
knn = KNeighborsClassifier(n_neighbors=7, algorithm='auto', leaf_size=1)
voter = VotingClassifier(estimators=[('lr', logReg), ('rf', RF), ('mlp', mlp), ('knn', knn)], voting='soft')
scores= cross_val_score(voter, np.concatenate((test_x, train_x)),np.concatenate((test_y, train_y)), cv=10)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# voter.fit(train_x, train_y.ravel())

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


[0.71717172 0.65656566 0.65656566 0.71717172 0.70707071 0.67346939
 0.6185567  0.65979381 0.64948454 0.77319588]
Accuracy: 0.68 (+/- 0.09)


  if diff:


In [184]:
voter.fit(train_x, train_y.ravel())
preds_test = voter.predict_proba(test_x)[:,1]
# threshold = .19
# preds_test[preds_test > 1 - threshold] = 1
# preds_test[preds_test < threshold] = 0
print("Accuracy, Test:  ", voter.score(test_x, test_y))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))

Accuracy, Test:   0.746268656716418
Log Loss, Test:   0.4958949977095333


  if diff:


In [185]:
# from sklearn.model_selection import cross_val_score
# knn = KNeighborsClassifier(n_neighbors=7, algorithm='auto', leaf_size=1)
# logReg = LogisticRegression(C=10, tol=0.0001)
# RF = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_features=4, min_samples_leaf=40)

# voter = VotingClassifier(estimators=[('lr', logReg), ('rf', RF), ('knn', knn)], voting='soft')
# voter.fit(train_x, train_y)
# print("Accuracy: ", voter.score(test_x, test_y))


# scores2 = cross_val_score(voter, np.concatenate((test_x, train_x)),np.concatenate((test_y, train_y)), cv=4)
# print(scores2)
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))