In [343]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, VotingClassifier
from sklearn.metrics import accuracy_score, log_loss, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

random.seed(1)

root = "/home/austin/Github/kaggle-ncaa-2018/"

data = pd.read_csv(root + "derived_data/Master3.csv")
matchups = pd.read_csv("/home/austin/Github/DataFiles/TourneyWinratesBySeed.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")

In [344]:
seasons = range(2010, 2018)

all_columns = data.columns.tolist()
non_stats_columns = ["Season", "TeamID", "Seed", "Elo", "NumTeamID", "DenTeamID"]#, "BPI", "Predictor_Score"]
stats_columns = [c for c in all_columns if c not in non_stats_columns]
# stats_columns_A = ["A"+c for c in all_columns if c not in non_stats_columns]
# stats_columns_B = ["B"+c for c in all_columns if c not in non_stats_columns]

# Put all data in a single DataFrame

In [345]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)
extra_cols = ["NumTeamSeed", "DenTeamSeed", "NumTeamElo", "DenTeamElo", "HistWinPct"]

pred_data = pd.DataFrame(index=range(tourney_games.shape[0]), columns= ["Season", "NumTeamID", "DenTeamID"] + stats_columns + extra_cols + ["Upset","NumTeamWon"])
pred_data.loc[:,"NumTeamWon"] = np.ones(tourney_games.shape[0])
pred_data.loc[:,"Upset"] = np.zeros(tourney_games.shape[0])
gap = 4
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        pred_data.loc[index, "Season"] = season
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        
        teamA_seed = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        teamB_seed = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        
        teamA_elo = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        teamB_elo = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        
        teamA_stats = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season), stats_columns].as_matrix()
        teamB_stats = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season), stats_columns].as_matrix()
        
        r = random.random()
        
        if abs(teamA_seed - teamB_seed) < gap:
            continue
        
        if r > 0.5:
            pred_data.at[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
            pred_data.at[index, "NumTeamID"] = teamA_id
            pred_data.at[index, "DenTeamID"] = teamB_id
            pred_data.at[index, "NumTeamSeed"] = teamA_seed
            pred_data.at[index, "DenTeamSeed"] = teamB_seed
            pred_data.at[index, "NumTeamElo"] = teamA_elo
            pred_data.at[index, "DenTeamElo"] = teamB_elo
            pred_data.at[index, "HistWinPct"] = matchups.loc[(matchups["WinSeed"] == teamA_seed) & (matchups["LoseSeed"] == teamB_seed), "1985"].reset_index(drop=True)[0]
            
        else:
            pred_data.at[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            pred_data.at[index, "NumTeamID"] = teamB_id
            pred_data.at[index, "DenTeamID"] = teamA_id
            pred_data.at[index, "NumTeamSeed"] = teamB_seed
            pred_data.at[index, "DenTeamSeed"] = teamA_seed
            pred_data.at[index, "NumTeamElo"] = teamB_elo
            pred_data.at[index, "DenTeamElo"] = teamA_elo
            pred_data.at[index, "HistWinPct"] = matchups.loc[(matchups["WinSeed"] == teamB_seed) & (matchups["LoseSeed"] == teamA_seed), "1985"].reset_index(drop=True)[0]
            pred_data.at[index, "NumTeamWon"] = 0
            
        if teamA_seed >= teamB_seed + gap:
            pred_data.at[index, "Upset"] = 1
        
        index += 1

pred_data = pred_data.dropna(axis=0, how='any')

# Select which stats to use

In [397]:
all_train_columns = stats_columns + extra_cols

# train_columns_to_drop = ["FGA", "Ast", "FGM", "Blk", "STL%", "Stl", "DR", "NumTeamSeed", "NumTeamElo", "OR", "Pace",
#                          "ASSIST_RATIO", "AST%", "PF", "DEF_EFF", "PIE", "FT_PCT"]

train_columns_to_drop = ["Upset"]

train_columns = [c for c in all_train_columns if c not in train_columns_to_drop]
print(train_columns)
# print(pred_data)

['FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'PIE', 'FG_PCT', 'TURNOVER_RATE', 'OFF_REB_PCT', 'FT_RATE', '4FACTOR', 'OFF_EFF', 'DEF_EFF', 'ASSIST_RATIO', 'DEF_REB_PCT', 'FT_PCT', 'WINPCT', 'Pace', 'ORtg', 'FTr', '3PAr', 'TS%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'NumTeamSeed', 'DenTeamSeed', 'NumTeamElo', 'DenTeamElo', 'HistWinPct']


In [398]:
# train_columns = features_to_keep
# train_columns = ['TURNOVER_RATE', 'PF', 'WINPCT', 'DenTeamElo', 'NumTeamElo']
# print(train_columns)

# Split data into train and test

In [414]:
from sklearn.utils import shuffle

seasons_test = [2017]
seasons_train = [season for season in seasons if season not in seasons_test]
print(seasons_train)

# train_x = pred_data.loc[pred_data["Season"].isin(seasons_train)][train_columns].as_matrix()
# train_y = pred_data.loc[pred_data["Season"].isin(seasons_train)]["NumTeamWon"].as_matrix()
# test_x = pred_data.loc[pred_data["Season"].isin(seasons_test)][train_columns].as_matrix()
# test_y = pred_data.loc[pred_data["Season"].isin(seasons_test)]["NumTeamWon"].as_matrix()

pred_data_no_upset = pred_data[pred_data["Upset"] == 0]
pred_data_upset = pred_data[pred_data["Upset"] == 1]
num_upset = pred_data_upset.shape[0]
pred_data_no_upset_sample = pred_data_no_upset.sample(num_upset)

data_to_use = pd.concat([pred_data_upset, pred_data_no_upset_sample])
data_to_use = shuffle(data_to_use)

train_x = data_to_use.loc[data_to_use["Season"].isin(seasons_train)][train_columns].as_matrix()
train_y = data_to_use.loc[data_to_use["Season"].isin(seasons_train)]["NumTeamWon"].as_matrix()
test_x = pred_data.loc[pred_data["Season"].isin(seasons_test)][train_columns].as_matrix()
test_y = pred_data.loc[pred_data["Season"].isin(seasons_test)]["NumTeamWon"].as_matrix()

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

# print(pred_data_no_upset)

# from keras.utils import to_categorical
# # train_y = to_categorical(train_y)

[2010, 2011, 2012, 2013, 2014, 2015, 2016]
(154, 43)
(154,)
(42, 43)
(42,)


In [415]:
# print(train_x)
# print(train_y)

# Logistic Regression

In [419]:
from sklearn.model_selection import cross_val_score

parameters = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10],
              'tol': [1e-4, 1e-3, 1e-2, 1e-1]
              }

logReg = GridSearchCV(LogisticRegression(), parameters, cv=3)
# logReg = LogisticRegression(C=10, tol=0.0001)

logReg.fit(train_x, train_y.ravel())

scores= cross_val_score(logReg, np.concatenate((test_x, train_x)),np.concatenate((test_y, train_y)), cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
logReg.fit(train_x, train_y.ravel())

[0.75       0.65       0.61538462 0.61538462 0.63157895]
Accuracy: 0.65 (+/- 0.10)


GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'tol': [0.0001, 0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [420]:
# preds_train = np.array(logReg.predict_proba(train_x))
preds_test_lr = np.array(logReg.predict_proba(test_x))[:,1]

# print(preds_test)
# print(test_y)

# threshold = .1

# preds_test_lr[preds_test_lr > 1 - threshold] = 1
# preds_test_lr[preds_test_lr < threshold] = 0

print(logReg.best_params_)

# print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# print("Log Loss, Train: ", log_loss(train_y, preds_train))
# print("")
print("Accuracy: ", logReg.score(test_x,test_y))
# print("Accuracy, Test:  ", accuracy_score(test_y, preds_test_lr > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test_lr))
print("")
# print(preds_test_lr)
# to_print = pred_data.loc[pred_data["Season"].isin(seasons_test)].reset_index(drop=True)

# pd.set_option('display.max_rows', len(to_print))
# pd.set_option('display.max_columns', to_print.shape[1])
# print(to_print[features_to_keep])
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

{'C': 1, 'tol': 0.01}
Accuracy:  0.7142857142857143
Log Loss, Test:   0.538190096535906



# Random Forest

In [421]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'max_features': np.arange(1,12,1),
              'min_samples_split': np.arange(2,50,4),
              'min_samples_leaf': np.arange(1,25,4)
              }

# RF = RandomizedSearchCV(RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0), parameters,
#                     cv=10, n_iter=10)
RF = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0)
scores2 = cross_val_score(RF, np.concatenate((test_x, train_x)),np.concatenate((test_y, train_y)), cv=5)
print(scores2)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))
# clf = ExtraTreesRegressor(n_estimators=1000, max_features=1, n_jobs=-1)
# clf = LogisticRegression(n_jobs=-1)
RF.fit(train_x, train_y.ravel())

# print("OOB Score: ", clf.oob_score_)

[0.65       0.5        0.56410256 0.53846154 0.57894737]
Accuracy: 0.57 (+/- 0.10)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [423]:
# print(RF.best_params_)

preds_train = RF.predict(train_x)
# preds_test_rf = RF.predict_proba(test_x)[:,1]
preds_test_rf = RF.predict(test_x)

# threshold = .1
# preds_test_rf[preds_test_rf > 1 - threshold] = 1
# preds_test_rf[preds_test_rf < threshold] = 0

# print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# print("Log Loss, Train: ", log_loss(train_y, preds_train))
# print("")
print("Accuracy, Test:  ", RF.score(test_x, test_y))
print("Log Loss, Test:  ", log_loss(test_y, preds_test_rf))

Accuracy, Test:   0.6428571428571429
Log Loss, Test:   12.33541055013504


In [424]:
variables = train_columns
feature_importance = RF.feature_importances_

feature_importance, variables = (list(t) for t in zip(*sorted(zip(feature_importance, variables))))

num_features = len(feature_importance)
# features_to_keep = []
for i in range(len(feature_importance)):
#     if feature_importance[i] > 0.026:
#         features_to_keep.append(variables[i])
    print(variables[i], ":", feature_importance[i])

print(features_to_keep)

FG_PCT : 0.013731697294545733
TO : 0.014129071307428399
TOV% : 0.014165522015906398
TS% : 0.014629367008433569
OFF_EFF : 0.014858208313150111
eFG% : 0.017268282698777053
Blk : 0.017329545244196342
TURNOVER_RATE : 0.018223381947596082
FTM : 0.018446337846322984
BLK% : 0.018732194218405784
FTA : 0.01894987026737003
FGA : 0.019123899415458116
ORtg : 0.019322564647558372
4FACTOR : 0.01940918488591067
NumTeamSeed : 0.01957880816543807
DenTeamSeed : 0.02065968312146631
DR : 0.020694755503742068
TRB% : 0.020931817610488488
Ast : 0.021275932815187592
FGA3 : 0.021526407765236247
ASSIST_RATIO : 0.021607502998462005
NumTeamElo : 0.022682083407905822
Pace : 0.023170681565445946
FT_RATE : 0.023411968617235414
FT/FGA : 0.023772746011256906
WINPCT : 0.02398216454818644
FT_PCT : 0.024123582117236305
PF : 0.024127325769006204
3PAr : 0.024417963355762507
FGM3 : 0.024607895056980893
DEF_REB_PCT : 0.02483207320685145
OR : 0.02492053056850508
STL% : 0.02515522315094342
FGM : 0.025440475293994062
FTr : 0.02

# KNN

# Voter

In [425]:
voter = VotingClassifier(estimators=[('lr', logReg), ('rf', RF)], voting='soft')
voter.fit(train_x, train_y.ravel())

VotingClassifier(estimators=[('lr', GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001...imators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [426]:
preds_test = voter.predict_proba(test_x)[:,1]
# print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
# print("Log Loss, Test:  ", log_loss(test_y, preds_test))
print("Accuracy ", voter.score(test_x,test_y))

Accuracy  0.7142857142857143


  if diff:


In [428]:
RF = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0)
logReg = LogisticRegression(C=1, tol=0.01)
voter = VotingClassifier(estimators=[('lr', logReg), ('rf', RF)], voting='soft')
scores2 = cross_val_score(voter, np.concatenate((test_x, train_x)),np.concatenate((test_y, train_y)), cv=4)
print(scores2)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))

  if diff:
  if diff:
  if diff:


[0.7        0.57142857 0.51020408 0.5625    ]
Accuracy: 0.59 (+/- 0.14)


  if diff:
