In [211]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, VotingClassifier
from sklearn.metrics import accuracy_score, log_loss, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

random.seed(1)

root = "/home/austin/Github/kaggle-ncaa-2018/"

data = pd.read_csv(root + "derived_data/Master.csv")
matchups = pd.read_csv("/home/austin/Github/DataFiles/TourneyWinratesBySeed.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")

In [212]:
seasons = range(2003, 2018)

all_columns = data.columns.tolist()
non_stats_columns = ["Season", "TeamID", "Seed", "Elo"]#, "BPI", "Predictor_Score"]
stats_columns = [c for c in all_columns if c not in non_stats_columns]

# Put all data in a single DataFrame

In [213]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)
extra_cols = ["NumTeamSeed", "DenTeamSeed", "NumTeamElo", "DenTeamElo", "HistWinPct"]

pred_data = pd.DataFrame(index=range(tourney_games.shape[0]), columns= ["Season"] + stats_columns + extra_cols + ["NumTeamWon"])
pred_data.loc[:,"NumTeamWon"] = np.ones(tourney_games.shape[0])
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        pred_data.loc[index, "Season"] = season
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        
        teamA_seed = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        teamB_seed = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        
        teamA_elo = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        teamB_elo = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        
        teamA_stats = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season), stats_columns].as_matrix()
        teamB_stats = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season), stats_columns].as_matrix()
        
        r = random.random()
        
        if r > 0.5:
            pred_data.at[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
            pred_data.at[index, "NumTeamSeed"] = teamA_seed
            pred_data.at[index, "DenTeamSeed"] = teamB_seed
            pred_data.at[index, "NumTeamElo"] = teamA_elo
            pred_data.at[index, "DenTeamElo"] = teamB_elo
            pred_data.at[index, "HistWinPct"] = matchups[(matchups["WinSeed"] == teamA_seed) & (matchups["LoseSeed"] == teamB_seed)]["1985"]
        else:
            pred_data.at[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            pred_data.at[index, "NumTeamSeed"] = teamB_seed
            pred_data.at[index, "DenTeamSeed"] = teamA_seed
            pred_data.at[index, "NumTeamElo"] = teamB_elo
            pred_data.at[index, "DenTeamElo"] = teamA_elo
            pred_data.at[index, "HistWinPct"] = matchups[(matchups["WinSeed"] == teamB_seed) & (matchups["LoseSeed"] == teamA_seed)]["1985"]
            pred_data.at[index, "NumTeamWon"] = 0
        
        index += 1

pred_data = pred_data.dropna(axis=0, how='any')

# Select which stats to use

In [240]:
all_train_columns = stats_columns + extra_cols
# train_columns_to_drop = ["FTM", "FGA", "TS%", "eFG%", "4FACTOR", "FG_PCT", "FT/FGA", "3PAr", "STL%", "Ast",
#                          "ASSIST_RATIO", "DR", "FGA3", "DEF_REB_PCT"]

train_columns_to_drop = ["FGM3", "FTM", "FGA3", "FTA", "4FACTOR", "DEF_REB_PCT", "FT_PCT", "DR", "ASSIST_RATIO",
                         "Stl", "FG_PCT", "FGA", "PF", "FT_RATE", "OR", "Ast", "TO", "Blk"]

train_columns = [c for c in all_train_columns if c not in train_columns_to_drop]
print(train_columns)

['FGM', 'PIE', 'TURNOVER_RATE', 'OFF_REB_PCT', 'OFF_EFF', 'DEF_EFF', 'WINPCT', 'NumTeamSeed', 'DenTeamSeed', 'NumTeamElo', 'DenTeamElo', 'HistWinPct']


# Split data into train and test

In [241]:
seasons_test = [2017]
seasons_train = [season for season in seasons if season not in seasons_test]

train_x = pred_data.loc[pred_data["Season"].isin(seasons_train)][train_columns].as_matrix()
train_y = pred_data.loc[pred_data["Season"].isin(seasons_train)]["NumTeamWon"].as_matrix()
test_x = pred_data.loc[pred_data["Season"].isin(seasons_test)][train_columns].as_matrix()
test_y = pred_data.loc[pred_data["Season"].isin(seasons_test)]["NumTeamWon"].as_matrix()

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)
print(len(train_columns))

# from keras.utils import to_categorical
# # train_y = to_categorical(train_y)

(914, 12)
(914,)
(67, 12)
(67,)
12


# Logistic Regression

In [242]:
parameters = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10],
              'tol': [1e-4, 1e-3, 1e-2, 1e-1]
              }

logReg = GridSearchCV(LogisticRegression(), parameters, cv=10)

logReg.fit(train_x, train_y.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tol': [0.0001, 0.001, 0.01, 0.1], 'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [243]:
preds_train = np.array(logReg.predict_proba(train_x))
preds_test_lr = np.array(logReg.predict_proba(test_x))[:,1]

# print(preds_test)
# print(test_y)

# threshold = .1

# preds_test_lr[preds_test_lr > 1 - threshold] = 1
# preds_test_lr[preds_test_lr < threshold] = 0

# print(logReg.best_params_)

# print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# print("Log Loss, Train: ", log_loss(train_y, preds_train))
# print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test_lr > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test_lr))

Accuracy, Test:   0.746268656716418
Log Loss, Test:   0.5002241648313955


# Random Forest

In [263]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'max_features': np.arange(1,12,1),
              'min_samples_split': np.arange(2,50,4),
              'min_samples_leaf': np.arange(1,25,4)
              }

RF = GridSearchCV(RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0), parameters,
                    cv=10, scoring=make_scorer(log_loss))
# RF = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_features=4, min_samples_leaf=40)

# clf = ExtraTreesRegressor(n_estimators=1000, max_features=1, n_jobs=-1)
# clf = LogisticRegression(n_jobs=-1)
RF.fit(train_x, train_y.ravel())
# print("OOB Score: ", clf.oob_score_)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': array([ 2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46]), 'min_samples_leaf': array([ 1,  5,  9, 13, 17, 21]), 'max_features': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(log_loss), verbose=0)

In [266]:
print(RF.best_params_)

preds_train = RF.predict(train_x)
# preds_test_rf = RF.predict_proba(test_x)[:,1]
preds_test_rf = RF.predict(test_x)

# threshold = .1
# preds_test_rf[preds_test_rf > 1 - threshold] = 1
# preds_test_rf[preds_test_rf < threshold] = 0

# print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# print("Log Loss, Train: ", log_loss(train_y, preds_train))
# print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test_rf > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test_rf))

{'min_samples_leaf': 1, 'min_samples_split': 2, 'max_features': 11}
Accuracy, Test:   0.7164179104477612
Log Loss, Test:   0.5478577232123406


In [246]:
variables = train_columns
feature_importance = RF.feature_importances_

feature_importance, variables = (list(t) for t in zip(*sorted(zip(feature_importance, variables))))

for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])

OFF_REB_PCT : 0.014324658861801336
TURNOVER_RATE : 0.01660118746435672
FGM : 0.018933310687614825
DEF_EFF : 0.02105622548257219
OFF_EFF : 0.031947183830035475
WINPCT : 0.03345639859970109
DenTeamSeed : 0.07069475619186985
DenTeamElo : 0.07175865647977786
PIE : 0.0737976526888745
NumTeamSeed : 0.09452458727097553
NumTeamElo : 0.17270712944538316
HistWinPct : 0.38019825299703836


# KNN

# Voter

In [250]:
voter = VotingClassifier(estimators=[('lr', logReg), ('rf', RF)], voting='soft')
voter.fit(train_x, train_y.ravel())

In [260]:
preds_test = voter.predict_proba(test_x)[:,1]
threshold = .12
preds_test[preds_test > 1 - threshold] = 1
preds_test[preds_test < threshold] = 0
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))

Accuracy, Test:   0.7611940298507462
Log Loss, Test:   0.4994045865095755


In [261]:
preds = (preds_test_lr + preds_test_rf) / 2
print("Log Loss, Test:  ", log_loss(test_y, preds))

Log Loss, Test:   0.5118223012088069
