In [90]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import accuracy_score, log_loss

root = "/home/austin/Github/kaggle-ncaa-2018/"

data = pd.read_csv(root + "derived_data/Master.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")

In [91]:
seasons = range(2003, 2018)
seasons_test = [2014, 2015, 2016, 2017]
seasons_train = [season for season in seasons if season not in seasons_test]

all_columns = data.columns.tolist()
non_stats_columns = ["Season", "TeamID", "Seed", "Elo"]
stats_columns = [c for c in all_columns if c not in non_stats_columns]

In [92]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)

pred_data = pd.DataFrame(index=range(tourney_games.shape[0]), columns= ["Season"] + stats_columns + ["NumTeamWon", "NumTeamSeed", "DenTeamSeed", "NumTeamElo", "DenTeamElo"])
pred_data.loc[:,"NumTeamWon"] = np.ones(tourney_games.shape[0])
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        pred_data.loc[index, "Season"] = season
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        
        teamA_seed = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        teamB_seed = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        
        teamA_elo = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        teamB_elo = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        
        teamA_stats = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season), stats_columns].as_matrix()
        teamB_stats = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season), stats_columns].as_matrix()
        r = random.random()
        
        if r > 0.5:
            pred_data.at[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
            pred_data.at[index, "NumTeamSeed"] = teamA_seed
            pred_data.at[index, "DenTeamSeed"] = teamB_seed
            pred_data.at[index, "NumTeamElo"] = teamA_elo
            pred_data.at[index, "DenTeamElo"] = teamB_elo
        else:
            pred_data.at[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            pred_data.at[index, "NumTeamSeed"] = teamB_seed
            pred_data.at[index, "DenTeamSeed"] = teamA_seed
            pred_data.at[index, "NumTeamElo"] = teamB_elo
            pred_data.at[index, "DenTeamElo"] = teamA_elo
            pred_data.at[index, "NumTeamWon"] = 0
        
        index += 1

pred_data = pred_data.dropna(axis=0, how='any')

In [93]:
all_train_columns = stats_columns + ["NumTeamSeed", "DenTeamSeed", "NumTeamElo", "DenTeamElo"]
train_columns_to_drop = ["FT_PCT", "FGM3", "FGA3", "FTM", "DEF_REB_PCT", "FTA", "FT_RATE", "4Factor",
                         "DR", "FG_PCT", "Stl", "ASSIST_RATIO", "PF", "TO", "Ast", "FGA", "OR"]

# train_columns_to_drop = []

train_columns = [c for c in all_train_columns if c not in train_columns_to_drop]

In [94]:
train_x = pred_data.loc[pred_data["Season"].isin(seasons_train)][train_columns].as_matrix()
train_y = pred_data.loc[pred_data["Season"].isin(seasons_train)]["NumTeamWon"].as_matrix()
test_x = pred_data.loc[pred_data["Season"].isin(seasons_test)][train_columns].as_matrix()
test_y = pred_data.loc[pred_data["Season"].isin(seasons_test)]["NumTeamWon"].as_matrix()

In [104]:
from sklearn.linear_model import LogisticRegression

clf = ExtraTreesRegressor(n_estimators=30000, oob_score=True, bootstrap=True, max_features=1, min_samples_leaf=1, n_jobs=-1)
# clf = LogisticRegression(n_jobs=-1)
clf.fit(train_x, train_y.ravel())
print("OOB Score: ", clf.oob_score_)

OOB Score:  0.23062597128633844


In [105]:
preds_train = clf.predict(train_x)
preds_test = clf.predict(test_x)

print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
print("Log Loss, Train: ", log_loss(train_y, preds_train))
print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))

Accuracy, Train:  1.0
Log Loss, Train:  0.16032171330628392

Accuracy, Test:   0.7201492537313433
Log Loss, Test:   0.5596612827500136


In [97]:
variables = train_columns
feature_importance = clf.feature_importances_

feature_importance, variables = (list(t) for t in zip(*sorted(zip(feature_importance, variables))))

for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])

4FACTOR : 0.06303330283076683
Blk : 0.06756142149662885
TURNOVER_RATE : 0.06823267306264438
OFF_REB_PCT : 0.06858841976068433
FGM : 0.0715889855500587
DEF_EFF : 0.07334705476142507
WINPCT : 0.07485985647734324
OFF_EFF : 0.0752982499367858
PIE : 0.07903358639888672
DenTeamElo : 0.08296937696944447
NumTeamElo : 0.08591606605909984
DenTeamSeed : 0.09040769030080953
NumTeamSeed : 0.0991633163954238
