In [47]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, log_loss
import math

root = "/home/austin/Github/kaggle-ncaa-2018/"

In [48]:
# Import relevant data files
reg_ratios = pd.read_csv(root + "derived_data/ratios/RegularSeasonDetailedResultsRatios.csv")
reg_ratios_comp = pd.read_csv(root + "derived_data/ratios/RegularSeasonAverageRatiosWithComposites.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")
reg_season_avgs = pd.read_csv(root + "derived_data/ratios/RegularSeasonAverageRatiosForTournamentTeams.csv")

# List of all columns in the datasets
all_cols = reg_ratios_comp.columns.tolist()
# all_cols = ["Season", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc",
#                  "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
#                  "TOR", "StlR", "BlkR", "PFR"]

# Columns that won't be used in the dataset
cols_to_drop = ["Season", "TeamID"]

# Columns that will be used in the dataset
# stats_columns = [item for item in all_cols if item not in cols_to_drop]
stats_columns = ["PIE", "FT_RATE", "OFF_EFF", "FGMR", "FTMR", "WINPCT", "BlkR", "OFF_REB_PCT", "TURNOVER_RATE",
                 "StlR", "TOR"]


print(stats_columns)

# Seasons to look at
seasons = [2011, 2012, 2013, 2014, 2015, 2016, 2017]
seasons_test = [2017]
seasons_train = [season for season in seasons if season not in seasons_test]

# # Specify what to use as the training data
# train_data = reg_ratios_comp

# # Get all of the training data for the given years
# train_x = train_data.loc[train_data["Season"].isin(seasons)].drop(labels=cols_to_drop, axis=1).as_matrix()
# train_y = train_data.loc[train_data["Season"].isin(seasons)]["NumTeamWon"].as_matrix()

['PIE', 'FT_RATE', 'OFF_EFF', 'FGMR', 'FTMR', 'WINPCT', 'BlkR', 'OFF_REB_PCT', 'TURNOVER_RATE', 'StlR', 'TOR']


In [49]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)

all_data = pd.DataFrame(index=range(tourney_games.shape[0]), columns= ["Season"] + stats_columns + ["NumTeamWon"])
all_data.loc[:,"NumTeamWon"] = np.ones(tourney_games.shape[0])
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        all_data.loc[index, "Season"] = season
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        teamA_stats = reg_ratios_comp.loc[(reg_ratios_comp["TeamID"] == teamA_id) & (reg_ratios_comp["Season"] == season), stats_columns].as_matrix()
        teamB_stats = reg_ratios_comp.loc[(reg_ratios_comp["TeamID"] == teamB_id) & (reg_ratios_comp["Season"] == season), stats_columns].as_matrix()
        r = random.random()
        if r > 0.5:
            all_data.loc[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
        else:
            all_data.loc[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            all_data.loc[index, "NumTeamWon"] = 0
            
#         if index == 0:
#             print(all_data.loc[index, stats_columns])
#             print(teamA_stats / teamB_stats)
#             print(teamA_stats)
#             print(teamB_stats)
        
        index += 1

In [50]:
train_x = all_data.loc[all_data["Season"].isin(seasons_train)][stats_columns].as_matrix()
train_y = all_data.loc[all_data["Season"].isin(seasons_train)]["NumTeamWon"].as_matrix()
test_x = all_data.loc[all_data["Season"].isin(seasons_test)][stats_columns].as_matrix()
test_y = all_data.loc[all_data["Season"].isin(seasons_test)]["NumTeamWon"].as_matrix()

print("Train X: ", train_x.shape)
print("Train Y: ", train_y.shape)
print("Test X:  ", test_x.shape)
print("Test Y:  ", test_y.shape)

Train X:  (402, 11)
Train Y:  (402,)
Test X:   (67, 11)
Test Y:   (67,)


In [59]:
clf = RandomForestRegressor(n_estimators=5000, oob_score=False, max_features=1, min_samples_leaf=1, n_jobs=-1)
clf.fit(train_x, train_y.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=1, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=5000, n_jobs=-1, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [60]:
preds_train = clf.predict(train_x)
preds_test = clf.predict(test_x)

print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
print("Log Loss, Train: ", log_loss(train_y, preds_train))
print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))

Accuracy, Train:  1.0
Log Loss, Train:  0.17665896758155253

Accuracy, Test:   0.6417910447761194
Log Loss, Test:   0.6068496847627293


In [61]:
variables = stats_columns
feature_importance = clf.feature_importances_

feature_importance, variables = (list(t) for t in zip(*sorted(zip(feature_importance, variables))))

for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])

TOR : 0.08539325306976453
StlR : 0.08670190081258734
OFF_REB_PCT : 0.0881299616729374
FT_RATE : 0.08887598417463675
WINPCT : 0.09073579933777068
BlkR : 0.09100954057615845
OFF_EFF : 0.09238602302107347
TURNOVER_RATE : 0.09245394566305176
FGMR : 0.09291952047889912
FTMR : 0.0949083202541041
PIE : 0.09648575093901735
