In [62]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, log_loss
import math

root = "/home/austin/Github/kaggle-ncaa-2018/"

In [63]:
# Import relevant data files
reg_ratios = pd.read_csv(root + "derived_data/ratios/RegularSeasonDetailedResultsRatios.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")
reg_season_avgs = pd.read_csv(root + "derived_data/ratios/RegularSeasonAverageRatiosForTournamentTeams.csv")

# List of all columns in the datasets
# all_cols = reg_ratios.columns.tolist()
all_cols = ["Season", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc",
                 "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                 "TOR", "StlR", "BlkR", "PFR"]

# Columns that won't be used in the dataset
cols_to_drop = ["Season", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc"]

# Columns that will be used in the dataset
stats_columns = [item for item in all_cols if item not in cols_to_drop]

# Seasons to look at
seasons = [2015, 2016, 2017]

# Specify what to use as the training data
train_data = reg_ratios

# Get all of the training data for the given years
train_x = train_data.loc[reg_ratios["Season"].isin(seasons)].drop(labels=cols_to_drop, axis=1).as_matrix()
train_y = train_data.loc[reg_ratios["Season"].isin(seasons)]["NumTeamWon"].as_matrix()

In [64]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)

test_x = pd.DataFrame(index=range(tourney_games.shape[0]), columns=stats_columns)
test_y = np.ones(67*len(seasons))
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        teamA_stats = reg_season_avgs.loc[(reg_season_avgs["TeamID"] == teamA_id) & (reg_season_avgs["Season"] == season), stats_columns].as_matrix()
        teamB_stats = reg_season_avgs.loc[(reg_season_avgs["TeamID"] == teamB_id) & (reg_season_avgs["Season"] == season), stats_columns].as_matrix()
        r = random.random()
        if r > 0.5:
            test_x.loc[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
        else:
            test_x.loc[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            test_y[index] = 0
            
#         if index == 0:
#             print(test_x.loc[index, stats_columns])
#             print(teamA_stats / teamB_stats)
#             print(teamA_stats)
#             print(teamB_stats)
        index += 1
    
test_x = test_x.as_matrix()
print("Train X: ", train_x.shape)
print("Train Y: ", train_y.shape)
print("Test X:  ", test_x.shape)
print("Test Y:  ", test_y.shape)

Train X:  (16118, 13)
Train Y:  (16118,)
Test X:   (201, 13)
Test Y:   (201,)


In [65]:
clf = RandomForestRegressor(n_estimators=5000, oob_score=False, max_features=1, min_samples_leaf=1, n_jobs=-1)
clf.fit(train_x, train_y.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=1, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=5000, n_jobs=-1, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [66]:
preds_train = clf.predict(train_x)
preds_test = clf.predict(test_x)

print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
print("Log Loss, Train: ", log_loss(train_y, preds_train))
print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))

Accuracy, Train:  1.0
Log Loss, Train:  0.06157566729937712

Accuracy, Test:   0.6467661691542289
Log Loss, Test:   0.6637331930981382


In [67]:
variables = stats_columns
feature_importance = clf.feature_importances_
for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])

FGMR : 0.18870282833247795
FGAR : 0.036585243568917875
FGMR3 : 0.059028297979774796
FGAR3 : 0.034008658267835015
FTMR : 0.09885460306020129
FTAR : 0.07905456058443956
ORR : 0.027424622241236544
DRR : 0.15828999706881924
AstR : 0.1114756374857551
TOR : 0.05457112017885099
StlR : 0.03978102584501234
BlkR : 0.04077396383368072
PFR : 0.07144944155299804
