In [2]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, log_loss
import math

In [8]:
# Import relevant data files
reg_ratios = pd.read_csv("RegularSeasonDetailedResultsRatios.csv")
tour_ratios = pd.read_csv("NCAATourneyDetailedResultsRatios.csv")
reg_season_avgs = pd.read_csv("RegularSeasonAverageRatios.csv")

# List of all columns in the datasets
# all_cols = reg_ratios.columns.tolist()
all_cols = ["Season", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc",
                 "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                 "TOR", "StlR", "BlkR", "PFR"]

# Columns that won't be used in the dataset
cols_to_drop = ["Season", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc"]

# Columns that will be used in the dataset
stats_columns = [item for item in all_cols if item not in cols_to_drop]

# Seasons to look at
seasons = [2017]

# Specify what to use as the training data
train_data = reg_ratios

# Get all of the training data for the given years
train_x = train_data.loc[reg_ratios["Season"].isin(seasons)].drop(labels=cols_to_drop, axis=1).as_matrix()
train_y = train_data.loc[reg_ratios["Season"].isin(seasons)]["NumTeamWon"].as_matrix()

In [9]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)

test_x = pd.DataFrame(index=range(tourney_games.shape[0]), columns=stats_columns)
test_y = np.ones(67*len(seasons))
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season]
    for j, row in tourney_games_for_season.iterrows():
        index = i * 67 + j
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        teamA_stats = reg_season_avgs.loc[(reg_season_avgs["TeamID"] == teamA_id) & (reg_season_avgs["Season"] == season), stats_columns].as_matrix()
        teamB_stats = reg_season_avgs.loc[(reg_season_avgs["TeamID"] == teamB_id) & (reg_season_avgs["Season"] == season), stats_columns].as_matrix()
        r = random.random()
        if r > 0.5:
            test_x.loc[index, stats_columns] = teamA_stats / teamB_stats
        else:
            test_x.loc[index, stats_columns] = teamB_stats / teamA_stats
            test_y[j] = 0
    
test_x = test_x.as_matrix()
print("Train X: ", train_x.shape)
print("Train Y: ", train_y.shape)
print("Test X:  ", test_x.shape)
print("Test Y:  ", test_y.shape)

Train X:  (5395, 13)
Train Y:  (5395,)
Test X:   (67, 13)
Test Y:   (67,)


In [10]:
clf = RandomForestRegressor(n_estimators=5000, oob_score=False, max_features=1, min_samples_leaf=50, n_jobs=-1)
clf.fit(train_x, train_y.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=1, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=50,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=5000, n_jobs=-1, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [11]:
preds_train = clf.predict(train_x)
preds_test = clf.predict(test_x)

print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
print("Log Loss, Train: ", log_loss(train_y, preds_train))
print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))

Accuracy, Train:  0.9490268767377201
Log Loss, Train:  0.32303054403482695

Accuracy, Test:   0.6865671641791045
Log Loss, Test:   0.5978310327542643


In [12]:
variables = stats_columns
feature_importance = clf.feature_importances_
for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])

FGMR : 0.20654888069483596
FGAR : 0.020037413336939577
FGMR3 : 0.05914881017111136
FGAR3 : 0.013465290749479218
FTMR : 0.09872771193889555
FTAR : 0.08368906068316485
ORR : 0.006865472212006444
DRR : 0.1858764377111324
AstR : 0.1369655149636822
TOR : 0.039138604276543046
StlR : 0.03183936615180908
BlkR : 0.039642835300210816
PFR : 0.07805460181018832
