In [40]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, log_loss
import math

In [28]:
# Import relevant data files
reg_ratios = pd.read_csv("RegularSeasonDetailedResultsRatios.csv")
tour_ratios = pd.read_csv("NCAATourneyDetailedResultsRatios.csv")

# List of all columns in the datasets
# all_cols = reg_ratios.columns.tolist()
all_cols = ["Season", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc",
                 "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                 "TOR", "StlR", "BlkR", "PFR"]

# Columns that won't be used in the dataset
cols_to_drop = ["Season", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc"]

# Columns that will be used in the dataset
stats_columns = [item for item in all_cols if item not in cols_to_drop]

# Years to look at
seasons = [2017]

# Specify what to use as the training data
train_data = reg_ratios

# Get all of the training data for the given years
train_x = reg_ratios.loc[reg_ratios["Season"].isin(seasons)].drop(labels=cols_to_drop, axis=1).as_matrix()
train_y = reg_ratios.loc[reg_ratios["Season"].isin(seasons)]["NumTeamWon"].as_matrix()

# Get all team IDs from the tournament
reg_season_avg_cols = ["Season", "TeamID"] + stats_columns

# Create a dataframe with teamIDs for all tourneys with season in which that team played
team_ids = pd.DataFrame(index=range(len(seasons)*68), columns=["Season", "TeamID"])
for season in seasons:
    team_ids_for_season = pd.unique(tour_ratios.loc[tour_ratios["Season"] == 2017, ["WTeamID", "LTeamID"]].values.ravel('K'))
    for i in range(len(team_ids_for_season)):
        team_ids.loc[i, "Season"] = season
        team_ids.loc[i, "TeamID"] = team_ids_for_season[i]

# Create a data sets to hold regular season averages for every team in the tournament for every specified year
reg_season_avgs = pd.DataFrame(index=range(team_ids.shape[0]), columns=reg_season_avg_cols)
reg_season_avgs["Season"] = team_ids["Season"]
reg_season_avgs["TeamID"] = team_ids["TeamID"]

for season in seasons:
    team_ids_for_season = reg_season_avgs.loc[reg_season_avgs["Season"] == season, "TeamID"].tolist()
        
    for team_id in team_ids_for_season:
        games_all = reg_ratios.loc[(reg_ratios["WTeamID"] == team_id) | (reg_ratios["LTeamID"] == team_id)]
        games = games_all.loc[games_all["Season"] == season]
        games.reset_index(inplace=True, drop=True)
        games_stats = games.loc[:, stats_columns]
        for i, row in games.iterrows():
            if ((team_id == row["WTeamID"]) & (row["NumTeamWon"] == 0)) | ((team_id == row["LTeamID"]) & (row["NumTeamWon"] == 1)):
                games_stats.iloc[i] = games_stats.iloc[i].apply(np.reciprocal)
    
        reg_season_avgs.loc[(reg_season_avgs["TeamID"] == team_id) & (reg_season_avgs["Season"] == season), stats_columns] = games_stats.mean(axis=0).ravel()


In [37]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)

test_x = pd.DataFrame(index=range(tourney_games.shape[0]), columns=stats_columns)
test_y = np.ones(67*len(seasons))
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season]
    for j, row in tourney_games_for_season.iterrows():
        index = i * 67 + j
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        teamA_stats = reg_season_avgs.loc[(reg_season_avgs["TeamID"] == teamA_id) & (reg_season_avgs["Season"] == season), stats_columns].as_matrix()
        teamB_stats = reg_season_avgs.loc[(reg_season_avgs["TeamID"] == teamB_id) & (reg_season_avgs["Season"] == season), stats_columns].as_matrix()
        r = random.random()
        if r > 0.5:
            test_x.loc[index, stats_columns] = teamA_stats / teamB_stats
        else:
            test_x.loc[index, stats_columns] = teamB_stats / teamA_stats
            test_y[i] = 0
    
test_x = test_x.as_matrix()
print("Train X: ", train_x.shape)
print("Train Y: ", train_y.shape)
print("Test X:  ", test_x.shape)
print("Test Y:  ", test_y.shape)

Train X:  (5395, 13)
Train Y:  (5395,)
Test X:   (67, 13)
Test Y:   (67,)


In [49]:
clf = RandomForestRegressor(n_estimators=5000, oob_score=False, max_features=13, min_samples_leaf=1, n_jobs=-1)
clf.fit(train_x, train_y.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=13, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=5000, n_jobs=-1, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [50]:
preds_train = clf.predict(train_x)
preds_test = clf.predict(test_x)

print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
print("Log Loss, Train: ", log_loss(train_y, preds_train))
print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))
# print(preds_test)
# print(test_y)
# print("Log loss", log_loss(test_y, preds_test))

Accuracy, Train:  1.0
Log Loss, Train:  0.03891280353281886

Accuracy, Test:   0.5373134328358209
Log Loss, Test:   2.4164428884452827


In [32]:
variables = stats_columns
feature_importance = clf.feature_importances_
for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])

FGMR : 0.1783877774182809
FGAR : 0.03968620367167515
FGMR3 : 0.06566348229014081
FGAR3 : 0.036065276620378374
FTMR : 0.09261160782621278
FTAR : 0.07877265493266138
ORR : 0.030477507521364657
DRR : 0.15220920420422562
AstR : 0.11361003322119814
TOR : 0.05383051098203282
StlR : 0.04349486108293387
BlkR : 0.0437610144163479
PFR : 0.07142986581254765
