In [33]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import accuracy_score, log_loss
import math

root = "/home/austin/Github/kaggle-ncaa-2018/"

In [34]:
# Import relevant data files
reg_ratios = pd.read_csv(root + "derived_data/ratios/RegularSeasonDetailedResultsRatios.csv")
reg_ratios_comp = pd.read_csv(root + "derived_data/ratios/RegularSeasonAverageRatiosWithComposites.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")
reg_season_avgs = pd.read_csv(root + "derived_data/ratios/RegularSeasonAverageRatiosForTournamentTeams.csv")
seeds = pd.read_csv(root + "original_data/NCAATourneySeeds.csv")

# List of all columns in the datasets
all_cols = reg_ratios_comp.columns.tolist()
# all_cols = ["Season", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc",
#                  "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
#                  "TOR", "StlR", "BlkR", "PFR"]

# Columns that won't be used in the dataset
cols_to_drop = ["Season", "TeamID"]

# Columns that will be used in the dataset
stats_columns = [item for item in all_cols if item not in cols_to_drop]
# stats_columns = ["PIE", "FT_RATE", "OFF_EFF", "FGMR", "FTMR", "WINPCT", "BlkR", "OFF_REB_PCT", "TURNOVER_RATE",
#                  "StlR", "TOR"]

print(stats_columns)

# Seasons to look at
seasons = range(2003, 2018)
seasons_test = [2014, 2015, 2016, 2017]
seasons_train = [season for season in seasons if season not in seasons_test]

# # Specify what to use as the training data
# train_data = reg_ratios_comp

# # Get all of the training data for the given years
# train_x = train_data.loc[train_data["Season"].isin(seasons)].drop(labels=cols_to_drop, axis=1).as_matrix()
# train_y = train_data.loc[train_data["Season"].isin(seasons)]["NumTeamWon"].as_matrix()

['FGMR', 'FGAR', 'FGMR3', 'FGAR3', 'FTMR', 'FTAR', 'ORR', 'DRR', 'AstR', 'TOR', 'StlR', 'BlkR', 'PFR', 'PIE', 'FG_PCT', 'TURNOVER_RATE', 'OFF_REB_PCT', 'FT_RATE', '4FACTOR', 'OFF_EFF', 'DEF_EFF', 'ASSIST_RATIO', 'DEF_REB_PCT', 'FT_PCT', 'WINPCT']


In [35]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)

all_data = pd.DataFrame(index=range(tourney_games.shape[0]), columns= ["Season"] + stats_columns + ["NumTeamWon", "NumTeamSeed", "DenTeamSeed"])
all_data.loc[:,"NumTeamWon"] = np.ones(tourney_games.shape[0])
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        all_data.loc[index, "Season"] = season
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        
        teamA_seed = seeds.loc[(seeds["Season"] == season) & (seeds["TeamID"] == teamA_id), "Seed"].reset_index(drop=True)[0]
        teamA_seed = int(teamA_seed[1:3])
        
        teamB_seed = seeds.loc[(seeds["Season"] == season) & (seeds["TeamID"] == teamB_id), "Seed"].reset_index(drop=True)[0]
        teamB_seed = int(teamB_seed[1:3])
        
#         if teamB_seed == teamA_seed:
#             print(season)
#             print(teamA_id)
#             print(teamB_id)
#             print(teamB_seed)
#             print("")
        
        teamA_stats = reg_ratios_comp.loc[(reg_ratios_comp["TeamID"] == teamA_id) & (reg_ratios_comp["Season"] == season), stats_columns].as_matrix()
        teamB_stats = reg_ratios_comp.loc[(reg_ratios_comp["TeamID"] == teamB_id) & (reg_ratios_comp["Season"] == season), stats_columns].as_matrix()
        r = random.random()
        
#         print(teamA_stats)
#         print(teamB_stats)
#         print(season)
#         print(teamA_id)
#         print(teamB_id)
        
        if r > 0.5:
            all_data.loc[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
            all_data.loc[index, "NumTeamSeed"] = teamA_seed
            all_data.loc[index, "DenTeamSeed"] = teamB_seed
        else:
            all_data.loc[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            all_data.loc[index, "NumTeamSeed"] = teamB_seed
            all_data.loc[index, "DenTeamSeed"] = teamA_seed
            all_data.loc[index, "NumTeamWon"] = 0
            
#         if index == 0:
#             print(all_data.loc[index, stats_columns])
#             print(teamA_stats / teamB_stats)
#             print(teamA_stats)
#             print(teamB_stats)
        
        index += 1

all_data = all_data.dropna(axis=0, how='any')
stats_columns = stats_columns + ["NumTeamSeed", "DenTeamSeed"]

In [36]:
train_x = all_data.loc[all_data["Season"].isin(seasons_train)][stats_columns].as_matrix()
train_y = all_data.loc[all_data["Season"].isin(seasons_train)]["NumTeamWon"].as_matrix()
test_x = all_data.loc[all_data["Season"].isin(seasons_test)][stats_columns].as_matrix()
test_y = all_data.loc[all_data["Season"].isin(seasons_test)]["NumTeamWon"].as_matrix()

print("Train X: ", train_x.shape)
print("Train Y: ", train_y.shape)
print("Test X:  ", test_x.shape)
print("Test Y:  ", test_y.shape)

Train X:  (713, 27)
Train Y:  (713,)
Test X:   (268, 27)
Test Y:   (268,)


In [48]:
clf = RandomForestRegressor(n_estimators=30000, oob_score=True, max_features=1, min_samples_leaf=1, n_jobs=-1)
clf.fit(train_x, train_y.ravel())
print("OOB Score: ", clf.oob_score_)

OOB Score:  0.223938116940277


In [49]:
preds_train = clf.predict(train_x)
preds_test = clf.predict(test_x)

# preds_test_old = np.copy(preds_test)

# threshold = 0.05
# preds_test[preds_test < threshold] = 0
# preds_test[preds_test > 1 - threshold] = 1
# preds_test[(preds_test < 0.7) & (preds_test > 0.5)] = 0.51
# preds_test[(preds_test < 0.5) & (preds_test > 0.3)] = 0.49

# print(preds_test)

print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
print("Log Loss, Train: ", log_loss(train_y, preds_train))
print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))

Accuracy, Train:  0.8737727910238429
Log Loss, Train:  0.3945551159208841

Accuracy, Test:   0.6791044776119403
Log Loss, Test:   0.5783440956564604


In [47]:
variables = stats_columns + ["NumTeamSeed", "DenTeamSeed"]
feature_importance = clf.feature_importances_

feature_importance, variables = (list(t) for t in zip(*sorted(zip(feature_importance, variables))))

for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])

FG_PCT : 0.02411331695751205
DRR : 0.02444033860863327
TOR : 0.025754988842849876
ORR : 0.0262391099852396
FGMR3 : 0.026565110367239724
FGAR3 : 0.026799119791528575
FGAR : 0.027689351280065624
FTAR : 0.02807102758401814
FTMR : 0.028167758420651508
PFR : 0.028626524095911467
FT_PCT : 0.028634258904189528
ASSIST_RATIO : 0.02870608049808498
BlkR : 0.02878628038962802
AstR : 0.029168831941190153
4FACTOR : 0.02918130960006994
DEF_REB_PCT : 0.029324773397607078
TURNOVER_RATE : 0.030669835010330004
FT_RATE : 0.030891241080576056
StlR : 0.0332116117132303
DEF_EFF : 0.035517395346368254
OFF_REB_PCT : 0.040669000083521784
OFF_EFF : 0.045768268549363014
WINPCT : 0.04880413826512434
FGMR : 0.051938078016468735
PIE : 0.0669112627190101
NumTeamSeed : 0.0846462296476583
DenTeamSeed : 0.0907047589039301
