In [4]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor

In [None]:
reg_ratios = pd.read_csv("RegularSeasonDetailedResultsRatios.csv")
cols_to_drop = ["Season", "DayNum", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon"]

In [None]:
# Get all rows for 2017 season and drop irrelevant variables
test_data = reg_ratios.loc[reg_ratios['Season'] == 2017]
test_x = test_data.drop(columns=cols_to_drop, axis=1).as_matrix()
test_y = test_data["NumTeamWon"].as_matrix()

# Get rows for all other seasons, drop irrelevant variables
train_data = reg_ratios[reg_ratios.Season != 2017]
train_x = train_data.drop(labels=cols_to_drop, axis=1).as_matrix()
train_y = train_data["NumTeamWon"].as_matrix()

In [None]:
# Random forest
clf = RandomForestRegressor(n_estimators=100)
clf.fit(train_x, train_y.ravel())
# print("Out-of-bag score: {}".format(clf.oob_score_))
# acc = clf.score(test_x, test_y.ravel())
# print("Test accuracy: {}".format(acc))

In [None]:
from sklearn.metrics import accuracy_score, log_loss
preds = clf.predict(test_x)
print(preds.shape)
print(test_y.shape)
print("Accuracy score", accuracy_score(test_y, preds > 0.5))
print("Log loss", log_loss(test_y, preds))

In [8]:
# New model:
# Train on all regular season data, including teamID
# Predict based on team's last X games
games_to_use = 20
import math

reg_ratios = pd.read_csv("RegularSeasonDetailedResultsRatios.csv")
tour_ratios = pd.read_csv("NCAATourneyDetailedResultsRatios.csv")
all_cols = ["Season", "DayNum", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc",
                 "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                 "TOR", "StlR", "BlkR", "PFR"]

# Add day numbers to regular season games
regular_season_data = pd.read_csv("RegularSeasonDetailedResults.csv")
reg_ratios["DayNum"] = regular_season_data["DayNum"]
reg_ratios["Season"] = reg_ratios["Season"].astype(int)
# print(reg_ratios.loc[reg_ratios["Season"] == 2017])
# print(reg_ratios)

# Get train data: all regular season games
# cols_to_drop = ["Season", "DayNum", "WScore", "WTeamID", "LScore", "LTeamID", "NumTeamWon", "Loc"]
cols_to_drop = ["Season", "DayNum", "WScore", "LScore", "NumTeamWon", "Loc"]

train_data = reg_ratios
train_x = reg_ratios.loc[reg_ratios["Season"] == 2017].drop(labels=cols_to_drop, axis=1).as_matrix()
train_y = reg_ratios.loc[reg_ratios["Season"] == 2017]["NumTeamWon"].as_matrix()

# Get all team IDs from the tournament
tour_avg_cols = ["TeamID", "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                 "TOR", "StlR", "BlkR", "PFR"]
team_ids = pd.unique(tour_ratios.loc[tour_ratios["Season"] == 2017, ["WTeamID", "LTeamID"]].values.ravel('K'))
tour_avgs = pd.DataFrame(index=range(team_ids.shape[0]), columns=tour_avg_cols)
tour_avgs.loc[:,"TeamID"] = team_ids
# print(test_x)

for team_id in team_ids:
    games_all = reg_ratios.loc[(reg_ratios["WTeamID"] == team_id) | (reg_ratios["LTeamID"] == team_id)]
    games = games_all.loc[games_all["Season"] == 2017]
    games.reset_index(inplace=True, drop=True)
    games_stats = games.loc[:, "FGMR":"PFR"]
    for i, row in games.iterrows():
        if ((team_id == row["WTeamID"]) & (row["NumTeamWon"] == 0)) | ((team_id == row["LTeamID"]) & (row["NumTeamWon"] == 1)):
            games_stats.iloc[i] = games_stats.iloc[i].apply(np.reciprocal)
#             if math.isinf(games_stats.at[i,"Loc"]):
#                 games_stats.loc[i,"Loc"] = 0
    
    tour_avgs.loc[tour_avgs["TeamID"] == team_id, "FGMR":"PFR"] = games_stats.mean(axis=0).ravel()
    
tour_avgs = tour_avgs.set_index("TeamID")

In [9]:
import random

test_x_columns = [item for item in all_cols if item not in cols_to_drop]
# print(test_x_columns)

tourney_games_2017 = tour_ratios.loc[tour_ratios["Season"] == 2017]
tourney_games_2017 = tourney_games_2017.reset_index(drop=True)
# print(tourney_games_2017.shape)
test_x = pd.DataFrame(index=range(tourney_games_2017.shape[0]), columns=test_x_columns)
test_y = np.ones(67)
for i, row in tourney_games_2017.iterrows():
    teamA_id = tourney_games_2017.at[i, "WTeamID"]
    teamB_id = tourney_games_2017.at[i, "LTeamID"]
    teamA_stats = tour_avgs.loc[teamA_id, "FGMR":"PFR"].as_matrix()
    teamB_stats = tour_avgs.loc[teamB_id, "FGMR":"PFR"].as_matrix()
    r = random.random()
    if r > 0.5:
        test_x["WTeamID"] = teamA_id
        test_x["LTeamID"] = teamB_id
#         test_x["Loc"] = 0
        test_x.loc[i, "FGMR":"PFR"] = teamA_stats / teamB_stats
    else:
        test_x["WTeamID"] = teamB_id
        test_x["LTeamID"] = teamA_id
#         test_x["Loc"] = 0
        test_x.loc[i, "FGMR":"PFR"] = teamB_stats / teamA_stats
        test_y[i] = 0
    
# print(test_x)
test_x = test_x.as_matrix()
# print(train_x.shape)
# print(train_y.shape)
# print(test_x.shape)
# print(test_y.shape)

In [50]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, log_loss

# num_splits = 10
# ll_best = 1000
# num_features_best = 1
# kf = KFold(n_splits=num_splits)
# for num_features_to_use in range(1, 17):
#     ll_feature_count = 0
#     for train, test in kf.split(train_x):
#         clf2 = RandomForestRegressor(n_estimators=5000, oob_score=True, max_features=num_features_to_use)
#         clf2.fit(train_x[train], train_y[train].ravel())
#         preds = clf2.predict(train_x[test])
#         ll_feature_count = log_loss(train_y[test], preds) / num_splits
    
#     if ll_feature_count < ll_best:
#         ll_best = ll_feature_count
#         num_features_best = num_features_to_use

# print("Best number of features to use: {}", num_features_best)
# print("Log loss achieved: {}", ll_best)

clf2 = RandomForestRegressor(n_estimators=5000, oob_score=True, max_features=1, min_samples_leaf=50, n_jobs=-1)
clf2.fit(train_x, train_y.ravel())
print("oob score: {}", clf2.oob_score_)

oob score: {} 0.5939268001571109


In [51]:

preds = clf2.predict(test_x)
print("Accuracy score", accuracy_score(test_y, preds > 0.5))
print(preds)
print(test_y)
print("Log loss", log_loss(test_y, preds))

Accuracy score 0.6567164179104478
[0.5529672  0.30315836 0.19299759 0.66537479 0.26698175 0.58109421
 0.32790715 0.75108253 0.0873931  0.38975715 0.55257351 0.4925039
 0.55149249 0.35742411 0.16704249 0.14352955 0.29118399 0.61488758
 0.77630509 0.52660116 0.35514744 0.54790598 0.7987679  0.19826557
 0.25854854 0.83159468 0.22918285 0.71339927 0.32459922 0.68709596
 0.10688774 0.47425838 0.78696425 0.79150459 0.74699067 0.80478267
 0.30710608 0.58659479 0.50231626 0.92654705 0.19745819 0.62252687
 0.35327676 0.29785228 0.59675443 0.76506399 0.27246753 0.48939752
 0.79635179 0.2119273  0.44784822 0.38871168 0.15475425 0.2404387
 0.63272652 0.15888957 0.35670231 0.50404244 0.24122572 0.56646692
 0.89721805 0.72741461 0.4919409  0.33731644 0.84516809 0.47057346
 0.16840658]
[1. 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0.
 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1.]
Log loss 0.59

In [14]:
print(num_features_to_use)
print(num_features_best)
print(ll_best)

13
12
0.013204296094581802
