In [4]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor

In [None]:
reg_ratios = pd.read_csv("RegularSeasonDetailedResultsRatios.csv")
cols_to_drop = ["Season", "DayNum", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon"]

In [None]:
# Get all rows for 2017 season and drop irrelevant variables
test_data = reg_ratios.loc[reg_ratios['Season'] == 2017]
test_x = test_data.drop(columns=cols_to_drop, axis=1).as_matrix()
test_y = test_data["NumTeamWon"].as_matrix()

# Get rows for all other seasons, drop irrelevant variables
train_data = reg_ratios[reg_ratios.Season != 2017]
train_x = train_data.drop(labels=cols_to_drop, axis=1).as_matrix()
train_y = train_data["NumTeamWon"].as_matrix()

In [None]:
# Random forest
clf = RandomForestRegressor(n_estimators=100)
clf.fit(train_x, train_y.ravel())
# print("Out-of-bag score: {}".format(clf.oob_score_))
# acc = clf.score(test_x, test_y.ravel())
# print("Test accuracy: {}".format(acc))

In [None]:
from sklearn.metrics import accuracy_score, log_loss
preds = clf.predict(test_x)
print(preds.shape)
print(test_y.shape)
print("Accuracy score", accuracy_score(test_y, preds > 0.5))
print("Log loss", log_loss(test_y, preds))

In [76]:
# New model:
# Train on all regular season data, including teamID
# Predict based on team's last X games
games_to_use = 20
import math

reg_ratios = pd.read_csv("RegularSeasonDetailedResultsRatios.csv")
tour_ratios = pd.read_csv("NCAATourneyDetailedResultsRatios.csv")
all_cols = ["Season", "DayNum", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc",
                 "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                 "TOR", "StlR", "BlkR", "PFR"]

# Add day numbers to regular season games
regular_season_data = pd.read_csv("RegularSeasonDetailedResults.csv")
reg_ratios["DayNum"] = regular_season_data["DayNum"]
reg_ratios["Season"] = reg_ratios["Season"].astype(int)
# print(reg_ratios.loc[reg_ratios["Season"] == 2017])
# print(reg_ratios)

# Get train data: all regular season games
# cols_to_drop = ["Season", "DayNum", "WScore", "WTeamID", "LScore", "LTeamID", "NumTeamWon", "Loc"]
cols_to_drop = ["Season", "DayNum", "WScore", "LScore", "NumTeamWon", "Loc", "WTeamID", "LTeamID", "ORR"]

train_data = reg_ratios
train_x = reg_ratios.loc[reg_ratios["Season"] == 2015].drop(labels=cols_to_drop, axis=1).as_matrix()
train_y = reg_ratios.loc[reg_ratios["Season"] == 2015]["NumTeamWon"].as_matrix()

# Get all team IDs from the tournament
tour_avg_cols = ["TeamID", "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR","DRR", "AstR",
                 "TOR", "StlR", "BlkR", "PFR"]
team_ids = pd.unique(tour_ratios.loc[tour_ratios["Season"] == 2015, ["WTeamID", "LTeamID"]].values.ravel('K'))
tour_avgs = pd.DataFrame(index=range(team_ids.shape[0]), columns=tour_avg_cols)
tour_avgs.loc[:,"TeamID"] = team_ids
# print(test_x)

for team_id in team_ids:
    games_all = reg_ratios.loc[(reg_ratios["WTeamID"] == team_id) | (reg_ratios["LTeamID"] == team_id)]
    games = games_all.loc[games_all["Season"] == 2015]
    games.reset_index(inplace=True, drop=True)
    games_stats = games.loc[:, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR","DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]]
    for i, row in games.iterrows():
        if ((team_id == row["WTeamID"]) & (row["NumTeamWon"] == 0)) | ((team_id == row["LTeamID"]) & (row["NumTeamWon"] == 1)):
            games_stats.iloc[i] = games_stats.iloc[i].apply(np.reciprocal)
#             if math.isinf(games_stats.at[i,"Loc"]):
#                 games_stats.loc[i,"Loc"] = 0
    
    tour_avgs.loc[tour_avgs["TeamID"] == team_id, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]] = games_stats.mean(axis=0).ravel()
    
tour_avgs = tour_avgs.set_index("TeamID")

In [78]:
import random

test_x_columns = [item for item in all_cols if item not in cols_to_drop]
# print(test_x_columns)

tourney_games_2017 = tour_ratios.loc[tour_ratios["Season"] == 2015]
tourney_games_2017 = tourney_games_2017.reset_index(drop=True)
# print(tourney_games_2017.shape)
test_x = pd.DataFrame(index=range(tourney_games_2017.shape[0]), columns=test_x_columns)
test_y = np.ones(67)
for i, row in tourney_games_2017.iterrows():
    teamA_id = tourney_games_2017.at[i, "WTeamID"]
    teamB_id = tourney_games_2017.at[i, "LTeamID"]
    teamA_stats = tour_avgs.loc[teamA_id, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]].as_matrix()
    teamB_stats = tour_avgs.loc[teamB_id, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]].as_matrix()
    r = random.random()
    if r > 0.5:
#         test_x["WTeamID"] = teamA_id
#         test_x["LTeamID"] = teamB_id
#         test_x["Loc"] = 0
        test_x.loc[i, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]] = teamA_stats / teamB_stats
    else:
#         test_x["WTeamID"] = teamB_id
#         test_x["LTeamID"] = teamA_id
#         test_x["Loc"] = 0
        test_x.loc[i, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]] = teamB_stats / teamA_stats
        test_y[i] = 0
    
# print(test_x)
test_x = test_x.as_matrix()
# print(train_x.shape)
# print(train_y.shape)
# print(test_x.shape)
# print(test_y.shape)

In [87]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, log_loss

# num_splits = 10
# ll_best = 1000
# num_features_best = 1
# kf = KFold(n_splits=num_splits)
# for num_features_to_use in range(1, 17):
#     ll_feature_count = 0
#     for train, test in kf.split(train_x):
#         clf2 = RandomForestRegressor(n_estimators=5000, oob_score=True, max_features=num_features_to_use)
#         clf2.fit(train_x[train], train_y[train].ravel())
#         preds = clf2.predict(train_x[test])
#         ll_feature_count = log_loss(train_y[test], preds) / num_splits
    
#     if ll_feature_count < ll_best:
#         ll_best = ll_feature_count
#         num_features_best = num_features_to_use

# print("Best number of features to use: {}", num_features_best)
# print("Log loss achieved: {}", ll_best)

clf2 = RandomForestRegressor(n_estimators=5000, oob_score=True, max_features=1, min_samples_leaf=50, n_jobs=-1)
clf2.fit(train_x, train_y.ravel())
print("oob score: {}", clf2.oob_score_)

oob score: {} 0.6453949771307421


In [88]:

preds = clf2.predict(test_x)
print("Accuracy score", accuracy_score(test_y, preds > 0.5))
print(preds)
print(test_y)
print("Log loss", log_loss(test_y, preds))

Accuracy score 0.6567164179104478
[0.47863992 0.47879407 0.41359583 0.35734047 0.11151679 0.67420572
 0.37970829 0.69398893 0.23691362 0.46557591 0.97399429 0.74747724
 0.45531481 0.80893795 0.13674792 0.83549993 0.13623675 0.23046119
 0.91457332 0.35474287 0.59600911 0.91891554 0.85321546 0.59153471
 0.14799383 0.2170854  0.52099991 0.38951794 0.73481686 0.3446836
 0.43025277 0.81357031 0.0988598  0.55123117 0.24131607 0.9113362
 0.24764183 0.92735045 0.89694036 0.45069965 0.1774557  0.53671202
 0.87588841 0.50815789 0.37358878 0.81916046 0.25653359 0.85766557
 0.45654112 0.62090609 0.73852116 0.90009723 0.1147239  0.05985707
 0.38963418 0.22175865 0.50873776 0.0672134  0.81816814 0.42214715
 0.88815987 0.34443636 0.50465374 0.54312394 0.23009355 0.25914111
 0.27536869]
[1. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1.
 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0.
 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1.]
Log loss 0.59

In [89]:
variables = test_x_columns
feature_importance = clf2.feature_importances_
for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])

FGMR : 0.19460037000804545
FGAR : 0.021075793072354257
FGMR3 : 0.04536180274748659
FGAR3 : 0.01592329594439309
FTMR : 0.1181382775296927
FTAR : 0.0944975126589396
DRR : 0.17348269414637926
AstR : 0.1354413605094929
TOR : 0.04546660630848851
StlR : 0.034265551128565955
BlkR : 0.0381803263480402
PFR : 0.08356640959812202


In [68]:
print(test_x_columns)

['WTeamID', 'LTeamID', 'FGMR', 'FGAR', 'FGMR3', 'FGAR3', 'FTMR', 'FTAR', 'ORR', 'DRR', 'AstR', 'TOR', 'StlR', 'BlkR', 'PFR']
