In [6]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor

random.seed(1)

In [7]:
# New model:
# Train on all regular season data, including teamID
# Predict based on team's last X games
games_to_use = 20
import math

reg_ratios = pd.read_csv("RegularSeasonDetailedResultsRatios.csv")
tour_ratios = pd.read_csv("NCAATourneyDetailedResultsRatios.csv")
all_cols = ["Season", "DayNum", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc",
                 "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                 "TOR", "StlR", "BlkR", "PFR"]

# Add day numbers to regular season games
regular_season_data = pd.read_csv("RegularSeasonDetailedResults.csv")
reg_ratios["DayNum"] = regular_season_data["DayNum"]
reg_ratios["Season"] = reg_ratios["Season"].astype(int)
# print(reg_ratios.loc[reg_ratios["Season"] == 2017])
# print(reg_ratios)

# Get train data: all regular season games
# cols_to_drop = ["Season", "DayNum", "WScore", "WTeamID", "LScore", "LTeamID", "NumTeamWon", "Loc"]
cols_to_drop = ["Season", "DayNum", "WScore", "LScore", "NumTeamWon", "Loc", "WTeamID", "LTeamID"]
stats_cols = [item for item in all_cols if item not in cols_to_drop]

train_data = reg_ratios
train_x = reg_ratios.loc[reg_ratios["Season"] == 2017].drop(labels=cols_to_drop, axis=1).as_matrix()
train_y = reg_ratios.loc[reg_ratios["Season"] == 2017]["NumTeamWon"].as_matrix()

# Get all team IDs from the tournament
tour_avg_cols = ["TeamID"] + stats_cols
team_ids = pd.unique(tour_ratios.loc[tour_ratios["Season"] == 2017, ["WTeamID", "LTeamID"]].values.ravel('K'))
tour_avgs = pd.DataFrame(index=range(team_ids.shape[0]), columns=tour_avg_cols)
tour_avgs.loc[:,"TeamID"] = team_ids
# print(test_x)

for team_id in team_ids:
    games_all = reg_ratios.loc[(reg_ratios["WTeamID"] == team_id) | (reg_ratios["LTeamID"] == team_id)]
    games = games_all.loc[games_all["Season"] == 2017]
    games.reset_index(inplace=True, drop=True)
    games_stats = games.loc[:, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]]
    for i, row in games.iterrows():
        if ((team_id == row["WTeamID"]) & (row["NumTeamWon"] == 0)) | ((team_id == row["LTeamID"]) & (row["NumTeamWon"] == 1)):
            games_stats.iloc[i] = games_stats.iloc[i].apply(np.reciprocal)
#             if math.isinf(games_stats.at[i,"Loc"]):
#                 games_stats.loc[i,"Loc"] = 0
    
    tour_avgs.loc[tour_avgs["TeamID"] == team_id, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]] = games_stats.mean(axis=0).ravel()
    
tour_avgs = tour_avgs.set_index("TeamID")

In [8]:
import random

test_x_columns = [item for item in all_cols if item not in cols_to_drop]
# print(test_x_columns)

tourney_games_2017 = tour_ratios.loc[tour_ratios["Season"] == 2017]
tourney_games_2017 = tourney_games_2017.reset_index(drop=True)
# print(tourney_games_2017.shape)
test_x = pd.DataFrame(index=range(tourney_games_2017.shape[0]), columns=test_x_columns)
test_y = np.ones(67)
for i, row in tourney_games_2017.iterrows():
    teamA_id = tourney_games_2017.at[i, "WTeamID"]
    teamB_id = tourney_games_2017.at[i, "LTeamID"]
    teamA_stats = tour_avgs.loc[teamA_id, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]].as_matrix()
    teamB_stats = tour_avgs.loc[teamB_id, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]].as_matrix()
    r = random.random()
    if r > 0.5:
#         test_x["WTeamID"] = teamA_id
#         test_x["LTeamID"] = teamB_id
#         test_x["Loc"] = 0
        test_x.loc[i, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]] = teamA_stats / teamB_stats
    else:
#         test_x["WTeamID"] = teamB_id
#         test_x["LTeamID"] = teamA_id
#         test_x["Loc"] = 0
        test_x.loc[i, ["FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                                           "TOR", "StlR", "BlkR", "PFR"]] = teamB_stats / teamA_stats
        test_y[i] = 0
    
# print(test_x)
test_x = test_x.as_matrix()
# print(train_x.shape)
# print(train_y.shape)
# print(test_x.shape)
# print(test_y.shape)

In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, log_loss

# num_splits = 10
# ll_best = 1000
# num_features_best = 1
# kf = KFold(n_splits=num_splits)
# for num_features_to_use in range(1, 17):
#     ll_feature_count = 0
#     for train, test in kf.split(train_x):
#         clf2 = RandomForestRegressor(n_estimators=5000, oob_score=True, max_features=num_features_to_use)
#         clf2.fit(train_x[train], train_y[train].ravel())
#         preds = clf2.predict(train_x[test])
#         ll_feature_count = log_loss(train_y[test], preds) / num_splits
    
#     if ll_feature_count < ll_best:
#         ll_best = ll_feature_count
#         num_features_best = num_features_to_use

# print("Best number of features to use: {}", num_features_best)
# print("Log loss achieved: {}", ll_best)

clf2 = RandomForestRegressor(n_estimators=5000, oob_score=True, max_features=1, min_samples_leaf=50, n_jobs=-1)
clf2.fit(train_x, train_y.ravel())
print("oob score: {}", clf2.oob_score_)

oob score: {} 0.6224520751701116


In [12]:
print(test_y)
preds = clf2.predict(test_x)
print("Accuracy score", accuracy_score(test_y, preds > 0.5))
print(preds)
print(test_y)
print("Log loss", log_loss(test_y, preds))

[0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0.
 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 1.]
Accuracy score 0.6716417910447762
[0.44583549 0.30040483 0.1773321  0.33842489 0.26227531 0.57740319
 0.62921502 0.7565605  0.07340146 0.60153899 0.55710823 0.48177796
 0.56015652 0.34411795 0.15367603 0.88442564 0.28223433 0.62536213
 0.79748608 0.55143583 0.34645554 0.55357958 0.82000923 0.1859739
 0.24782824 0.15475341 0.22137542 0.25937001 0.66301079 0.32633523
 0.09269351 0.56937315 0.1827745  0.16570045 0.76596859 0.16696443
 0.28616858 0.33966455 0.50890401 0.05639514 0.74251453 0.62852676
 0.63271566 0.72889858 0.60656389 0.7826614  0.25253016 0.55042769
 0.81258514 0.82598866 0.58297693 0.53386102 0.83939226 0.21815672
 0.64460836 0.14080768 0.55217879 0.51908722 0.76079719 0.56876295
 0.0687683  0.74478537 0.56379123 0.33690649 0.1441361  0.47561667
 0.16077684]
[0. 1. 1. 0.