In [45]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor
import math

In [60]:
# Import relevant data files
reg_ratios = pd.read_csv("RegularSeasonDetailedResultsRatios.csv")
tour_ratios = pd.read_csv("NCAATourneyDetailedResultsRatios.csv")

# List of all columns in the datasets
# all_cols = reg_ratios.columns.tolist()
all_cols = ["Season", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc",
                 "FGMR", "FGAR", "FGMR3", "FGAR3", "FTMR", "FTAR", "ORR", "DRR", "AstR",
                 "TOR", "StlR", "BlkR", "PFR"]

# Columns that won't be used in the dataset
cols_to_drop = ["Season", "WTeamID", "WScore", "LTeamID", "LScore", "NumTeamWon", "Loc"]

# Columns that will be used in the dataset
stats_columns = [item for item in all_cols if item not in cols_to_drop]

# Years to look at
seasons = [2017]

# Specify what to use as the training data
train_data = reg_ratios

# Get all of the training data for the given years
train_x = reg_ratios.loc[reg_ratios["Season"].isin(seasons)].drop(labels=cols_to_drop, axis=1).as_matrix()
train_y = reg_ratios.loc[reg_ratios["Season"].isin(seasons)]["NumTeamWon"].as_matrix()

# Get all team IDs from the tournament
reg_season_avg_cols = ["Season", "TeamID"] + stats_columns

# Create a dataframe with teamIDs for all tourneys with season in which that team played
team_ids = pd.DataFrame(index=range(len(seasons)*68), columns=["Season", "TeamID"])
for season in seasons:
    team_ids_for_season = pd.unique(tour_ratios.loc[tour_ratios["Season"] == 2017, ["WTeamID", "LTeamID"]].values.ravel('K'))
    for i in range(len(team_ids_for_season)):
        team_ids.loc[i, "Season"] = season
        team_ids.loc[i, "TeamID"] = team_ids_for_season[i]

# Create a data sets to hold regular season averages for every team in the tournament for every specified year
reg_season_avgs = pd.DataFrame(index=range(team_ids.shape[0]), columns=reg_season_avg_cols)
reg_season_avgs["Season"] = team_ids["Season"]
reg_season_avgs["TeamID"] = team_ids["TeamID"]

for season in seasons:
    team_ids_for_season = reg_season_avgs.loc[reg_season_avgs["Season"] == season, "TeamID"].tolist()
        
    for team_id in team_ids_for_season:
        games_all = reg_ratios.loc[(reg_ratios["WTeamID"] == team_id) | (reg_ratios["LTeamID"] == team_id)]
        games = games_all.loc[games_all["Season"] == season]
        games.reset_index(inplace=True, drop=True)
        games_stats = games.loc[:, stats_columns]
        for i, row in games.iterrows():
            if ((team_id == row["WTeamID"]) & (row["NumTeamWon"] == 0)) | ((team_id == row["LTeamID"]) & (row["NumTeamWon"] == 1)):
                games_stats.iloc[i] = games_stats.iloc[i].apply(np.reciprocal)
    
        reg_season_avgs.loc[(reg_season_avgs["TeamID"] == team_id) & (reg_season_avgs["Season"] == season), stats_columns] = games_stats.mean(axis=0).ravel()

print(reg_season_avgs)
# reg_season_avgs = reg_season_avgs.set_index("TeamID")

   Season TeamID      FGMR      FGAR     FGMR3     FGAR3      FTMR      FTAR  \
0    2017   1243   1.10154  0.981988   1.05094   1.09203   1.36656   1.36491   
1    2017   1291  0.986222  0.979794   1.77279   1.37903   1.16017   1.06083   
2    2017   1413   1.00627  0.988064   1.10613  0.895115   1.47796   1.83711   
3    2017   1425   1.03186  0.957951   1.11822   1.01172   2.05628   1.86904   
4    2017   1112   1.12224  0.971343   1.27608  0.857243   1.91802   1.57692   
5    2017   1139   1.15861   1.05787   1.27133     1.075   1.12676   1.04409   
6    2017   1196   1.16118   1.03256   1.84946   1.22727   1.45438   1.34282   
7    2017   1199   1.26253   1.05627   1.17845  0.958448   1.21959   1.21378   
8    2017   1211   1.39475  0.968298   1.68969   1.09368   1.89001   1.59918   
9    2017   1235   1.14461   1.01789   1.60363   1.20148   1.46309    1.3124   
10   2017   1292   1.31531   1.11538   1.37128   1.09106  0.950387   0.96748   
11   2017   1321   1.13904   1.03789    

In [37]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(years)]
tourney_games = tourney_games.reset_index(drop=True)
# print(tourney_games_2017.shape)
test_x = pd.DataFrame(index=range(tourney_games_2017.shape[0]), columns=stats_columns)
test_y = np.ones(67*len(seasons))
for i in len(seasons):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[reg_season_avgs["Season"] == season]
    for j, row in tourney_games_for_season.iterrows():
        index = i * 67 + j
        teamA_id = tourney_games_for_season.at[i, "WTeamID"]
        teamB_id = tourney_games_for_season.at[i, "LTeamID"]
        teamA_stats = reg_season_avgs.loc[(reg_season_avgs["TeamID"] == teamA_id) & (reg_season_avgs["Season"] == season), stats_columns].as_matrix()
        teamB_stats = reg_season_avgs.loc[(reg_season_avgs["TeamID"] == teamB_id) & (reg_season_avgs["Season"] == season), stats_columns].as_matrix()
        r = random.random()
        if r > 0.5:
    #         test_x["WTeamID"] = teamA_id
    #         test_x["LTeamID"] = teamB_id
    #         test_x["Loc"] = 0
            test_x.loc[index, stats_columns] = teamA_stats / teamB_stats
        else:
            test_x.loc[index, stats_columns] = teamB_stats / teamA_stats
            test_y[i] = 0
    
# print(test_x)
test_x = test_x.as_matrix()
# print(train_x.shape)
# print(train_y.shape)
# print(test_x.shape)
# print(test_y.shape)

KeyError: 'the label [1243] is not in the [index]'

In [8]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error

# num_splits = 10
# ll_best = 1000
# num_features_best = 1
# kf = KFold(n_splits=num_splits)
# for num_features_to_use in range(1, 17):
#     ll_feature_count = 0
#     for train, test in kf.split(train_x):
#         clf2 = RandomForestRegressor(n_estimators=5000, oob_score=True, max_features=num_features_to_use)
#         clf2.fit(train_x[train], train_y[train].ravel())
#         preds = clf2.predict(train_x[test])
#         ll_feature_count = log_loss(train_y[test], preds) / num_splits
    
#     if ll_feature_count < ll_best:
#         ll_best = ll_feature_count
#         num_features_best = num_features_to_use

# print("Best number of features to use: {}", num_features_best)
# print("Log loss achieved: {}", ll_best)

clf2 = RandomForestRegressor(n_estimators=5000, oob_score=True, max_features=4, min_samples_leaf=1, n_jobs=-1)
clf2.fit(train_x, train_y.ravel())
print("oob score: {}", clf2.oob_score_)

oob score: {} 0.8458122286358658


In [9]:
preds_train = clf2.predict(train_x)
preds_test = clf2.predict(test_x)

print("Log Loss, Train: ", log_loss(train_y, preds_train))
print("Log Loss, Test: ", log_loss(test_y, preds_test))
print("")
print("Accuracy, Train", accuracy_score(train_y, preds_train > 0.5))
print("Accuracy, Test", accuracy_score(test_y, preds_test > 0.5))
# print(preds_test)
# print(test_y)
# print("Log loss", log_loss(test_y, preds_test))

ValueError: Number of features of the model must match the input. Model n_features is 13 and input n_features is 12 

In [18]:
variables = test_x_columns
feature_importance = clf2.feature_importances_
for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])

FGMR : 0.2661684289801305
FGAR : 0.017757013767729524
FGMR3 : 0.051357716289353525
FGAR3 : 0.016470414111656125
FTMR : 0.15362692111669005
FTAR : 0.08434325049422604
DRR : 0.1587136196251818
AstR : 0.09986918935151781
TOR : 0.061197488980729804
StlR : 0.02103141677041243
BlkR : 0.013251364547296196
PFR : 0.056213175965077314


In [10]:
print(test_x_columns)

['FGMR', 'FGAR', 'FGMR3', 'FGAR3', 'FTMR', 'FTAR', 'DRR', 'AstR', 'TOR', 'StlR', 'BlkR', 'PFR']
