In [11]:
# Imports
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import accuracy_score, log_loss, make_scorer

root = "/home/austin/Github/kaggle-ncaa-2018/"

data = pd.read_csv(root + "derived_data/Master.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")

In [2]:
seasons = range(2003, 2018)
seasons_test = [2017]
seasons_train = [season for season in seasons if season not in seasons_test]

all_columns = data.columns.tolist()
non_stats_columns = ["Season", "TeamID", "Seed", "Elo"]
stats_columns = [c for c in all_columns if c not in non_stats_columns]

In [3]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)

pred_data = pd.DataFrame(index=range(tourney_games.shape[0]), columns= ["Season"] + stats_columns + ["NumTeamWon", "NumTeamSeed", "DenTeamSeed", "NumTeamElo", "DenTeamElo"])
pred_data.loc[:,"NumTeamWon"] = np.ones(tourney_games.shape[0])
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        pred_data.loc[index, "Season"] = season
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        
        teamA_seed = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        teamB_seed = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        
        teamA_elo = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        teamB_elo = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        
        teamA_stats = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season), stats_columns].as_matrix()
        teamB_stats = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season), stats_columns].as_matrix()
        r = random.random()
        
        if r > 0.5:
            pred_data.at[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
            pred_data.at[index, "NumTeamSeed"] = teamA_seed
            pred_data.at[index, "DenTeamSeed"] = teamB_seed
            pred_data.at[index, "NumTeamElo"] = teamA_elo
            pred_data.at[index, "DenTeamElo"] = teamB_elo
        else:
            pred_data.at[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            pred_data.at[index, "NumTeamSeed"] = teamB_seed
            pred_data.at[index, "DenTeamSeed"] = teamA_seed
            pred_data.at[index, "NumTeamElo"] = teamB_elo
            pred_data.at[index, "DenTeamElo"] = teamA_elo
            pred_data.at[index, "NumTeamWon"] = 0
        
        index += 1

pred_data = pred_data.dropna(axis=0, how='any')

In [4]:
all_train_columns = stats_columns + ["NumTeamSeed", "DenTeamSeed", "NumTeamElo", "DenTeamElo"]
train_columns_to_drop = ["FT_PCT", "FGM3", "FGA3", "FTM", "DEF_REB_PCT", "FTA", "FT_RATE", "4Factor",
                         "DR", "FG_PCT", "Stl", "ASSIST_RATIO", "PF", "TO", "Ast", "FGA", "OR"]

# train_columns_to_drop = []

train_columns = [c for c in all_train_columns if c not in train_columns_to_drop]

In [5]:
train_x = pred_data.loc[pred_data["Season"].isin(seasons_train)][train_columns].as_matrix()
train_y = pred_data.loc[pred_data["Season"].isin(seasons_train)]["NumTeamWon"].as_matrix()
test_x = pred_data.loc[pred_data["Season"].isin(seasons_test)][train_columns].as_matrix()
test_y = pred_data.loc[pred_data["Season"].isin(seasons_test)]["NumTeamWon"].as_matrix()

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'max_features': np.arange(1,12,1),
              'min_samples_split': np.arange(2,50,2),
              'min_samples_leaf': np.arange(1,15,1)
              }

clf = GridSearchCV(ExtraTreesRegressor(n_estimators=1000, n_jobs=-1, random_state=0), parameters,
                   cv=10, scoring=make_scorer(log_loss))
# clf = ExtraTreesRegressor(n_estimators=1000, max_features=1, n_jobs=-1)
# clf = LogisticRegression(n_jobs=-1)
clf.fit(train_x, train_y.ravel())
# print("OOB Score: ", clf.oob_score_)

GridSearchCV(cv=10, error_score='raise',
       estimator=ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
          oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34,
       36, 38, 40, 42, 44, 46, 48]), 'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]), 'max_features': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [7]:
preds_train = clf.predict(train_x)
preds_test = clf.predict(test_x)

print(clf.best_params_)

# print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# print("Log Loss, Train: ", log_loss(train_y, preds_train))
# print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))

{'min_samples_split': 38, 'min_samples_leaf': 3, 'max_features': 4}
Accuracy, Test:   0.746268656716418
Log Loss, Test:   0.5434445406781031


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'max_features': np.arange(1,12,1),
              'min_samples_split': np.arange(2,50,2),
              'min_samples_leaf': np.arange(1,15,1)
              }

clf1 = GridSearchCV(RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0), parameters,
                    cv=10, scoring=make_scorer(log_loss))
# clf = ExtraTreesRegressor(n_estimators=1000, max_features=1, n_jobs=-1)
# clf = LogisticRegression(n_jobs=-1)
clf1.fit(train_x, train_y.ravel())
# print("OOB Score: ", clf.oob_score_)

KeyboardInterrupt: 

In [None]:
preds_train1 = clf1.predict(train_x)
preds_test1 = clf1.predict(test_x)

print(clf1.best_params_)

# print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# print("Log Loss, Train: ", log_loss(train_y, preds_train))
# print("")
# print("Accuracy, Test:  ", accuracy_score(test_y, preds_test > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test1))

In [None]:
variables = train_columns
feature_importance = clf.feature_importances_

feature_importance, variables = (list(t) for t in zip(*sorted(zip(feature_importance, variables))))

for i in range(len(feature_importance)):
    print(variables[i], ":", feature_importance[i])