In [138]:
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, make_scorer

random.seed(1)

root = "/home/austin/Github/kaggle-ncaa-2018/"

data = pd.read_csv(root + "derived_data/Master.csv")
matchups = pd.read_csv("/home/austin/Github/DataFiles/TourneyWinratesBySeed.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")

In [139]:
seasons = range(2003, 2018)
seasons_test = [2016]
seasons_train = [season for season in seasons if season not in seasons_test]

all_columns = data.columns.tolist()
non_stats_columns = ["Season", "TeamID", "Seed", "Elo"]#, "BPI", "Predictor_Score"]
stats_columns = [c for c in all_columns if c not in non_stats_columns]

In [140]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)
extra_cols = ["NumTeamSeed", "DenTeamSeed", "NumTeamElo", "DenTeamElo", "HistWinPct"]

pred_data = pd.DataFrame(index=range(tourney_games.shape[0]), columns= ["Season"] + stats_columns + extra_cols + ["NumTeamWon"])
pred_data.loc[:,"NumTeamWon"] = np.ones(tourney_games.shape[0])
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        pred_data.loc[index, "Season"] = season
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        
        teamA_seed = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        teamB_seed = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        
        teamA_elo = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        teamB_elo = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        
        teamA_stats = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season), stats_columns].as_matrix()
        teamB_stats = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season), stats_columns].as_matrix()
        
        r = random.random()
        
        if r > 0.5:
            pred_data.at[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
            pred_data.at[index, "NumTeamSeed"] = teamA_seed
            pred_data.at[index, "DenTeamSeed"] = teamB_seed
            pred_data.at[index, "NumTeamElo"] = teamA_elo
            pred_data.at[index, "DenTeamElo"] = teamB_elo
            pred_data.at[index, "HistWinPct"] = matchups[(matchups["WinSeed"] == teamA_seed) & (matchups["LoseSeed"] == teamB_seed)]["1985"]
        else:
            pred_data.at[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            pred_data.at[index, "NumTeamSeed"] = teamB_seed
            pred_data.at[index, "DenTeamSeed"] = teamA_seed
            pred_data.at[index, "NumTeamElo"] = teamB_elo
            pred_data.at[index, "DenTeamElo"] = teamA_elo
            pred_data.at[index, "HistWinPct"] = matchups[(matchups["WinSeed"] == teamB_seed) & (matchups["LoseSeed"] == teamA_seed)]["1985"]
            pred_data.at[index, "NumTeamWon"] = 0
        
        index += 1

pred_data = pred_data.dropna(axis=0, how='any')

In [141]:
all_train_columns = stats_columns + extra_cols
# train_columns_to_drop = ["FT_PCT", "FGM3", "FGA3", "FTM", "DEF_REB_PCT", "FTA", "FT_RATE", "4Factor",
#                          "DR", "FG_PCT", "Stl", "ASSIST_RATIO", "PF", "TO", "Ast", "FGA", "OR"]

train_columns_to_drop = []

train_columns = [c for c in all_train_columns if c not in train_columns_to_drop]
print(train_columns)

['FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF', 'PIE', 'FG_PCT', 'TURNOVER_RATE', 'OFF_REB_PCT', 'FT_RATE', '4FACTOR', 'OFF_EFF', 'DEF_EFF', 'ASSIST_RATIO', 'DEF_REB_PCT', 'FT_PCT', 'WINPCT', 'NumTeamSeed', 'DenTeamSeed', 'NumTeamElo', 'DenTeamElo', 'HistWinPct']


In [142]:
train_x = pred_data.loc[pred_data["Season"].isin(seasons_train)][train_columns].as_matrix()
train_y = pred_data.loc[pred_data["Season"].isin(seasons_train)]["NumTeamWon"].as_matrix()
test_x = pred_data.loc[pred_data["Season"].isin(seasons_test)][train_columns].as_matrix()
test_y = pred_data.loc[pred_data["Season"].isin(seasons_test)]["NumTeamWon"].as_matrix()

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)
print(len(train_columns))

# from keras.utils import to_categorical
# # train_y = to_categorical(train_y)

(914, 30)
(914,)
(67, 30)
(67,)
30


In [143]:
parameters = {'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10],
              'tol': [1e-4, 1e-3, 1e-2, 1e-1]
              }

clf = GridSearchCV(LogisticRegression(), parameters,
                   cv=10, scoring=make_scorer(log_loss))

clf.fit(train_x, train_y.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tol': [0.0001, 0.001, 0.01, 0.1], 'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(log_loss), verbose=0)

In [144]:
preds_train = np.array(clf.predict_proba(train_x))
preds_test = np.array(clf.predict_proba(test_x))

# print(preds_test)
# print(test_y)

# threshold = .1

# preds_test[preds_test > 1 - threshold] = 1
# preds_test[preds_test < threshold] = 0

print(clf.best_params_)

print("Accuracy, Train: ", accuracy_score(train_y, preds_train[:,1] > 0.5))
print("Log Loss, Train: ", log_loss(train_y, preds_train))
print("")
print("Accuracy, Test:  ", accuracy_score(test_y, preds_test[:,1] > 0.5))
print("Log Loss, Test:  ", log_loss(test_y, preds_test))

{'tol': 0.01, 'C': 1e-05}
Accuracy, Train:  0.712253829321663
Log Loss, Train:  0.5460727495890044

Accuracy, Test:   0.7611940298507462
Log Loss, Test:   0.5104207111248198
