In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, VotingClassifier
from sklearn.metrics import accuracy_score, log_loss, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

random.seed(1)

root = "/home/austin/Github/kaggle-ncaa-2018/"

data = pd.read_csv(root + "derived_data/Master.csv")
matchups = pd.read_csv("/home/austin/Github/DataFiles/TourneyWinratesBySeed.csv")
tour_ratios = pd.read_csv(root + "derived_data/ratios/NCAATourneyDetailedResultsRatios.csv")
comp = pd.read_csv(root + "derived_data/TeamCompositeStats.csv")

In [2]:
seasons = range(2003, 2018)

all_columns = data.columns.tolist()
non_stats_columns = ["Season", "TeamID", "Seed", "Elo"]#, "BPI", "Predictor_Score"]
stats_columns = [c for c in all_columns if c not in non_stats_columns]

# stats_columns_A = [c+"num" for c in stats_columns]
# stats_columns_B = [c+"den" for c in stats_columns]

# Put all data in a single DataFrame

In [3]:
tourney_games = tour_ratios.loc[tour_ratios["Season"].isin(seasons)]
tourney_games = tourney_games.reset_index(drop=True)
extra_cols = ["SeedDiff", "NumTeamElo", "DenTeamElo", "HistWinPct", "NumTeamBPI", "DenTeamBPI"]

pred_data = pd.DataFrame(index=range(tourney_games.shape[0]), columns= ["Season"] + stats_columns + extra_cols + ["NumTeamWon"])
pred_data.loc[:,"NumTeamWon"] = np.ones(tourney_games.shape[0])
index = 0
for i in range(len(seasons)):
    season = seasons[i]
    tourney_games_for_season = tourney_games.loc[tourney_games["Season"] == season].reset_index()
    for j, row in tourney_games_for_season.iterrows():
        pred_data.loc[index, "Season"] = season
        teamA_id = tourney_games_for_season.at[j, "WTeamID"]
        teamB_id = tourney_games_for_season.at[j, "LTeamID"]
        
        teamA_seed = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        teamB_seed = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Seed"]
        
        teamA_elo = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        teamB_elo = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season)].reset_index(drop=True).loc[0, "Elo"]
        
        teamA_stats = data.loc[(data["TeamID"] == teamA_id) & (data["Season"] == season), stats_columns].as_matrix()
        teamB_stats = data.loc[(data["TeamID"] == teamB_id) & (data["Season"] == season), stats_columns].as_matrix()
        
        r = random.random()
        
        if r > 0.5:
            pred_data.at[index, stats_columns] = (teamA_stats / teamB_stats).ravel()
            pred_data.at[index, "SeedDiff"] = teamA_seed - teamB_seed
#             pred_data.at[index, "NumTeamSeed"] = teamA_seed
#             pred_data.at[index, "DenTeamSeed"] = teamB_seed
            pred_data.at[index, "NumTeamElo"] = teamA_elo
            pred_data.at[index, "DenTeamElo"] = teamB_elo
            pred_data.at[index, "NumTeamBPI"] = comp.loc[(comp["Season"] == season) & (comp["TeamID"] == teamA_id), "OrdinalRank"]
            pred_data.at[index, "DenTeamBPI"] = comp.loc[(comp["Season"] == season) & (comp["TeamID"] == teamB_id), "OrdinalRank"]
            pred_data.at[index, "HistWinPct"] = matchups[(matchups["WinSeed"] == teamA_seed) & (matchups["LoseSeed"] == teamB_seed)]["1985"]
        else:
            pred_data.at[index, stats_columns] = (teamB_stats / teamA_stats).ravel()
            pred_data.at[index, "SeedDiff"] = teamB_seed - teamA_seed
#             pred_data.at[index, "NumTeamSeed"] = teamB_seed
#             pred_data.at[index, "DenTeamSeed"] = teamA_seed
            pred_data.at[index, "NumTeamElo"] = teamB_elo
            pred_data.at[index, "DenTeamElo"] = teamA_elo
            pred_data.at[index, "NumTeamBPI"] = comp.loc[(comp["Season"] == season) & (comp["TeamID"] == teamB_id), "OrdinalRank"]
            pred_data.at[index, "DenTeamBPI"] = comp.loc[(comp["Season"] == season) & (comp["TeamID"] == teamA_id), "OrdinalRank"]
            pred_data.at[index, "HistWinPct"] = matchups[(matchups["WinSeed"] == teamB_seed) & (matchups["LoseSeed"] == teamA_seed)]["2003"]
            pred_data.at[index, "NumTeamWon"] = 0
        
        index += 1

pred_data = pred_data.dropna(axis=0, how='any')
print(pred_data)

     Season       FGM       FGA      FGM3      FGA3       FTM       FTA  \
0      2003   1.01452  0.973123  0.915248   1.02778   1.08985   1.34091   
1      2003   1.22128   1.17637   1.33357   1.29637   1.36337   1.27866   
2      2003   1.03543  0.948276  0.571429  0.627148   1.17281   1.14458   
3      2003   1.07799   1.09043   1.16728   1.14242  0.720009  0.795724   
4      2003   0.88987  0.908201   1.24211   1.32085   1.17844   1.04865   
5      2003  0.813733  0.824085   1.02091   1.02733   1.22104   1.09327   
6      2003   1.03662   1.15803   1.74603   1.78098   1.17059   1.15659   
7      2003   1.13854  0.978352   1.07228  0.977667   1.19634   1.14439   
8      2003  0.948697   1.00185   1.03718   1.04292   1.01216   1.10454   
9      2003  0.783903  0.833601  0.840278  0.858491  0.904564  0.838398   
10     2003   1.12424   1.06687   1.08952   1.03329   1.16375   1.06637   
11     2003   1.04607  0.977584  0.815086  0.755128   1.14474   1.06821   
12     2003  0.998501   1

# Select which stats to use

# Reformat Training Data

In [12]:
train_columns = ['SeedDiff', 'PIE', 'DenTeamElo', 'NumTeamElo', 'NumTeamBPI', 'DenTeamBPI']

In [13]:
train_x = pred_data.loc[pred_data["Season"].isin(seasons)][train_columns].as_matrix()
train_y = pred_data.loc[pred_data["Season"].isin(seasons)]["NumTeamWon"].as_matrix()

print(train_x.shape)
print(train_y.shape)

(981, 6)
(981,)


# Logistic Regression

In [14]:
# parameters = {
#               'C': np.linspace(1e-5, 1e-4, 25),
#               'tol': np.linspace(0.001, 0.1, 25)
#               }

# logReg = GridSearchCV(LogisticRegression(), parameters, cv=10)

logReg = LogisticRegression(C=3.625e-5, tol=.0546)
logReg.fit(train_x, train_y.ravel())

LogisticRegression(C=3.625e-05, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0546, verbose=0, warm_start=False)

In [15]:
# preds_train = np.array(logReg.predict_proba(train_x))
# preds_test_lr = np.array(logReg.predict_proba(test_x))[:,1]

# # print(preds_test)
# # print(test_y)

# # threshold = .1

# # preds_test_lr[preds_test_lr > 1 - threshold] = 1
# # preds_test_lr[preds_test_lr < threshold] = 0

# # print(logReg.best_params_)

# # print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# # print("Log Loss, Train: ", log_loss(train_y, preds_train))
# # print("")
# print("Accuracy, Test:  ", logReg.score(test_x, test_y))
# print("Log Loss, Test:  ", log_loss(test_y, preds_test_lr))

In [16]:
# logReg = LogisticRegression(C=9.7e-5, tol=0.07)

# scores= cross_val_score(logReg, np.concatenate((test_x, train_x)),np.concatenate((test_y, train_y)), cv=5)
# print(scores)
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Random Forest

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

parameters = {'max_features': np.arange(1,12,1),
              'min_samples_split': np.arange(2,50,4),
              'min_samples_leaf': np.arange(1,25,4)
              }

# RF = GridSearchCV(RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0), parameters,
#                     cv=10, scoring=make_scorer(log_loss))
RF = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_features=4, min_samples_leaf=40)

# clf = ExtraTreesRegressor(n_estimators=1000, max_features=1, n_jobs=-1)
# clf = LogisticRegression(n_jobs=-1)
RF.fit(train_x, train_y.ravel())
# print("OOB Score: ", clf.oob_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=4, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=40, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [18]:
# # print(RF.best_params_)

# preds_train = RF.predict(train_x)
# preds_test_rf = RF.predict_proba(test_x)[:,1]
# # preds_test_rf = RF.predict_proba(test_x)

# # threshold = .1
# # preds_test_rf[preds_test_rf > 1 - threshold] = 1
# # preds_test_rf[preds_test_rf < threshold] = 0

# # print("Accuracy, Train: ", accuracy_score(train_y, preds_train > 0.5))
# # print("Log Loss, Train: ", log_loss(train_y, preds_train))
# # print("")
# print("Accuracy, Test:  ", accuracy_score(test_y, preds_test_rf > 0.5))
# print("Log Loss, Test:  ", log_loss(test_y, preds_test_rf))

In [19]:
# variables = train_columns
# feature_importance = RF.feature_importances_

# feature_importance, variables = (list(t) for t in zip(*sorted(zip(feature_importance, variables))))

# features_to_keep = []
# for i in range(len(feature_importance)):
#     print(variables[i], ":", feature_importance[i])
#     if feature_importance[i] > 0.03:
#         features_to_keep.append(variables[i])
# print(features_to_keep)

# Voter

In [21]:
from sklearn.model_selection import cross_val_score
logReg1 = LogisticRegression(C=3.625e-5, tol=.0546)
RF1 = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_features=4, min_samples_leaf=40)
# mlp = MLPClassifier()
# knn = KNeighborsClassifier(n_neighbors=7, algorithm='auto', leaf_size=1)
voter = VotingClassifier(estimators=[('lr', logReg1), ('rf', RF1)], voting='soft')
# scores= cross_val_score(voter, train_x, train_y, cv=5)
# print(scores)
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [22]:
voter.fit(train_x, train_y.ravel())
# preds_test = voter.predict_proba(test_x)[:,1]
# for i in range(len(preds_test)):
#     if preds_test[i] > .9:
#         preds_test[i] = 1
#     elif preds_test[i] < .1:
#         preds_test[i] = 0
# threshold = .19
# preds_test[preds_test > 1 - threshold] = 1
# preds_test[preds_test < threshold] = 0
# print("Accuracy, Test:  ", voter.score(test_x, test_y))
# print("Log Loss, Test:  ", log_loss(test_y, preds_test))

NameError: name 'test_x' is not defined

In [None]:
# from sklearn.model_selection import cross_val_score
# knn = KNeighborsClassifier(n_neighbors=7, algorithm='auto', leaf_size=1)
# logReg = LogisticRegression(C=10, tol=0.0001)
# RF = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_features=4, min_samples_leaf=40)

# voter = VotingClassifier(estimators=[('lr', logReg), ('rf', RF), ('knn', knn)], voting='soft')
# voter.fit(train_x, train_y)
# print("Accuracy: ", voter.score(test_x, test_y))


# scores2 = cross_val_score(voter, np.concatenate((test_x, train_x)),np.concatenate((test_y, train_y)), cv=4)
# print(scores2)
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))

In [None]:
logReg_scores = cross_val_score(logReg, train_x, train_y, cv=5)
RF_scores = cross_val_score(RF, train_x, train_y, cv=5)
voter_scores = cross_val_score(voter, train_x, train_y, cv=5)

print("LogReg Accuracy: %0.2f (+/- %0.2f)" % (logReg_scores.mean(), logReg_scores.std() * 2))
print("RF Accuracy: %0.2f (+/- %0.2f)" % (RF_scores.mean(), RF_scores.std() * 2))
print("Voter Accuracy: %0.2f (+/- %0.2f)" % (voter_scores.mean(), voter_scores.std() * 2))