In [2]:
'''
Imports for this program are here. Need installs of:

xgboost, pandas, numpy, matplotlib, graphviz, sklearn
'''


import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import graphviz
import numpy as np
from sklearn import preprocessing


'''
Need this data from KenPom website that we have already downloaded
in our github repository. Import the data and split it into testing
and training data based on year. The goal of this split is to use
past data (years before 2016) to predict 'future' data (years after 2016).
'''

full_results = pd.read_csv('full_results.csv')
full_stats = pd.read_csv('FullKenPom_pt.csv')

###mask test and train data###
res_test_mask = (full_results['Year'] == 2016) | (full_results['Year'] == 2017) | (full_results['Year'] == 2018)
stats_test_mask = (full_stats['Season'] == 2016) | (full_stats['Season'] == 2017) | (full_stats['Season'] == 2018)
res_train_mask = (full_results['Year'] < 2016)
stats_train_mask = (full_stats['Season'] < 2016)

res_data_test = full_results[res_test_mask]
stats_data_test = full_stats[stats_test_mask]
res_data_train = full_results[res_train_mask]
stats_data_train = full_stats[stats_train_mask]

#reindex all of them
res_data_test = res_data_test.reset_index()
stats_data_test = stats_data_test.reset_index()
res_data_train = res_data_train.reset_index()
stats_data_train = stats_data_train.reset_index()

for i in range(len(res_data_test)):
    if res_data_test['Region Name'][i] == "First Four":
        res_data_test = res_data_test.drop(i)
for i in range(len(res_data_train)):
    if res_data_train['Region Name'][i] == "First Four":
        res_data_train = res_data_train.drop(i)
        
#reindex all of them
res_data_test = res_data_test.reset_index()
stats_data_test = stats_data_test.reset_index()
res_data_train = res_data_train.reset_index()
stats_data_train = stats_data_train.reset_index()


#these names match the kenpom stats csv
stats_vec = ["AdjTempo",
            "AdjOE",
            "AdjDE",
            "AdjEM",
            "seed",
            "ConfTournament",
            "SOSAdjEM",
            "NCSOSAdjEM",
            "O-D_eFG_Pct",
            "D-O_TO_Pct",
            "O-D_OR_Pct",
            "O-D_FT_Rate",
            "LastTenRecord"]

# Move training data into 2 numpy arrays - data and labels (results)
N = len(res_data_train)

training_data = np.zeros((N,13))
training_labels = np.zeros((N,1))

for i in range(len(res_data_train)):
    year = res_data_train['Year'][i]
    teamA = res_data_train['TeamA'][i]
    teamB = res_data_train['TeamB'][i]
    score_diff = res_data_train['ScoreA'][i] - res_data_train['ScoreB'][i]
    for k in range(len(stats_data_train)):
        if ((stats_data_train['Season'][k] == year) and (stats_data_train['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_train)):
        if ((stats_data_train['Season'][k] == year) and (stats_data_train['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        training_data[i][s] = stats_data_train[stat][indexA] - stats_data_train[stat][indexB]
    
    if (score_diff > 0):
        training_labels[i][0] = 1
    else:
        training_labels[i][0] = 0

# Move testing data into 2 numpy arrays - data and labels (results)
N = len(res_data_test)

testing_data = np.zeros((N,13))
testing_labels = np.zeros((N,1))

for i in range(len(res_data_test)):
    year = res_data_test['Year'][i]
    teamA = res_data_test['TeamA'][i]
    teamB = res_data_test['TeamB'][i]
    score_diff = res_data_test['ScoreA'][i] - res_data_test['ScoreB'][i]
    for k in range(len(stats_data_test)):
        if ((stats_data_test['Season'][k] == year) and (stats_data_test['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_test)):
        if ((stats_data_test['Season'][k] == year) and (stats_data_test['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        testing_data[i][s] = stats_data_test[stat][indexA] - stats_data_test[stat][indexB]
    
    if (score_diff > 0):
        testing_labels[i][0] = 1
    else:
        testing_labels[i][0] = 0



'''
This next section of code is for normalizing the data (if you want
since xgboost does not need the data to be normalized)
'''


#Time to normalize the data
training_data = preprocessing.normalize(training_data, axis=0, norm='max')
testing_data = preprocessing.normalize(testing_data, axis=0, norm='max')

evallist = [(testing_data, 'eval'), (training_data, 'train')]

param = {'objective': 'multi:softprob'}
param['eval_metric'] = "merror"
param['num_class'] = 2

dtrain = xgb.DMatrix(training_data, label=training_labels,
                     feature_names=stats_vec)
dtest = xgb.DMatrix(testing_data, label=testing_labels,
                    feature_names=stats_vec)


'''
The next three parts (labeled 'Phases') are for tuning the model
for creating the best possible paramters for the final model.
'''

#Phase 1: Tuning Max depth and min_child_weight
param = {'objective': 'multi:softprob'}
param['eval_metric'] = "merror"
param['num_class'] = 2  # 2 classes - win or loss

num_round = 999 #looks like it levels off at around 200

gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(1,8)
    for min_child_weight in range(1,6)
]
min_merror = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    # print("CV with max_depth={}, min_child_weight={}".format(
    #                         max_depth,
    #                         min_child_weight))
    
    # Update Parameters
    param['max_depth'] = max_depth
    param['min_child_weight'] = min_child_weight
    
    #Run CV
    cv_results = xgb.cv(param,
                        dtrain,
                        num_boost_round=num_round, #maybe wrong
                        seed=42,
                        nfold=3,
                        metrics={'merror'},
                        early_stopping_rounds=10)
    
    #Update best MError
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].idxmin()
    # boost_rounds = cv_results['test-merror-mean'].argmin()
    # print("\tMerror {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = (max_depth, min_child_weight)
    

param['max_depth'] = best_params[0]
param['min_child_weight'] = best_params[1]

#Phase 2: Subsample and Colsample_bytree
#tune subsample,colsample
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(1,11)]
    for colsample in [i/10. for i in range(1,11)]
]
min_merror = float("Inf")
best_params = None
for subsample, colsample in reversed(gridsearch_params):
    # print("CV with subsample={}, colsample={}".format(
    #                         subsample,
    #                         colsample))
    # Update our parameters
    param['subsample'] = subsample
    param['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        param,
        dtrain,
        num_boost_round=num_round,
        seed=42,
        nfold=3,
        metrics={'merror'},
        early_stopping_rounds=10
    )
    # Update best Merror
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].idxmin()
    # boost_rounds = cv_results['test-merror-mean'].argmin()
    # print("\tMerror {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = (subsample,colsample)
        
        
param['subsample'] = best_params[0]
param['colsample_bytree'] = best_params[1]

#Phase 3: eta
min_merror = float("Inf")
best_params = None
for eta in [0.5,0.3, 0.03, .003,0.0003]:
    # print("CV with eta={}".format(eta))
    # Update our parameters
    param['eta'] = eta
    # Run CV
    cv_results = xgb.cv(
        param,
        dtrain,
        num_boost_round=num_round,
        seed=42,
        nfold=3,
        metrics={'merror'},
        early_stopping_rounds=10
    )
    # Update best Merror
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].idxmin()
    # boost_rounds = cv_results['test-merror-mean'].argmin()
    # print("\tMerror {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = eta
        
param['eta'] = best_params


'''
This section is for the final model for predicting results. Here we test the
model (trained against the data between 2006-2015) with the known results of
the 2016-2018 seasons. The plot showing the best parameters is commented out
for your convience, as well as the results of the correct values.
'''

final_gb = xgb.train(param,dtrain,num_boost_round=num_round,
                   early_stopping_rounds=5,evals=[(dtest, "Test")])

# xgb.plot_importance(final_gb)

ypred = final_gb.predict(dtest)
ypred

correct_list = np.zeros_like(testing_labels)
for i in range(len(ypred)):
    if ypred[i][0] < ypred[i][1]:
        metric = 1 # team A predicted to win
    else:
        metric = 0
    correct_list[i][0] = metric
    # print("TeamA Win% {:.4}  SeedDiff {}".format(ypred[i][1]*100,testing_data[i][4]*-1))

print('\n')
print("% Correct: ", (correct_list.sum() / len(correct_list))*100)
print('\n')

# xgb.plot_tree(final_gb)
# fig = plt.gcf()
# fig.set_size_inches(18,25)
# plt.show()

[0]	Test-merror:0.312169
Will train until Test-merror hasn't improved in 5 rounds.
[1]	Test-merror:0.275132
[2]	Test-merror:0.275132
[3]	Test-merror:0.285714
[4]	Test-merror:0.269841
[5]	Test-merror:0.259259
[6]	Test-merror:0.269841
[7]	Test-merror:0.259259
[8]	Test-merror:0.26455
[9]	Test-merror:0.269841
[10]	Test-merror:0.285714
Stopping. Best iteration:
[5]	Test-merror:0.259259



% Correct:  89.41798941798942




In [12]:
# The next many cells of code are for creating the bracket for 2018
# Each round needs to be run seperately, as the teams competing in each
# round is contingent on the last, obviously.
# As a result, there are 6 cells that need to be run to create the bracket

res_data_2018 = res_data_test[res_data_test['Year'] == 2018]
stats_data_2018 = stats_data_test[stats_data_test['Season'] == 2018]

res_data_2018 = res_data_2018.reset_index(drop=True)
stats_data_2018 = stats_data_2018.reset_index(drop=True)

r64_2018_data = np.zeros((32,13))
r64_2018_labels = np.zeros((32,1))
r64_2018_teams = []

for i in range(32):
    year = res_data_2018['Year'][i]
    teamA = res_data_2018['TeamA'][i]
    teamB = res_data_2018['TeamB'][i]
    score_diff = res_data_2018['ScoreA'][i] - res_data_2018['ScoreB'][i]
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_train)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r64_2018_data[i][s] = stats_data_2018[stat][indexA] - stats_data_2018[stat][indexB]
    
    if (score_diff > 0): # team A won
        r64_2018_labels[i][0] = 1
    else:
        r64_2018_labels[i][0] = 0 #team B won
    r64_2018_teams.append([teamA,teamB])

r64_2018_data = preprocessing.normalize(r64_2018_data, axis=0, norm='max')
r64_2018_dmatrix = xgb.DMatrix(r64_2018_data, label=r64_2018_labels,
                    feature_names=stats_vec)

r64_2018_pred = final_gb.predict(r64_2018_dmatrix)

#0 index is teamB win - 1 index is teamA win
print("\n\nRound of 64 Predictions - 2018")
for i in range(len(r64_2018_pred)):
    teamA = r64_2018_teams[i][0]
    teamB = r64_2018_teams[i][1]
    if r64_2018_pred[i][1] > r64_2018_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')

# As you can see, I manually read through the results of each round to
# create the matchups for the next round by hand

r32_2018_teams = [["Virginia","Kansas St"],
                  ["Arizona","Kentucky"],
                  ["Tennessee","Miami"],
                  ["Cincinnati","Nevada"],
                  ["Xavier","Missouri"],
                  ["Gonzaga","Ohio State"],
                  ["Michigan","Houston"],
                  ["North Carolina","Texas A&M"],
                  ["Villanova","Virginia Tech"],
                  ["Wichita St","West Virginia"],
                  ["Texas Tech","Florida"],
                  ["Purdue","Arkansas"],
                  ["Kansas","Seton Hall"],
                  ["Auburn","Clemson"],
                  ["Michigan St","TCU"],
                  ["Duke","Oklahoma"]]

r32_2018_data = np.zeros((16,13))
r32_2018_labels = np.zeros((16,1))

for i in range(16):
    year = 2018
    teamA = r32_2018_teams[i][0]
    teamB = r32_2018_teams[i][1]
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r32_2018_data[i][s] = stats_data_2018[stat][indexA] - stats_data_2018[stat][indexB]

r32_2018_data = preprocessing.normalize(r32_2018_data, axis=0, norm='max')
r32_2018_dmatrix = xgb.DMatrix(r32_2018_data, label=r32_2018_labels,
                    feature_names=stats_vec)
r32_2018_pred = final_gb.predict(r32_2018_dmatrix)
#0 index is teamB win - 1 index is teamA win
print("\n\nRound of 32 Predictions - 2018")
for i in range(len(r32_2018_pred)):
    teamA = r32_2018_teams[i][0]
    teamB = r32_2018_teams[i][1]
    if r32_2018_pred[i][1] > r32_2018_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')

r16_2018_teams = [["Virginia","Kentucky"],
                  ["Cincinnati","Tennessee"],
                  ["Xavier","Gonzaga"],
                  ["North Carolina","Michigan"],
                  ["Villanova","Wichita St"],
                  ["Purdue","Texas Tech"],
                  ["Kansas","Clemson"],
                  ["Duke","Michigan St"]]

r16_2018_data = np.zeros((8,13))
r16_2018_labels = np.zeros((8,1))

for i in range(8):
    year = 2018
    teamA = r16_2018_teams[i][0]
    teamB = r16_2018_teams[i][1]
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r16_2018_data[i][s] = stats_data_2018[stat][indexA] - stats_data_2018[stat][indexB]
        
r16_2018_data = preprocessing.normalize(r16_2018_data, axis=0, norm='max')
r16_2018_dmatrix = xgb.DMatrix(r16_2018_data, label=r16_2018_labels,
                    feature_names=stats_vec)
r16_2018_pred = final_gb.predict(r16_2018_dmatrix)
#0 index is teamB win - 1 index is teamA win
print("\n\nSweet 16 Predictions - 2018")
for i in range(len(r16_2018_pred)):
    teamA = r16_2018_teams[i][0]
    teamB = r16_2018_teams[i][1]
    if r16_2018_pred[i][1] > r16_2018_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')


r8_2018_teams = [["Virginia","Cincinnati"],
                 ["North Carolina","Xavier"],
                 ["Villanova","Purdue"],
                 ["Kansas","Duke"]]

r8_2018_data = np.zeros((4,13))
r8_2018_labels = np.zeros((4,1))

for i in range(4):
    year = 2018
    teamA = r8_2018_teams[i][0]
    teamB = r8_2018_teams[i][1]
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r8_2018_data[i][s] = stats_data_2018[stat][indexA] - stats_data_2018[stat][indexB]
        
r8_2018_data = preprocessing.normalize(r8_2018_data, axis=0, norm='max')
r8_2018_dmatrix = xgb.DMatrix(r8_2018_data, label=r8_2018_labels,
                    feature_names=stats_vec)
r8_2018_pred = final_gb.predict(r8_2018_dmatrix)
#0 index is teamB win - 1 index is teamA win
print("\n\nElite 8 Predictions - 2018")
for i in range(len(r8_2018_pred)):
    teamA = r8_2018_teams[i][0]
    teamB = r8_2018_teams[i][1]
    if r8_2018_pred[i][1] > r8_2018_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')


r4_2018_teams = [["Virginia","North Carolina"],
                 ["Villanova","Duke"]]

r4_2018_data = np.zeros((2,13))
r4_2018_labels = np.zeros((2,1))

for i in range(2):
    year = 2018
    teamA = r4_2018_teams[i][0]
    teamB = r4_2018_teams[i][1]
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r4_2018_data[i][s] = stats_data_2018[stat][indexA] - stats_data_2018[stat][indexB]
        
r4_2018_data = preprocessing.normalize(r4_2018_data, axis=0, norm='max')
r4_2018_dmatrix = xgb.DMatrix(r4_2018_data, label=r4_2018_labels,
                    feature_names=stats_vec)
r4_2018_pred = final_gb.predict(r4_2018_dmatrix)
#0 index is teamB win - 1 index is teamA win
print("\n\nFinal 4 Predictions - 2018")
for i in range(len(r4_2018_pred)):
    teamA = r4_2018_teams[i][0]
    teamB = r4_2018_teams[i][1]
    if r4_2018_pred[i][1] > r4_2018_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')


r2_2018_teams = [["Virginia","Villanova"]]

r2_2018_data = np.zeros((1,13))
r2_2018_labels = np.zeros((1,1))

for i in range(1):
    year = 2018
    teamA = r2_2018_teams[i][0]
    teamB = r2_2018_teams[i][1]
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_2018)):
        if ((stats_data_2018['Season'][k] == year) and (stats_data_2018['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r2_2018_data[i][s] = stats_data_2018[stat][indexA] - stats_data_2018[stat][indexB]
        
r2_2018_data = preprocessing.normalize(r2_2018_data, axis=0, norm='max')
r2_2018_dmatrix = xgb.DMatrix(r2_2018_data, label=r2_2018_labels,
                    feature_names=stats_vec)
r2_2018_pred = final_gb.predict(r2_2018_dmatrix)
#0 index is teamB win - 1 index is teamA win
print("\n\nNational Championship Prediction - 2018")
for i in range(len(r2_2018_pred)):
    teamA = r2_2018_teams[i][0]
    teamB = r2_2018_teams[i][1]
    if r2_2018_pred[i][1] > r2_2018_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')




Round of 64 Predictions - 2018
Winner: Virginia
Loser:  UMBC 

Winner: Cincinnati
Loser:  Georgia St 

Winner: Tennessee
Loser:  Wright St 

Winner: Arizona
Loser:  Buffalo 

Winner: Kentucky
Loser:  Davidson 

Winner: Miami
Loser:  Loyola Chicago 

Winner: Nevada
Loser:  Texas 

Winner: Kansas St
Loser:  Creighton 

Winner: Xavier
Loser:  Texas Southern 

Winner: North Carolina
Loser:  Lipscomb 

Winner: Michigan
Loser:  Montana 

Winner: Gonzaga
Loser:  UNC Greensboro 

Winner: Ohio St
Loser:  South Dakota St 

Winner: Houston
Loser:  San Diego St 

Winner: Texas A&M
Loser:  Providence 

Winner: Missouri
Loser:  Florida St 

Winner: Villanova
Loser:  Radford 

Winner: Purdue
Loser:  Cal St Fullerton 

Winner: Texas Tech
Loser:  Stephen F Austin 

Winner: Wichita St
Loser:  Marshall 

Winner: West Virginia
Loser:  Murray St 

Winner: Florida
Loser:  St Bonaventure 

Winner: Arkansas
Loser:  Butler 

Winner: Virginia Tech
Loser:  Alabama 

Winner: Kansas
Loser:  Pennsylvania 

Winner

In [13]:
# Now, I did the same for 2017, just to see how well we would do when a
# historic upset does NOT occur in the first round

res_data_2017 = res_data_test[res_data_test['Year'] == 2017]
stats_data_2017 = stats_data_test[stats_data_test['Season'] == 2017]

res_data_2017 = res_data_2017.reset_index(drop=True)
stats_data_2017 = stats_data_2017.reset_index(drop=True)

r64_2017_data = np.zeros((32,13))
r64_2017_labels = np.zeros((32,1))
r64_2017_teams = []

for i in range(32):
    year = res_data_2017['Year'][i]
    teamA = res_data_2017['TeamA'][i]
    teamB = res_data_2017['TeamB'][i]
    score_diff = res_data_2017['ScoreA'][i] - res_data_2017['ScoreB'][i]
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_train)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r64_2017_data[i][s] = stats_data_2017[stat][indexA] - stats_data_2017[stat][indexB]
    
    if (score_diff > 0): # team A won
        r64_2017_labels[i][0] = 1
    else:
        r64_2017_labels[i][0] = 0 #team B won
    r64_2017_teams.append([teamA,teamB])

r64_2017_data = preprocessing.normalize(r64_2017_data, axis=0, norm='max')
r64_2017_dmatrix = xgb.DMatrix(r64_2017_data, label=r64_2017_labels,
                    feature_names=stats_vec)

r64_2017_pred = final_gb.predict(r64_2017_dmatrix)

#0 index is teamB win - 1 index is teamA win
print("\n\nRound of 64 Predictions - 2017")
for i in range(len(r64_2017_pred)):
    teamA = r64_2017_teams[i][0]
    teamB = r64_2017_teams[i][1]
    if r64_2017_pred[i][1] > r64_2017_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')

r32_2017_teams = [["Villanova","Wisconsin"],
                  ["Florida","Virginia"],
                  ["Baylor","SMU"],
                  ["Duke","South Carolina"],
                  ["Gonzaga","Vanderbilt"],
                  ["West Virginia","Notre Dame"],
                  ["Florida St","Xavier"],
                  ["Arizona","St Marys"],
                  ["Kansas","Michigan St"],
                  ["Purdue","Iowa St"],
                  ["Oregon","Creighton"],
                  ["Louisville","Michigan"],
                  ["North Carolina","Arkansas"],
                  ["Butler","Minnesota"],
                  ["UCLA","Cincinnati"],
                  ["Kentucky","Wichita St"]]

r32_2017_data = np.zeros((16,13))
r32_2017_labels = np.zeros((16,1))

for i in range(16):
    year = 2017
    teamA = r32_2017_teams[i][0]
    teamB = r32_2017_teams[i][1]
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r32_2017_data[i][s] = stats_data_2017[stat][indexA] - stats_data_2017[stat][indexB]

r32_2017_data = preprocessing.normalize(r32_2017_data, axis=0, norm='max')
r32_2017_dmatrix = xgb.DMatrix(r32_2017_data, label=r32_2017_labels,
                    feature_names=stats_vec)
r32_2017_pred = final_gb.predict(r32_2017_dmatrix)
#0 index is teamB win - 1 index is teamA win
print("\n\nRound of 32 Predictions - 2017")
for i in range(len(r32_2017_pred)):
    teamA = r32_2017_teams[i][0]
    teamB = r32_2017_teams[i][1]
    if r32_2017_pred[i][1] > r32_2017_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')

r16_2017_teams = [["Villanova","Florida"],
                  ["Duke","SMU"],
                  ["Gonzaga","West Virginia"],
                  ["Arizona","Florida St"],
                  ["Kansas","Purdue"],
                  ["Louisville","Oregon"],
                  ["North Carolina","Butler"],
                  ["Kentucky","UCLA"]]

r16_2017_data = np.zeros((8,13))
r16_2017_labels = np.zeros((8,1))

for i in range(8):
    year = 2017
    teamA = r16_2017_teams[i][0]
    teamB = r16_2017_teams[i][1]
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r16_2017_data[i][s] = stats_data_2017[stat][indexA] - stats_data_2017[stat][indexB]
        
r16_2017_data = preprocessing.normalize(r16_2017_data, axis=0, norm='max')
r16_2017_dmatrix = xgb.DMatrix(r16_2017_data, label=r16_2017_labels,
                    feature_names=stats_vec)
r16_2017_pred = final_gb.predict(r16_2017_dmatrix)
#0 index is teamB win - 1 index is teamA win
print("\n\nSweet 16 Predictions - 2017")
for i in range(len(r16_2017_pred)):
    teamA = r16_2017_teams[i][0]
    teamB = r16_2017_teams[i][1]
    if r16_2017_pred[i][1] > r16_2017_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')

r8_2017_teams = [["Villanova","Duke"],
                 ["Gonzaga","Arizona"],
                 ["Kansas","Louisville"],
                 ["North Carolina","Kentucky"]]
r8_2017_data = np.zeros((4,13))
r8_2017_labels = np.zeros((4,1))

for i in range(4):
    year = 2017
    teamA = r8_2017_teams[i][0]
    teamB = r8_2017_teams[i][1]
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r8_2017_data[i][s] = stats_data_2017[stat][indexA] - stats_data_2017[stat][indexB]
        
r8_2017_data = preprocessing.normalize(r8_2017_data, axis=0, norm='max')
r8_2017_dmatrix = xgb.DMatrix(r8_2017_data, label=r8_2017_labels,
                    feature_names=stats_vec)
r8_2017_pred = final_gb.predict(r8_2017_dmatrix)
#0 index is teamB win - 1 index is teamA win
print("\n\nElite 8 Predictions - 2017")
for i in range(len(r8_2017_pred)):
    teamA = r8_2017_teams[i][0]
    teamB = r8_2017_teams[i][1]
    if r8_2017_pred[i][1] > r8_2017_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')


r4_2017_teams = [["Villanova","Gonzaga"],
                 ["Kansas","North Carolina"]]

r4_2017_data = np.zeros((2,13))
r4_2017_labels = np.zeros((2,1))

for i in range(2):
    year = 2017
    teamA = r4_2017_teams[i][0]
    teamB = r4_2017_teams[i][1]
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r4_2017_data[i][s] = stats_data_2017[stat][indexA] - stats_data_2017[stat][indexB]
        
r4_2017_data = preprocessing.normalize(r4_2017_data, axis=0, norm='max')
r4_2017_dmatrix = xgb.DMatrix(r4_2017_data, label=r4_2017_labels,
                    feature_names=stats_vec)
r4_2017_pred = final_gb.predict(r4_2017_dmatrix)
#0 index is teamB win - 1 index is teamA win
print("\n\nFinal 4 Predictions - 2017")
for i in range(len(r4_2017_pred)):
    teamA = r4_2017_teams[i][0]
    teamB = r4_2017_teams[i][1]
    if r4_2017_pred[i][1] > r4_2017_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')

r2_2017_teams = [["Villanova","Kansas"]]

r2_2017_data = np.zeros((1,13))
r2_2017_labels = np.zeros((1,1))

for i in range(1):
    year = 2017
    teamA = r2_2017_teams[i][0]
    teamB = r2_2017_teams[i][1]
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamA)):
            indexA = k
            break
            
    for k in range(len(stats_data_2017)):
        if ((stats_data_2017['Season'][k] == year) and (stats_data_2017['TeamName'][k] == teamB)):
            indexB = k
            break
    for s in range(len(stats_vec)):
        stat = stats_vec[s]
        r2_2017_data[i][s] = stats_data_2017[stat][indexA] - stats_data_2017[stat][indexB]
        
r2_2017_data = preprocessing.normalize(r2_2017_data, axis=0, norm='max')
r2_2017_dmatrix = xgb.DMatrix(r2_2017_data, label=r2_2017_labels,
                    feature_names=stats_vec)
r2_2017_pred = final_gb.predict(r2_2017_dmatrix)
#0 index is teamB win - 1 index is teamA win
print("\n\nNational Championship Prediction - 2017")
for i in range(len(r2_2017_pred)):
    teamA = r2_2017_teams[i][0]
    teamB = r2_2017_teams[i][1]
    if r2_2017_pred[i][1] > r2_2017_pred[i][0]: #team A win
        print("Winner:", teamA)
        print("Loser: ", teamB, '\n')
    else: #team B win
        print("Winner:", teamB)
        print("Loser: ", teamA, '\n')




Round of 64 Predictions - 2017
Winner: Villanova
Loser:  Mount St Marys 

Winner: Duke
Loser:  Troy 

Winner: Baylor
Loser:  New Mexico St 

Winner: Florida
Loser:  East Tennessee St 

Winner: Virginia
Loser:  UNC Wilmington 

Winner: SMU
Loser:  USC 

Winner: South Carolina
Loser:  Marquette 

Winner: Wisconsin
Loser:  Virginia Tech 

Winner: Gonzaga
Loser:  South Dakota St 

Winner: Arizona
Loser:  North Dakota 

Winner: Florida St
Loser:  Florida Gulf Coast 

Winner: West Virginia
Loser:  Bucknell 

Winner: Notre Dame
Loser:  Princeton 

Winner: Xavier
Loser:  Maryland 

Winner: St Marys
Loser:  VCU 

Winner: Vanderbilt
Loser:  Northwestern 

Winner: Kansas
Loser:  UC Davis 

Winner: Louisville
Loser:  Jacksonville St 

Winner: Oregon
Loser:  Iona 

Winner: Purdue
Loser:  Vermont 

Winner: Iowa St
Loser:  Nevada 

Winner: Creighton
Loser:  Rhode Island 

Winner: Michigan
Loser:  Oklahoma St 

Winner: Michigan St
Loser:  Miami 

Winner: North Carolina
Loser:  Texas Southern 

Winne

In [14]:
# I wanted to also see how well the machine predicts closely seeded games
# Obviously it does will with 1v16 and 2v15 matchups, but how well would it do
# when a 1v3, or 2v3, or even 1v1 matchup arises?

even_matchup_mask = testing_data[:,4] >= -2

even_matchup_stats = testing_data[even_matchup_mask]
even_matchup_ypred = ypred[even_matchup_mask]
even_matchup_labels = testing_labels[even_matchup_mask]

correct_list = np.zeros_like(even_matchup_labels)
for i in range(len(even_matchup_ypred)):
    if even_matchup_ypred[i][0] < even_matchup_ypred[i][1]:
        metric = 1 # team A predicted to win
    else:
        metric = 0
    correct_list[i][0] = metric
    #print("TeamA Win% {:.4}  SeedDiff {}".format(even_matchup_ypred[i][1]*100,even_matchup_stats[i][4]*-1))
    
print("% Correct: ", (correct_list.sum() / len(correct_list))*100)

% Correct:  67.56756756756756
