In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import datetime as dt
import random
from sportsreference.ncaab.teams import Teams
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
teams_df = pd.DataFrame()
i = 2010
while i <=2018:
    print("Downloading data for the", i, "season.")
    start = dt.datetime.now()
    teams = Teams(year = i)
    end = dt.datetime.now()
    teams_temp = teams.dataframes
    teams_temp['Season'] = i
    teams_df = pd.concat([teams_df, teams_temp])
    span = end - start
    print(i, "took", span, "seconds to download.")
    i += 1

Downloading data for the 2010 season.
2010 took 0:00:25.708131 seconds to download.
Downloading data for the 2011 season.
2011 took 0:00:28.452803 seconds to download.
Downloading data for the 2012 season.
2012 took 0:00:27.385087 seconds to download.
Downloading data for the 2013 season.
2013 took 0:00:30.318756 seconds to download.
Downloading data for the 2014 season.
2014 took 0:00:27.503782 seconds to download.
Downloading data for the 2015 season.
2015 took 0:00:29.890348 seconds to download.
Downloading data for the 2016 season.
2016 took 0:00:27.798723 seconds to download.
Downloading data for the 2017 season.
2017 took 0:00:27.657564 seconds to download.
Downloading data for the 2018 season.
2018 took 0:00:28.595215 seconds to download.


In [64]:
teams_df.columns

Index(['abbreviation', 'assist_percentage', 'assists', 'away_losses',
       'away_wins', 'block_percentage', 'blocks', 'conference',
       'conference_losses', 'conference_wins', 'defensive_rebounds',
       'effective_field_goal_percentage', 'field_goal_attempts',
       'field_goal_percentage', 'field_goals', 'free_throw_attempt_rate',
       'free_throw_attempts', 'free_throw_percentage', 'free_throws',
       'free_throws_per_field_goal_attempt', 'games_played', 'home_losses',
       'home_wins', 'losses', 'minutes_played', 'name', 'net_rating',
       'offensive_rating', 'offensive_rebound_percentage',
       'offensive_rebounds', 'opp_assist_percentage', 'opp_assists',
       'opp_block_percentage', 'opp_blocks', 'opp_defensive_rebounds',
       'opp_effective_field_goal_percentage', 'opp_field_goal_attempts',
       'opp_field_goal_percentage', 'opp_field_goals',
       'opp_free_throw_attempt_rate', 'opp_free_throw_attempts',
       'opp_free_throw_percentage', 'opp_free_thro

In [4]:
team_names = teams_df[['abbreviation']]

In [5]:
team_names.head()

Unnamed: 0,abbreviation
AIR-FORCE,AIR-FORCE
AKRON,AKRON
ALABAMA-AM,ALABAMA-AM
ALABAMA-BIRMINGHAM,ALABAMA-BIRMINGHAM
ALABAMA-STATE,ALABAMA-STATE


In [6]:
#Only needed this on first run
#team_names.to_csv(r'data/team_names.csv')
#teams_df.to_csv(r'data/team_data.csv')

In [85]:
games = pd.read_csv(r"C:\Users\bdraus\Documents\Python Scripts\Practice\TourneyCompactResults.csv")
seeds = pd.read_csv(r"C:\Users\bdraus\Documents\Python Scripts\Practice\DataFiles\NCAATourneySeeds.csv")

In [86]:
games.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT'],
      dtype='object')

In [87]:
seeds.columns

Index(['Season', 'Seed', 'TeamID', 'Region', 'Num'], dtype='object')

In [89]:
games = games[['Season','WTeamID','LTeamID']]
games['Winner'] = games['WTeamID']

games2 = games.merge(seeds, left_on = ['WTeamID','Season'], right_on = ['TeamID','Season'], how = 'left')
games3 = games2.merge(seeds, left_on = ['LTeamID','Season'], right_on = ['TeamID','Season'], how = 'left')
games3 = games3.dropna()
#print(games3.head(15))
games3 = games3[['Season', 'WTeamID', 'LTeamID', 'Winner', 'Num_x','Num_y']]
games3 = games3.rename(columns = {'WTeamID' : 'Team_A', 'LTeamID': 'Team_B', 'Num_x' : 'Seed_A', 'Num_y': 'Seed_B'})

games3.head()

Unnamed: 0,Season,Team_A,Team_B,Winner,Seed_A,Seed_B
0,1985,1116,1234,1116,9,8
1,1985,1120,1345,1120,11,6
2,1985,1207,1250,1207,1,16
3,1985,1229,1425,1229,9,8
4,1985,1242,1325,1242,3,14


In [90]:
i = 0
games3['Team_X'] = 0
games3['Team_Y'] = 0
while i < len(games3):
    games3['Team_X'][i] = random.choice(([games3['Team_A'][i],games3['Team_B'][i]]))
    i += 1
games3['Team_Y'] = np.where(games3['Team_X'] == games3['Team_A'], games3['Team_B'], games3['Team_A'])
games3['Result'] = np.where(games3['Winner'] == games3['Team_X'], 'X', 'Y')
games3 = games3.drop(columns = ['Team_A','Team_B'], axis = 1)

In [91]:
games3.head(25)

Unnamed: 0,Season,Winner,Seed_A,Seed_B,Team_X,Team_Y,Result
0,1985,1116,9,8,1116,1234,X
1,1985,1120,11,6,1120,1345,X
2,1985,1207,1,16,1250,1207,Y
3,1985,1229,9,8,1229,1425,X
4,1985,1242,3,14,1242,1325,X
5,1985,1246,12,4,1449,1246,Y
6,1985,1256,4,12,1256,1338,X
7,1985,1260,4,13,1233,1260,Y
8,1985,1314,2,15,1314,1292,X
9,1985,1323,7,10,1323,1333,X


In [92]:
team_data = teams_df[['Season','abbreviation','assist_percentage','block_percentage','effective_field_goal_percentage','field_goal_percentage','free_throw_attempt_rate','free_throw_percentage','free_throws_per_field_goal_attempt','net_rating','offensive_rating','offensive_rebound_percentage','opp_effective_field_goal_percentage','opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throws_per_field_goal_attempt','opp_offensive_rating','opp_offensive_rebound_percentage','opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage','opp_total_rebound_percentage','opp_true_shooting_percentage','opp_two_point_field_goal_percentage','pace','simple_rating_system','steal_percentage','strength_of_schedule','three_point_attempt_rate','three_point_field_goal_percentage','total_rebound_percentage','true_shooting_percentage','turnover_percentage','two_point_field_goal_percentage','win_percentage']]

In [93]:
games3 = games3.loc[games['Season'] >= 2010]
games3.head()

Unnamed: 0,Season,Winner,Seed_A,Seed_B,Team_X,Team_Y,Result
1584,2010,1115,16,16,1457,1115,Y
1585,2010,1124,3,14,1124,1358,X
1586,2010,1139,4,12,1431,1139,Y
1587,2010,1140,7,10,1196,1140,Y
1588,2010,1242,1,16,1242,1250,X


In [94]:
team_abb = pd.read_csv(r"C:\Users\bdraus\Documents\Python Scripts\Practice\team_names.csv")

In [95]:
team_abb = team_abb[['abbreviation','Team_Id']]

In [96]:
team_data_abb = team_data.merge(team_abb, left_on = 'abbreviation', right_on = 'abbreviation', how = 'left').dropna()
team_data_abb['Team_Id'] = team_data_abb['Team_Id'].astype(np.int64)

In [97]:
team_data_abb.head()

Unnamed: 0,Season,abbreviation,assist_percentage,block_percentage,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,free_throws_per_field_goal_attempt,net_rating,...,steal_percentage,strength_of_schedule,three_point_attempt_rate,three_point_field_goal_percentage,total_rebound_percentage,true_shooting_percentage,turnover_percentage,two_point_field_goal_percentage,win_percentage,Team_Id
0,2010,AIR-FORCE,61.6,6.2,0.504,0.443,0.367,0.635,0.233,-10.0,...,8.4,3.13,0.394,0.313,46.8,0.529,19.0,0.527,0.323,1102
1,2010,AKRON,53.9,8.5,0.491,0.433,0.363,0.657,0.239,7.6,...,9.0,-1.5,0.343,0.339,51.6,0.521,16.4,0.483,0.686,1103
2,2010,ALABAMA-AM,48.1,12.7,0.416,0.382,0.474,0.635,0.301,-5.2,...,12.5,-13.71,0.237,0.291,46.8,0.463,18.8,0.41,0.407,1105
3,2010,ALABAMA-BIRMINGHAM,51.1,7.3,0.471,0.422,0.457,0.694,0.317,10.0,...,10.0,2.9,0.315,0.311,53.6,0.518,17.0,0.474,0.735,1412
4,2010,ALABAMA-STATE,60.0,11.1,0.462,0.404,0.448,0.641,0.287,-2.1,...,10.8,-12.02,0.356,0.324,51.3,0.499,20.0,0.448,0.516,1106


In [98]:
games_a = games3.merge(team_data_abb, left_on = ['Team_X','Season'], right_on = ['Team_Id','Season'], how = 'left')
games_b = games_a.merge(team_data_abb, left_on = ['Team_Y','Season'], right_on = ['Team_Id','Season'], how = 'left')
games_b = games_b.dropna()
print(games_b.head(15))

    Season  Winner  Seed_A  Seed_B  Team_X  Team_Y Result  \
0     2010    1115      16      16    1457    1115      Y   
1     2010    1124       3      14    1124    1358      X   
2     2010    1139       4      12    1431    1139      Y   
3     2010    1140       7      10    1196    1140      Y   
4     2010    1242       1      16    1242    1250      X   
5     2010    1243       2      15    1243    1317      X   
6     2010    1246       1      16    1190    1246      Y   
7     2010    1293      13       4    1293    1435      X   
8     2010    1307       3      14    1285    1307      Y   
9     2010    1320       9       8    1320    1424      X   
10    2010    1325      14       3    1325    1207      X   
11    2010    1330      11       6    1330    1323      X   
12    2010    1388      10       7    1388    1350      X   
13    2010    1397       6      11    1397    1361      X   
14    2010    1437       2      15    1437    1352      X   

          abbreviation_

In [99]:
games_b['Winner'] = np.where(games_b['Winner'] == games_b['Team_X'], games_b['abbreviation_x'], games_b['abbreviation_y'])
games_b['Team_X'] = games_b['abbreviation_x']
games_b['Team_Y'] = games_b['abbreviation_y']

In [100]:
games_b.dtypes

Season                                        int64
Winner                                       object
Seed_A                                        int64
Seed_B                                        int64
Team_X                                       object
Team_Y                                       object
Result                                       object
abbreviation_x                               object
assist_percentage_x                         float64
block_percentage_x                          float64
effective_field_goal_percentage_x           float64
field_goal_percentage_x                     float64
free_throw_attempt_rate_x                   float64
free_throw_percentage_x                     float64
free_throws_per_field_goal_attempt_x        float64
net_rating_x                                float64
offensive_rating_x                          float64
offensive_rebound_percentage_x              float64
opp_effective_field_goal_percentage_x       float64
opp_field_go

In [101]:
print(games_b.head(15))

    Season               Winner  Seed_A  Seed_B                Team_X  \
0     2010  ARKANSAS-PINE-BLUFF      16      16              WINTHROP   
1     2010               BAYLOR       3      14                BAYLOR   
2     2010               BUTLER       4      12         TEXAS-EL-PASO   
3     2010        BRIGHAM-YOUNG       7      10               FLORIDA   
4     2010               KANSAS       1      16                KANSAS   
5     2010         KANSAS-STATE       2      15          KANSAS-STATE   
6     2010             KENTUCKY       1      16  EAST-TENNESSEE-STATE   
7     2010         MURRAY-STATE      13       4          MURRAY-STATE   
8     2010           NEW-MEXICO       3      14               MONTANA   
9     2010        NORTHERN-IOWA       9       8         NORTHERN-IOWA   
10    2010                 OHIO      14       3                  OHIO   
11    2010         OLD-DOMINION      11       6          OLD-DOMINION   
12    2010       SAINT-MARYS-CA      10       7    

In [102]:
ml_input = games_b.drop(columns = ['Season','Winner','Team_X','Team_Y','abbreviation_x','abbreviation_y','Team_Id_x','Team_Id_y','win_percentage_x','win_percentage_y'], axis = 1)
ml_input.dtypes

Seed_A                                        int64
Seed_B                                        int64
Result                                       object
assist_percentage_x                         float64
block_percentage_x                          float64
effective_field_goal_percentage_x           float64
field_goal_percentage_x                     float64
free_throw_attempt_rate_x                   float64
free_throw_percentage_x                     float64
free_throws_per_field_goal_attempt_x        float64
net_rating_x                                float64
offensive_rating_x                          float64
offensive_rebound_percentage_x              float64
opp_effective_field_goal_percentage_x       float64
opp_field_goal_percentage_x                 float64
opp_free_throw_attempt_rate_x               float64
opp_free_throws_per_field_goal_attempt_x    float64
opp_offensive_rating_x                      float64
opp_offensive_rebound_percentage_x          float64
opp_steal_pe

In [103]:
X_all = ml_input.drop(['Result'],1)
y_all = ml_input['Result']

In [104]:
cols = [['assist_percentage_x','block_percentage_x','effective_field_goal_percentage_x','field_goal_percentage_x','free_throw_attempt_rate_x','free_throw_percentage_x','free_throws_per_field_goal_attempt_x','net_rating_x','offensive_rating_x','offensive_rebound_percentage_x','opp_effective_field_goal_percentage_x','opp_field_goal_percentage_x','opp_free_throw_attempt_rate_x','opp_free_throws_per_field_goal_attempt_x','opp_offensive_rating_x','opp_offensive_rebound_percentage_x','opp_steal_percentage_x','opp_three_point_attempt_rate_x','opp_three_point_field_goal_percentage_x','opp_total_rebound_percentage_x','opp_true_shooting_percentage_x','opp_two_point_field_goal_percentage_x','pace_x','simple_rating_system_x','steal_percentage_x','strength_of_schedule_x','three_point_attempt_rate_x','three_point_field_goal_percentage_x','total_rebound_percentage_x','true_shooting_percentage_x','turnover_percentage_x','two_point_field_goal_percentage_x','assist_percentage_y','block_percentage_y','effective_field_goal_percentage_y','field_goal_percentage_y','free_throw_attempt_rate_y','free_throw_percentage_y','free_throws_per_field_goal_attempt_y','net_rating_y','offensive_rating_y','offensive_rebound_percentage_y','opp_effective_field_goal_percentage_y','opp_field_goal_percentage_y','opp_free_throw_attempt_rate_y','opp_free_throws_per_field_goal_attempt_y','opp_offensive_rating_y','opp_offensive_rebound_percentage_y','opp_steal_percentage_y','opp_three_point_attempt_rate_y','opp_three_point_field_goal_percentage_y','opp_total_rebound_percentage_y','opp_true_shooting_percentage_y','opp_two_point_field_goal_percentage_y','pace_y','simple_rating_system_y','steal_percentage_y','strength_of_schedule_y','three_point_attempt_rate_y','three_point_field_goal_percentage_y','total_rebound_percentage_y','true_shooting_percentage_y','turnover_percentage_y','two_point_field_goal_percentage_y']]
for col in cols:
    X_all[col] = scale(X_all[col])

In [105]:
len(X_all)

533

In [106]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,
                                                   stratify = y_all)

In [107]:
#for measuring training time
from time import time 
# F1 score (also F-score or F-measure) is a measure of a test's accuracy. 
#It considers both the precision p and the recall r of the test to compute 
#the score: p is the number of correct positive results divided by the number of 
#all positive results, and r is the number of correct positive results divided by 
#the number of positive results that should have been returned. The F1 score can be 
#interpreted as a weighted average of the precision and recall, where an F1 score 
#reaches its best value at 1 and worst at 0.
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    time_taken = end - start
    
    # Print the results
    print('Trained model', clf.__class__.__name__, 'in', time_taken, ' seconds')

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    time_taken = end - start
    print("Made predictions in", time_taken, "seconds.")
    
    return f1_score(target, y_pred, pos_label='X'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    #print("Training something using a training set size of %d. . .").format(len(X_train))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print("F1 score and accuracy score for training set:", f1, " , ", acc)
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set:", f1, " , ", acc)

In [108]:
clf_A = LogisticRegression(random_state = 64)
clf_B = SVC(random_state = 64, kernel = 'rbf')
clf_C = xgb.XGBClassifier(seed = 64)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print('')

Trained model LogisticRegression in 0.020055294036865234  seconds
Made predictions in 0.0015048980712890625 seconds.
F1 score and accuracy score for training set: 0.781491002570694  ,  0.7869674185463659
Made predictions in 0.0005016326904296875 seconds.
F1 score and accuracy score for test set: 0.702290076335878  ,  0.7089552238805971

Trained model SVC in 0.020058155059814453  seconds
Made predictions in 0.016039609909057617 seconds.
F1 score and accuracy score for training set: 0.9693877551020408  ,  0.9699248120300752
Made predictions in 0.007020235061645508 seconds.
F1 score and accuracy score for test set: 0.6833333333333333  ,  0.7164179104477612

Trained model XGBClassifier in 0.28185081481933594  seconds
Made predictions in 0.004006624221801758 seconds.
F1 score and accuracy score for training set: 1.0  ,  1.0
Made predictions in 0.0030078887939453125 seconds.
F1 score and accuracy score for test set: 0.7142857142857143  ,  0.7313432835820896



  if diff:
  if diff:


In [109]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer


# TODO: Create the parameters list you wish to tune
parameters = { 'learning_rate' : [0.1],
               'n_estimators' : [40],
               'max_depth': [3],
               'min_child_weight': [3],
               'gamma':[0.4],
               'subsample' : [0.8],
               'colsample_bytree' : [0.8],
               'scale_pos_weight' : [1],
               'reg_alpha':[1e-5]
             }  

# TODO: Initialize the classifier
clf = xgb.XGBClassifier(seed=2)

# TODO: Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score,pos_label='X')

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_
print(clf)

# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set:", f1, " , ", acc)
    
f1, acc = predict_labels(clf, X_test, y_test)
print("F1 score and accuracy score for test set:", f1, " , ", acc)

  if diff:
  if diff:
  if diff:


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=3, missing=None,
       n_estimators=40, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=1e-05,
       reg_lambda=1, scale_pos_weight=1, seed=2, silent=True,
       subsample=0.8)
Made predictions in 0.0030078887939453125 seconds.
F1 score and accuracy score for training set: 0.9432989690721649  ,  0.9448621553884712
Made predictions in 0.0020036697387695312 seconds.
F1 score and accuracy score for test set: 0.7286821705426357  ,  0.7388059701492538


  if diff:
  if diff:
  if diff:
  if diff:


In [227]:
X_all.head()

Unnamed: 0,Seed_A,Seed_B,assist_percentage_x,block_percentage_x,effective_field_goal_percentage_x,field_goal_percentage_x,free_throw_attempt_rate_x,free_throw_percentage_x,free_throws_per_field_goal_attempt_x,net_rating_x,...,pace_y,simple_rating_system_y,steal_percentage_y,strength_of_schedule_y,three_point_attempt_rate_y,three_point_field_goal_percentage_y,total_rebound_percentage_y,true_shooting_percentage_y,turnover_percentage_y,two_point_field_goal_percentage_y
0,16,16,-1.519843,0.283419,-3.840233,-3.451673,-0.419021,-1.913402,-1.057348,-2.00096,...,0.211879,-2.910346,-0.086777,-2.679859,-1.519903,-2.192749,0.918293,-2.453393,3.050735,-2.243125
1,3,14,-0.666759,2.192581,1.118435,1.263919,-0.460053,0.474644,-0.267565,0.557117,...,1.253023,-1.349977,0.560861,-1.811673,1.20064,0.334939,0.446799,0.488379,0.521549,0.53518
2,4,12,0.082291,-0.262056,0.358347,0.690996,0.319558,-1.058422,-0.070119,0.302898,...,-0.861801,0.03615,0.506891,-0.109546,1.262006,-0.721408,0.093178,0.161515,0.633957,0.007654
3,7,10,-0.125778,-1.284821,-0.872271,-0.631132,-0.952438,-0.173961,-0.972729,-0.745754,...,1.383166,0.614787,1.046589,-0.528531,-0.128948,2.108094,0.446799,1.673259,-0.54633,0.253833
4,1,16,0.956182,1.715291,1.190825,1.35206,0.668331,-0.291889,0.522218,1.8441,...,1.057809,-2.265394,-0.680445,-2.702017,-0.80397,1.429013,-0.378316,-0.001916,0.521549,-1.082567


This section inputs all upsets from 2010-2018 and tests how the model performs

In [142]:
game_list = [[2017,'MIDDLE-TENNESSEE','MINNESOTA','MIDDLE-TENNESSEE'],
[2016,'ARKANSAS-LITTLE-ROCK','PURDUE','ARKANSAS-LITTLE-ROCK'],
[2016,'YALE','BAYLOR','YALE'],
[2014,'STEPHEN-F-AUSTIN','VIRGINIA-COMMONWEALTH','STEPHEN-F-AUSTIN'],
[2014,'NORTH-DAKOTA-STATE','OKLAHOMA','NORTH-DAKOTA-STATE'],
[2014,'HARVARD','CINCINNATI','HARVARD'],
[2013,'MISSISSIPPI','WISCONSIN','MISSISSIPPI'],
[2013,'CALIFORNIA','NEVADA-LAS-VEGAS','CALIFORNIA'],
[2013,'OREGON','OKLAHOMA-STATE','OREGON'],
[2012,'SOUTH-FLORIDA','TEMPLE','SOUTH-FLORIDA'],
[2012,'VIRGINIA-COMMONWEALTH','WICHITA-STATE','VIRGINIA-COMMONWEALTH'],
[2011,'RICHMOND','VANDERBILT','RICHMOND'],
[2010,'CORNELL','TEMPLE','CORNELL'],
[2016,'NORTHERN-IOWA','TEXAS','NORTHERN-IOWA'],
[2016,'GONZAGA','SETON-HALL','GONZAGA'],
[2016,'WICHITA-STATE','ARIZONA','WICHITA-STATE'],
[2015,'DAYTON','PROVIDENCE','DAYTON'],
[2015,'UCLA','SOUTHERN-METHODIST','UCLA'],
[2014,'TENNESSEE','MASSACHUSETTS','TENNESSEE'],
[2014,'DAYTON','OHIO-STATE','DAYTON'],
[2013,'MINNESOTA','UCLA','MINNESOTA'],
[2016,'MIDDLE-TENNESSEE','MICHIGAN-STATE','MIDDLE-TENNESSEE'],
[2013,'FLORIDA-GULF-COAST','GEORGETOWN','FLORIDA-GULF-COAST'],
[2012,'LEHIGH','DUKE','LEHIGH'],
[2012,'NORFOLK-STATE','MISSOURI','NORFOLK-STATE'],
[2016,'STEPHEN-F-AUSTIN','WEST-VIRGINIA','STEPHEN-F-AUSTIN'],
[2015,'GEORGIA-STATE','BAYLOR','GEORGIA-STATE'],
[2015,'ALABAMA-BIRMINGHAM','IOWA-STATE','ALABAMA-BIRMINGHAM'],
[2014,'MERCER','DUKE','MERCER'],
[2013,'HARVARD','NEW-MEXICO','HARVARD'],
[2010,'OHIO','GEORGETOWN','OHIO'],
[2018,'MARSHALL','WICHITA-STATE','MARSHALL'],
[2018,'BUFFALO','ARIZONA','BUFFALO'],
[2016,'HAWAII','CALIFORNIA','HAWAII'],
[2013,'LA-SALLE','KANSAS-STATE','LA-SALLE'],
[2012,'OHIO','MICHIGAN','OHIO'],
[2011,'MOREHEAD-STATE','LOUISVILLE','MOREHEAD-STATE'],
[2010,'MURRAY-STATE','VANDERBILT','MURRAY-STATE'],
[2018,'LOYOLA-IL','MIAMI-FL','LOYOLA-IL'],
[2018,'SYRACUSE','TEXAS-CHRISTIAN','SYRACUSE'],
[2017,'RHODE-ISLAND','CREIGHTON','RHODE-ISLAND'],
[2017,'SOUTHERN-CALIFORNIA','SOUTHERN-METHODIST','SOUTHERN-CALIFORNIA'],
[2017,'XAVIER','MARYLAND','XAVIER']]
game_df_teams = pd.DataFrame(game_list)
game_df_teams.columns = ['Season','Away','Home','Actual Winner']
game_df_teams

Unnamed: 0,Season,Away,Home,Actual Winner
0,2017,MIDDLE-TENNESSEE,MINNESOTA,MIDDLE-TENNESSEE
1,2016,ARKANSAS-LITTLE-ROCK,PURDUE,ARKANSAS-LITTLE-ROCK
2,2016,YALE,BAYLOR,YALE
3,2014,STEPHEN-F-AUSTIN,VIRGINIA-COMMONWEALTH,STEPHEN-F-AUSTIN
4,2014,NORTH-DAKOTA-STATE,OKLAHOMA,NORTH-DAKOTA-STATE
5,2014,HARVARD,CINCINNATI,HARVARD
6,2013,MISSISSIPPI,WISCONSIN,MISSISSIPPI
7,2013,CALIFORNIA,NEVADA-LAS-VEGAS,CALIFORNIA
8,2013,OREGON,OKLAHOMA-STATE,OREGON
9,2012,SOUTH-FLORIDA,TEMPLE,SOUTH-FLORIDA


In [176]:
game_df = pd.DataFrame(game_list)
game_df.columns = ['Season','Away','Home','Actual Winner']
game_df = game_df[['Season','Away','Home']]
game_df = game_df.merge(team_data_abb, left_on = ['Season','Home'], right_on = ['Season','abbreviation'], how = 'left')
game_df = game_df.merge(team_data_abb, left_on = ['Season','Away'], right_on = ['Season','abbreviation'], how = 'left')
#game_df = game_df.merge(seeds, left_on = ['Team_Id_x','Season'], right_on = ['TeamID','Season'], how = 'left')
#game_df2 = game_df.merge(seeds, left_on = ['Team_Id_x','Season'], right_on = ['TeamID','Season'], how = 'left')
#print(game_df)

In [181]:
game_df.head()

Unnamed: 0,Season,Away,Home,abbreviation_x,assist_percentage_x,block_percentage_x,effective_field_goal_percentage_x,field_goal_percentage_x,free_throw_attempt_rate_x,free_throw_percentage_x,...,steal_percentage_y,strength_of_schedule_y,three_point_attempt_rate_y,three_point_field_goal_percentage_y,total_rebound_percentage_y,true_shooting_percentage_y,turnover_percentage_y,two_point_field_goal_percentage_y,win_percentage_y,Team_Id_y
0,2017,MIDDLE-TENNESSEE,MINNESOTA,MINNESOTA,57.2,15.9,0.487,0.435,0.396,0.711,...,10.1,-0.48,0.299,0.367,53.9,0.567,13.6,0.54,0.861,1292
1,2016,ARKANSAS-LITTLE-ROCK,PURDUE,PURDUE,64.1,11.7,0.536,0.47,0.354,0.744,...,10.1,-1.76,0.354,0.384,50.1,0.553,13.6,0.49,0.857,1114
2,2016,YALE,BAYLOR,BAYLOR,64.6,11.2,0.518,0.466,0.375,0.725,...,8.4,-1.03,0.309,0.363,57.8,0.555,16.4,0.517,0.767,1463
3,2014,STEPHEN-F-AUSTIN,VIRGINIA-COMMONWEALTH,VIRGINIA-COMMONWEALTH,49.8,11.5,0.479,0.42,0.352,0.674,...,11.1,-6.76,0.36,0.345,53.8,0.561,14.2,0.529,0.914,1372
4,2014,NORTH-DAKOTA-STATE,OKLAHOMA,OKLAHOMA,53.1,8.4,0.517,0.447,0.394,0.749,...,9.7,-0.02,0.275,0.364,53.2,0.593,13.4,0.558,0.788,1295


In [197]:
game_input = game_df2.drop(columns = ['Season','Away','Home','abbreviation_x','abbreviation_y','Team_Id_x',
                                     'Team_Id_y','win_percentage_x','win_percentage_y','Team_Id_y','Seed_x',
                                     'TeamID_x', 'Region_x', 'Seed_y', 'TeamID_y','Region_y'], axis = 1)
game_input = game_input.rename(columns = {'Num_x' : 'Seed_A', 'Num_y': 'Seed_B'})

game_input = game_input[['Seed_A', 'Seed_B', 'assist_percentage_x', 'block_percentage_x', 'effective_field_goal_percentage_x', 'field_goal_percentage_x', 'free_throw_attempt_rate_x', 'free_throw_percentage_x', 'free_throws_per_field_goal_attempt_x', 'net_rating_x', 'offensive_rating_x', 'offensive_rebound_percentage_x', 'opp_effective_field_goal_percentage_x', 'opp_field_goal_percentage_x', 'opp_free_throw_attempt_rate_x', 'opp_free_throws_per_field_goal_attempt_x', 'opp_offensive_rating_x', 'opp_offensive_rebound_percentage_x', 'opp_steal_percentage_x', 'opp_three_point_attempt_rate_x', 'opp_three_point_field_goal_percentage_x', 'opp_total_rebound_percentage_x', 'opp_true_shooting_percentage_x', 'opp_two_point_field_goal_percentage_x', 'pace_x', 'simple_rating_system_x', 'steal_percentage_x', 'strength_of_schedule_x', 'three_point_attempt_rate_x', 'three_point_field_goal_percentage_x', 'total_rebound_percentage_x', 'true_shooting_percentage_x', 'turnover_percentage_x', 'two_point_field_goal_percentage_x', 'assist_percentage_y', 'block_percentage_y', 'effective_field_goal_percentage_y', 'field_goal_percentage_y', 'free_throw_attempt_rate_y', 'free_throw_percentage_y', 'free_throws_per_field_goal_attempt_y', 'net_rating_y', 'offensive_rating_y', 'offensive_rebound_percentage_y', 'opp_effective_field_goal_percentage_y', 'opp_field_goal_percentage_y', 'opp_free_throw_attempt_rate_y', 'opp_free_throws_per_field_goal_attempt_y', 'opp_offensive_rating_y', 'opp_offensive_rebound_percentage_y', 'opp_steal_percentage_y', 'opp_three_point_attempt_rate_y', 'opp_three_point_field_goal_percentage_y', 'opp_total_rebound_percentage_y', 'opp_true_shooting_percentage_y', 'opp_two_point_field_goal_percentage_y', 'pace_y', 'simple_rating_system_y', 'steal_percentage_y', 'strength_of_schedule_y', 'three_point_attempt_rate_y', 'three_point_field_goal_percentage_y', 'total_rebound_percentage_y', 'true_shooting_percentage_y', 
                         'turnover_percentage_y', 'two_point_field_goal_percentage_y']]

In [198]:
game_input.shape

(43, 66)

In [199]:
X_all.shape

(533, 66)

In [200]:
y_pred_C = clf_C.predict(game_input)
y_pred_B = clf_B.predict(game_input)
y_pred_A = clf_A.predict(game_input)
Y_pred = clf.predict(game_input)

  if diff:
  if diff:


In [201]:
print(y_pred_C,y_pred_B,y_pred_A,Y_pred)

['Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'X' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'] ['Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'] ['X' 'X' 'X' 'X' 'X' 'X' 'X' 'Y' 'X' 'Y' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'Y'
 'Y' 'X' 'Y' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X' 'X'
 'X' 'X' 'X' 'Y' 'X' 'X' 'Y'] ['X' 'X' 'X' 'X' 'X' 'X' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'X' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'X' 'X' 'X' 'Y' 'X' 'X' 'X' 'X' 'X' 'X' 'Y' 'X'
 'X' 'X' 'Y' 'Y' 'Y' 'Y' 'Y']


In [202]:
test = pd.DataFrame(Y_pred)

In [203]:
games_df_out = game_df_teams.merge(test, left_index = True, right_index = True)
games_df_out.columns = ['Season','Away','Home','Actual Winner','Forecast']

In [204]:
games_df_out['Winner'] = np.where(games_df_out['Forecast'] == 'Y', games_df_out['Away'], games_df_out['Home'])
games_df_out['Correct'] = np.where(games_df_out['Actual Winner'] == games_df_out['Winner'], "Correct!", "¯\_(ツ)_/¯")

In [205]:
games_df_out[['Season','Away','Home','Actual Winner','Winner','Correct']]

Unnamed: 0,Season,Away,Home,Actual Winner,Winner,Correct
0,2017,MIDDLE-TENNESSEE,MINNESOTA,MIDDLE-TENNESSEE,MINNESOTA,¯\_(ツ)_/¯
1,2016,ARKANSAS-LITTLE-ROCK,PURDUE,ARKANSAS-LITTLE-ROCK,PURDUE,¯\_(ツ)_/¯
2,2016,YALE,BAYLOR,YALE,BAYLOR,¯\_(ツ)_/¯
3,2014,STEPHEN-F-AUSTIN,VIRGINIA-COMMONWEALTH,STEPHEN-F-AUSTIN,VIRGINIA-COMMONWEALTH,¯\_(ツ)_/¯
4,2014,NORTH-DAKOTA-STATE,OKLAHOMA,NORTH-DAKOTA-STATE,OKLAHOMA,¯\_(ツ)_/¯
5,2014,HARVARD,CINCINNATI,HARVARD,CINCINNATI,¯\_(ツ)_/¯
6,2013,MISSISSIPPI,WISCONSIN,MISSISSIPPI,MISSISSIPPI,Correct!
7,2013,CALIFORNIA,NEVADA-LAS-VEGAS,CALIFORNIA,CALIFORNIA,Correct!
8,2013,OREGON,OKLAHOMA-STATE,OREGON,OREGON,Correct!
9,2012,SOUTH-FLORIDA,TEMPLE,SOUTH-FLORIDA,SOUTH-FLORIDA,Correct!


This section is where you can input current year games and get an output.

In [206]:
teams_19 = Teams(year = 2019)
teams_df_19 = teams.dataframes
teams_df_19 = teams_df_19[['abbreviation','assist_percentage','block_percentage','effective_field_goal_percentage','field_goal_percentage','free_throw_attempt_rate','free_throw_percentage','free_throws_per_field_goal_attempt','net_rating','offensive_rating','offensive_rebound_percentage','opp_effective_field_goal_percentage','opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throws_per_field_goal_attempt','opp_offensive_rating','opp_offensive_rebound_percentage','opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage','opp_total_rebound_percentage','opp_true_shooting_percentage','opp_two_point_field_goal_percentage','pace','simple_rating_system','steal_percentage','strength_of_schedule','three_point_attempt_rate','three_point_field_goal_percentage','total_rebound_percentage','true_shooting_percentage','turnover_percentage','two_point_field_goal_percentage']]

In [207]:
teams_df_19.head()

Unnamed: 0,abbreviation,assist_percentage,block_percentage,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,free_throws_per_field_goal_attempt,net_rating,offensive_rating,...,pace,simple_rating_system,steal_percentage,strength_of_schedule,three_point_attempt_rate,three_point_field_goal_percentage,total_rebound_percentage,true_shooting_percentage,turnover_percentage,two_point_field_goal_percentage
ABILENE-CHRISTIAN,ABILENE-CHRISTIAN,55.4,11.6,0.521,0.464,0.309,0.701,0.217,3.5,102.2,...,71.6,-9.14,11.3,-6.82,0.35,0.325,49.7,0.549,17.7,0.539
AIR-FORCE,AIR-FORCE,60.7,8.1,0.49,0.419,0.318,0.734,0.233,-5.7,100.8,...,67.7,-4.31,9.5,1.72,0.431,0.331,48.7,0.527,16.3,0.485
AKRON,AKRON,52.7,7.5,0.518,0.435,0.319,0.696,0.222,-5.1,102.6,...,69.1,-6.82,8.4,-1.92,0.467,0.358,49.2,0.547,17.3,0.502
ALABAMA-AM,ALABAMA-AM,50.5,3.9,0.45,0.397,0.314,0.647,0.203,-23.2,88.1,...,68.3,-23.97,5.8,-8.04,0.354,0.303,48.2,0.48,20.9,0.448
ALABAMA-BIRMINGHAM,ALABAMA-BIRMINGHAM,59.3,11.6,0.545,0.488,0.291,0.75,0.218,10.1,109.8,...,69.5,4.9,7.7,-0.65,0.334,0.345,54.8,0.575,16.5,0.559


In [123]:
test = teams_df_19[teams_df_19['abbreviation']=='CENTRAL-FLORIDA']

In [124]:
test

Unnamed: 0,abbreviation,assist_percentage,block_percentage,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,free_throws_per_field_goal_attempt,net_rating,offensive_rating,...,pace,simple_rating_system,steal_percentage,strength_of_schedule,three_point_attempt_rate,three_point_field_goal_percentage,total_rebound_percentage,true_shooting_percentage,turnover_percentage,two_point_field_goal_percentage
CENTRAL-FLORIDA,CENTRAL-FLORIDA,49.1,9.4,0.478,0.424,0.431,0.641,0.277,1.5,96.2,...,64.9,4.75,8.1,3.75,0.331,0.329,50.5,0.512,18.0,0.471


In [280]:
first_round = [['NORTH-CAROLINA-CENTRAL','NORTH-DAKOTA-STATE',16,16],['NORTH-DAKOTA-STATE','DUKE',16,1],
               ['VIRGINIA-COMMONWEALTH','CENTRAL-FLORIDA',8,9],['MISSISSIPPI-STATE','LIBERTY',5,12],['VIRGINIA-TECH','SAINT-LOUIS',4,13],
               ['BELMONT','TEMPLE',11,11],['MARYLAND','TEMPLE',6,11] ,['LOUISIANA-STATE','YALE',3,14],['LOUISVILLE','MINNESOTA',7,10],
               ['BRADLEY','MICHIGAN-STATE',15,2],['FAIRLEIGH-DICKINSON','PRAIRIE-VIEW',16,16],['PRAIRIE-VIEW','GONZAGA',16,1],
               ['SYRACUSE','BAYLOR',8,9],['MARQUETTE','MURRAY-STATE',5,12],['FLORIDA-STATE','VERMONT',4,13],['ST-JOHNS-NY','ARIZONA-STATE',11,11],
               ['ARIZONA-STATE','BUFFALO',11,6],['TEXAS-TECH','NORTHERN-KENTUCKY',3,14],['FLORIDA','NEVADA',10,7],['MICHIGAN','MONTANA',2,15],
               ['GARDNER-WEBB','VIRGINIA',16,1],['MISSISSIPPI','OKLAHOMA',8,9],['WISCONSIN','OREGON',5,12], 
               ['KANSAS-STATE','CALIFORNIA-IRVINE',4,13],['VILLANOVA','SAINT-MARYS-CA',6,11], ['PURDUE','OLD-DOMINION',3,14],
               ['CINCINNATI','IOWA',7,10],['TENNESSEE','COLGATE',2,15],['NORTH-CAROLINA','IONA',1,16],['UTAH-STATE','WASHINGTON',8,9],
               ['AUBURN','NEW-MEXICO-STATE',5,12],['KANSAS','NORTHEASTERN',4,13],['IOWA-STATE','OHIO-STATE',6,11],['HOUSTON','GEORGIA-STATE',3,14],
               ['WOFFORD','SETON-HALL',7,10],['KENTUCKY','ABILENE-CHRISTIAN',1,16]]

second_round = [['DUKE','VIRGINIA-COMMONWEALTH'],['LIBERTY','SAINT-LOUIS'],['LOUISIANA-STATE','MARYLAND'],
                  ['LOUISVILLE','BRADLEY'],['GONZAGA','SYRACUSE'],['MARQUETTE','FLORIDA-STATE'],['ST-JOHNS-NY','TEXAS-TECH']
                 ,['FLORIDA','MICHIGAN'],['VIRGINIA','MISSISSIPPI'],['WISCONSIN','KANSAS-STATE'],['VILLANOVA','PURDUE'],
                 ['IOWA','TENNESSEE'],['NORTH-CAROLINA','UTAH-STATE'],['AUBURN','KANSAS'],['IOWA-STATE','HOUSTON'],
                 ['SETON-HALL','KENTUCKY']]

sweet_sixteen = [['SAINT-LOUIS','VIRGINIA-COMMONWEALTH'],['MARYLAND','LOUISVILLE'],['SYRACUSE','FLORIDA-STATE'],
                  ['ST-JOHNS-NY','FLORIDA'],['VIRGINIA','WISCONSIN'],['TENNESSEE','PURDUE'],['NORTH-CAROLINA','AUBURN']
                 ,['IOWA-STATE','SETON-HALL']]

elite_eight = [['LOUISVILLE','VIRGINIA-COMMONWEALTH'],['FLORIDA-STATE','ST-JOHNS-NY'],['WISCONSIN','PURDUE'],
                  ['NORTH-CAROLINA','IOWA-STATE']]

final_four = [['ST-JOHNS-NY','VIRGINIA-COMMONWEALTH'],['PURDUE','NORTH-CAROLINA']]

finals = [['NORTH-CAROLINA','VIRGINIA-COMMONWEALTH']]


game_df_cur_team = pd.DataFrame(first_round)
game_df_cur_team.columns = ['Away','Home','Seed_A','Seed_B']

In [281]:
#game_df_cur = pd.DataFrame(first_round)
#game_df_cur.columns = ['Away','Home','Seed_A','Seed_B']
game_df_cur = game_df_cur[['Away','Home','Seed_A','Seed_B']]
game_df_cur = game_df_cur.merge(teams_df_19, left_on = ['Home'], right_on = ['abbreviation'], how = 'left')
game_df_cur = game_df_cur.merge(teams_df_19, left_on = ['Away'], right_on = ['abbreviation'], how = 'left')

In [282]:
game_input_cur

Unnamed: 0,Seed_A,Seed_B,assist_percentage_x,block_percentage_x,effective_field_goal_percentage_x,field_goal_percentage_x,free_throw_attempt_rate_x,free_throw_percentage_x,free_throws_per_field_goal_attempt_x,net_rating_x,...,pace_y,simple_rating_system_y,steal_percentage_y,strength_of_schedule_y,three_point_attempt_rate_y,three_point_field_goal_percentage_y,total_rebound_percentage_y,true_shooting_percentage_y,turnover_percentage_y,two_point_field_goal_percentage_y
0,1.882845,1.244043,-0.703378,-1.641801,0.794029,0.283496,-0.610726,1.202688,-0.227241,-0.516864,...,-0.079324,-2.365444,-1.27719,-2.946868,-1.423388,-1.141305,1.122465,-0.82995,1.584659,-0.028727
1,1.882845,-2.348381,0.777659,0.799571,1.050233,1.31757,0.230162,-0.20736,0.183259,1.581063,...,-0.123257,-1.222553,-1.875289,-1.17022,0.990952,0.619031,-0.006271,0.836121,0.926416,0.591782
2,0.150387,-0.432421,-0.970788,-0.381738,-1.914414,-1.739692,1.973467,-2.115073,1.092224,-1.05163,...,1.019006,-0.67683,-0.230517,-0.507909,0.025216,-0.354772,0.038879,-0.422689,0.78014,0.119013
3,-0.499285,0.286063,0.859938,-0.499869,0.318222,-0.076182,0.086596,2.059777,0.828331,0.251005,...,-0.29899,0.146294,0.292819,0.220632,-0.225875,-1.590753,0.219476,-0.385665,0.853278,0.946359
4,-0.715842,0.525558,-0.004,1.07521,-2.024216,-1.829611,2.62977,-1.810945,1.825261,-1.175037,...,0.360008,0.412599,-0.679091,0.204074,0.276308,0.99357,-0.954409,1.539574,0.048759,1.566868
5,0.800059,0.046568,-0.826798,-0.539246,-1.365405,-1.694732,-1.61569,-0.981504,-1.927886,-1.367005,...,-0.255057,-0.476093,-1.501477,-1.385471,3.038313,0.544123,0.264626,1.835764,0.78014,2.866982
6,-0.282728,0.046568,-0.826798,-0.539246,-1.365405,-1.694732,-1.61569,-0.981504,-1.927886,-1.367005,...,-1.089787,0.355101,-1.950052,0.406079,-0.399707,0.356853,1.212763,0.873145,1.95035,0.562234
7,-0.932399,0.765053,1.477037,-0.854261,0.098618,-0.031222,-0.569707,0.124416,-0.549777,-1.106478,...,0.140342,0.067613,1.414255,0.420981,0.334252,-0.579496,-0.683512,0.539931,-0.39007,1.241839
8,-0.06617,-0.192926,0.22227,0.917702,-1.585009,-1.424974,0.332709,-0.373248,0.212581,-1.229885,...,0.272142,0.407555,0.591869,0.594838,-0.631484,0.356853,-0.548064,-0.311617,-0.39007,-0.678785
9,1.666288,-2.108886,2.834654,3.201566,1.416239,1.497409,0.496785,0.815616,0.828331,1.814166,...,-1.221587,-0.825113,0.517106,-0.848999,-0.805317,-0.279864,0.400074,-1.200189,1.511521,-1.50613


In [283]:
game_input_cur = game_df_cur.drop(columns = ['Away','Home','abbreviation_x','abbreviation_y'], axis = 1)

In [284]:
cols = [['Seed_A','Seed_B','assist_percentage_x','block_percentage_x','effective_field_goal_percentage_x','field_goal_percentage_x','free_throw_attempt_rate_x','free_throw_percentage_x','free_throws_per_field_goal_attempt_x','net_rating_x','offensive_rating_x','offensive_rebound_percentage_x','opp_effective_field_goal_percentage_x','opp_field_goal_percentage_x','opp_free_throw_attempt_rate_x','opp_free_throws_per_field_goal_attempt_x','opp_offensive_rating_x','opp_offensive_rebound_percentage_x','opp_steal_percentage_x','opp_three_point_attempt_rate_x','opp_three_point_field_goal_percentage_x','opp_total_rebound_percentage_x','opp_true_shooting_percentage_x','opp_two_point_field_goal_percentage_x','pace_x','simple_rating_system_x','steal_percentage_x','strength_of_schedule_x','three_point_attempt_rate_x','three_point_field_goal_percentage_x','total_rebound_percentage_x','true_shooting_percentage_x','turnover_percentage_x','two_point_field_goal_percentage_x','assist_percentage_y','block_percentage_y','effective_field_goal_percentage_y','field_goal_percentage_y','free_throw_attempt_rate_y','free_throw_percentage_y','free_throws_per_field_goal_attempt_y','net_rating_y','offensive_rating_y','offensive_rebound_percentage_y','opp_effective_field_goal_percentage_y','opp_field_goal_percentage_y','opp_free_throw_attempt_rate_y','opp_free_throws_per_field_goal_attempt_y','opp_offensive_rating_y','opp_offensive_rebound_percentage_y','opp_steal_percentage_y','opp_three_point_attempt_rate_y','opp_three_point_field_goal_percentage_y','opp_total_rebound_percentage_y','opp_true_shooting_percentage_y','opp_two_point_field_goal_percentage_y','pace_y','simple_rating_system_y','steal_percentage_y','strength_of_schedule_y','three_point_attempt_rate_y','three_point_field_goal_percentage_y','total_rebound_percentage_y','true_shooting_percentage_y','turnover_percentage_y','two_point_field_goal_percentage_y']]
for col in cols:
    game_input_cur[col] = scale(game_input_cur[col])

In [285]:
out_19 = clf.predict(game_input_cur)

  if diff:


In [286]:
temp = pd.DataFrame(out_19)
games_df_out_cur = game_df_cur_team.merge(temp, left_index = True, right_index = True)
games_df_out_cur.columns = ['Away','Home','Seed_A','Seed_B','Forecast']
games_df_out_cur['Winner'] = np.where(games_df_out_cur['Forecast'] == 'Y', games_df_out_cur['Away'], games_df_out_cur['Home'])

In [287]:
games_df_out_cur

Unnamed: 0,Away,Home,Seed_A,Seed_B,Forecast,Winner
0,NORTH-CAROLINA-CENTRAL,NORTH-DAKOTA-STATE,16,16,X,NORTH-DAKOTA-STATE
1,NORTH-DAKOTA-STATE,DUKE,16,1,X,DUKE
2,VIRGINIA-COMMONWEALTH,CENTRAL-FLORIDA,8,9,X,CENTRAL-FLORIDA
3,MISSISSIPPI-STATE,LIBERTY,5,12,Y,MISSISSIPPI-STATE
4,VIRGINIA-TECH,SAINT-LOUIS,4,13,Y,VIRGINIA-TECH
5,BELMONT,TEMPLE,11,11,X,TEMPLE
6,MARYLAND,TEMPLE,6,11,Y,MARYLAND
7,LOUISIANA-STATE,YALE,3,14,Y,LOUISIANA-STATE
8,LOUISVILLE,MINNESOTA,7,10,Y,LOUISVILLE
9,BRADLEY,MICHIGAN-STATE,15,2,X,MICHIGAN-STATE
