In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import os
import pickle
from sklearn.metrics import confusion_matrix
import requests, bs4
from sklearn.metrics import roc_curve, auc

In [2]:
os.chdir('/Users/ahelgeso/Documents/GitHub/bball')

In [3]:
db19 = pd.read_csv('2019stats.csv')

### Check results using 2016-2018 data on 2019 games 

In [4]:
db = pd.read_csv('2017_2018stats.csv')

In [11]:
feature3Cols = ['Location','Rolling3EFG','Rolling3TOV','Rolling3ORB','Rolling3FTR','Rolling3Pace','Opp R3Pace','Opp R3EFG','Opp R3TOV','Opp R3ORB','Opp R3FTR', 'Rolling3DEff', 'Opp R3 Deff', 'Rolling3OEff','Opp R3 Oeff']
feature9Cols = ['Location','Rolling9EFG','Rolling9TOV','Rolling9ORB','Rolling9FTR','Rolling9Pace','Opp R9Pace','Opp R9EFG','Opp R9TOV','Opp R9ORB','Opp R9FTR', 'Rolling9DEff', 'Opp R9 Deff', 'Rolling3OEff','Opp R3 Oeff']

target = ['Result']
X3 = db.dropna()[feature3Cols]
X9 = db.dropna()[feature9Cols]
y = np.array(db.dropna()[target]).flatten()

x9_train, x9_test, y9_train, y9_test = train_test_split(X9, y, test_size=0.3)
x3_train, x3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.3)


### Training models

In [14]:
clfgtb9 = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1).fit(x9_train, y9_train)
pred1 = clfgtb9.predict(x9_test)
print('All picks (R9):',clfgtb9.score(x9_test, y9_test))
print('All picks (R9) matrix:\n',confusion_matrix(y9_test, pred1))
print(' ')
false_positive_rate, true_positive_rate, thresholds = roc_curve(y9_test, pred1)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)

clfgtb3 = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1).fit(x3_train, y3_train)
pred2 = clfgtb3.predict(x3_test)
print('All picks (R3):',clfgtb3.score(x3_test, y3_test))
print('All picks (R3) matrix:\n',confusion_matrix(y3_test, pred2))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y3_test, pred2)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
print(' ')


All picks (R9): 0.6506849315068494
All picks (R9) matrix:
 [[418 229]
 [230 437]]
 
0.6506155732025796
All picks (R3): 0.6248097412480974
All picks (R3) matrix:
 [[415 263]
 [230 406]]
0.6252295875772249
 


### 2019 test data

In [10]:
dogs2019 = db19[(db19['Line'] > 0)].dropna().reset_index()
Ydogs2019 = db19[(db19['Line'] > 0)].dropna()['Result']

In [11]:
print("Using dog data[rolling 9 averages]: \n")

print("Using 9 game standard predictor: ")
pred = clfgtb9.predict(dogs2019[feature9Cols])
print('Dog picks:',clfgtb9.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred))
print('')

print("Using 3 game standard predictor: ")
pred1 = clfgtb3.predict(dogs2019[feature9Cols])
print('Dog picks:',clfgtb3.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')


Using dog data[rolling 9 averages]: 

Using 9 game standard predictor: 
Dog picks: 0.6123019571295434
Dog picks matrix:
 [[523 193]
 [223 134]]

Using 3 game standard predictor: 
Dog picks: 0.6057781919850885
Dog picks matrix:
 [[504 212]
 [211 146]]



In [158]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
import statistics
statistics.mean(odds)

165

Nothing really great here.


In [159]:
print("Using rolling 3 averages: \n")

print("Using 9 game standard predictor: ")
pred1 = clfgtb9.predict(dogs2019[feature3Cols])
print('Dog picks:',clfgtb9.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = clfgtb3.predict(dogs2019[feature3Cols])
print('Dog picks:',clfgtb3.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')


Using rolling 3 averages: 

Using 9 game standard predictor: 
Dog picks: 0.6029822926374651
Dog picks matrix:
 [[491 225]
 [201 156]]

Using 3 game standard predictor: 
Dog picks: 0.6365330848089469
Dog picks matrix:
 [[541 175]
 [215 142]]



In [127]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print(statistics.mean(odds))
print(statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print(statistics.mean(odds))
print(statistics.median(odds))

196
160
178
155


#### Predictor 2 above goes 142-175 with an average odds of 178

##### Now what I wanna do is build a model off 2018 data while continuously rolling in 2019 data as if the season was ongoing

In [162]:
db = pd.read_csv('2018stats.csv')

In [163]:
#feature3Cols = ['Location','Rolling3EFG','Rolling3TOV','Rolling3ORB','Rolling3FTR','Rolling3Pace','Opp R3Pace','Opp R3EFG','Opp R3TOV','Opp R3ORB','Opp R3FTR']
#feature9Cols = ['Location','Rolling9EFG','Rolling9TOV','Rolling9ORB','Rolling9FTR','Rolling9Pace','Opp R9Pace','Opp R9EFG','Opp R9TOV','Opp R9ORB','Opp R9FTR']

#target = ['Result']
X3 = db.dropna()[feature3Cols]
X9 = db.dropna()[feature9Cols]
y = np.array(db.dropna()[target]).flatten()

x9_train, x9_test, y9_train, y9_test = train_test_split(X9, y, test_size=0.3, random_state=8)
x3_train, x3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.3, random_state=8)


In [164]:
clfgtb9 = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1, random_state=1).fit(x9_train, y9_train)
pred = clfgtb9.predict(x9_test)
print('All picks (R9):',clfgtb9.score(x9_test, y9_test))
print('All picks (R9) matrix:\n',confusion_matrix(y9_test, pred))
print(' ')
false_positive_rate, true_positive_rate, thresholds = roc_curve(y9_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)

clfgtb3 = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1, random_state=1).fit(x3_train, y3_train)
pred = clfgtb3.predict(x3_test)
print('All picks (R3):',clfgtb3.score(x3_test, y3_test))
print('All picks (R3) matrix:\n',confusion_matrix(y3_test, pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y3_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
print(' ')

All picks (R9): 0.6484018264840182
All picks (R9) matrix:
 [[218 115]
 [116 208]]
 
0.648314981648315
All picks (R3): 0.624048706240487
All picks (R3) matrix:
 [[208 125]
 [122 202]]
0.6240407073740407
 


In [165]:
dogs2019 = db19[db19['Line'] > 0].dropna().reset_index()
Ydogs2019 = db19[db19['Line'] > 0].dropna()['Result']

In [166]:
print("Using dog data[rolling 9 averages]: \n")

print("Using 9 game standard predictor: ")
pred1 = clfgtb9.predict(dogs2019[feature9Cols])
print('Dog picks:',clfgtb9.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = clfgtb3.predict(dogs2019[feature9Cols])
print('Dog picks:',clfgtb3.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

Using dog data[rolling 9 averages]: 

Using 9 game standard predictor: 
Dog picks: 0.6104380242311277
Dog picks matrix:
 [[507 209]
 [209 148]]

Using 3 game standard predictor: 
Dog picks: 0.6067101584342963
Dog picks matrix:
 [[499 217]
 [205 152]]



Second model is the best one

In [167]:
odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))


Average odds: 190
Median odds: 155


In [168]:
print("Using rolling 3 averages: \n")

print("Using 9 game standard predictor: ")
pred1 = clfgtb9.predict(dogs2019[feature3Cols])
print('Dog picks:',clfgtb9.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = clfgtb3.predict(dogs2019[feature3Cols])
print('Dog picks:',clfgtb3.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')


Using rolling 3 averages: 

Using 9 game standard predictor: 
Dog picks: 0.6197576887232059
Dog picks matrix:
 [[500 216]
 [192 165]]

Using 3 game standard predictor: 
Dog picks: 0.5983224603914259
Dog picks matrix:
 [[461 255]
 [176 181]]



In [169]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

Average odds: 199
Median odds: 160
Average odds: 206
Median odds: 160.0


Above: model 1 goes 165-216 with average betting odds of 199; this is profitable


In [170]:
db19 = pd.read_csv('2019stats.csv')
db = pd.read_csv('2018stats.csv')

In [171]:
#db19.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1, inplace = True)
db19['Date'] = db19['Date'].apply(str)
db19['Date'] = pd.to_datetime(db19.Date, format = "%Y%m%d")
db19 = db19.sort_values(by = ['Date']).reset_index(drop = True)

In [172]:
db['Date'] = db['Date'].apply(str)
db['Date'] = pd.to_datetime(db.Date, format= "%Y%m%d")
db = db.sort_values(by = ['Date']).reset_index(drop = True)
test = pd.concat([db, db19[:int(len(db19)/2)]], ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [173]:
# Leave in first half of prev season or cut it out?

test = test.drop(test.index[:int(len(db19)/2)]) 

In [174]:
db = test.reset_index(drop = True) ### for simplicity

In [175]:
db = db.drop(['Implied Prob','Implied Proba','Unnamed: 0'], axis = 1)

### Here is a realistic expectation of what a running database may look like. I added half the season's worth of 2019 onto the 2018 db, and dropped the first half of 2018. The model now looks at the second half of 2018 and the first half of 2019.

In [176]:
#feature3Cols = ['Location','Rolling3EFG','Rolling3TOV','Rolling3ORB','Rolling3FTR','Rolling3Pace','Opp R3Pace','Opp R3EFG','Opp R3TOV','Opp R3ORB','Opp R3FTR']
#feature9Cols = ['Location','Rolling9EFG','Rolling9TOV','Rolling9ORB','Rolling9FTR','Rolling9Pace','Opp R9Pace','Opp R9EFG','Opp R9TOV','Opp R9ORB','Opp R9FTR']

target = ['Result']
X3 = db.dropna()[feature3Cols]
X9 = db.dropna()[feature9Cols]
y = np.array(db.dropna()[target]).flatten()

x9_train, x9_test, y9_train, y9_test = train_test_split(X9, y, test_size=0.3, random_state=3)
x3_train, x3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.3, random_state=3)


In [177]:
clfgtb9 = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1, random_state=3).fit(x9_train, y9_train)
pred = clfgtb9.predict(x9_test)
print('All picks (R9):',clfgtb9.score(x9_test, y9_test))
print('All picks (R9) matrix:\n',confusion_matrix(y9_test, pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y9_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
print(' ')

clfgtb3 = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=1, random_state=3).fit(x3_train, y3_train)
pred = clfgtb3.predict(x3_test)
print('All picks (R3):',clfgtb3.score(x3_test, y3_test))
print('All picks (R3) matrix:\n',confusion_matrix(y3_test, pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y3_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
print(' ')


All picks (R9): 0.6529680365296804
All picks (R9) matrix:
 [[215 104]
 [124 214]]
0.6535586429485634
 
All picks (R3): 0.619482496194825
All picks (R3) matrix:
 [[194 125]
 [125 213]]
0.6191639925061676
 


In [178]:
dogs2019 = db19[int(len(db19)/2):][db19['Line'] > 0].dropna().reset_index()
Ydogs2019 = db19[int(len(db19)/2):][db19['Line'] > 0].dropna()['Result']

  """Entry point for launching an IPython kernel.
  


### This is now predicting all underdog games for last half of season

In [179]:
print("Using dog data[rolling 9 averages]: \n")

print("Using 9 game standard predictor: ")
pred1 = clfgtb9.predict(dogs2019[feature9Cols])
print('Dog picks:',clfgtb9.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = clfgtb3.predict(dogs2019[feature9Cols])
print('Dog picks:',clfgtb3.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

Using dog data[rolling 9 averages]: 

Using 9 game standard predictor: 
Dog picks: 0.6595041322314049
Dog picks matrix:
 [[330  82]
 [124  69]]

Using 3 game standard predictor: 
Dog picks: 0.6247933884297521
Dog picks matrix:
 [[311 101]
 [126  67]]



In [180]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

Average odds: 148
Median odds: 160
Average odds: 176
Median odds: 160.0


In [181]:
print("Using rolling 3 averages: \n")

print("Using 9 game standard predictor: ")
pred1 = clfgtb9.predict(dogs2019[feature3Cols])
print('Dog picks:',clfgtb9.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = clfgtb3.predict(dogs2019[feature3Cols])
print('Dog picks:',clfgtb3.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')


Using rolling 3 averages: 

Using 9 game standard predictor: 
Dog picks: 0.631404958677686
Dog picks matrix:
 [[308 104]
 [119  74]]

Using 3 game standard predictor: 
Dog picks: 0.6545454545454545
Dog picks matrix:
 [[313  99]
 [110  83]]



In [182]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

Average odds: 211
Median odds: 175.0
Average odds: 206
Median odds: 165.0


In [183]:
pkl_filename = "GBC9model.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(clfgtb9, file)
    
pkl_filename = "GBC3model.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(clfgtb3, file)

This model drops 2018 data when adding first half of 2019.

The 9-game predictor goes 69-82 using 9-game average data, with average odds of 148.

The 3-game predictor goes 83-99 using 3-game average data, with average odds of 206.

In [58]:
model3 = pickle.load(open('GBC3Model.pkl', 'rb'))
model9 = pickle.load(open('GBC9Model.pkl','rb'))



In [59]:
print("Using dog data[rolling 9 averages]: \n")

print("Using 9 game standard predictor: ")
pred1 = model9.predict(dogs2019[feature9Cols])
print('Dog picks:',model9.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = model3.predict(dogs2019[feature9Cols])
print('Dog picks:',model3.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))


print("\nUsing rolling 3 averages: \n")

print("Using 9 game standard predictor: ")
pred1 = model9.predict(dogs2019[feature3Cols])
print('Dog picks:',model9.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = model3.predict(dogs2019[feature3Cols])
print('Dog picks:',model3.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))



Using dog data[rolling 9 averages]: 

Using 9 game standard predictor: 
Dog picks: 0.6264462809917355
Dog picks matrix:
 [[309 103]
 [123  70]]

Using 3 game standard predictor: 
Dog picks: 0.6264462809917355
Dog picks matrix:
 [[316  96]
 [130  63]]

Average odds: 191.9364161849711
Median odds: 165.0
Average odds: 192.0251572327044
Median odds: 160.0

Using rolling 3 averages: 

Using 9 game standard predictor: 
Dog picks: 0.6082644628099173
Dog picks matrix:
 [[281 131]
 [106  87]]

Using 3 game standard predictor: 
Dog picks: 0.6099173553719008
Dog picks matrix:
 [[288 124]
 [112  81]]

Average odds: 232.90825688073394
Median odds: 175.0
Average odds: 227.15121951219513
Median odds: 175.0
