In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import os
import pickle
from sklearn.metrics import confusion_matrix
import requests, bs4
from sklearn.metrics import roc_curve, auc

In [2]:
db19 = pd.read_csv('2019stats.csv')

### Check results using 2016-2018 data on 2019 games 

In [3]:
db = pd.read_csv('2017_2018stats.csv')

In [4]:
feature3Cols = ['Location','Rolling3EFG','Rolling3TOV','Rolling3ORB','Rolling3FTR','Rolling3Pace','Opp R3Pace','Opp R3EFG','Opp R3TOV','Opp R3ORB','Opp R3FTR', 'Rolling3DEff', 'Opp R3 Deff', 'Rolling3OEff','Opp R3 Oeff']
feature9Cols = ['Location','Rolling9EFG','Rolling9TOV','Rolling9ORB','Rolling9FTR','Rolling9Pace','Opp R9Pace','Opp R9EFG','Opp R9TOV','Opp R9ORB','Opp R9FTR', 'Rolling9DEff', 'Opp R9 Deff', 'Rolling3OEff','Opp R3 Oeff']

target = ['Result']
X3 = db.dropna()[feature3Cols]
X9 = db.dropna()[feature9Cols]
y = np.array(db.dropna()[target]).flatten()
svc=SVC(probability=True)

x9_train, x9_test, y9_train, y9_test = train_test_split(X9, y, test_size=0.3, random_state=1)
x3_train, x3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.3, random_state=1)


### Training models

In [9]:
bag9 = BaggingClassifier( n_estimators=20, random_state=1).fit(x9_train, y9_train)
pred = bag9.predict(x9_test)
print('All picks (R9):',bag9.score(x9_test, y9_test))
print('All picks (R9) matrix:\n',confusion_matrix(y9_test, pred))
print(' ')
false_positive_rate, true_positive_rate, thresholds = roc_curve(y9_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)

bag3 = BaggingClassifier(n_estimators=20, random_state=1).fit(x3_train, y3_train)
pred = bag3.predict(x3_test)
print('All picks (R3):',bag3.score(x3_test, y3_test))
print('All picks (R3) matrix:\n',confusion_matrix(y3_test, pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y3_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
print(' ')


All picks (R9): 0.6293759512937596
All picks (R9) matrix:
 [[442 208]
 [279 385]]
 
0.6299096385542168
All picks (R3): 0.5905631659056316
All picks (R3) matrix:
 [[430 220]
 [318 346]]
0.5913113994439296
 


### 2019 test data

In [10]:
dogs2019 = db19[(db19['Line'] > 0)].dropna().reset_index()
Ydogs2019 = db19[(db19['Line'] > 0)].dropna()['Result']

In [11]:
print("Using dog data[rolling 9 averages]: \n")

print("Using 9 game standard predictor: ")
pred1 = bag9.predict(dogs2019[feature9Cols])
print('Dog picks:',bag9.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = bag3.predict(dogs2019[feature9Cols])
print('Dog picks:',bag3.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')


Using dog data[rolling 9 averages]: 

Using 9 game standard predictor: 
Dog picks: 0.6402609506057781
Dog picks matrix:
 [[568 148]
 [238 119]]

Using 3 game standard predictor: 
Dog picks: 0.6169617893755824
Dog picks matrix:
 [[508 208]
 [203 154]]



In [12]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
import statistics
statistics.mean(odds)

166

Nothing great

In [13]:
print("Using rolling 3 averages: \n")

print("Using 9 game standard predictor: ")
pred1 = bag9.predict(dogs2019[feature3Cols])
print('Dog picks:',bag9.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = bag3.predict(dogs2019[feature3Cols])
print('Dog picks:',bag3.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')


Using rolling 3 averages: 

Using 9 game standard predictor: 
Dog picks: 0.5945945945945946
Dog picks matrix:
 [[495 221]
 [214 143]]

Using 3 game standard predictor: 
Dog picks: 0.6132339235787512
Dog picks matrix:
 [[502 214]
 [201 156]]



In [14]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print(statistics.mean(odds))
print(statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print(statistics.mean(odds))
print(statistics.median(odds))

217
175.0
204
165.0


### Nothing great

##### Now what I wanna do is build a model off 2018 data while continuously rolling in 2019 data as if the season was ongoing

In [15]:
db = pd.read_csv('2018stats.csv')

In [56]:
#feature3Cols = ['Location','Rolling3EFG','Rolling3TOV','Rolling3ORB','Rolling3FTR','Rolling3Pace','Opp R3Pace','Opp R3EFG','Opp R3TOV','Opp R3ORB','Opp R3FTR']
#feature9Cols = ['Location','Rolling9EFG','Rolling9TOV','Rolling9ORB','Rolling9FTR','Rolling9Pace','Opp R9Pace','Opp R9EFG','Opp R9TOV','Opp R9ORB','Opp R9FTR']

#target = ['Result']
X3 = db.dropna()[feature3Cols]
X9 = db.dropna()[feature9Cols]
y = np.array(db.dropna()[target]).flatten()

x9_train, x9_test, y9_train, y9_test = train_test_split(X9, y, test_size=0.3, random_state=8)
x3_train, x3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.3, random_state=8)


In [16]:
bag9 = BaggingClassifier(n_estimators=20, random_state=1).fit(x9_train, y9_train)
pred = bag9.predict(x9_test)
print('All picks (R9):',bag9.score(x9_test, y9_test))
print('All picks (R9) matrix:\n',confusion_matrix(y9_test, pred))
print(' ')
false_positive_rate, true_positive_rate, thresholds = roc_curve(y9_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)

bag3 = BaggingClassifier(n_estimators=20, random_state=1).fit(x3_train, y3_train)
pred = bag3.predict(x3_test)
print('All picks (R3):',bag3.score(x3_test, y3_test))
print('All picks (R3) matrix:\n',confusion_matrix(y3_test, pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y3_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
print(' ')

All picks (R9): 0.6293759512937596
All picks (R9) matrix:
 [[442 208]
 [279 385]]
 
0.6299096385542168
All picks (R3): 0.5905631659056316
All picks (R3) matrix:
 [[430 220]
 [318 346]]
0.5913113994439296
 


In [17]:
dogs2019 = db19[db19['Line'] > 0].dropna().reset_index()
Ydogs2019 = db19[db19['Line'] > 0].dropna()['Result']

In [18]:
print("Using dog data[rolling 9 averages]: \n")

print("Using 9 game standard predictor: ")
pred1 = bag9.predict(dogs2019[feature9Cols])
print('Dog picks:',bag9.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = bag3.predict(dogs2019[feature9Cols])
print('Dog picks:',bag3.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

Using dog data[rolling 9 averages]: 

Using 9 game standard predictor: 
Dog picks: 0.6402609506057781
Dog picks matrix:
 [[568 148]
 [238 119]]

Using 3 game standard predictor: 
Dog picks: 0.6169617893755824
Dog picks matrix:
 [[508 208]
 [203 154]]



First model is the best one

In [19]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print("Average odds 1:", statistics.mean(odds))
print("Median odds 1:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print("Average odds 2:", statistics.mean(odds))
print("Median odds 2:", statistics.median(odds))


Average odds 1: 166
Median odds 1: 155
Average odds 2: 211
Median odds 2: 160.0


In [20]:
print("Using rolling 3 averages: \n")

print("Using 9 game standard predictor: ")
pred1 = bag9.predict(dogs2019[feature3Cols])
print('Dog picks:',bag9.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = bag3.predict(dogs2019[feature3Cols])
print('Dog picks:',bag3.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')


Using rolling 3 averages: 

Using 9 game standard predictor: 
Dog picks: 0.5945945945945946
Dog picks matrix:
 [[495 221]
 [214 143]]

Using 3 game standard predictor: 
Dog picks: 0.6132339235787512
Dog picks matrix:
 [[502 214]
 [201 156]]



In [21]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print("Average odds 1:", statistics.mean(odds))
print("Median odds 1:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])
        
print("Average odds 2:", statistics.mean(odds))
print("Median odds 2:", statistics.median(odds))

Average odds 1: 217
Median odds 1: 175.0
Average odds 2: 204
Median odds 2: 165.0


Nothing great

In [22]:
db19 = pd.read_csv('2019stats.csv')
db = pd.read_csv('2018stats.csv')

In [23]:
#db19.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1, inplace = True)
db19['Date'] = db19['Date'].apply(str)
db19['Date'] = pd.to_datetime(db19.Date, format = "%Y%m%d")
db19 = db19.sort_values(by = ['Date']).reset_index(drop = True)

In [24]:
db['Date'] = db['Date'].apply(str)
db['Date'] = pd.to_datetime(db.Date, format= "%Y%m%d")
db = db.sort_values(by = ['Date']).reset_index(drop = True)
test = pd.concat([db, db19[:int(len(db19)/2)]], ignore_index = True)

In [25]:
# Leave in first half of prev season or cut it out?

test = test.drop(test.index[:int(len(db19)/2)]) 

In [26]:
db = test.reset_index(drop = True) ### for simplicity

In [27]:
db = db.drop(['Implied Prob','Implied Proba','Unnamed: 0'], axis = 1)

### Here is a realistic expectation of what a running database may look like. I added half the season's worth of 2019 onto the 2018 db, and dropped the first half of 2018. The model now looks at the second half of 2018 and the first half of 2019.

In [69]:
#feature3Cols = ['Location','Rolling3EFG','Rolling3TOV','Rolling3ORB','Rolling3FTR','Rolling3Pace','Opp R3Pace','Opp R3EFG','Opp R3TOV','Opp R3ORB','Opp R3FTR']
#feature9Cols = ['Location','Rolling9EFG','Rolling9TOV','Rolling9ORB','Rolling9FTR','Rolling9Pace','Opp R9Pace','Opp R9EFG','Opp R9TOV','Opp R9ORB','Opp R9FTR']

target = ['Result']
X3 = db.dropna()[feature3Cols]
X9 = db.dropna()[feature9Cols]
y = np.array(db.dropna()[target]).flatten()


x9_train, x9_test, y9_train, y9_test = train_test_split(X9, y, test_size=0.3, random_state=3)
x3_train, x3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.3, random_state=3)


In [28]:
bag9 = BaggingClassifier(n_estimators=20, random_state=3).fit(x9_train, y9_train)
pred = bag9.predict(x9_test)
print('All picks (R9):',bag9.score(x9_test, y9_test))
print('All picks (R9) matrix:\n',confusion_matrix(y9_test, pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y9_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
print(' ')

bag3 = BaggingClassifier(n_estimators=20, random_state=3).fit(x3_train, y3_train)
pred = bag3.predict(x3_test)
print('All picks (R3):',bag3.score(x3_test, y3_test))
print('All picks (R3) matrix:\n',confusion_matrix(y3_test, pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y3_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
print(' ')


All picks (R9): 0.636986301369863
All picks (R9) matrix:
 [[441 209]
 [268 396]]
0.6374235403151065
 
All picks (R3): 0.6111111111111112
All picks (R3) matrix:
 [[424 226]
 [285 379]]
0.6115454124189064
 


In [29]:
dogs2019 = db19[int(len(db19)/2):][db19['Line'] > 0].dropna().reset_index()
Ydogs2019 = db19[int(len(db19)/2):][db19['Line'] > 0].dropna()['Result']

  """Entry point for launching an IPython kernel.
  


### This is now predicting all underdog games for last half of season

In [30]:
print("Using dog data[rolling 9 averages]: \n")

print("Using 9 game standard predictor: ")
pred1 = bag9.predict(dogs2019[feature9Cols])
print('Dog picks:',bag9.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = bag3.predict(dogs2019[feature9Cols])
print('Dog picks:',bag3.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

Using dog data[rolling 9 averages]: 

Using 9 game standard predictor: 
Dog picks: 0.6495867768595042
Dog picks matrix:
 [[332  80]
 [132  61]]

Using 3 game standard predictor: 
Dog picks: 0.6297520661157024
Dog picks matrix:
 [[307 105]
 [119  74]]



In [31]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

Average odds: 153
Median odds: 155
Average odds: 187
Median odds: 165


In [34]:
print("Using rolling 3 averages: \n")

print("Using 9 game standard predictor: ")
pred1 = bag9.predict(dogs2019[feature3Cols])
print('Dog picks:',bag9.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = bag3.predict(dogs2019[feature3Cols])
print('Dog picks:',bag3.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')


Using rolling 3 averages: 

Using 9 game standard predictor: 
Dog picks: 0.6297520661157024
Dog picks matrix:
 [[306 106]
 [118  75]]

Using 3 game standard predictor: 
Dog picks: 0.6396694214876033
Dog picks matrix:
 [[306 106]
 [112  81]]



In [35]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

Average odds: 202
Median odds: 175
Average odds: 216
Median odds: 170


This model drops 2018 data when adding first half of 2019.

The 3-game predictor goes 74-105 using 9-game average data, with average odds of 187.

The 3-game predictor goes 81-106 using 3-game average data, with average odds of 216.

In [None]:
pkl_filename = "GBC9model.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(clfgtb9, file)
    
pkl_filename = "GBC3model.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(clfgtb3, file)

In [58]:
model3 = pickle.load(open('GBC3Model.pkl', 'rb'))
model9 = pickle.load(open('GBC9Model.pkl','rb'))



In [59]:
print("Using dog data[rolling 9 averages]: \n")

print("Using 9 game standard predictor: ")
pred1 = model9.predict(dogs2019[feature9Cols])
print('Dog picks:',model9.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = model3.predict(dogs2019[feature9Cols])
print('Dog picks:',model3.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))


print("\nUsing rolling 3 averages: \n")

print("Using 9 game standard predictor: ")
pred1 = model9.predict(dogs2019[feature3Cols])
print('Dog picks:',model9.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = model3.predict(dogs2019[feature3Cols])
print('Dog picks:',model3.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))



Using dog data[rolling 9 averages]: 

Using 9 game standard predictor: 
Dog picks: 0.6264462809917355
Dog picks matrix:
 [[309 103]
 [123  70]]

Using 3 game standard predictor: 
Dog picks: 0.6264462809917355
Dog picks matrix:
 [[316  96]
 [130  63]]

Average odds: 191.9364161849711
Median odds: 165.0
Average odds: 192.0251572327044
Median odds: 160.0

Using rolling 3 averages: 

Using 9 game standard predictor: 
Dog picks: 0.6082644628099173
Dog picks matrix:
 [[281 131]
 [106  87]]

Using 3 game standard predictor: 
Dog picks: 0.6099173553719008
Dog picks matrix:
 [[288 124]
 [112  81]]

Average odds: 232.90825688073394
Median odds: 175.0
Average odds: 227.15121951219513
Median odds: 175.0
