In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import os
import pickle
from sklearn.metrics import confusion_matrix
import requests, bs4
from sklearn.metrics import roc_curve, auc

In [2]:
db14 = pd.read_csv('WNBA14.csv')
db15 = pd.read_csv('WNBA15.csv')
db16 = pd.read_csv('WNBA16.csv')
db17 = pd.read_csv('WNBA17.csv')
db18 = pd.read_csv('WNBA18.csv')
db = pd.concat([db14, db15, db16, db17]).reset_index(drop = True)

### Check results using 2015-2017 data on 2018 games 

In [3]:
cols =  ['Result', 'Location','Rolling4TSP','Rolling4TOV','Rolling4ORB','Rolling4FTR','Rolling4Poss','Opp R4Poss','Opp R4TSP','Opp R4TOV','Opp R4ORB','Opp R4FTR', 'Rolling4DEff', 'Opp R4DEff', 'Rolling4OEff','Opp R4OEff']
featureCols = ['Location','Rolling4TSP','Rolling4TOV','Rolling4ORB','Rolling4FTR','Rolling4Poss','Opp R4Poss','Opp R4TSP','Opp R4TOV','Opp R4ORB','Opp R4FTR', 'Rolling4DEff', 'Opp R4DEff', 'Rolling4OEff','Opp R4OEff']
target = ['Result']

db = db[cols].dropna()

X = db[featureCols]
y = np.array(db[target]).flatten()

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)



### Training models

In [4]:
ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.1).fit(x_train, y_train)
pred = ada.predict(x_test)
print('All picks (R9):',ada.score(x_test, y_test))
print('All picks (R9) matrix:\n',confusion_matrix(y_test, pred))
print(' ')
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)


All picks (R9): 0.6203703703703703
All picks (R9) matrix:
 [[130 100]
 [ 64 138]]
 
0.6241928540680156


### 2018 test data

In [5]:
dogs2018 = db18[(db18['Odds'] > 0)].dropna().reset_index()
Ydogs2018 = db18[(db18['Odds'] > 0)].dropna()['Result']

In [6]:
print("Using dog data\n")

pred1 = ada.predict(dogs2018[featureCols])
print('Dog picks:',ada.score(dogs2018[featureCols], Ydogs2018))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2018, pred1))
print('')


Using dog data

Dog picks: 0.6488095238095238
Dog picks matrix:
 [[84 28]
 [31 25]]



In [7]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2018['Odds'][i])
import statistics
statistics.mean(odds)

201.0

### Here is a realistic expectation of what a running database may look like. I added half the season's worth of 2019 onto the 2018 db, and dropped the first half of 2018. The model now looks at the second half of 2018 and the first half of 2019.

In [31]:
#feature3Cols = ['Location','Rolling3EFG','Rolling3TOV','Rolling3ORB','Rolling3FTR','Rolling3Pace','Opp R3Pace','Opp R3EFG','Opp R3TOV','Opp R3ORB','Opp R3FTR']
#feature9Cols = ['Location','Rolling9EFG','Rolling9TOV','Rolling9ORB','Rolling9FTR','Rolling9Pace','Opp R9Pace','Opp R9EFG','Opp R9TOV','Opp R9ORB','Opp R9FTR']

target = ['Result']
X3 = db.dropna()[feature3Cols]
X9 = db.dropna()[feature9Cols]
y = np.array(db.dropna()[target]).flatten()

x9_train, x9_test, y9_train, y9_test = train_test_split(X9, y, test_size=0.3)
x3_train, x3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.3)


In [57]:
ada9 = AdaBoostClassifier(n_estimators=50, learning_rate=0.1).fit(x9_train, y9_train)
pred = ada9.predict(x9_test)
print('All picks (R9):',ada9.score(x9_test, y9_test))
print('All picks (R9) matrix:\n',confusion_matrix(y9_test, pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y9_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
print(' ')

ada3 = AdaBoostClassifier(n_estimators=50, learning_rate=0.1).fit(x3_train, y3_train)
pred = ada3.predict(x3_test)
print('All picks (R3):',ada3.score(x3_test, y3_test))
print('All picks (R3) matrix:\n',confusion_matrix(y3_test, pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y3_test, pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print(roc_auc)
print(' ')


All picks (R9): 0.6445966514459666
All picks (R9) matrix:
 [[419 239]
 [228 428]]
0.6446085699458818
 
All picks (R3): 0.6210045662100456
All picks (R3) matrix:
 [[412 223]
 [275 404]]
0.6219057669337724
 


In [58]:
dogs2019 = db19[int(len(db19)/2):][db19['Line'] > 0].dropna().reset_index()
Ydogs2019 = db19[int(len(db19)/2):][db19['Line'] > 0].dropna()['Result']

  """Entry point for launching an IPython kernel.
  


### This is now predicting all underdog games for last half of season

In [59]:
print("Using dog data[rolling 9 averages]: \n")

print("Using 9 game standard predictor: ")
pred1 = ada9.predict(dogs2019[feature9Cols])
print('Dog picks:',ada9.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = ada3.predict(dogs2019[feature9Cols])
print('Dog picks:',ada3.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

Using dog data[rolling 9 averages]: 

Using 9 game standard predictor: 
Dog picks: 0.6115702479338843
Dog picks matrix:
 [[299 113]
 [122  71]]

Using 3 game standard predictor: 
Dog picks: 0.6413223140495867
Dog picks matrix:
 [[324  88]
 [129  64]]



In [60]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

Average odds: 164
Median odds: 156.5
Average odds: 174
Median odds: 160.0


In [61]:
print("Using rolling 3 averages: \n")

print("Using 9 game standard predictor: ")
pred1 = ada9.predict(dogs2019[feature3Cols])
print('Dog picks:',ada9.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = ada3.predict(dogs2019[feature3Cols])
print('Dog picks:',ada3.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')


Using rolling 3 averages: 

Using 9 game standard predictor: 
Dog picks: 0.6396694214876033
Dog picks matrix:
 [[305 107]
 [111  82]]

Using 3 game standard predictor: 
Dog picks: 0.6644628099173554
Dog picks matrix:
 [[323  89]
 [114  79]]



In [62]:
odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

Average odds: 191
Median odds: 165
Average odds: 181
Median odds: 160.0


This model drops 2018 data when adding first half of 2019.

The 9-game predictor goes 85-141 using 3-game average data, with average odds of 229.

The 3-game predictor goes 70-116 using 3-game average data, with average odds of 220.

In [38]:
pkl_filename = "ADA9model.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(ada9, file)
    
pkl_filename = "ADA3model.pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(ada3, file)

In [58]:
model3 = pickle.load(open('GBC3Model.pkl', 'rb'))
model9 = pickle.load(open('GBC9Model.pkl','rb'))



In [63]:
print("Using dog data[rolling 9 averages]: \n")

print("Using 9 game standard predictor: ")
pred1 = model9.predict(dogs2019[feature9Cols])
print('Dog picks:',model9.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = model3.predict(dogs2019[feature9Cols])
print('Dog picks:',model3.score(dogs2019[feature9Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))


print("\nUsing rolling 3 averages: \n")

print("Using 9 game standard predictor: ")
pred1 = model9.predict(dogs2019[feature3Cols])
print('Dog picks:',model9.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred1))
print('')

print("Using 3 game standard predictor: ")
pred2 = model3.predict(dogs2019[feature3Cols])
print('Dog picks:',model3.score(dogs2019[feature3Cols], Ydogs2019))
print('Dog picks matrix:\n',confusion_matrix(Ydogs2019, pred2))
print('')

odds = []
for i, n in enumerate(pred1):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))

odds = []
for i, n in enumerate(pred2):
    if n == 1:
        odds.append(dogs2019['Odds'][i])

print("Average odds:", statistics.mean(odds))
print("Median odds:", statistics.median(odds))



Using dog data[rolling 9 averages]: 

Using 9 game standard predictor: 


NameError: name 'model9' is not defined