In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from datatools import DataGrapher


%matplotlib inline

In [2]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')


In [3]:


X = nba.drop(columns = ['cover','home_win_margin', 'win_margin_ratio',
                        'date', 'dataset','line_cv',
                       'home_starter5',
                       'away_starter2', 'away_starter3',
                       'away_starter4', 'away_starter5',])

y = nba['cover']

In [4]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y ,
                                                    stratify=y,
                                                    random_state = 23)

In [5]:
X_train.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6499 entries, 2375 to 1024
Data columns (total 434 columns):
spread                               float64
total                                float64
moneyline                            float64
away_pace                            float64
mov_5_1q                             float64
mov_3_1q                             float64
mov_5_2q                             float64
mov_3_2q                             float64
mov_5_3q                             float64
mov_3_3q                             float64
mov_5_4q                             float64
mov_3_4q                             float64
mov_5_ot1                            float64
mov_3_ot1                            float64
mov_5_ot2                            float64
mov_3_ot2                            float64
mov_5_ot3                            float64
mov_3_ot3                            float64
mov_5_ot4                            float64
mov_3_ot4                         

In [6]:
np.where(X_train.values >= np.finfo(np.float64).max)


(array([], dtype=int64), array([], dtype=int64))

In [7]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [8]:
from sklearn.decomposition import PCA

In [9]:
pc = PCA(n_components=330)
X_train_pc = pc.fit_transform(X_train_scaled)
X_test_pc = pc.transform(X_test_scaled)


In [10]:
var_exp = pc.explained_variance_ratio_
print('Explained variance:            ', var_exp)

cum_var_exp = np.cumsum(var_exp)
print('Cumulative explained variance: ', cum_var_exp)

Explained variance:             [0.03428207 0.02814723 0.01611594 0.01551474 0.01415412 0.01166162
 0.01061315 0.00941504 0.00881336 0.0074524  0.00730858 0.00717563
 0.00656968 0.00631975 0.00613764 0.00553475 0.00524219 0.00496107
 0.00492845 0.00468393 0.00462979 0.00442545 0.00430439 0.00421197
 0.00407963 0.00404056 0.00395464 0.00394198 0.00388035 0.00382944
 0.0038012  0.00376271 0.00372635 0.00369966 0.00365821 0.00365213
 0.00363193 0.00360109 0.00359561 0.00356649 0.00355393 0.00352551
 0.00348932 0.0034564  0.00345162 0.00342357 0.00338426 0.00337446
 0.00336737 0.00333338 0.00331229 0.00330739 0.00330136 0.00329358
 0.00327154 0.00324499 0.00322702 0.00322214 0.00321521 0.00320364
 0.00319419 0.00317852 0.003167   0.00315084 0.00314107 0.00312933
 0.00311656 0.00311274 0.003103   0.00308245 0.00306772 0.00306629
 0.00305851 0.00304339 0.00304085 0.00302522 0.00301146 0.00299624
 0.00298338 0.00298013 0.00297384 0.00296461 0.00295362 0.00295236
 0.00293658 0.00292384 0.00291

In [11]:
lr = LogisticRegressionCV(penalty='l2', solver ='saga', cv =3 )
lr.fit(X_train_pc, y_train)
lr.score(X_train_pc, y_train)



0.5594706877981228

In [12]:
lr.score(X_test_pc, y_test)

0.5163820950622982

In [61]:
y_hat_lr = lr.predict(X_test_pc)
y_hat_lrp = lr.predict_proba(X_test_pc)


In [67]:
 score = y_hat_lrp[:, 1] + y_test

2855    0.462609
4323    0.453814
5774    0.463276
5793    0.464645
1156    0.443722
1950    1.485029
3696    0.475750
5122    1.472575
2352    0.467191
64      1.477873
7708    0.459906
8465    0.484268
708     0.471472
1530    0.510126
4242    0.482037
7154    1.479633
6746    0.479192
2712    0.490890
5787    0.479043
2300    0.472360
5815    1.500985
1557    0.499089
7163    1.474828
6872    0.473589
3498    0.487433
3927    1.473862
6427    0.440502
3489    1.483612
936     1.460307
1248    1.471131
          ...   
5645    0.526778
6523    1.469095
2750    1.490958
1636    1.451554
6258    1.518902
6301    1.455190
6848    1.476479
7107    1.492616
4068    0.445428
8187    1.538218
3123    1.537059
8301    1.476065
6307    0.457267
7725    1.450451
5642    0.460767
7795    0.479532
6071    0.462337
8208    0.465636
1296    0.467842
846     1.471065
5839    1.474664
391     1.475101
5941    1.479365
868     1.495089
8536    0.464794
3775    0.489455
5914    1.483446
2543    0.4746

In [14]:
print(classification_report(y_test, y_hat_lr))

              precision    recall  f1-score   support

           0       0.52      0.86      0.65      1129
           1       0.48      0.14      0.22      1038

   micro avg       0.52      0.52      0.52      2167
   macro avg       0.50      0.50      0.43      2167
weighted avg       0.50      0.52      0.44      2167



In [15]:
confusion_matrix( y_test, y_hat_lr)

array([[971, 158],
       [890, 148]])

### Naive Bayes

Model seems to work best with bare bones information.  More information confused the model.

In [16]:
nb = GaussianNB()
nb_params = {
    'var_smoothing' : [.2,.3,.4 ] 
    }

In [17]:
gs = GridSearchCV(nb, param_grid=nb_params, scoring='precision')
gs.fit(X_train_pc, y_train)
print(gs.best_score_)
print(gs.best_params_)



0.502518955170822
{'var_smoothing': 0.3}


In [18]:
gs.score(X_test_pc, y_test)

0.48314606741573035

In [19]:
y_hat_nb = gs.predict(X_test_pc)

In [20]:
print(classification_report(y_test, y_hat_nb))

              precision    recall  f1-score   support

           0       0.52      0.96      0.68      1129
           1       0.48      0.04      0.08      1038

   micro avg       0.52      0.52      0.52      2167
   macro avg       0.50      0.50      0.38      2167
weighted avg       0.50      0.52      0.39      2167



In [21]:
confusion_matrix( y_test, y_hat_nb)

array([[1083,   46],
       [ 995,   43]])

In [22]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [38]:
model = Sequential()
model.add(Dense(250, 
                activation = 'relu',
                input_dim = X_train_pc.shape[1]))

model.add(Dense(200, activation = 'relu'))
model.add(Dense(50, activation = 'relu'))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid')) #output layer 

In [42]:
model.compile(loss = 'binary_crossentropy', optimizer='adam',
              metrics = ['accuracy'])
model.fit(X_train_pc, 
          y_train, epochs =3, 
          batch_size= 64, 
          validation_data=(X_test_pc, y_test))

Train on 6499 samples, validate on 2167 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a41bb9f98>

In [29]:
y_hat_nn = gs.predict(X_test_pc)