In [126]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from datatools import DataGrapher


%matplotlib inline

In [127]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')


In [128]:
features = ['feature_1', 'feature_2', 'feature_3' ,'feature_4', 'spread']

# X = nba.{drop(columns = ['cover','home_win_margin', 
#                         'date', 'dataset','line_cv',
#                        'home_starter5',
#                        'away_starter2', 'away_starter3',
#                        'away_starter4', 'away_starter5',
#                        'ref_1', 'ref_3', 'crew_referees',
#                       'away_pace', 'away_spread',
#                         'away_line_cv',  'away_cover', ])}

X = nba[features]
y = nba['away_cover']

In [129]:

nba.shape

(10685, 159)

In [130]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y ,
                                                    stratify=y,
                                                    random_state = 23)

In [131]:
X_train.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8013 entries, 4257 to 5115
Data columns (total 5 columns):
feature_1    8013 non-null int64
feature_2    8013 non-null int64
feature_3    8013 non-null int64
feature_4    8013 non-null int64
spread       8013 non-null float64
dtypes: float64(1), int64(4)
memory usage: 375.6 KB


In [132]:
np.where(X_train.values >= np.finfo(np.float64).max)


(array([], dtype=int64), array([], dtype=int64))

In [133]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [134]:
from sklearn.decomposition import PCA

In [135]:
pc = PCA(n_components=150)
X_train_pc = pc.fit_transform(X_train_scaled)
X_test_pc = pc.transform(X_test_scaled)


ValueError: n_components=150 must be between 0 and min(n_samples, n_features)=5 with svd_solver='full'

In [None]:
var_exp = pc.explained_variance_ratio_
print('Explained variance:            ', var_exp)

cum_var_exp = np.cumsum(var_exp)
print('Cumulative explained variance: ', cum_var_exp)

In [136]:
lr = LogisticRegressionCV(penalty='l2', solver ='saga', cv =3 )
lr.fit(X_train_scaled, y_train)
lr.score(X_train_scaled, y_train)

0.5150380631473855

In [137]:
lr.score(X_test_scaled, y_test)

0.49550898203592814

In [142]:
lr.coef_

array([[-0.04559294,  0.07002695,  0.0620399 ,  0.00260898, -0.0420752 ]])

In [138]:
y_hat_lr = lr.predict(X_test_scaled)
y_hat_lrp = lr.predict_proba(X_test_scaled)


In [139]:
print(classification_report(y_test, y_hat_lr))

              precision    recall  f1-score   support

           0       0.49      0.44      0.46      1324
           1       0.50      0.55      0.53      1348

   micro avg       0.50      0.50      0.50      2672
   macro avg       0.49      0.49      0.49      2672
weighted avg       0.49      0.50      0.49      2672



In [140]:
confusion_matrix( y_test, y_hat_lr)

array([[579, 745],
       [603, 745]])

### Naive Bayes

Model seems to work best with bare bones information.  More information confused the model.

In [None]:
nb = GaussianNB()
nb_params = {
    'var_smoothing' : [.2,.1,.05 ] 
    }

In [141]:
gs = GridSearchCV(nb, param_grid=nb_params, scoring='precision')
gs.fit(X_train_pc, y_train)
print(gs.best_score_)
print(gs.best_params_)



ValueError: Found input variables with inconsistent numbers of samples: [6899, 8013]

In [None]:
gs.score(X_test_pc, y_test)

In [None]:
y_hat_nb = gs.predict(X_test_pc)
y_hat_nbp = gs.predict_proba(X_test_pc)

In [None]:
print(classification_report(y_test, y_hat_nb))

In [None]:
confusion_matrix( y_test, y_hat_nb)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

In [None]:
model = Sequential()
model.add(Dense(250, 
                activation = 'relu',
                input_dim = X_train_pc.shape[1]))

model.add(Dense(200, activation = 'relu'))

model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid')) #output layer 

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer='adam',
              metrics = ['accuracy'])
model.fit(X_train_pc, 
          y_train, epochs =1, 
          batch_size= 64, 
          validation_data=(X_test_pc, y_test))

In [None]:
y_hat_nn = gs.predict(X_test_pc)
y_hat_nnp = gs.predict_proba(X_test_pc)

In [None]:
#modifying train and testing data to allow for changing prob thresholds and printing the 
#roc curves
X_test.loc[:, 'actual_y'] = y_test
X_test.loc[:, 'predicted_label_lr'] = y_hat_lr
X_test.loc[:, 'predicted_proba_lrp'] = y_hat_lrp[:, 1]

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
fpr, tpr, _ = roc_curve(y_test, X_test['predicted_proba_lrp'])
roc_auc = auc(fpr, tpr)

In [None]:
# Plot 
plt.figure(figsize = (12,7))
line_width = 3

plt.plot(fpr, tpr, lw = line_width, color="darkorange", label = "ROC Curve %.2f" % roc_auc)
plt.plot([0,1], [0, 1], lw = line_width, linestyle="--", color="navy")

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
for prob in range(55, 62, 1):
    proba = prob/100
    
    X_test.loc[:, 'predicted_label']= X_test['predicted_proba_lrp'].map(
    lambda p: 1 if p > proba else 0)
    
#     X_train.loc[:, 'predicted_label']= X_train['predicted_proba'].map(
#     lambda p: 1 if p > proba else 0)
    
    X_test['predicted_label']
    print('Test  ' + str(proba))
    print(confusion_matrix(y_test, X_test['predicted_label']))
#     print(' Train  ' + str(proba))
#     print(confusion_matrix(y_train, X_train['predicted_label']))