In [140]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from datatools import DataGrapher


%matplotlib inline

In [55]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')
nba.dropna(inplace = True)

In [104]:
 features = ['home_win_pct', 'away_win_pct',
             'eff_ratio1','eff_ratio2',
            'mov_5_fta', 'mov_5_away_fta',  
             'eff_ratio3', 'eff_ratio4', 
            'mov_5_home_score', 'mov_5_away_score',
            'mov_5_away_off_eff', 'mov_5_away_def_eff', 
            'mov_5_away_assists', 'mov_5_home_win_margin',
            'mov_5_win', 'home_ave_win_margin', 
            'mov_5_away_win_margin', 'home_win_pct', 'away_win_pct',
            'high_alt', 'home_ave_win_margin', 'away_ave_win_margin',
            'playoff_game', 'mov_5_3pa', 'mov_5_away_3pa',
            'mov_2_fta', 'mov_2_away_fta', 
           'mov_2_home_score', 'mov_2_away_score',
           'mov_2_tot', 'mov_2_away_total_reb',
           'mov_2_away_off_eff', 'mov_2_away_def_eff', 
            'mov_2_away_assists', 'mov_2_home_win_margin',
           'mov_2_win', 'mov_2_away_win_margin', 'mov_2_3pa', 
            'mov_2_away_3pa','away_rest', 'rest_days'
            ]
X = nba.drop(columns = ['cover', 'home_win_margin', 'spread'])

y = nba['cover']

In [105]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y ,
                                                    stratify=y,
                                                    random_state = 23)

In [106]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [107]:
from sklearn.decomposition import PCA

In [108]:
pc = PCA(n_components=18)
X_train_pc = pc.fit_transform(X_train_scaled)
X_test_pc = pc.transform(X_test_scaled)


In [109]:
var_exp = pc.explained_variance_ratio_
print('Explained variance:            ', var_exp)

cum_var_exp = np.cumsum(var_exp)
print('Cumulative explained variance: ', cum_var_exp)

Explained variance:             [0.20946555 0.17131883 0.16094599 0.11182761 0.05200998 0.0471987
 0.04208947 0.03354746 0.0329927  0.03011603 0.02735211 0.0191045
 0.01657406 0.0112369  0.01046281 0.00693507 0.00625639 0.00423406]
Cumulative explained variance:  [0.20946555 0.38078438 0.54173037 0.65355797 0.70556795 0.75276665
 0.79485612 0.82840359 0.86139629 0.89151232 0.91886442 0.93796892
 0.95454298 0.96577989 0.97624269 0.98317777 0.98943416 0.99366822]


In [110]:
lr = LogisticRegressionCV(penalty='l1', solver ='saga', cv =3 )
lr.fit(X_train_pc, y_train)
lr.score(X_train_pc, y_train)

0.5235869723966701

In [111]:
lr.score(X_test_pc, y_test)

0.5238720981165134

In [112]:
y_hat_lr = lr.predict(X_train_pc)
y_hat_lrp = lr.predict_proba(X_test_pc)

In [113]:
print(classification_report(y_train, y_hat_lr))

              precision    recall  f1-score   support

           0       0.52      1.00      0.69      3585
           1       0.00      0.00      0.00      3262

   micro avg       0.52      0.52      0.52      6847
   macro avg       0.26      0.50      0.34      6847
weighted avg       0.27      0.52      0.36      6847



  'precision', 'predicted', average, warn_for)


In [114]:
confusion_matrix( y_train, y_hat_lr)

array([[3585,    0],
       [3262,    0]])

### Naive Bayes

Model seems to work best with bare bones information.  More information confused the model.

In [180]:
nb = GaussianNB()
nb_params = {
    'var_smoothing' : [.3,.4 ] 
    }

In [181]:
gs = GridSearchCV(nb, param_grid=nb_params, scoring='precision')
gs.fit(X_train_pc, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.4957957999395462
{'var_smoothing': 0.3}




In [182]:
gs.score(X_test_pc, y_test)

0.5211726384364821

In [183]:
y_hat_nb = gs.predict(X_test_pc)

In [184]:
print(classification_report(y_test, y_hat_nb))

              precision    recall  f1-score   support

           0       0.53      0.88      0.66      1196
           1       0.52      0.15      0.23      1087

   micro avg       0.53      0.53      0.53      2283
   macro avg       0.53      0.51      0.45      2283
weighted avg       0.53      0.53      0.46      2283



In [146]:
confusion_matrix( y_test, y_hat_nb)

array([[920, 276],
       [804, 283]])