In [1]:
from sklearn.ensemble import  AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [2]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')
nba.dropna(inplace = True)

In [3]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8668 entries, 7 to 10684
Columns: 450 entries, dataset to playoff_game
dtypes: float64(133), int64(310), object(7)
memory usage: 29.8+ MB


In [4]:

X = nba.drop(columns = ['cover','home_win_margin', 'win_margin_ratio',
                        'date', 'dataset','line_cv',
                       'home_starter5',
                       'away_starter2', 'away_starter3',
                       'away_starter4', 'away_starter5',])
y = nba['home_win_margin']

In [5]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y , 
                                                    random_state = 22)

In [6]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [7]:
from sklearn.decomposition import PCA


In [8]:
pc = PCA(n_components=250)
X_train_pc = pc.fit_transform(X_train_scaled)
X_test_pc = pc.transform(X_test_scaled)



In [9]:

var_exp = pc.explained_variance_ratio_
print('Explained variance:            ', var_exp)

cum_var_exp = np.cumsum(var_exp)
print('Cumulative explained variance: ', cum_var_exp)

Explained variance:             [0.03397576 0.02803731 0.01615593 0.01537717 0.01437822 0.01163618
 0.01052321 0.0093413  0.00898226 0.00740694 0.00729409 0.00712005
 0.0065715  0.00640375 0.00624423 0.00558314 0.00514161 0.00492726
 0.0047967  0.00465398 0.00453132 0.00444615 0.00432055 0.00421699
 0.00408308 0.00401065 0.00398041 0.00389624 0.00387989 0.00381424
 0.00375717 0.0037244  0.00369661 0.0036676  0.00365007 0.00362515
 0.00359749 0.00358318 0.00356252 0.00351218 0.0035086  0.00348606
 0.00347732 0.00346057 0.0034308  0.00341621 0.00340687 0.00336413
 0.00335932 0.00333651 0.00331044 0.00330513 0.00327745 0.00327149
 0.00326055 0.00325566 0.0032212  0.00321563 0.00321098 0.00319183
 0.0031786  0.00317604 0.00315544 0.00313361 0.00312997 0.00311423
 0.00311305 0.00309336 0.00307648 0.00306541 0.0030611  0.003047
 0.00304526 0.00303313 0.00302666 0.00301874 0.00300789 0.00299942
 0.00299554 0.00298121 0.00296422 0.00295309 0.00294523 0.00293707
 0.00292558 0.00292421 0.0029143

### Linear Regression

In [10]:
ols = LinearRegression()
ols.fit(X_train_pc, y_train)
ols.score(X_train_pc, y_train)

0.21914273242048277

In [11]:
ols.score(X_test_pc, y_test)

0.1655102722748597

### Boosting

In [12]:
ad = AdaBoostRegressor()
ad_params = { 'n_estimators'      : [300],
             'learning_rate'         : [ .01 ],
             
            }

In [13]:
gs = GridSearchCV(ad, param_grid= ad_params)
gs.fit(X_train_pc, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'learning_rate': 0.01, 'n_estimators': 300}
0.10029527888796883


In [14]:
gs.score(X_test_pc,y_test)

0.1182594340047639

In [15]:
y_hat_ad = gs.predict(X_test_pc)

In [16]:
rr = RidgeCV(alphas = [40, 120, 80])


In [17]:
rr.fit(X_train_pc, y_train);

In [18]:
rr.score(X_train_pc, y_train)

0.21912713268816963

In [19]:
rr.score(X_test_pc, y_test)

0.16653212210174384

In [20]:
rr.coef_


array([ 7.56652318e-01, -8.24964943e-01,  9.47080441e-01, -2.97513231e-01,
        1.74778231e-01, -4.06442335e-01,  3.33005536e-01, -1.90079136e-01,
       -6.33419533e-02, -2.80730967e-01, -2.46952371e-01,  5.91970688e-01,
       -2.14386057e-01,  4.67057804e-01, -6.57929889e-01, -3.41440187e-01,
        6.34210793e-03, -4.14117960e-01,  1.98379368e-01, -7.26068308e-02,
        1.72044067e-01,  2.91662272e-01, -2.57331616e-01,  2.13486436e-01,
       -2.27809730e-01,  3.19907112e-01,  4.39712399e-01,  4.10390191e-02,
       -2.14809290e-01,  2.56857299e-01, -1.32293248e-01, -5.11214029e-02,
       -1.10922157e-01, -1.57581633e-01,  9.11721608e-02,  1.08314132e-01,
       -1.52128936e-02,  3.47265822e-01, -2.52910130e-01,  1.64869412e-02,
        5.55651691e-02,  2.64361208e-01,  8.02759870e-02,  2.67998300e-01,
       -1.18720600e-01,  2.17502973e-01, -3.36598564e-02, -6.03024256e-02,
       -7.23847346e-02, -3.52998421e-02,  2.30768697e-01,  3.65400131e-01,
       -5.33805672e-02,  

In [21]:
y_hat_rr = rr.predict(X_test_pc)

In [22]:
rr.alpha_

120

In [23]:
y_hat_rr

array([10.57024481, -0.4486354 , -3.45377681, ...,  0.945034  ,
        1.84048332,  3.86654862])

### Support Vector Machine

In [None]:
svm = SVR()
svm_params = {'kernel' : ['linear'],
              'C'      : [.1, .02, .05]}

In [None]:
gs = GridSearchCV(svm, param_grid= svm_params)
gs.fit(X_train_pc, y_train)
print(gs.best_params_)
print(gs.best_score_)



In [None]:
gs.score(X_test_pc, y_test)

In [None]:
y_hat_svm = gs.predict(X_test_pc)


### Neural Network

Going to fit a neural network with the X features.   I am not concerned about inference.  Accurate prediction is all that matters.  So am going to set up an neural network as one of the final predictors of Home Win Margin.

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

In [None]:
model = Sequential()
model.add(Dense(250, 
                activation = 'relu',
                input_dim = X_train_pc.shape[1]))
#model.add(Dense(18, activation = 'relu'))

model.add(Dense(1, activation = None)) #output layer 

In [44]:
model.compile(loss = 'mean_squared_error', optimizer='adam' )
model.fit(X_train_pc, 
          y_train, epochs =18, 
          batch_size= 64, 
          validation_data=(X_test_pc, y_test))

Train on 6847 samples, validate on 2283 samples
Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


<keras.callbacks.History at 0x1a4049f320>

In [45]:
y_hat_nn = model.predict(X_test_pc)
predictions = pd.DataFrame(y_hat_nn, columns= ['y_hat_nn'])
predictions

Unnamed: 0,y_hat_nn
0,7.768983
1,4.021528
2,8.243096
3,11.663681
4,5.408768
5,-9.989044
6,-0.168817
7,5.903172
8,1.811391
9,1.966661


In [46]:
predictions['y_hat_rr'] = y_hat_rr
predictions['y_hat_svm'] = y_hat_svm
predictions['y_hat_ad'] = y_hat_ad


In [47]:
predictions.head(10)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad
0,7.768983,8.509132,8.250464,4.735949
1,4.021528,3.897451,4.478983,2.8775
2,8.243096,7.939707,8.107149,3.555861
3,11.663681,11.328191,11.579659,10.01922
4,5.408768,5.281882,6.085489,8.884793
5,-9.989044,-10.463134,-11.088128,-4.960184
6,-0.168817,0.824683,0.35211,2.206829
7,5.903172,5.726796,5.802836,-0.168979
8,1.811391,2.196907,2.59939,6.310578
9,1.966661,1.828906,1.874987,3.826923


In [48]:
y_s = pd.DataFrame(y_test)
y_s.reset_index(inplace = True)
predictions['y_test'] = y_s['home_win_margin']
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test
0,7.768983,8.509132,8.250464,4.735949,15.0
1,4.021528,3.897451,4.478983,2.8775,21.0
2,8.243096,7.939707,8.107149,3.555861,15.0
3,11.663681,11.328191,11.579659,10.01922,9.0
4,5.408768,5.281882,6.085489,8.884793,7.0


In [50]:
X_test.reset_index(inplace = True)
predictions['spread'] = X_test['spread']
predictions

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread
0,7.768983,8.509132,8.250464,4.735949,15.0,-8.0
1,4.021528,3.897451,4.478983,2.877500,21.0,-4.0
2,8.243096,7.939707,8.107149,3.555861,15.0,-8.0
3,11.663681,11.328191,11.579659,10.019220,9.0,-12.0
4,5.408768,5.281882,6.085489,8.884793,7.0,-5.5
5,-9.989044,-10.463134,-11.088128,-4.960184,-16.0,8.5
6,-0.168817,0.824683,0.352110,2.206829,-11.0,-2.5
7,5.903172,5.726796,5.802836,-0.168979,2.0,-6.5
8,1.811391,2.196907,2.599390,6.310578,25.0,-2.0
9,1.966661,1.828906,1.874987,3.826923,-15.0,-2.5


In [51]:
predictions['y_hat_ave'] = (predictions.y_hat_nn + 
                           
                            predictions.y_hat_rr + 
                            predictions.y_hat_nn)/3

In [52]:
predictions.head(20)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread,y_hat_ave
0,7.768983,8.509132,8.250464,4.735949,15.0,-8.0,8.015699
1,4.021528,3.897451,4.478983,2.8775,21.0,-4.0,3.980169
2,8.243096,7.939707,8.107149,3.555861,15.0,-8.0,8.141967
3,11.663681,11.328191,11.579659,10.01922,9.0,-12.0,11.551851
4,5.408768,5.281882,6.085489,8.884793,7.0,-5.5,5.366473
5,-9.989044,-10.463134,-11.088128,-4.960184,-16.0,8.5,-10.147074
6,-0.168817,0.824683,0.35211,2.206829,-11.0,-2.5,0.16235
7,5.903172,5.726796,5.802836,-0.168979,2.0,-6.5,5.84438
8,1.811391,2.196907,2.59939,6.310578,25.0,-2.0,1.939896
9,1.966661,1.828906,1.874987,3.826923,-15.0,-2.5,1.920743


In [53]:
predictions['predict_svm'] =  (predictions.y_hat_svm + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [54]:
predictions['predict_ad'] =  (predictions.y_hat_ad + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [55]:
predictions['predict_rr'] =  (predictions.y_hat_rr + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [56]:
predictions['predict_nn'] =  (predictions.y_hat_nn + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [57]:
predictions['actual'] = (predictions.y_test + predictions.spread).map(lambda x: 0 if x< 0 else 1)

In [58]:
predictions['predict'] = (predictions['predict_svm']+ 
    predictions['predict_rr'] + predictions['predict_nn'])

In [59]:
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread,y_hat_ave,predict_svm,predict_ad,predict_rr,predict_nn,actual,predict
0,7.768983,8.509132,8.250464,4.735949,15.0,-8.0,8.015699,1,0,1,0,1,2
1,4.021528,3.897451,4.478983,2.8775,21.0,-4.0,3.980169,1,0,0,1,1,2
2,8.243096,7.939707,8.107149,3.555861,15.0,-8.0,8.141967,1,0,0,1,1,2
3,11.663681,11.328191,11.579659,10.01922,9.0,-12.0,11.551851,0,0,0,0,0,0
4,5.408768,5.281882,6.085489,8.884793,7.0,-5.5,5.366473,1,1,0,0,1,1


In [64]:
print(classification_report(predictions.actual, predictions.predict_rr))

              precision    recall  f1-score   support

           0       0.49      0.70      0.57      1119
           1       0.50      0.29      0.36      1164

   micro avg       0.49      0.49      0.49      2283
   macro avg       0.49      0.49      0.47      2283
weighted avg       0.49      0.49      0.47      2283



In [65]:
confusion_matrix(predictions.actual, predictions.predict_svm)

array([[778, 341],
       [812, 352]])

In [66]:
predictions.predict.value_counts(normalize = True)

0    0.515550
1    0.197547
3    0.168200
2    0.118703
Name: predict, dtype: float64

In [67]:
predictions.groupby(['predict'])['actual'].mean()

predict
0    0.508071
1    0.516630
2    0.509225
3    0.507812
Name: actual, dtype: float64