In [1]:
from sklearn.ensemble import  AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [2]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')
nba.dropna(inplace = True)

In [3]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8664 entries, 0 to 8665
Columns: 445 entries, dataset to playoff_game
dtypes: float64(133), int64(305), object(7)
memory usage: 29.5+ MB


In [4]:

X = nba.drop(columns = ['cover','home_win_margin', 'win_margin_ratio',
                        'date', 'dataset','line_cv',
                       'home_starter5',
                       'away_starter2', 'away_starter3',
                       'away_starter4', 'away_starter5',])
y = nba['home_win_margin']

In [5]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y , 
                                                    random_state = 22)

In [6]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [7]:
from sklearn.decomposition import PCA


In [8]:
pc = PCA(n_components=250)
X_train_pc = pc.fit_transform(X_train_scaled)
X_test_pc = pc.transform(X_test_scaled)



In [9]:

var_exp = pc.explained_variance_ratio_
print('Explained variance:            ', var_exp)

cum_var_exp = np.cumsum(var_exp)
print('Cumulative explained variance: ', cum_var_exp)

Explained variance:             [0.03410261 0.02774254 0.01595164 0.01540195 0.0142969  0.01134735
 0.01050508 0.00947517 0.00880181 0.00731863 0.00721296 0.00711256
 0.00646081 0.00634558 0.0061919  0.00554285 0.00515039 0.00489619
 0.00480902 0.00477799 0.00455166 0.00443749 0.00431546 0.00420614
 0.00406792 0.0040241  0.00395645 0.00387917 0.00382256 0.00378385
 0.00373743 0.00372805 0.0036923  0.00366949 0.00364636 0.00363081
 0.00360795 0.0035917  0.00354365 0.00353308 0.00352537 0.00347274
 0.00344412 0.00343559 0.00341598 0.00339789 0.00338009 0.00336576
 0.00335241 0.00335036 0.00333583 0.00329674 0.00328619 0.00327166
 0.00326258 0.00323771 0.00322579 0.0032083  0.00319262 0.00317961
 0.00316749 0.00316065 0.00314218 0.00313312 0.00312014 0.00310647
 0.00309075 0.0030733  0.00305908 0.00305422 0.00304674 0.00304021
 0.00303024 0.003015   0.00300318 0.00299465 0.00298872 0.00298523
 0.00297283 0.00296269 0.00295208 0.00293079 0.00292555 0.00291864
 0.00291449 0.00290367 0.00290

### Linear Regression

In [10]:
ols = LinearRegression()
ols.fit(X_train_pc, y_train)
ols.score(X_train_pc, y_train)

0.22083401862998364

In [11]:
ols.score(X_test_pc, y_test)

0.1650832356030174

### Boosting

In [12]:
ad = AdaBoostRegressor()
ad_params = { 'n_estimators'      : [300],
             'learning_rate'         : [ .01 ],
             
            }

In [13]:
gs = GridSearchCV(ad, param_grid= ad_params)
gs.fit(X_train_pc, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'learning_rate': 0.01, 'n_estimators': 300}
0.10619444850466055


In [14]:
gs.score(X_test_pc,y_test)

0.11521751194940544

In [15]:
y_hat_ad = gs.predict(X_test_pc)

In [16]:
rr = RidgeCV(alphas = [40, 120, 80])


In [17]:
rr.fit(X_train_pc, y_train);

In [18]:
rr.score(X_train_pc, y_train)

0.2208192284952164

In [19]:
rr.score(X_test_pc, y_test)

0.1659186359925655

In [20]:
rr.coef_


array([ 0.80054185, -0.83160846, -0.94493229,  0.17471437,  0.20103435,
       -0.48665579,  0.30666461, -0.18353044, -0.02624847,  0.52716108,
        0.06608857,  0.25880263, -0.25356036, -0.02624351, -0.806171  ,
       -0.2903584 , -0.14836349, -0.44844854,  0.14500266, -0.14127142,
       -0.10048839,  0.41000735, -0.24667772,  0.07342789, -0.36754119,
        0.42295   , -0.22862622,  0.09554894,  0.11902731,  0.01985239,
        0.25684594, -0.06644396, -0.09808749, -0.20282481, -0.13960036,
       -0.09167693, -0.01997864, -0.15917271,  0.08196479,  0.09716334,
        0.14262949,  0.09890301,  0.39640326, -0.02605057, -0.22890647,
        0.23997529,  0.14816507, -0.0012484 ,  0.02514429, -0.13548979,
       -0.29215619,  0.05252665,  0.0911198 ,  0.24354481,  0.23339271,
        0.1170206 , -0.0541185 ,  0.15410152, -0.10281187, -0.1776935 ,
       -0.1278687 , -0.07052066,  0.01344015,  0.07248565, -0.13834624,
        0.12298454,  0.09486159,  0.01528831, -0.11127881, -0.00

In [21]:
y_hat_rr = rr.predict(X_test_pc)

In [22]:
rr.alpha_

120

In [23]:
y_hat_rr

array([ 9.10386629, -7.14411958, -0.9286215 , ...,  4.55931863,
        5.51830609, 11.27236493])

### Support Vector Machine

In [24]:
svm = SVR()
svm_params = {'kernel' : ['linear'],
              'C'      : [.1, .02, .05]}

In [25]:
gs = GridSearchCV(svm, param_grid= svm_params)
gs.fit(X_train_pc, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'C': 0.02, 'kernel': 'linear'}
0.12626724560239155


In [26]:
gs.score(X_test_pc, y_test)

0.15770951112642861

In [27]:
y_hat_svm = gs.predict(X_test_pc)


### Neural Network

Going to fit a neural network with the X features.   I am not concerned about inference.  Accurate prediction is all that matters.  So am going to set up an neural network as one of the final predictors of Home Win Margin.

In [28]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [29]:
model = Sequential()
model.add(Dense(250, 
                activation = 'relu',
                input_dim = X_train_pc.shape[1]))
#model.add(Dense(18, activation = 'relu'))

model.add(Dense(1, activation = None)) #output layer 

In [30]:
model.compile(loss = 'mean_squared_error', optimizer='adam' )
model.fit(X_train_pc, 
          y_train, epochs =18, 
          batch_size= 64, 
          validation_data=(X_test_pc, y_test))

Train on 6498 samples, validate on 2166 samples
Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


<keras.callbacks.History at 0x1a4be79b00>

In [31]:
y_hat_nn = model.predict(X_test_pc)
predictions = pd.DataFrame(y_hat_nn, columns= ['y_hat_nn'])
predictions

Unnamed: 0,y_hat_nn
0,12.932446
1,-10.875235
2,-1.042680
3,-16.517603
4,0.222306
5,9.225712
6,-0.991613
7,7.323125
8,8.114644
9,8.837726


In [32]:
predictions['y_hat_rr'] = y_hat_rr
predictions['y_hat_svm'] = y_hat_svm
predictions['y_hat_ad'] = y_hat_ad


In [33]:
predictions.head(10)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad
0,12.932446,9.103866,9.055897,5.117321
1,-10.875235,-7.14412,-4.411716,-3.59689
2,-1.04268,-0.928621,-0.971642,-3.195982
3,-16.517603,-6.606336,-5.860773,-3.056594
4,0.222306,1.813518,1.517029,4.195392
5,9.225712,8.124438,8.548652,6.019881
6,-0.991613,9.572154,12.666657,7.602871
7,7.323125,6.78903,7.016878,0.542803
8,8.114644,2.582118,1.971503,6.819267
9,8.837726,9.775057,10.287439,6.124567


In [34]:
y_s = pd.DataFrame(y_test)
y_s.reset_index(inplace = True)
predictions['y_test'] = y_s['home_win_margin']
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test
0,12.932446,9.103866,9.055897,5.117321,-18.0
1,-10.875235,-7.14412,-4.411716,-3.59689,12.0
2,-1.04268,-0.928621,-0.971642,-3.195982,-12.0
3,-16.517603,-6.606336,-5.860773,-3.056594,1.0
4,0.222306,1.813518,1.517029,4.195392,13.0


In [35]:
X_test.reset_index(inplace = True)
predictions['spread'] = X_test['spread']
predictions

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread
0,12.932446,9.103866,9.055897,5.117321,-18.0,-2.0
1,-10.875235,-7.144120,-4.411716,-3.596890,12.0,7.0
2,-1.042680,-0.928621,-0.971642,-3.195982,-12.0,8.5
3,-16.517603,-6.606336,-5.860773,-3.056594,1.0,7.5
4,0.222306,1.813518,1.517029,4.195392,13.0,2.0
5,9.225712,8.124438,8.548652,6.019881,7.0,-7.5
6,-0.991613,9.572154,12.666657,7.602871,21.0,-10.0
7,7.323125,6.789030,7.016878,0.542803,8.0,-1.5
8,8.114644,2.582118,1.971503,6.819267,-1.0,-3.5
9,8.837726,9.775057,10.287439,6.124567,10.0,-12.5


In [36]:
predictions['y_hat_ave'] = (predictions.y_hat_nn + 
                           
                            predictions.y_hat_rr + 
                            predictions.y_hat_nn)/3

In [37]:
predictions.head(20)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread,y_hat_ave
0,12.932446,9.103866,9.055897,5.117321,-18.0,-2.0,11.656252
1,-10.875235,-7.14412,-4.411716,-3.59689,12.0,7.0,-9.63153
2,-1.04268,-0.928621,-0.971642,-3.195982,-12.0,8.5,-1.00466
3,-16.517603,-6.606336,-5.860773,-3.056594,1.0,7.5,-13.213847
4,0.222306,1.813518,1.517029,4.195392,13.0,2.0,0.75271
5,9.225712,8.124438,8.548652,6.019881,7.0,-7.5,8.858621
6,-0.991613,9.572154,12.666657,7.602871,21.0,-10.0,2.529642
7,7.323125,6.78903,7.016878,0.542803,8.0,-1.5,7.145094
8,8.114644,2.582118,1.971503,6.819267,-1.0,-3.5,6.270469
9,8.837726,9.775057,10.287439,6.124567,10.0,-12.5,9.15017


In [38]:
predictions['predict_svm'] =  (predictions.y_hat_svm + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [39]:
predictions['predict_ad'] =  (predictions.y_hat_ad + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [40]:
predictions['predict_rr'] =  (predictions.y_hat_rr + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [41]:
predictions['predict_nn'] =  (predictions.y_hat_nn + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [42]:
predictions['actual'] = (predictions.y_test + predictions.spread).map(lambda x: 0 if x< 0 else 1)

In [43]:
predictions['predict'] = (predictions['predict_svm']+ 
    predictions['predict_rr'] + predictions['predict_nn'])

In [44]:
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread,y_hat_ave,predict_svm,predict_ad,predict_rr,predict_nn,actual,predict
0,12.932446,9.103866,9.055897,5.117321,-18.0,-2.0,11.656252,1,1,1,1,0,3
1,-10.875235,-7.14412,-4.411716,-3.59689,12.0,7.0,-9.63153,1,1,0,0,1,1
2,-1.04268,-0.928621,-0.971642,-3.195982,-12.0,8.5,-1.00466,1,1,1,1,0,3
3,-16.517603,-6.606336,-5.860773,-3.056594,1.0,7.5,-13.213847,1,1,1,0,1,2
4,0.222306,1.813518,1.517029,4.195392,13.0,2.0,0.75271,1,1,1,1,1,3


In [45]:
print(classification_report(predictions.actual, predictions.predict_rr))

              precision    recall  f1-score   support

           0       0.51      0.52      0.52      1101
           1       0.50      0.49      0.49      1065

   micro avg       0.51      0.51      0.51      2166
   macro avg       0.51      0.51      0.51      2166
weighted avg       0.51      0.51      0.51      2166



In [46]:
confusion_matrix(predictions.actual, predictions.predict_svm)

array([[559, 542],
       [531, 534]])

In [47]:
predictions.predict.value_counts(normalize = True)

0    0.320868
3    0.265928
2    0.222530
1    0.190674
Name: predict, dtype: float64

In [48]:
predictions.groupby(['predict'])['actual'].mean()

predict
0    0.499281
1    0.450363
2    0.489627
3    0.513889
Name: actual, dtype: float64