In [1]:
from sklearn.ensemble import  AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [2]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')
nba.dropna(inplace = True)

In [3]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9130 entries, 0 to 9129
Data columns (total 27 columns):
cover                  9130 non-null int64
home_win_margin        9130 non-null float64
spread                 9130 non-null float64
away_rest              9130 non-null int64
rest_days              9130 non-null int64
home_win_pct           9130 non-null float64
away_win_pct           9130 non-null float64
home_ave_win_margin    9130 non-null float64
away_ave_win_margin    9130 non-null float64
mov_5_oeff             9130 non-null float64
mov_5_deff             9130 non-null float64
mov_3_oeff             9130 non-null float64
mov_3_deff             9130 non-null float64
mov_5_away_off_eff     9130 non-null float64
mov_5_away_def_eff     9130 non-null float64
mov_3_away_off_eff     9130 non-null float64
mov_3_away_def_eff     9130 non-null float64
eff_ratio1             9130 non-null float64
eff_ratio2             9130 non-null float64
eff_ratio3             9130 non-null float64

In [4]:
# features = ['eff_ratio1', 'mov_5_fta', 'mov_5_away_fta',  
#             'eff_ratio2', 'eff_ratio3','eff_ratio4',
#             'away_rest', 'rest_days',
#             'spread',
#             'mov_5_home_score', 'mov_5_away_score',
#             'mov_5_away_off_eff', 'mov_5_away_def_eff', 
#             'mov_5_away_assists', 'mov_5_home_win_margin',
#             'mov_5_win', 'mov_5_away_win_margin', 'home_win_pct', 'away_win_pct',
#             'high_alt', 'home_ave_win_margin', 'away_ave_win_margin',
#             'playoff_game',  'mov_5_3pa', 
#            'mov_5_away_3pa'
#             ]

X = nba.drop(columns = ['cover', 'home_win_margin'])
y = nba['home_win_margin']

In [5]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y , 
                                                    random_state = 22)

In [6]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [7]:
from sklearn.decomposition import PCA


In [8]:
pc = PCA(n_components=18)
X_train_pc = pc.fit_transform(X_train_scaled)
X_test_pc = pc.transform(X_test_scaled)



In [9]:

var_exp = pc.explained_variance_ratio_
print('Explained variance:            ', var_exp)

cum_var_exp = np.cumsum(var_exp)
print('Cumulative explained variance: ', cum_var_exp)

Explained variance:             [0.22181338 0.169172   0.15347149 0.10645017 0.04970854 0.04569586
 0.04115408 0.03803325 0.03161858 0.02862201 0.02746335 0.0185453
 0.01599081 0.01097955 0.01032995 0.00823989 0.00668203 0.00597426]
Cumulative explained variance:  [0.22181338 0.39098538 0.54445687 0.65090705 0.70061558 0.74631144
 0.78746552 0.82549877 0.85711735 0.88573936 0.91320271 0.93174801
 0.94773882 0.95871837 0.96904832 0.97728822 0.98397025 0.98994451]


### Linear Regression

In [13]:
ols = LinearRegression()
ols.fit(X_train_pc, y_train)
ols.score(X_train_pc, y_train)

0.21376964855052838

In [14]:
ols.score(X_test_pc, y_test)

0.2056573375867633

### Boosting

In [10]:
ad = AdaBoostRegressor()
ad_params = { 'n_estimators'      : [300],
             'learning_rate'         : [ .01 ],
             
            }

In [16]:
gs = GridSearchCV(ad, param_grid= ad_params)
gs.fit(X_train_pc, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'learning_rate': 0.01, 'n_estimators': 300}
0.15855691227734922


In [17]:
gs.score(X_test_pc,y_test)

0.1600766613301562

In [18]:
y_hat_ad = gs.predict(X_test_pc)

In [29]:
rr = RidgeCV(alphas = [40, 120, 80])


In [30]:
rr.fit(X_train_pc, y_train);

In [31]:
rr.score(X_train_pc, y_train)

0.21375139001826404

In [32]:
rr.score(X_test_pc, y_test)

0.20564172584878682

In [33]:
rr.coef_


array([-1.95079804, -0.95772355, -0.07107271,  0.01227642,  0.26335072,
        0.37562933,  1.14614004, -2.56047187, -0.30018226, -0.47758479,
       -0.73279944,  0.06253793, -0.18357786,  0.83741482, -0.32583606,
        3.98886686,  0.6624237 ,  0.16631825])

In [34]:
y_hat_rr = rr.predict(X_test_pc)

In [35]:
rr.alpha_

40

In [36]:
y_hat_rr

array([ 8.50913152,  3.8974507 ,  7.93970728, ...,  8.53971065,
        5.91411621, -6.9204938 ])

### Support Vector Machine

In [37]:
svm = SVR()
svm_params = {'kernel' : ['linear'],
              'C'      : [.1, .02, .05]}

In [38]:
gs = GridSearchCV(svm, param_grid= svm_params)
gs.fit(X_train_pc, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'C': 0.1, 'kernel': 'linear'}
0.20690488288602557


In [39]:
gs.score(X_test_pc, y_test)

0.20452498497067303

In [40]:
y_hat_svm = gs.predict(X_test_pc)


### Neural Network

Going to fit a neural network with the X features.   I am not concerned about inference.  Accurate prediction is all that matters.  So am going to set up an neural network as one of the final predictors of Home Win Margin.

In [41]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [42]:
model = Sequential()
model.add(Dense(18, 
                activation = 'relu',
                input_dim = X_train_pc.shape[1]))
#model.add(Dense(18, activation = 'relu'))

model.add(Dense(1, activation = None)) #output layer 

In [44]:
model.compile(loss = 'mean_squared_error', optimizer='adam' )
model.fit(X_train_pc, 
          y_train, epochs =18, 
          batch_size= 64, 
          validation_data=(X_test_pc, y_test))

Train on 6847 samples, validate on 2283 samples
Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


<keras.callbacks.History at 0x1a4049f320>

In [45]:
y_hat_nn = model.predict(X_test_pc)
predictions = pd.DataFrame(y_hat_nn, columns= ['y_hat_nn'])
predictions

Unnamed: 0,y_hat_nn
0,7.768983
1,4.021528
2,8.243096
3,11.663681
4,5.408768
5,-9.989044
6,-0.168817
7,5.903172
8,1.811391
9,1.966661


In [46]:
predictions['y_hat_rr'] = y_hat_rr
predictions['y_hat_svm'] = y_hat_svm
predictions['y_hat_ad'] = y_hat_ad


In [47]:
predictions.head(10)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad
0,7.768983,8.509132,8.250464,4.735949
1,4.021528,3.897451,4.478983,2.8775
2,8.243096,7.939707,8.107149,3.555861
3,11.663681,11.328191,11.579659,10.01922
4,5.408768,5.281882,6.085489,8.884793
5,-9.989044,-10.463134,-11.088128,-4.960184
6,-0.168817,0.824683,0.35211,2.206829
7,5.903172,5.726796,5.802836,-0.168979
8,1.811391,2.196907,2.59939,6.310578
9,1.966661,1.828906,1.874987,3.826923


In [48]:
y_s = pd.DataFrame(y_test)
y_s.reset_index(inplace = True)
predictions['y_test'] = y_s['home_win_margin']
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test
0,7.768983,8.509132,8.250464,4.735949,15.0
1,4.021528,3.897451,4.478983,2.8775,21.0
2,8.243096,7.939707,8.107149,3.555861,15.0
3,11.663681,11.328191,11.579659,10.01922,9.0
4,5.408768,5.281882,6.085489,8.884793,7.0


In [50]:
X_test.reset_index(inplace = True)
predictions['spread'] = X_test['spread']
predictions

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread
0,7.768983,8.509132,8.250464,4.735949,15.0,-8.0
1,4.021528,3.897451,4.478983,2.877500,21.0,-4.0
2,8.243096,7.939707,8.107149,3.555861,15.0,-8.0
3,11.663681,11.328191,11.579659,10.019220,9.0,-12.0
4,5.408768,5.281882,6.085489,8.884793,7.0,-5.5
5,-9.989044,-10.463134,-11.088128,-4.960184,-16.0,8.5
6,-0.168817,0.824683,0.352110,2.206829,-11.0,-2.5
7,5.903172,5.726796,5.802836,-0.168979,2.0,-6.5
8,1.811391,2.196907,2.599390,6.310578,25.0,-2.0
9,1.966661,1.828906,1.874987,3.826923,-15.0,-2.5


In [51]:
predictions['y_hat_ave'] = (predictions.y_hat_nn + 
                           
                            predictions.y_hat_rr + 
                            predictions.y_hat_nn)/3

In [52]:
predictions.head(20)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread,y_hat_ave
0,7.768983,8.509132,8.250464,4.735949,15.0,-8.0,8.015699
1,4.021528,3.897451,4.478983,2.8775,21.0,-4.0,3.980169
2,8.243096,7.939707,8.107149,3.555861,15.0,-8.0,8.141967
3,11.663681,11.328191,11.579659,10.01922,9.0,-12.0,11.551851
4,5.408768,5.281882,6.085489,8.884793,7.0,-5.5,5.366473
5,-9.989044,-10.463134,-11.088128,-4.960184,-16.0,8.5,-10.147074
6,-0.168817,0.824683,0.35211,2.206829,-11.0,-2.5,0.16235
7,5.903172,5.726796,5.802836,-0.168979,2.0,-6.5,5.84438
8,1.811391,2.196907,2.59939,6.310578,25.0,-2.0,1.939896
9,1.966661,1.828906,1.874987,3.826923,-15.0,-2.5,1.920743


In [53]:
predictions['predict_svm'] =  (predictions.y_hat_svm + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [54]:
predictions['predict_ad'] =  (predictions.y_hat_ad + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [55]:
predictions['predict_rr'] =  (predictions.y_hat_rr + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [56]:
predictions['predict_nn'] =  (predictions.y_hat_nn + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [57]:
predictions['actual'] = (predictions.y_test + predictions.spread).map(lambda x: 0 if x< 0 else 1)

In [58]:
predictions['predict'] = (predictions['predict_svm']+ 
    predictions['predict_rr'] + predictions['predict_nn'])

In [59]:
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread,y_hat_ave,predict_svm,predict_ad,predict_rr,predict_nn,actual,predict
0,7.768983,8.509132,8.250464,4.735949,15.0,-8.0,8.015699,1,0,1,0,1,2
1,4.021528,3.897451,4.478983,2.8775,21.0,-4.0,3.980169,1,0,0,1,1,2
2,8.243096,7.939707,8.107149,3.555861,15.0,-8.0,8.141967,1,0,0,1,1,2
3,11.663681,11.328191,11.579659,10.01922,9.0,-12.0,11.551851,0,0,0,0,0,0
4,5.408768,5.281882,6.085489,8.884793,7.0,-5.5,5.366473,1,1,0,0,1,1


In [64]:
print(classification_report(predictions.actual, predictions.predict_rr))

              precision    recall  f1-score   support

           0       0.49      0.70      0.57      1119
           1       0.50      0.29      0.36      1164

   micro avg       0.49      0.49      0.49      2283
   macro avg       0.49      0.49      0.47      2283
weighted avg       0.49      0.49      0.47      2283



In [65]:
confusion_matrix(predictions.actual, predictions.predict_svm)

array([[778, 341],
       [812, 352]])

In [66]:
predictions.predict.value_counts(normalize = True)

0    0.515550
1    0.197547
3    0.168200
2    0.118703
Name: predict, dtype: float64

In [67]:
predictions.groupby(['predict'])['actual'].mean()

predict
0    0.508071
1    0.516630
2    0.509225
3    0.507812
Name: actual, dtype: float64