In [1]:
from sklearn.ensemble import  AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.model_selection import GridSearchCV

%matplotlib inline

  return f(*args, **kwds)


In [2]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')
nba.dropna(inplace = True)

In [3]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9198 entries, 0 to 9198
Columns: 206 entries, dataset to feature_19
dtypes: float64(148), int64(46), object(12)
memory usage: 14.5+ MB


In [4]:
X = nba.drop(columns = ['cover','home_win_margin', 
                        'date', 'dataset','line_cv',
                       'home_starter5', 'win_margin_ratio',
                       'away_starter2', 'away_starter3',
                       'away_starter4', 'away_starter5',
                       'ref_1', 'ref_3', 'crew_referees',
                      'away_pace', 'away_spread',
                        'away_line_cv',  'away_cover','feature_1', 
            'feature_2', 
            'feature_3' ,'feature_4', 'feature_5', 'feature_6',
            'feature_7', 'feature_8', 'teams', 'away_team',
            'feature_9','feature_10', 'feature_11', 'feature_12', 
            'feature_13', 'feature_14', 'feature_15', 'feature_16',
            'feature_17', 'feature_18', 'feature_19',
                       'home_payout', 'away_payout'])

y = nba['home_win_margin']

In [5]:
#create interactions and squared terms
poly = PolynomialFeatures(degree = 2)
X_poly = poly.fit_transform(X)
X_poly = pd.DataFrame(X_poly, columns = poly.get_feature_names(X.columns))

In [10]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y , 
                                                    random_state = 22)

In [11]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [12]:
from sklearn.decomposition import PCA


In [13]:
pc = PCA(n_components=350)
X_train_pc = pc.fit_transform(X_train_scaled)
X_test_pc = pc.transform(X_test_scaled)



In [14]:

var_exp = pc.explained_variance_ratio_
print('Explained variance:            ', var_exp)

cum_var_exp = np.cumsum(var_exp)
print('Cumulative explained variance: ', cum_var_exp)

Explained variance:             [8.04271221e-02 7.10273413e-02 3.40358512e-02 2.69063023e-02
 2.49805443e-02 2.40680571e-02 2.37936294e-02 2.24224442e-02
 2.00805967e-02 1.88215169e-02 1.80491944e-02 1.71662250e-02
 1.56674071e-02 1.40547522e-02 1.31017010e-02 1.28287308e-02
 1.19707421e-02 1.18644385e-02 1.16602363e-02 1.14693549e-02
 1.11220205e-02 1.08922656e-02 1.04199922e-02 1.02507168e-02
 1.00732775e-02 9.95806136e-03 9.63022120e-03 9.48660346e-03
 9.34212204e-03 9.26517648e-03 9.07945357e-03 8.91766181e-03
 8.79073134e-03 8.75369101e-03 8.59830355e-03 8.58846955e-03
 8.43544231e-03 8.35425504e-03 8.30675919e-03 8.21190882e-03
 8.03728750e-03 8.00591148e-03 7.98296942e-03 7.74024219e-03
 7.52249788e-03 7.36452683e-03 7.33288196e-03 7.11038816e-03
 6.80516837e-03 6.57972948e-03 6.21572398e-03 6.07018078e-03
 5.94458294e-03 5.85095423e-03 5.63761990e-03 5.60416845e-03
 5.50072848e-03 5.30321353e-03 5.18573227e-03 5.16505929e-03
 5.08750233e-03 4.96256640e-03 4.74858858e-03 4.62567

### Linear Regression

In [15]:
ols = LinearRegression()
ols.fit(X_train_pc, y_train)
ols.score(X_train_pc, y_train)

0.2575701513932085

In [17]:
ols.score(X_test_pc, y_test)

0.1496277463652924

### Boosting

In [18]:
ad = AdaBoostRegressor()
ad_params = { 'n_estimators'      : [300],
             'learning_rate'         : [ .01 ],
             
            }

In [19]:
gs = GridSearchCV(ad, param_grid= ad_params)
gs.fit(X_train_pc, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'learning_rate': 0.01, 'n_estimators': 300}
0.13381613723879782


In [20]:
gs.score(X_test_pc,y_test)

0.12726599544420425

In [21]:
y_hat_ad = gs.predict(X_test_pc)

In [22]:
rr = RidgeCV(alphas = [40, 120, 80])


In [23]:
rr.fit(X_train_pc, y_train);

In [24]:
rr.score(X_train_pc, y_train)

0.25756769773553434

In [25]:
rr.score(X_test_pc, y_test)

0.15012480110498594

In [26]:
y_hat_rr = rr.predict(X_test_pc)

In [22]:
rr.alpha_

120

### Support Vector Machine

In [27]:
svm = SVR()
svm_params = {'kernel' : ['linear'],
              'C'      : [.1, .02, .05]}

In [28]:
gs = GridSearchCV(svm, param_grid= svm_params)
gs.fit(X_train_pc, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'C': 0.02, 'kernel': 'linear'}
0.10309115732891411


In [29]:
gs.score(X_test_pc, y_test)

0.14944885058549562

In [30]:
y_hat_svm = gs.predict(X_test_pc)


### Neural Network

Going to fit a neural network with the X features.   I am not concerned about inference.  Accurate prediction is all that matters.  So am going to set up an neural network as one of the final predictors of Home Win Margin.

In [31]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [32]:
model = Sequential()
model.add(Dense(350, 
                activation = 'relu',
                input_dim = X_train_pc.shape[1]))
#model.add(Dense(18, activation = 'relu'))

model.add(Dense(1, activation = None)) #output layer 

In [33]:
model.compile(loss = 'mean_squared_error', optimizer='adam' )
model.fit(X_train_pc, 
          y_train, epochs =18, 
          batch_size= 64, 
          validation_data=(X_test_pc, y_test))

Train on 6898 samples, validate on 2300 samples
Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


<keras.callbacks.History at 0x1afcef9b38>

In [31]:
y_hat_nn = model.predict(X_test_pc)
predictions = pd.DataFrame(y_hat_nn, columns= ['y_hat_nn'])
predictions

Unnamed: 0,y_hat_nn
0,12.932446
1,-10.875235
2,-1.042680
3,-16.517603
4,0.222306
5,9.225712
6,-0.991613
7,7.323125
8,8.114644
9,8.837726


In [32]:
predictions['y_hat_rr'] = y_hat_rr
predictions['y_hat_svm'] = y_hat_svm
predictions['y_hat_ad'] = y_hat_ad


In [33]:
predictions.head(10)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad
0,12.932446,9.103866,9.055897,5.117321
1,-10.875235,-7.14412,-4.411716,-3.59689
2,-1.04268,-0.928621,-0.971642,-3.195982
3,-16.517603,-6.606336,-5.860773,-3.056594
4,0.222306,1.813518,1.517029,4.195392
5,9.225712,8.124438,8.548652,6.019881
6,-0.991613,9.572154,12.666657,7.602871
7,7.323125,6.78903,7.016878,0.542803
8,8.114644,2.582118,1.971503,6.819267
9,8.837726,9.775057,10.287439,6.124567


In [34]:
y_s = pd.DataFrame(y_test)
y_s.reset_index(inplace = True)
predictions['y_test'] = y_s['home_win_margin']
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test
0,12.932446,9.103866,9.055897,5.117321,-18.0
1,-10.875235,-7.14412,-4.411716,-3.59689,12.0
2,-1.04268,-0.928621,-0.971642,-3.195982,-12.0
3,-16.517603,-6.606336,-5.860773,-3.056594,1.0
4,0.222306,1.813518,1.517029,4.195392,13.0


In [35]:
X_test.reset_index(inplace = True)
predictions['spread'] = X_test['spread']
predictions

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread
0,12.932446,9.103866,9.055897,5.117321,-18.0,-2.0
1,-10.875235,-7.144120,-4.411716,-3.596890,12.0,7.0
2,-1.042680,-0.928621,-0.971642,-3.195982,-12.0,8.5
3,-16.517603,-6.606336,-5.860773,-3.056594,1.0,7.5
4,0.222306,1.813518,1.517029,4.195392,13.0,2.0
5,9.225712,8.124438,8.548652,6.019881,7.0,-7.5
6,-0.991613,9.572154,12.666657,7.602871,21.0,-10.0
7,7.323125,6.789030,7.016878,0.542803,8.0,-1.5
8,8.114644,2.582118,1.971503,6.819267,-1.0,-3.5
9,8.837726,9.775057,10.287439,6.124567,10.0,-12.5


In [36]:
predictions['y_hat_ave'] = (predictions.y_hat_nn + 
                           
                            predictions.y_hat_rr + 
                            predictions.y_hat_nn)/3

In [37]:
predictions.head(20)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread,y_hat_ave
0,12.932446,9.103866,9.055897,5.117321,-18.0,-2.0,11.656252
1,-10.875235,-7.14412,-4.411716,-3.59689,12.0,7.0,-9.63153
2,-1.04268,-0.928621,-0.971642,-3.195982,-12.0,8.5,-1.00466
3,-16.517603,-6.606336,-5.860773,-3.056594,1.0,7.5,-13.213847
4,0.222306,1.813518,1.517029,4.195392,13.0,2.0,0.75271
5,9.225712,8.124438,8.548652,6.019881,7.0,-7.5,8.858621
6,-0.991613,9.572154,12.666657,7.602871,21.0,-10.0,2.529642
7,7.323125,6.78903,7.016878,0.542803,8.0,-1.5,7.145094
8,8.114644,2.582118,1.971503,6.819267,-1.0,-3.5,6.270469
9,8.837726,9.775057,10.287439,6.124567,10.0,-12.5,9.15017


In [38]:
predictions['predict_svm'] =  (predictions.y_hat_svm + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [39]:
predictions['predict_ad'] =  (predictions.y_hat_ad + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [40]:
predictions['predict_rr'] =  (predictions.y_hat_rr + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [41]:
predictions['predict_nn'] =  (predictions.y_hat_nn + predictions.spread).map(
    lambda x: 0 if x <0 else 1)

In [42]:
predictions['actual'] = (predictions.y_test + predictions.spread).map(lambda x: 0 if x< 0 else 1)

In [43]:
predictions['predict'] = (predictions['predict_svm']+ 
    predictions['predict_rr'] + predictions['predict_nn'])

In [44]:
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_ad,y_test,spread,y_hat_ave,predict_svm,predict_ad,predict_rr,predict_nn,actual,predict
0,12.932446,9.103866,9.055897,5.117321,-18.0,-2.0,11.656252,1,1,1,1,0,3
1,-10.875235,-7.14412,-4.411716,-3.59689,12.0,7.0,-9.63153,1,1,0,0,1,1
2,-1.04268,-0.928621,-0.971642,-3.195982,-12.0,8.5,-1.00466,1,1,1,1,0,3
3,-16.517603,-6.606336,-5.860773,-3.056594,1.0,7.5,-13.213847,1,1,1,0,1,2
4,0.222306,1.813518,1.517029,4.195392,13.0,2.0,0.75271,1,1,1,1,1,3


In [45]:
print(classification_report(predictions.actual, predictions.predict_rr))

              precision    recall  f1-score   support

           0       0.51      0.52      0.52      1101
           1       0.50      0.49      0.49      1065

   micro avg       0.51      0.51      0.51      2166
   macro avg       0.51      0.51      0.51      2166
weighted avg       0.51      0.51      0.51      2166



In [46]:
confusion_matrix(predictions.actual, predictions.predict_svm)

array([[559, 542],
       [531, 534]])

In [47]:
predictions.predict.value_counts(normalize = True)

0    0.320868
3    0.265928
2    0.222530
1    0.190674
Name: predict, dtype: float64

In [48]:
predictions.groupby(['predict'])['actual'].mean()

predict
0    0.499281
1    0.450363
2    0.489627
3    0.513889
Name: actual, dtype: float64