In [97]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV

%matplotlib inline

In [98]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')
nba.dropna(inplace = True)

In [99]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9131 entries, 0 to 9130
Data columns (total 35 columns):
cover                    9131 non-null int64
line_cv                  9131 non-null float64
eff_ratio                9131 non-null float64
eff_ratio2               9131 non-null float64
away_rest                9131 non-null int64
main_referee             9131 non-null int64
home_rest                9131 non-null int64
ref_2                    9131 non-null int64
ref_3                    9131 non-null int64
spread                   9131 non-null float64
home_win_margin          9131 non-null float64
mov_5_fta                9131 non-null float64
mov_5_away_fta           9131 non-null float64
free_throw_ratio         9131 non-null float64
mov_5_home_score         9131 non-null float64
mov_5_away_score         9131 non-null float64
score_ratio              9131 non-null float64
rebound_ratio            9131 non-null float64
mov_5_tot                9131 non-null float64
mov_5_away_t

In [100]:
features = ['eff_ratio', 'mov_5_fta', 'mov_5_away_fta',  
            'eff_ratio2', 'away_rest', 
            'home_rest', 
            'free_throw_ratio', 'spread',
            'score_ratio', 'mov_5_home_score', 'mov_5_away_score',
            'mov_5_away_off_eff', 'mov_5_away_def_eff', 
            'mov_5_away_assists', 'mov_5_home_win_margin',
            'mov_5_win', 'mov_5_away_win_margin', 'home_win_pct', 'away_win_pct',
            'high_alt', 'home_ave_win_margin', 'away_ave_win_margin',
            'playoff_game']
X = nba[features]
y = nba['home_win_margin']

In [101]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y , 
                                                    random_state = 4)

In [102]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [292]:
rf = RandomForestRegressor()
rf_params = {'n_estimators'      : [400,500, 600, 700],
             'max_depth'         : [ 60, 70, 80, 90 ],
             'min_samples_split' : [8,11, 14, 17, 22  ]
            }

In [293]:
gs = GridSearchCV(rf, param_grid= rf_params)

In [None]:
gs.fit(X_train_scaled,y_train)



In [None]:
gs.best_score_

In [None]:
gs.best_params_

In [None]:
gs.score(X_test_scaled, y_test)

In [None]:
y_hat_rf = gs.predict(X_test_scaled)

In [None]:
y_hat_rf

In [None]:
rr = RidgeCV(alphas = [100, 150, 80])


In [None]:
rr.fit(X_train_scaled, y_train);

In [None]:
rr.score(X_train_scaled, y_train)

In [None]:
rr.score(X_test_scaled, y_test)

In [270]:
rr.coef_


array([ 3.77482651e-02,  3.07225977e-01, -5.09244140e-02,  2.23296882e-02,
        4.85549294e-01, -3.96244640e-01, -2.06696422e-01, -5.22718224e+00,
       -9.03454808e-02,  3.81684471e-04,  1.99110037e-01,  4.01191365e-01,
       -3.00354135e-01, -2.52802711e-01,  5.16840772e-01, -9.55835732e-01,
       -3.80399186e-01,  2.35863742e-01,  6.84559268e-01,  1.75960327e-01,
        9.35199379e-01, -1.10179319e+00,  3.61978133e-02])

In [271]:
y_hat_rr = rr.predict(X_test_scaled)

In [272]:
rr.alpha_

100

In [273]:
y_hat_rr

array([ 1.75688512, -1.89108177, 12.70695595, ...,  4.20378938,
       11.13123548,  2.09339121])

In [119]:
svm = SVR(kernel="linear", C = 5)


In [120]:
svm.fit(X_train_scaled,y_train)

SVR(C=5, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [121]:
svm.score(X_train_scaled, y_train)

0.2148030883256683

In [122]:
svm.score(X_test_scaled, y_test)

0.20615428938410918

In [123]:
y_hat_svm = svm.predict(X_test_scaled)
y_hat_svm

array([ 1.37110751, -1.78489493, 12.81406217, ...,  3.47352254,
       11.08402482,  1.95221682])

### Neural Network

Going to fit a neural network with the X features.   I am not concerned about inference.  Accurate prediction is all that matters.  So am going to set up an neural network as one of the final predictors of Home Win Margin.

In [124]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

In [125]:
model = Sequential()
model.add(Dense(23, 
                activation = 'relu',
                input_dim = X_train.shape[1]))
model.add(Dense(7, activation = 'relu'))

model.add(Dense(1, activation = None)) #output layer 

In [126]:
model.compile(loss = 'mean_squared_error', optimizer='adam' )
model.fit(X_train_scaled, 
          y_train, epochs = 40, 
          batch_size= 32, 
          validation_data=(X_test_scaled, y_test))

Train on 6848 samples, validate on 2283 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<keras.callbacks.History at 0x1a3d453a90>

In [127]:
y_hat_nn = model.predict(X_test_scaled)
predictions = pd.DataFrame(y_hat_nn, columns= ['y_hat_nn'])
predictions

Unnamed: 0,y_hat_nn
0,3.944405
1,-4.357571
2,11.830326
3,6.192354
4,1.581554
5,15.581586
6,8.221266
7,2.618827
8,4.615789
9,6.315024


In [276]:
predictions['y_hat_rr'] = y_hat_rr
predictions['y_hat_svm'] = y_hat_svm
predictions['y_hat_rf'] = y_hat_rf


In [277]:
predictions.head(10)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_rf,y_test,spread,y_hat_ave,predict_svm,predict_rr,predict_nn,actual,predict
0,3.944405,1.756885,1.371108,-0.600012,-1.0,-1.5,3.171716,0,1,1,0,3
1,-4.357571,-1.891082,-1.784895,-1.091522,-8.0,1.0,-3.568923,0,0,0,0,0
2,11.830326,12.706956,12.814062,11.736707,6.0,-13.5,12.059877,0,0,0,0,0
3,6.192354,5.395059,5.354332,5.394062,-5.0,-7.0,5.907983,0,0,0,0,0
4,1.581554,2.674261,2.352306,3.691513,8.0,-3.5,1.953234,0,0,0,1,0
5,15.581586,17.64794,17.578159,19.273758,9.0,-15.5,16.241839,1,1,1,0,4
6,8.221266,8.361175,9.815642,6.323641,14.0,-10.0,8.438949,0,0,0,1,0
7,2.618827,0.846593,0.339522,0.640337,-9.0,-1.0,2.075854,0,0,1,0,1
8,4.615789,3.247947,3.114539,2.83138,5.0,-4.5,4.165951,0,0,1,1,1
9,6.315024,4.362554,4.104572,4.636746,-6.0,-3.5,5.592544,1,1,1,0,4


In [278]:
y_s = pd.DataFrame(y_test)
y_s.reset_index(inplace = True)
predictions['y_test'] = y_s['home_win_margin']
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_rf,y_test,spread,y_hat_ave,predict_svm,predict_rr,predict_nn,actual,predict
0,3.944405,1.756885,1.371108,-0.600012,-1.0,-1.5,3.171716,0,1,1,0,3
1,-4.357571,-1.891082,-1.784895,-1.091522,-8.0,1.0,-3.568923,0,0,0,0,0
2,11.830326,12.706956,12.814062,11.736707,6.0,-13.5,12.059877,0,0,0,0,0
3,6.192354,5.395059,5.354332,5.394062,-5.0,-7.0,5.907983,0,0,0,0,0
4,1.581554,2.674261,2.352306,3.691513,8.0,-3.5,1.953234,0,0,0,1,0


In [279]:
X_spread = pd.DataFrame(X_test)
X_spread.reset_index(inplace = True)
predictions['spread'] = X_spread['spread']
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_rf,y_test,spread,y_hat_ave,predict_svm,predict_rr,predict_nn,actual,predict
0,3.944405,1.756885,1.371108,-0.600012,-1.0,-1.5,3.171716,0,1,1,0,3
1,-4.357571,-1.891082,-1.784895,-1.091522,-8.0,1.0,-3.568923,0,0,0,0,0
2,11.830326,12.706956,12.814062,11.736707,6.0,-13.5,12.059877,0,0,0,0,0
3,6.192354,5.395059,5.354332,5.394062,-5.0,-7.0,5.907983,0,0,0,0,0
4,1.581554,2.674261,2.352306,3.691513,8.0,-3.5,1.953234,0,0,0,1,0


In [280]:
y_test

4741    -1.0
6253    -8.0
1166     6.0
6353    -5.0
3631     8.0
1646     9.0
9075    14.0
7345    -9.0
4677     5.0
2703    -6.0
8031     1.0
6450    32.0
387     13.0
6636    -9.0
8119     3.0
758     14.0
9108     8.0
3030     3.0
4452   -17.0
4273    -4.0
4887     3.0
3715    18.0
2323     7.0
7775     3.0
5773    -7.0
2031     5.0
4134    10.0
2719    20.0
3281     5.0
186     -2.0
        ... 
5213     3.0
3550     7.0
5427    -8.0
3549     3.0
8560    11.0
2672    -8.0
2538    13.0
6074    15.0
8406    11.0
7695     8.0
4130     9.0
316     13.0
9060    -2.0
528     18.0
7361    23.0
307      4.0
4994    25.0
7265     3.0
13       7.0
4542    -4.0
2576   -11.0
8666    -3.0
3466    -7.0
2168   -11.0
6915     2.0
4402     7.0
2975    -5.0
3245     5.0
3       -2.0
7630   -28.0
Name: home_win_margin, Length: 2283, dtype: float64

In [281]:
predictions['y_hat_ave'] = (predictions.y_hat_nn + 
                           
                            predictions.y_hat_rr + 
                            predictions.y_hat_nn)/3

In [282]:
predictions.head(20)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_rf,y_test,spread,y_hat_ave,predict_svm,predict_rr,predict_nn,actual,predict
0,3.944405,1.756885,1.371108,-0.600012,-1.0,-1.5,3.215232,0,1,1,0,3
1,-4.357571,-1.891082,-1.784895,-1.091522,-8.0,1.0,-3.535408,0,0,0,0,0
2,11.830326,12.706956,12.814062,11.736707,6.0,-13.5,12.122536,0,0,0,0,0
3,6.192354,5.395059,5.354332,5.394062,-5.0,-7.0,5.926589,0,0,0,0,0
4,1.581554,2.674261,2.352306,3.691513,8.0,-3.5,1.94579,0,0,0,1,0
5,15.581586,17.64794,17.578159,19.273758,9.0,-15.5,16.27037,1,1,1,0,4
6,8.221266,8.361175,9.815642,6.323641,14.0,-10.0,8.267902,0,0,0,1,0
7,2.618827,0.846593,0.339522,0.640337,-9.0,-1.0,2.028083,0,0,1,0,1
8,4.615789,3.247947,3.114539,2.83138,5.0,-4.5,4.159842,0,0,1,1,1
9,6.315024,4.362554,4.104572,4.636746,-6.0,-3.5,5.6642,1,1,1,0,4


In [283]:
predictions['predict_svm'] =  (predictions.y_hat_svm + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [284]:
predictions['predict_rr'] =  (predictions.y_hat_rr + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [285]:
predictions['predict_nn'] =  (predictions.y_hat_nn + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [286]:
predictions['actual'] = (predictions.y_test + predictions.spread).map(lambda x: 0 if x< 0 else 1)

In [287]:
predictions['predict'] = (predictions['predict_svm']+ 
    predictions['predict_rr'] + predictions['predict_nn']+ 
                          predictions['predict_rr'])

In [288]:
print(classification_report(predictions.actual, predictions.predict))

              precision    recall  f1-score   support

           0       0.50      0.46      0.48      1119
           1       0.49      0.14      0.22      1164
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0

   micro avg       0.30      0.30      0.30      2283
   macro avg       0.20      0.12      0.14      2283
weighted avg       0.50      0.30      0.35      2283



  'recall', 'true', average, warn_for)


In [289]:
confusion_matrix(X_test.actual, X_test.predict)

array([[768, 351],
       [769, 395]])

In [290]:
predictions.predict.value_counts(normalize = True)

0    0.448971
4    0.210688
1    0.146299
3    0.120894
2    0.073149
Name: predict, dtype: float64

In [291]:
predictions.groupby(['predict'])['actual'].mean()

predict
0    0.496585
1    0.491018
2    0.479042
3    0.539855
4    0.544699
Name: actual, dtype: float64