In [160]:
from sklearn.ensemble import  AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [161]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')
nba.dropna(inplace = True)

In [162]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9130 entries, 0 to 9129
Data columns (total 20 columns):
cover                  9130 non-null int64
home_win_margin        9130 non-null float64
away_rest              9130 non-null int64
rest_days              9130 non-null int64
home_win_pct           9130 non-null float64
away_win_pct           9130 non-null float64
home_ave_win_margin    9130 non-null float64
away_ave_win_margin    9130 non-null float64
mov_5_oeff             9130 non-null float64
mov_5_deff             9130 non-null float64
mov_3_oeff             9130 non-null float64
mov_3_deff             9130 non-null float64
mov_5_away_off_eff     9130 non-null float64
mov_5_away_def_eff     9130 non-null float64
mov_3_away_off_eff     9130 non-null float64
mov_3_away_def_eff     9130 non-null float64
eff_ratio1             9130 non-null float64
eff_ratio2             9130 non-null float64
eff_ratio3             9130 non-null float64
eff_ratio4             9130 non-null float64

In [163]:
# features = ['eff_ratio1', 'mov_5_fta', 'mov_5_away_fta',  
#             'eff_ratio2', 'eff_ratio3','eff_ratio4',
#             'away_rest', 'rest_days',
#             'spread',
#             'mov_5_home_score', 'mov_5_away_score',
#             'mov_5_away_off_eff', 'mov_5_away_def_eff', 
#             'mov_5_away_assists', 'mov_5_home_win_margin',
#             'mov_5_win', 'mov_5_away_win_margin', 'home_win_pct', 'away_win_pct',
#             'high_alt', 'home_ave_win_margin', 'away_ave_win_margin',
#             'playoff_game',  'mov_5_3pa', 
#            'mov_5_away_3pa'
#             ]

X = nba.drop(columns = ['cover', 'home_win_margin'])
y = nba['home_win_margin']

In [164]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y , 
                                                    random_state = 4)

In [165]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [166]:
gb = AdaBoostRegressor()
gb_params = { 'n_estimators'      : [300],
             'learning_rate'         : [ .01 ],
             
            }

In [167]:
gs = GridSearchCV(gb, param_grid= gb_params)
gs.fit(X_train_scaled, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'learning_rate': 0.01, 'n_estimators': 300}
0.14110734572554753


In [168]:
gs.score(X_test_scaled,y_test)

0.15204652601578406

In [169]:
y_hat_gb = gs.predict(X_test_scaled)

In [170]:
rr = RidgeCV(alphas = [100, 120, 80])


In [171]:
rr.fit(X_train_scaled, y_train);

In [172]:
rr.score(X_train_scaled, y_train)

0.16091367087159314

In [173]:
rr.score(X_test_scaled, y_test)

0.1681001382638284

In [174]:
rr.coef_


array([ 0.67304246, -0.51692603,  0.64886354, -0.87175347,  3.16350532,
       -2.366814  ,  0.37462131, -0.14489877, -0.12009076, -0.09906782,
       -0.2787371 ,  0.37526395,  0.11494292, -0.20317751, -0.1192761 ,
        0.23362733,  0.10510003, -0.19058926])

In [175]:
y_hat_rr = rr.predict(X_test_scaled)

In [176]:
rr.alpha_

120

In [177]:
y_hat_rr

array([ 4.02709802,  9.72727669, 13.0177688 , ...,  2.09473375,
       12.26107869, -7.3373667 ])

### Support Vector Machine

In [178]:
svm = SVR()
svm_params = {'kernel' : ['linear'],
              'C'      : [.1, .02, .05]}

In [179]:
gs = GridSearchCV(svm, param_grid= svm_params)
gs.fit(X_train_scaled, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'C': 0.1, 'kernel': 'linear'}
0.1560814265126296


In [180]:
gs.score(X_test_scaled, y_test)

0.1685240052319065

In [181]:
y_hat_svm = gs.predict(X_test_scaled)


### Neural Network

Going to fit a neural network with the X features.   I am not concerned about inference.  Accurate prediction is all that matters.  So am going to set up an neural network as one of the final predictors of Home Win Margin.

In [182]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

In [183]:
model = Sequential()
model.add(Dense(30, 
                activation = 'relu',
                input_dim = X_train.shape[1]))
#model.add(Dense(18, activation = 'relu'))

model.add(Dense(1, activation = None)) #output layer 

In [184]:
model.compile(loss = 'mean_squared_error', optimizer='adam' )
model.fit(X_train_scaled, 
          y_train, epochs =25, 
          batch_size= 64, 
          validation_data=(X_test_scaled, y_test))

Train on 6847 samples, validate on 2283 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1a2ef4a080>

In [185]:
y_hat_nn = model.predict(X_test_scaled)
predictions = pd.DataFrame(y_hat_nn, columns= ['y_hat_nn'])
predictions

Unnamed: 0,y_hat_nn
0,4.786920
1,8.852562
2,12.144455
3,8.042840
4,6.774064
5,12.498083
6,2.460768
7,2.850856
8,0.825173
9,1.233060


In [186]:
predictions['y_hat_rr'] = y_hat_rr
predictions['y_hat_svm'] = y_hat_svm
predictions['y_hat_gb'] = y_hat_gb


In [187]:
predictions.head(10)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_gb
0,4.78692,4.027098,4.327796,4.442244
1,8.852562,9.727277,10.020636,9.048913
2,12.144455,13.017769,13.352571,10.211575
3,8.04284,7.300048,7.868117,9.114827
4,6.774064,6.953867,7.19408,6.351442
5,12.498083,12.641242,13.159693,10.824675
6,2.460768,1.659441,1.757982,4.446154
7,2.850856,3.333553,3.233855,2.554591
8,0.825173,1.32295,1.204882,-0.738255
9,1.23306,1.623924,1.888455,3.294737


In [188]:
y_s = pd.DataFrame(y_test)
y_s.reset_index(inplace = True)
predictions['y_test'] = y_s['home_win_margin']
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_gb,y_test
0,4.78692,4.027098,4.327796,4.442244,10.0
1,8.852562,9.727277,10.020636,9.048913,-2.0
2,12.144455,13.017769,13.352571,10.211575,6.0
3,8.04284,7.300048,7.868117,9.114827,11.0
4,6.774064,6.953867,7.19408,6.351442,-7.0


In [189]:
X_spread = pd.DataFrame(X_test)
X_spread.reset_index(inplace = True)
predictions['spread'] = X_spread['spread']
predictions.head()

KeyError: 'spread'

In [None]:
predictions['y_hat_ave'] = (predictions.y_hat_nn + 
                           
                            predictions.y_hat_rr + 
                            predictions.y_hat_nn)/3

In [None]:
predictions.head(20)

In [None]:
predictions['predict_svm'] =  (predictions.y_hat_svm + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [None]:
predictions['predict_gb'] =  (predictions.y_hat_gb + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [None]:
predictions['predict_rr'] =  (predictions.y_hat_rr + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [None]:
predictions['predict_nn'] =  (predictions.y_hat_nn + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [None]:
predictions['actual'] = (predictions.y_test + predictions.spread).map(lambda x: 0 if x< 0 else 1)

In [None]:
predictions['predict'] = (predictions['predict_svm']+ 
    predictions['predict_rr'] + predictions['predict_nn'])

In [None]:
predictions.head()

In [None]:
print(classification_report(predictions.actual, predictions.predict_svm))

In [None]:
confusion_matrix(predictions.actual, predictions.predict_nn)

In [None]:
predictions.predict.value_counts(normalize = True)

In [None]:
predictions.groupby(['predict'])['actual'].mean()