In [100]:
from sklearn.ensemble import  AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [101]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')
nba.dropna(inplace = True)

In [102]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9130 entries, 0 to 9129
Data columns (total 8 columns):
cover                  9130 non-null int64
home_win_margin        9130 non-null float64
away_rest              9130 non-null int64
rest_days              9130 non-null int64
home_win_pct           9130 non-null float64
away_win_pct           9130 non-null float64
home_ave_win_margin    9130 non-null float64
away_ave_win_margin    9130 non-null float64
dtypes: float64(5), int64(3)
memory usage: 642.0 KB


In [103]:
# features = ['eff_ratio1', 'mov_5_fta', 'mov_5_away_fta',  
#             'eff_ratio2', 'eff_ratio3','eff_ratio4',
#             'away_rest', 'rest_days',
#             'spread',
#             'mov_5_home_score', 'mov_5_away_score',
#             'mov_5_away_off_eff', 'mov_5_away_def_eff', 
#             'mov_5_away_assists', 'mov_5_home_win_margin',
#             'mov_5_win', 'mov_5_away_win_margin', 'home_win_pct', 'away_win_pct',
#             'high_alt', 'home_ave_win_margin', 'away_ave_win_margin',
#             'playoff_game',  'mov_5_3pa', 
#            'mov_5_away_3pa'
#             ]

X = nba.drop(columns = ['cover', 'home_win_margin'])
y = nba['home_win_margin']

In [104]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y , 
                                                    random_state = 4)

In [105]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [106]:
gb = AdaBoostRegressor()
gb_params = { 'n_estimators'      : [300],
             'learning_rate'         : [ .01 ],
             
            }

In [107]:
gs = GridSearchCV(gb, param_grid= gb_params)
gs.fit(X_train_scaled, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'learning_rate': 0.01, 'n_estimators': 300}
0.14292303201481896


In [108]:
gs.score(X_test_scaled,y_test)

0.15135952362192906

In [109]:
y_hat_gb = gs.predict(X_test_scaled)

In [110]:
rr = RidgeCV(alphas = [100, 120, 80])


In [111]:
rr.fit(X_train_scaled, y_train);

In [112]:
rr.score(X_train_scaled, y_train)

0.16022963636629106

In [113]:
rr.score(X_test_scaled, y_test)

0.1683281603910538

In [114]:
rr.coef_


array([ 0.67033112, -0.52451231,  0.57693672, -0.8364106 ,  3.52882473,
       -2.6293725 ])

In [115]:
y_hat_rr = rr.predict(X_test_scaled)

In [116]:
rr.alpha_

80

In [117]:
y_hat_rr

array([ 3.6666343 ,  9.79433511, 13.27768316, ...,  1.6372937 ,
       11.97026926, -7.43155964])

### Support Vector Machine

In [118]:
svm = SVR()
svm_params = {'kernel' : ['linear'],
              'C'      : [.1, .02, .05]}

In [119]:
gs = GridSearchCV(svm, param_grid= svm_params)
gs.fit(X_train_scaled, y_train)
print(gs.best_params_)
print(gs.best_score_)



{'C': 0.1, 'kernel': 'linear'}
0.15801244724189287


In [120]:
gs.score(X_test_scaled, y_test)

0.1682961295812111

In [121]:
y_hat_svm = gs.predict(X_test_scaled)


### Neural Network

Going to fit a neural network with the X features.   I am not concerned about inference.  Accurate prediction is all that matters.  So am going to set up an neural network as one of the final predictors of Home Win Margin.

In [122]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

In [123]:
model = Sequential()
model.add(Dense(30, 
                activation = 'relu',
                input_dim = X_train.shape[1]))
#model.add(Dense(18, activation = 'relu'))

model.add(Dense(1, activation = None)) #output layer 

In [124]:
model.compile(loss = 'mean_squared_error', optimizer='adam' )
model.fit(X_train_scaled, 
          y_train, epochs =25, 
          batch_size= 64, 
          validation_data=(X_test_scaled, y_test))

Train on 6847 samples, validate on 2283 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1a3601fc18>

In [125]:
y_hat_nn = model.predict(X_test_scaled)
predictions = pd.DataFrame(y_hat_nn, columns= ['y_hat_nn'])
predictions

Unnamed: 0,y_hat_nn
0,4.332137
1,9.423896
2,12.523093
3,7.532235
4,6.391845
5,12.474705
6,2.893021
7,3.818507
8,0.695629
9,2.255543


In [126]:
predictions['y_hat_rr'] = y_hat_rr
predictions['y_hat_svm'] = y_hat_svm
predictions['y_hat_gb'] = y_hat_gb


In [127]:
predictions.head(10)

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_gb
0,4.332137,3.666634,3.853732,4.768541
1,9.423896,9.794335,10.112783,8.865649
2,12.523093,13.277683,13.761053,10.107789
3,7.532235,7.137672,7.633777,8.752344
4,6.391845,6.400545,6.718879,6.213571
5,12.474705,12.851076,13.390663,10.988148
6,2.893021,2.945776,3.018509,4.3234
7,3.818507,3.215078,3.303957,2.528547
8,0.695629,1.320945,1.14973,-0.370641
9,2.255543,1.717761,1.970892,3.709503


In [128]:
y_s = pd.DataFrame(y_test)
y_s.reset_index(inplace = True)
predictions['y_test'] = y_s['home_win_margin']
predictions.head()

Unnamed: 0,y_hat_nn,y_hat_rr,y_hat_svm,y_hat_gb,y_test
0,4.332137,3.666634,3.853732,4.768541,10.0
1,9.423896,9.794335,10.112783,8.865649,-2.0
2,12.523093,13.277683,13.761053,10.107789,6.0
3,7.532235,7.137672,7.633777,8.752344,11.0
4,6.391845,6.400545,6.718879,6.213571,-7.0


In [129]:
X_spread = pd.DataFrame(X_test)
X_spread.reset_index(inplace = True)
predictions['spread'] = X_spread['spread']
predictions.head()

KeyError: 'spread'

In [None]:
predictions['y_hat_ave'] = (predictions.y_hat_nn + 
                           
                            predictions.y_hat_rr + 
                            predictions.y_hat_nn)/3

In [None]:
predictions.head(20)

In [None]:
predictions['predict_svm'] =  (predictions.y_hat_svm + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [None]:
predictions['predict_gb'] =  (predictions.y_hat_gb + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [None]:
predictions['predict_rr'] =  (predictions.y_hat_rr + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [None]:
predictions['predict_nn'] =  (predictions.y_hat_nn + X_test.spread).map(
    lambda x: 0 if x <0 else 1)

In [None]:
predictions['actual'] = (predictions.y_test + predictions.spread).map(lambda x: 0 if x< 0 else 1)

In [None]:
predictions['predict'] = (predictions['predict_svm']+ 
    predictions['predict_rr'] + predictions['predict_nn'])

In [None]:
predictions.head()

In [None]:
print(classification_report(predictions.actual, predictions.predict_svm))

In [None]:
confusion_matrix(predictions.actual, predictions.predict_nn)

In [None]:
predictions.predict.value_counts(normalize = True)

In [None]:
predictions.groupby(['predict'])['actual'].mean()