In [713]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import timeit

In [714]:
df = pd.read_csv('data/nba_dataset.csv')

In [715]:
#OneHotEncode Positions and drop columns
df = pd.concat([df, pd.get_dummies(df['Pos'])],axis=1)

#Group my percentage ranges.
group = pd.cut(df['%_of_cap'],[0,0.1,0.15,0.2,0.25,df['%_of_cap'].max()], 
               labels=['0-10%','10%-15%','15%-20%','20%-25%','25%<='])
df['group']=group

df.drop(['Pos', '%_of_cap'], axis=1, inplace=True)

In [716]:
feature_names = ['Age', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'TRB%', 'AST%', 'STL%',
                 'BLK%', 'TOV%', 'USG%', 'WS', 'BPM', 'C', 'PF', 'PG', 'SF', 'SG']

X = np.array(df.iloc[:,0:20])
y = np.array(df.group)

In [717]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

#Create dictionary of classifiers to try
classifiers = {'DecisionTree': DecisionTreeClassifier(random_state=34),  
               'RandomForest': RandomForestClassifier(random_state=34), 
               'SVC': SVC(random_state=34),   
               'AdaBoost': AdaBoostClassifier(random_state=34),
               'XGBoost': XGBClassifier(random_state=34)}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 34)


In [744]:
model_eval = pd.DataFrame({})
for key, classifier in classifiers.items():
    pipe = Pipeline(steps=[('minmax', MinMaxScaler()),
                           ('classifier', classifier)])
    kfold = KFold(n_splits=5, random_state=34)
    scores = cross_validate(pipe, X_train, y_train, cv=kfold) #Use cross_val_score to get the mean of each fold and standard deviation
    model_eval = model_eval.append({'classifier':key,    
                                    'mean_fit_time':scores['fit_time'].mean(),
                                    'mean_score_time':scores['score_time'].mean(),                       
                                    'mean_test_score':scores['test_score'].mean(),
                                    'std_test_score':scores['test_score'].std(),
                                    'mean_train_score':scores['train_score'].mean(),
                                    'std_train_score':scores['train_score'].mean()}, ignore_index=True)

In [814]:
model_eval[['classifier', 'mean_fit_time', 'mean_test_score', 
            'std_test_score', 'mean_train_score']]

Unnamed: 0,classifier,mean_fit_time,mean_test_score,std_test_score,mean_train_score
0,DecisionTree,0.095745,0.666398,0.00784,1.0
1,RandomForest,0.174659,0.74554,0.009935,0.983501
2,SVC,1.227508,0.730651,0.011307,0.731388
3,AdaBoost,0.874373,0.74715,0.009784,0.758417
4,XGBoost,5.135772,0.758417,0.011314,0.793427


In [873]:
classifiers_opt = {'RandomForest': RandomForestClassifier(random_state=34),             
                   'AdaBoost': AdaBoostClassifier(random_state=34)}
model_eval2 = pd.DataFrame({})

for key, classifier in classifiers_opt.items():
    model_def = Pipeline(steps=[('minmax', MinMaxScaler()),                          
                                ('classifier', classifier)])
    model_def.fit(X_train, y_train)
    model_eval2 = model_eval2.append({'classifier':key,    
                                    'accuracy_score':model_def.score(X_test, y_test),
                                     'cm':confusion_matrix(y_test,model_def.predict(X_test)),
                                     'class_report': classification_report(y_test,model_def.predict(X_test),output_dict=True)}, ignore_index=True)

In [905]:
model_eval2[['classifier', 'accuracy_score']]

Unnamed: 0,classifier,accuracy_score
0,RandomForest,0.757344
1,AdaBoost,0.755332


In [903]:
#Unpacking Classification Report
metrics = pd.DataFrame({})

classification_report = model_eval2[['classifier',                                          
                                     'class_report']].set_index('classifier').to_dict()

for key_1, val_1 in classification_report.items():
    for key_2, val_2 in val_1.items():
        for key_3, val_3 in val_2.items():
            for key_4, val_4 in val_3.items():
                metrics = metrics.append({'classifier':key_2,                                  
                                          'group':key_3,                                  
                                         'metric':key_4, 
                                         'value':val_4},ignore_index=True)

#Create pivot table of classification table
classification_table = pd.pivot_table(metrics,                        
                                      values='value',                      
                                      index=['classifier', 'metric'],                      
                                     columns='group')

In [904]:
classification_table

Unnamed: 0_level_0,group,0-10%,10%-15%,15%-20%,20%-25%,25%<=,macro avg,micro avg,weighted avg
classifier,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AdaBoost,f1-score,0.89353,0.182278,0.091837,0.10596,0.463576,0.347436,0.755332,0.710217
AdaBoost,precision,0.83365,0.262774,0.36,0.145455,0.426829,0.405742,0.755332,0.692609
AdaBoost,recall,0.962678,0.139535,0.052632,0.083333,0.507246,0.349085,0.755332,0.755332
AdaBoost,support,1822.0,258.0,171.0,96.0,138.0,2485.0,2485.0,2485.0
RandomForest,f1-score,0.890909,0.188586,0.12987,0.165414,0.444444,0.363845,0.757344,0.712802
RandomForest,precision,0.82507,0.262069,0.25,0.297297,0.514286,0.429744,0.757344,0.689398
RandomForest,recall,0.968167,0.147287,0.087719,0.114583,0.391304,0.341812,0.757344,0.757344
RandomForest,support,1822.0,258.0,171.0,96.0,138.0,2485.0,2485.0,2485.0


Hyperparameter Tuning

Will begin with using GridSearch for AdaBoost Classifier

In [887]:
param_grid_ada = {'learning_rate':[.001, .01, .1],                                
                  'n_estimators':[500, 1000, 2000],                                                   
                  'random_state':[34]}

kfold = KFold(n_splits=5, random_state=34)
grid = GridSearchCV(AdaBoostClassifier(), param_grid = param_grid_ada ,cv=kfold)

pipe_ada = Pipeline(steps = [('minmax', MinMaxScaler()),
                            ('grid', grid)])
pipe_ada.fit(X_train, y_train)

#grid = grid.fit(X_train, y_train)


Pipeline(memory=None,
     steps=[('minmax', MinMaxScaler(copy=True, feature_range=(0, 1))), ('grid', GridSearchCV(cv=KFold(n_splits=5, random_state=34, shuffle=False),
       error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimator...   pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0))])

In [893]:
ada_predict = pipe_ada.predict(X_test)
print('Model Accuracy: ', accuracy_score(y_test, ada_predict))
print('Training Score: ' , pipe_ada.score(X_train, y_train))
print('Best Parameters: ', pipe_ada.steps[1][1].best_params_)

Model Accuracy:  0.7637826961770624
Training Score:  0.7691482226693495
Best Parameters:  {'learning_rate': 0.1, 'n_estimators': 1000, 'random_state': 34}


In [895]:
print(confusion_matrix(y_test, ada_predict))

[[1762   33    2   11   14]
 [ 192   39    8    4   15]
 [  84   38   11   10   28]
 [  36   15    9   14   22]
 [  18   23    6   19   72]]


In [894]:
print(classification_report(y_test, ada_predict))

              precision    recall  f1-score   support

       0-10%       0.84      0.97      0.90      1822
     10%-15%       0.26      0.15      0.19       258
     15%-20%       0.31      0.06      0.11       171
     20%-25%       0.24      0.15      0.18        96
       25%<=       0.48      0.52      0.50       138

   micro avg       0.76      0.76      0.76      2485
   macro avg       0.43      0.37      0.38      2485
weighted avg       0.70      0.76      0.72      2485



In [897]:
features_ada = pd.DataFrame()
features_ada['feature'] = feature_names
features_ada['weight'] = pipe_ada.steps[1][1].best_estimator_.feature_importances_
features_ada.set_index('feature').sort_values('weight', ascending=False)

Unnamed: 0_level_0,weight
feature,Unnamed: 1_level_1
MP,0.162
G,0.139
Age,0.117
TS%,0.078
TRB%,0.078
BPM,0.067
USG%,0.066
FTr,0.054
TOV%,0.05
PER,0.039


In [898]:
param_grid_rf = {'max_features':['auto', 'sqrt', 'log2'],                                  
                 'n_estimators':[500, 1000, 2000],  
                 'max_depth':[2, 3, 5, 8],
                 'random_state':[34]}

kfold = KFold(n_splits=5, random_state=34)
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid = param_grid_rf ,cv=kfold)

pipe_rf = Pipeline(steps = [('minmax', MinMaxScaler()),
                            ('grid_rf', grid_rf)])
pipe_rf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('minmax', MinMaxScaler(copy=True, feature_range=(0, 1))), ('grid_rf', GridSearchCV(cv=KFold(n_splits=5, random_state=34, shuffle=False),
       error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=...   pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0))])

In [899]:
rf_predict = pipe_rf.predict(X_test)

print('Model Accuracy: ', accuracy_score(y_test, rf_predict))
print('Training Score: ' , pipe_rf.score(X_train, y_train))
print('Best Parameters: ', pipe_rf.steps[1][1].best_params_)

Model Accuracy:  0.7625754527162978
Training Score:  0.794634473507713
Best Parameters:  {'max_depth': 8, 'max_features': 'auto', 'n_estimators': 2000, 'random_state': 34}


In [859]:
print(confusion_matrix(y_test, rf_predict))

[[1803    6    1    1   11]
 [ 226   16    5    0   11]
 [ 126   10    4    1   30]
 [  63    5    0    0   28]
 [  50   10    5    1   72]]


In [858]:
print(classification_report(y_test, rf_predict))

              precision    recall  f1-score   support

       0-10%       0.79      0.99      0.88      1822
     10%-15%       0.34      0.06      0.10       258
     15%-20%       0.27      0.02      0.04       171
     20%-25%       0.00      0.00      0.00        96
       25%<=       0.47      0.52      0.50       138

   micro avg       0.76      0.76      0.76      2485
   macro avg       0.38      0.32      0.31      2485
weighted avg       0.66      0.76      0.69      2485



In [826]:
features_rf = pd.DataFrame()
features_rf['feature'] = feature_names
features_rf['weight'] = grid_rf.best_estimator_.feature_importances_
features_rf.set_index('feature').sort_values('weight', ascending=False)

Unnamed: 0_level_0,weight
feature,Unnamed: 1_level_1
Age,0.179548
WS,0.131209
MP,0.129975
PER,0.099521
USG%,0.081938
BPM,0.077241
G,0.046788
TS%,0.036189
TRB%,0.036059
AST%,0.034827


In [695]:
from keras import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from keras.models import load_model

In [696]:
n_cols = X.shape[1]

y_train = pd.get_dummies(y_train)

from keras.callbacks import EarlyStopping

early_stopping_monitor = EarlyStopping(patience=2)

In [717]:
model_1 = Sequential()

model_1.add(Dense(21, activation='relu', input_shape=(n_cols,)))

model_1.add(Dense(21, activation='relu'))

model_1.add(Dense(5, activation='softmax'))

model_1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model_1.fit(X_train, y_train, validation_split=0.3,
        epochs=50, callbacks=[early_stopping_monitor])

Train on 5218 samples, validate on 2237 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


<keras.callbacks.callbacks.History at 0x1abb5a0978>

In [718]:
model_1.save('model_1.h5')

In [720]:
load_1 = load_model('model_1.h5')

y_prediction = load_1.predict(X_test)

#Converting predictions to label
pred = list()
for i in range(len(y_prediction)):
    pred.append(np.argmax(y_prediction[i]))
    
print('Accuracy for NN: Model_1: %.2f%%' % (accuracy_score(pred,y_test)*100))

Accuracy for NN: Model_1: 5.96%


In [721]:
model_2 = Sequential()
model_2.add(Dense(10, activation='relu', input_shape=(n_cols,)))
model_2.add(Dense(10, activation='relu'))
model_2.add(Dense(5, activation='softmax'))
model_2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_2.fit(X_train, y_train, validation_split=0.3,
        epochs=50, callbacks=[early_stopping_monitor])

Train on 5218 samples, validate on 2237 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50


<keras.callbacks.callbacks.History at 0x1ad6211518>

In [722]:
model_2.save('model_2.h5')

In [724]:
load_2 = load_model('model_2.h5')

y_prediction2 = load_2.predict(X_test)

#Converting predictions to label
pred2 = list()
for i in range(len(y_prediction2)):
    pred2.append(np.argmax(y_prediction2[i]))
    
print('Accuracy for NN: Model_1: %.2f%%' % (accuracy_score(pred2,y_test)*100))

Accuracy for NN: Model_1: 7.57%


In [727]:
model_3 = Sequential()
model_3.add(Dense(21, activation='relu', input_shape=(n_cols,)))
model_3.add(Dense(21, activation='relu'))
model_3.add(Dense(21, activation='relu'))
model_3.add(Dense(5, activation='softmax'))
model_3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_3.fit(X_train, y_train, validation_split=0.3,
        epochs=50, callbacks=[early_stopping_monitor])

model_3.save('model_3.h5')

load_3 = load_model('model_3.h5')

y_prediction3 = load_3.predict(X_test)

#Converting predictions to label
pred3 = list()
for i in range(len(y_prediction3)):
    pred3.append(np.argmax(y_prediction3[i]))
    
print('Accuracy for NN: Model_1: %.2f%%' % (accuracy_score(pred3,y_test)*100))

Train on 5218 samples, validate on 2237 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Accuracy for NN: Model_1: 4.19%


In [1]:
model_performance

NameError: name 'model_performance' is not defined