# NHL Draft Model

This notebook will build a classification model to predict future success of NHL draft prospects.

In [16]:
#import packages
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#model imports
from sklearn.model_selection import cross_val_score, KFold,train_test_split
from sklearn.metrics import roc_auc_score, r2_score, confusion_matrix, accuracy_score, classification_report, precision_score, recall_score, f1_score, matthews_corrcoef, ConfusionMatrixDisplay
from sklearn import metrics

#logistic import
#import statsmodels.api as sm

#random forest import
#from sklearn.ensemble import RandomForestRegressor
#from sklearn.model_selection import GridSearchCV

#import xgboost as xgb
#from xgboost import XGBRegressor

#SMOTE
#from imblearn.over_sampling import SMOTE

#OLS
#import statsmodels.formula.api as smf
#import statsmodels.tools.eval_measures as smf_metrics

#random forest import
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

#cross validaiton import
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import xgboost as xgb
from xgboost import XGBClassifier

#SMOTE
from imblearn.over_sampling import SMOTE

In [17]:
#read in data
df = pd.read_excel("C:/Users/BRG4142/Documents/hockey stuff/NHL Draft data/Model ready data/NHL_draft_clean_df.xlsx")

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,OVERALL,NHL_TEAM,PLAYER,AGE,PS,YEAR,POS2,ROUND,DRAFT_YEAR,...,PS_PG,HT_CAT,HT_CAT2,GP_STANDARD,G_STANDARD,A_STANDARD,PTS_STANDARD,GPG_STANDARD,APG_STANDARD,PPG_STANDARD
0,0,1,Edmonton Oilers,Taylor Hall,18,74.2,2010,W,1,2010,...,0.097503,6.01,3,-0.393029,2.531579,3.548666,3.365403,2.945029,3.894737,3.798937
1,1,2,Boston Bruins,Tyler Seguin,18,84.4,2010,C,1,2010,...,0.102303,6.01,3,0.328125,3.373684,2.920722,3.365403,3.346032,2.764928,3.262241
2,2,3,Florida Panthers,Erik Gudbranson,18,16.6,2010,D,1,2010,...,0.025897,6.04,5,-2.011481,-0.68642,0.270814,0.012775,-0.520325,1.286031,0.852265
3,3,4,Columbus Blue Jackets,Ryan Johansen,18,52.3,2010,C,1,2010,...,0.066455,6.02,4,1.044803,1.279395,1.931174,1.761375,0.818341,1.49582,1.339437
4,4,5,New York Islanders,Nino Niederreiter,18,42.8,2010,W,1,2010,...,0.05847,6.02,4,0.507168,2.559953,0.311741,1.30637,2.290848,0.156314,1.176923


In [19]:
df.shape

(1187, 35)

In [20]:
#examine target
df['REACH_NHL'].value_counts(normalize=True)

1    0.537489
0    0.462511
Name: REACH_NHL, dtype: float64

In [21]:
df['NHL_REGULAR'].value_counts(normalize=True)

0    0.711036
1    0.288964
Name: NHL_REGULAR, dtype: float64

In [22]:
#drop if age > 18
df= df[(df['AGE'] <= 18) & (df['POS2'] != 'D')] #forwards
#df= df[(df['AGE'] <= 18) & (df['POS2'] == 'D')] #defense

In [23]:
#get categories
df['POS2'] = df['POS2'].astype('category').cat.codes
#df['AGE'] = df['AGE'].astype('category').cat.codes
df['HT_CAT'] = df['HT_CAT'].astype('category').cat.codes
df['LGE_CAT'] = df['LGE2'].astype('category').cat.codes

### Train/Test Split

In [24]:
#train/test split
X = df[[
    'PLAYER',
    'LGE2',
    'PRO_LEAGUE',
    'ROUND',
    'OVERALL',
    'PS',
    'POS2', 
    'HT_CAT', 
    'LGE_CAT', 
    'GP_STANDARD', 
    'G_STANDARD', 
    'A_STANDARD',
    'PTS_STANDARD',
    'GPG_STANDARD', 
    'APG_STANDARD',
    'PPG_STANDARD'
        ]]


#y= df['REACH_NHL'] #reach nhl target
y= df['NHL_REGULAR'] #nhl regular target

In [25]:
X_train_output, X_test_output, y_train, y_test = train_test_split(X, y, test_size=.25)

In [38]:
X_train = X_train_output[['POS2', 
        'HT_CAT', 
        'LGE_CAT',
        #'PRO_LEAGUE',
        #'GP_STANDARD', 
        'G_STANDARD', 
        'A_STANDARD',
        'PTS_STANDARD',
        'GPG_STANDARD', 
        'APG_STANDARD',
        'PPG_STANDARD']]

X_test = X_test_output[['POS2', 
        'HT_CAT', 
        'LGE_CAT', 
        #'PRO_LEAGUE',
        #'GP_STANDARD', 
        'G_STANDARD', 
        'A_STANDARD',
        'PTS_STANDARD',
        'GPG_STANDARD', 
        'APG_STANDARD',
        'PPG_STANDARD']]

### SMOTE

In [39]:
#from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=9)
X_res, y_res = sm.fit_resample(X_train, y_train)

### Random Forest

In [40]:
#specify model
model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=1, random_state=99)

#Fit your model on the features (X) and the target (y)
model.fit(X_train, y_train)
#model.fit(X_res, y_res) #smote

RandomForestClassifier(max_depth=10, random_state=99)

In [41]:
y_pred = model.predict(X_test) #returns 0 or 1
pred_proba = model.predict_proba(X_test)[:,1] # returns predicted probabilites 
print("Accuracy", accuracy_score(y_test, y_pred).round(3))
print("AUC", roc_auc_score(y_test, pred_proba).round(3))
print(confusion_matrix(y_test, y_pred))

Accuracy 0.664
AUC 0.695
[[79 17]
 [34 22]]


In [42]:
#print out accuracy measures
print("The model used is Random Forest classifier")
acc= accuracy_score(y_test,y_pred).round(3)
print("The accuracy is  {}".format(acc))
prec= precision_score(y_test,y_pred).round(3)
print("The precision is {}".format(prec))
rec= recall_score(y_test,y_pred).round(3)
print("The recall is {}".format(rec))
f1= f1_score(y_test,y_pred).round(3)
print("The F1-Score is {}".format(f1))

The model used is Random Forest classifier
The accuracy is  0.664
The precision is 0.564
The recall is 0.393
The F1-Score is 0.463


In [32]:
#feature importance
feature_scores = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

PTS_STANDARD    0.153481
PPG_STANDARD    0.153237
A_STANDARD      0.151437
APG_STANDARD    0.135522
G_STANDARD      0.134590
GPG_STANDARD    0.128211
LGE_CAT         0.084304
HT_CAT          0.059216
dtype: float64

In [33]:
#Grid search
#print('Processing GridSearch. Please hold for the next available set of outputs.\n')
#parameters = {"max_depth": [5,10,15,20,25]
#             ,"min_samples_split" :[2,3,4]
#             ,"n_estimators" : [10, 20, 50, 100]
#             ,"min_samples_leaf": [1,2,3]
#             ,"criterion": ('gini','entropy')}

#rf = RandomForestClassifier(random_state=99)
#gd_model = GridSearchCV(rf, parameters, n_jobs = -1, cv=5)
#gd_model.fit(X_train,y_train)

#print(gd_model.best_params_)
#print(gd_model.best_estimator_)
#print(gd_model.best_score_)

### XGBoost Model

In [34]:
# fit model
#xgb_model = XGBClassifier()

#tune parameters
xgb_model = XGBClassifier(
 learning_rate =0.07,
 n_estimators=1000,
 max_depth=10,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=1)

#xgb_model.fit(X_train, y_train) 
xgb_model.fit(X_res, y_res) 





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.07, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=4, nthread=4, num_parallel_tree=1,
              predictor='auto', random_state=1, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, seed=1, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [35]:
# make predictions for test data
y_pred_xgb = xgb_model.predict(X_test)
predictions = [round(value) for value in y_pred_xgb]

In [36]:
# evaluate predictions
accuracy_xgb = accuracy_score(y_test, predictions)
pred_proba_xgb = xgb_model.predict_proba(X_test)[:,1] # returns predicted probabilites
#print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Accuracy", accuracy_score(y_test, predictions).round(3))
print("AUC", roc_auc_score(y_test, pred_proba_xgb).round(3))
print(confusion_matrix(y_test, predictions))

Accuracy 0.612
AUC 0.643
[[65 31]
 [28 28]]


In [37]:
#print out accuracy measures
print("The model used is XGBoost")
acc= accuracy_score(y_test,y_pred_xgb).round(3)
print("The accuracy is  {}".format(acc))
prec= precision_score(y_test,y_pred_xgb).round(3)
print("The precision is {}".format(prec))
rec= recall_score(y_test,y_pred_xgb).round(3)
print("The recall is {}".format(rec))
f1= f1_score(y_test,y_pred_xgb).round(3)
print("The F1-Score is {}".format(f1))

The model used is XGBoost
The accuracy is  0.612
The precision is 0.475
The recall is 0.5
The F1-Score is 0.487


## Output predictions

In [57]:
#append test dataset, target from test dataset, and predicted probabilty of finding together
df_pred = pd.DataFrame(X_test_output)
df_pred['Target'] = y_test
df_pred['Probability'] = pred_proba[:]  

In [58]:
#output to excel
df_pred.to_excel("C:/Users/BRG4142/Documents/hockey stuff/NHL Draft data/Model ready data/NHL_REGULAR_model_predictions.xlsx",
             sheet_name='Sheet1')