# GENERATE PROJECT RESULTS

- Import relevant libraries for executing machine learning models and visualizations
- Load data from previously generated file and parse into numeric arrays for selected features and class labels.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
from xgboost import plot_tree
from sklearn import metrics
import seaborn as sns
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

data = pd.read_csv('TrainTestSample.csv')
features = ['Average Minutes', 'NBA Experience', 'Off Days']
X = data[['AVGmins', 'Experience', 'OffDays']].values
Y = data['Class'].values

- Import relevant packages for machine learning and evaluation metrics. 
- Generate empty lists and dataframes for storage of results following each iteration of training and testing. 
- Set up models with default parameters to establish baseline results and present evaluation metrics in sorted table.
- Generate visualization of model accuracy resuls relative to one another with Mean ± Confidence Intervals for 100 iterations.

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import tree
import xgboost as xgb

DTACC = []
RFACC = []
GBCACC = []
LRACC = []
MLPACC = []
XGBACC = []
DTScores = pd.DataFrame(columns = ['Precision', 'Recall', 'F1', 'None'])
GBCScores = pd.DataFrame(columns = ['Precision', 'Recall', 'F1', 'None'])
RFScores = pd.DataFrame(columns = ['Precision', 'Recall', 'F1', 'None'])
LRScores = pd.DataFrame(columns = ['Precision', 'Recall', 'F1', 'None'])
MLPScores = pd.DataFrame(columns = ['Precision', 'Recall', 'F1', 'None'])
XGBScores = pd.DataFrame(columns = ['Precision', 'Recall', 'F1', 'None'])

for a in range(0, 100):
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

    DT = tree.DecisionTreeClassifier()
    DT.fit(X_train, y_train)
    DTPred = DT.predict(X_test)
    DTAccuracy = round(accuracy_score(y_test, DTPred) * 100, 2)
    DTACC.append(DTAccuracy)
    DTPerf = precision_recall_fscore_support(y_test, DTPred, average = 'binary')
    DTScores.loc[len(DTScores)] = precision_recall_fscore_support(y_test, DTPred, average = 'binary')
    
    RF = RandomForestClassifier()
    RF.fit(X_train, y_train)
    RFPred = RF.predict(X_test)
    RFAccuracy = round(accuracy_score(y_test, RFPred) * 100, 2)
    RFACC.append(RFAccuracy)
    RFPerf = precision_recall_fscore_support(y_test, RFPred, average = 'binary')
    RFScores.loc[len(RFScores)] = precision_recall_fscore_support(y_test, RFPred, average = 'binary')
    
    GBC = GradientBoostingClassifier()
    GBC.fit(X_train, y_train)
    GBCPred = GBC.predict(X_test)
    GBCAccuracy = round(accuracy_score(y_test, GBCPred) * 100, 2)
    GBCACC.append(GBCAccuracy)
    GBCPerf = precision_recall_fscore_support(y_test, GBCPred, average = 'binary')
    GBCScores.loc[len(GBCScores)] = precision_recall_fscore_support(y_test, GBCPred, average = 'binary')
    
    LR = LogisticRegression()
    LR.fit(X_train, y_train)
    LRPred = LR.predict(X_test)
    LRAccuracy = round(accuracy_score(y_test, LRPred) * 100, 2)
    LRACC.append(LRAccuracy)
    LRPerf = precision_recall_fscore_support(y_test, LRPred, average = 'binary')
    LRScores.loc[len(LRScores)] = precision_recall_fscore_support(y_test, LRPred, average = 'binary')

    MLP = MLPClassifier(max_iter = 2000)
    MLP.fit(X_train, y_train)
    MLPYPred = MLP.predict(X_test)
    MLPAccuracy = round(accuracy_score(y_test, MLPYPred) * 100, 2)
    MLPACC.append(MLPAccuracy)
    MLPPerf = precision_recall_fscore_support(y_test, MLPYPred, average = 'binary')
    MLPScores.loc[len(MLPScores)] = precision_recall_fscore_support(y_test, MLPYPred, average = 'binary')
    
    XGB = xgb.XGBClassifier(use_label_encoder=False)
    XGB.fit(X_train, y_train, eval_metric='rmse')
    XGBPred = XGB.predict(X_test)
    XGBAccuracy = round(accuracy_score(y_test, XGBPred) * 100, 2)
    XGBACC.append(XGBAccuracy)
    XGBPerf = precision_recall_fscore_support(y_test, XGBPred, average = 'binary')
    XGBScores.loc[len(XGBScores)] = precision_recall_fscore_support(y_test, XGBPred, average = 'binary')

models = ['Decision Tree', 'Random Forest', 'Gradient Boost', 'Logistic Regression', 'Multilayer Perceptron', 'Extreme Gradient Boosting']
shorts = [DTACC, RFACC, GBCACC, LRACC, MLPACC, XGBACC]
scores = [DTScores, RFScores, GBCScores, LRScores, MLPScores, XGBScores]
colNames = ['Model', 'Mean Accuracy', 'Std Accuracy', 'Lower95', 'Upper95', 'Mean Precision', 'Mean Recall', 'Mean F-1 Score']
summary = pd.DataFrame(columns = colNames)

for b in range(0,len(models)): 
    
    name = models[b]
    mean = round(np.mean(shorts[b]), 2)
    stdv = round(np.std(shorts[b]), 2)
    lowr = round(mean - 1.96 * stdv, 2)
    uppr = round(mean + 1.96 * stdv, 2)
    precision = round(np.mean(scores[b]['Precision']), 2)
    recall = round(np.mean(scores[b]['Recall']), 2)
    F1 = round(np.mean(scores[b]['F1']), 2)
    row = [name, mean, stdv, lowr, uppr, precision, recall, F1]
    
    summary.loc[len(summary)] = row
    
summary = summary.sort_values('Mean Accuracy', ascending = False).reset_index(drop = True)
plt.scatter(summary['Mean Accuracy'], summary['Model'], s = 50, linestyle = 'None')
plt.errorbar(summary['Mean Accuracy'], summary['Model'], xerr = summary['Std Accuracy'] * 1.96, linestyle = 'None')
plt.title('Model Performance: Mean ± 95% Confidence Interval', y = 1.1, fontsize = 15, fontweight = 'bold')
plt.xlabel('Accuracy Score', fontsize = 12, fontweight = 'bold')
plt.yticks(fontweight = 'bold')
plt.xlim(50, 100)

#plt.savefig('ModelPerf.jpg')

In [None]:
summary

# CODE SET UP TO RUN FOCUSED SECOND EXPERIMENT

- Copied from body of code above with focus on decision tree and XGBoost classifiers. 
- Variable 'max_depth' left as 'a' from exploratory analysis concerning model depth and evaluation metrics. 
- -- Was previously set to '2' for purposes of the second experiment presented in 'results' of project report. 

In [None]:
DTACC = []
XGBACC = []
DTScores = pd.DataFrame(columns = ['Precision', 'Recall', 'F1', 'None'])
XGBScores = pd.DataFrame(columns = ['Precision', 'Recall', 'F1', 'None'])

for a in range(1, 101):
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

    DT = tree.DecisionTreeClassifier(max_depth = a)
    DT.fit(X_train, y_train)
    DTPred = DT.predict(X_test)
    DTAccuracy = round(accuracy_score(y_test, DTPred) * 100, 2)
    DTACC.append(DTAccuracy)
    DTPerf = precision_recall_fscore_support(y_test, DTPred, average = 'binary')
    DTScores.loc[len(DTScores)] = precision_recall_fscore_support(y_test, DTPred, average = 'binary')

    XGB = xgb.XGBClassifier(use_label_encoder=False, max_depth = a)
    XGB.fit(X_train, y_train, eval_metric='rmse')
    XGBPred = XGB.predict(X_test)
    XGBAccuracy = round(accuracy_score(y_test, XGBPred) * 100, 2)
    XGBACC.append(XGBAccuracy)
    XGBPerf = precision_recall_fscore_support(y_test, XGBPred, average = 'binary')
    XGBScores.loc[len(XGBScores)] = precision_recall_fscore_support(y_test, XGBPred, average = 'binary')
    
models = ['Decision Tree', 'Extreme Gradient Boosting']
shorts = [DTACC, RFACC, GBCACC, LRACC, MLPACC, XGBACC]
scores = [DTScores, RFScores, GBCScores, LRScores, MLPScores, XGBScores]
colNames = ['Model', 'Mean Accuracy', 'Std Accuracy', 'Lower95', 'Upper95', 'Mean Precision', 'Mean Recall', 'Mean F-1 Score']
summary = pd.DataFrame(columns = colNames)

for b in range(0,len(models)): 
    
    name = models[b]
    mean = round(np.mean(shorts[b]), 2)
    stdv = round(np.std(shorts[b]), 2)
    lowr = round(mean - 1.96 * stdv, 2)
    uppr = round(mean + 1.96 * stdv, 2)
    precision = round(np.mean(scores[b]['Precision']), 2)
    recall = round(np.mean(scores[b]['Recall']), 2)
    F1 = round(np.mean(scores[b]['F1']), 2)
    row = [name, mean, stdv, lowr, uppr, precision, recall, F1]
    
    summary.loc[len(summary)] = row
    
summary = summary.sort_values('Mean Accuracy', ascending = False).reset_index(drop = True)

summary

In [None]:
x = list(range(1, 101))
y1 = XGBACC
y2 = DTACC

n = 1

plt.plot(x[::n], y1[::n], label = 'XGBoost')
plt.plot(x[::n], y2[::n], label = 'Decision Tree')


plt.legend(loc = 'best')
plt.title('Figure 4: Accuracy by Depth of Model', y = 1.05, fontweight = 'bold', fontsize = 15)
plt.ylabel('Accuracy (%)', fontweight = 'bold')
plt.xlabel('Depth of Tree-Based Model', fontweight = 'bold')
plt.xlim(0, 101)

In [None]:
DT = tree.DecisionTreeClassifier(max_depth = 4)
DT.fit(X_train, y_train)
DTPred = DT.predict(X_test)
DTAccuracy = round(accuracy_score(y_test, DTPred) * 100, 2)
#DTACC.append(DTAccuracy)
DTPerf = precision_recall_fscore_support(y_test, DTPred, average = 'binary')
#DTScores.loc[len(DTScores)] = precision_recall_fscore_support(y_test, DTPred, average = 'binary')

XGB = xgb.XGBClassifier(use_label_encoder=False, max_depth = 4)
XGB.fit(X_train, y_train, eval_metric='rmse')
XGBPred = XGB.predict(X_test)
XGBAccuracy = round(accuracy_score(y_test, XGBPred) * 100, 2)
#XGBACC.append(XGBAccuracy)
XGBPerf = precision_recall_fscore_support(y_test, XGBPred, average = 'binary')
#XGBScores.loc[len(XGBScores)] = precision_recall_fscore_support(y_test, XGBPred, average = 'binary')

# THE REST OF THIS NOTEBOOK IS FIGURES FOR USE IN PROJECT REPORT

In [None]:
cm = confusion_matrix(y_test, XGBPred)
cmp = sns.heatmap(cm, annot = True, fmt = 'g', cmap = 'Blues', cbar = False, annot_kws={"fontsize":12})
cmp.set_xlabel('Actual Class', fontsize = 15, fontweight = 'bold', labelpad = 12)
cmp.set_ylabel('Predicted Class', fontsize = 15, fontweight = 'bold', labelpad = 12)
cmp.set_title('Confusion Matrix for XGBoost Model', fontsize = 20, fontweight = 'bold', y = 1.05)

In [None]:
viz = dtreeviz(DT, X, Y,
                target_name="Class",
                feature_names=features,
                class_names= ['Injured', 'Control'])

viz
#viz.save("RegDTree3.svg")

In [None]:
plot_tree(XGB, rankdir='LR', fmap = 'feature_map.txt')
fig = plt.gcf()
fig.set_size_inches(150, 100)
fig.savefig('XGBTree3.png')

In [None]:
fig = plt.figure(figsize = (12, 12))
ax = fig.add_subplot(111, projection = '3d')

injData = data[data['Class'] == 1]
conData = data[data['Class'] == 0]

x1 = injData['Experience']
y1 = injData['AVGmins']
z1 = injData['OffDays']

x2 = conData['Experience']
y2 = conData['AVGmins']
z2 = conData['OffDays']

cat = data.groupby('Class')


#color = ['blue' if l == 0 else 'orange' for l in data['Class']]

#ax.scatter(x, y, z, label = data['Class'], c = color, s = 60) 

ax.scatter(x1, y1, z1, c = 'orange', s = 60, label = 'Injured')
ax.scatter(x2, y2, z2, c = 'blue', s = 60, label = 'Control') 
    
ax.view_init(30, 30)

ax.set_title('Injury Class Using Three Predictors', y= 1.05, fontsize = 25, fontweight = 'bold')
ax.set_xlabel('Years of NBA Experience', fontsize = 15, labelpad = 20)
ax.set_ylabel('Average Minutes Per Game', fontsize = 15, labelpad = 20)
ax.set_zlabel('Rest Days in Last 21 Days', fontsize = 15, labelpad = 20)
ax.legend(fontsize = 15, loc=1, bbox_to_anchor=(0.1,0.8))

In [None]:
#import graphviz
#from xgboost import plot_tree

#plt.figure(figsize = (50, 50))
#plot_tree(XGB, fontsize = 20)
#plt.show()

plot_tree(XGB, rankdir='LR')
fig = plt.gcf()
fig.set_size_inches(150, 100)
fig.savefig('XGBTree.png')

In [None]:
#!pip install dtreeviz

from dtreeviz.trees import dtreeviz 

viz = dtreeviz(DT, X, Y,
                target_name="Class",
                feature_names=features,
                class_names= ['Injured', 'Control'])

viz
viz.save("RegDTree.svg")

In [None]:
plot_tree(XGB, rankdir='LR')
fig = plt.gcf()
fig.set_size_inches(150, 100)
#fig.savefig('XGBTree2.png')

In [None]:
plotData = data[['AVGmins', 'Experience', 'OffDays', 'Class']]

plotDataMeans = plotData.groupby('Class').mean()
plotDataStdvs = plotData.groupby('Class').std()

#Injured = plotData[plotData['Class'] == 1]
#Injured = Injured.drop('Class', axis = 1)
#Control = plotData[plotData['Class'] == 0]
#Control = Control.drop('Class', axis = 1)

In [None]:
x = plotDataMeans.columns

y1 = plotDataMeans.iloc[0]
yerr1 = plotDataStdvs.iloc[0]

y2 = plotDataMeans.iloc[1]
yerr2 = plotDataStdvs.iloc[1]

z = np.arange(len(x))


In [None]:
plotDataMeans

In [None]:
plt.bar(z - 0.2, y1, 0.4, color = 'blue')
plt.errorbar(z - 0.2, y1, yerr = yerr1, fmt = "o", color = 'black')
plt.bar(z + 0.2, y2, 0.4, color = 'orange')
plt.errorbar(z + 0.2, y2, yerr = yerr2, fmt = "o", color = 'black')
plt.xticks(z, x, fontsize = 12, fontweight = 'bold')
plt.yticks(fontsize = 10, fontweight = 'bold')
plt.title('Qualitative Between-Group Comparison', y = 1.05, fontsize = 15, fontweight = 'bold')

colors = {'Controls':'blue', 'Injured':'orange'}         
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
plt.legend(handles, labels)