# Notebook for testing and optimising single tree model

In [None]:
import sys
sys.path.insert(0, '../notebooks')

from Build_ML_df import *
#generate dataframe for ML
ML_df = df_for_ML().merged_structural()

In [None]:
df = ML_df[['BINARY_PHENOTYPE', 'MIC', 'Ligand0_Distance', 'Ca_Distance', 
            'cardiolipin_Distance', 'Depth', 'lipid_head_dis', 'lipid_tail_dis',
            'dG_stability', 'd_volume', 'd_MW', 'd_hydropathy', 'Pi', 'MAPP', 
            'H', 'B', 'E', 'G', 'I', 'T', 'S', 'NaN']].copy()
#convert foldx stability values to floats
df['dG_stability'] = df['dG_stability'].astype(float, errors='raise')
df.dropna(inplace=True)

#convert intermediate phenotypes to resistant
List = []
for i in df['BINARY_PHENOTYPE']:
    if i == 'I':
        List.append('R')
    else:
        List.append(i)
        
df['BINARY_PHENOTYPE'] = List

#create numpy array with features for ML training
data_array = df[df.columns[2:]].to_numpy()

#create column with 01 binary phenotype
List = []
for i in df['BINARY_PHENOTYPE']:
    if i == 'R':
        List.append(0)
    else:
        List.append(1)
df['BF'] = List

## Direct binary classification

#### Strategy for finding best performing model:
1) Grid search for best performing preprocessing and parameters for *ACCURACY*    
2) Plot feature importance chart
3) Grid search for best performing preprocessing and parameters for *average precision*
4) Grid search for best performing preprocessing and parameters for *ROC AUC
5) enerate precision-recall curve with best parameters for *average precision*   
6) enerate ROC curve with best parameters for *ROC AUC*   
7) Generate confusion matrix with best parameters for either average preicsion or ROC AUC (these tend to have the same best performing parameters)   


### 1) Grid search for best performing preprocessing and parameters for ACCURACY

In [None]:
#no preprocessing - therefore, do not see why I would need a pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, average_precision_score, roc_auc_score

tree = DecisionTreeClassifier(random_state=0)
param_grid = {'max_depth':[2,4,6,8,10,12,14, None], 
              'min_samples_split':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
              'min_samples_leaf':[0.1,0.2,0.3,0.4,0.5],
              'max_features':['auto','sqrt','log2', None]}
X_train, X_test, y_train, y_test = train_test_split(data_array, df['BF'],
                                                    random_state=0)

In [None]:
#Grid search with shuffled kfold validation
Kfold_shuffle=KFold(n_splits=5, shuffle=True, random_state=0)
grid_kfold_shuffle = GridSearchCV(tree, param_grid, cv=Kfold_shuffle)
grid_kfold_shuffle.fit(X_train, y_train)
print ('grid_kfold_shuffle: best estimator: \n', grid_kfold_shuffle.best_estimator_)
print ('grid_kfold_shuffle: best cross-validation score: ', grid_kfold_shuffle.best_score_)
print ('grid_kfold_shuffle: test set average accuracy: ', 
       accuracy_score(y_test, grid_kfold_shuffle.predict(X_test)), '\n')


### 2) Generate feature importance plots 

In [None]:
from matplotlib import pyplot as plt
import numpy as np

#define trees with best perfoming parameters, or default parameters
tree_best_params = DecisionTreeClassifier(max_depth=2, max_features='auto', min_samples_leaf=0.1,
                       min_samples_split=0.1, random_state=0)
tree_no_params = DecisionTreeClassifier(random_state=0)
tree_best_params.fit(X_train, y_train)    
tree_no_params.fit(X_train, y_train)

#plot charts
def plot_feature_importances(model):
    n_features = data_array.shape[1]
    plt.barh(np.arange(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), df.columns[2:-1])
    plt.xlabel('feature importance')
    plt.ylabel('feature')
    plt.ylim(-1, n_features)
    plt.title('Feature importance plot')

plot_feature_importances(tree_no_params)

In [None]:
plot_feature_importances(tree_best_params)

### 3) Grid search for best performing parameters for AVERAGE PRECISION SCORE

In [None]:
#Grid search with shuffled kfold validation
#use predict_proba function to calculate average_precision
kfold_shuffle = KFold(n_splits=5, shuffle=True, random_state=0)
grid_kfold_shuffle = GridSearchCV(tree, param_grid, cv=kfold_shuffle, scoring='average_precision')
grid_kfold_shuffle.fit(X_train, y_train)
print ('grid_kfold_shuffle: best estimator: \n', grid_kfold_shuffle.best_estimator_)
print ('grid_kfold_shuffle: best cross-validation score: ', grid_kfold_shuffle.best_score_)                  
print ('grid_kfold_shuffle test set average precision: ', 
       average_precision_score(y_test, grid_kfold_shuffle.predict_proba(X_test)[:,1]), '\n')

### 4) Grid search for best performing preprocessing and parameters for ROC AUC

In [None]:
#Grid search with shuffled kfold validation
#use predict_proba function to calculate average_precision

kfold_shuffle = KFold(n_splits=5, shuffle=True, random_state=0)
grid_kfold_shuffle = GridSearchCV(tree, param_grid, cv=kfold_shuffle, scoring='roc_auc')
grid_kfold_shuffle.fit(X_train, y_train)
print ('grid_kfold_shuffle: best estimator: \n', grid_kfold_shuffle.best_estimator_)
print ('grid_kfold_shuffle: best cross-validation score: ', grid_kfold_shuffle.best_score_)                  
print ('grid_kfold_shuffle test set AUC: ', 
       roc_auc_score(y_test, grid_kfold_shuffle.predict_proba(X_test)[:,1]), '\n')


### 5) Generate precision-recall curve with best parameters for average precision   


In [None]:
#use best parameters for average precision

from sklearn.metrics import precision_recall_curve
from matplotlib import pyplot as plt

X_train, X_test, y_train, y_test = train_test_split(data_array, df['BF'], random_state=0)
tree = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.1, min_samples_split=0.1,
                       random_state=0)
tree.fit(X_train, y_train)

precision, recall, thresholds = precision_recall_curve(y_test, tree.predict_proba(X_test)[:, 1])

plt.plot(precision, recall, label='tree')
close_default = np.argmin(np.abs(thresholds - 0.5))
plt.plot(precision[close_default], recall[close_default], '^', c='k', 
          markersize=10, label='threshold 0.5 ', fillstyle='none', mew=2)
plt.xlabel('precision')
plt.ylabel('recall')
plt.title('Precision-recall curve for decision tree')
plt.legend(loc='best')

### 6) Generate ROC curve with best parameters for ROC AUC  

In [None]:
# use best parameters for AUC
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, tree.predict_proba(X_test)[:, 1])

plt.plot(fpr, tpr, label='ROC curve tree')
plt.xlabel('FPR')
plt.ylabel('TPR(recall)')

close_default = np.argmin(np.abs(thresholds - 0.5))
plt.plot(fpr[close_default], tpr[close_default], '^', markersize=10, label='threshold 0.5', 
         fillstyle='none', c='k', mew=2)
plt.title('ROC curve for decision tree')
plt.legend(loc=4)

### 7) Generate confusion matrix with best parameters for either average preicion or ROC AUC (these tend to have the same best performing parameters)

#### Precision/sens/spec/fpr:

precision = PPV = TP/TP+FP   
sensitivity = recall = TPR = TP/TP+FN   
specificity = TNR = TN/TN+FP   
FPR = FP/FP+TN = (1-specificity)

#### Errors:

very major error is a susceptible prediction when isolate is resistant:    
Very major error = (no. very major errors/no. actaul resistant)*100

major error is a resitant prediction when isoalte is susceptible   
major error = (no major erors/no. actual susceptible)*100


In [None]:
#confusion matrix (max_features=auto, max_depth=2, min_samples_lieaf=0.1, min_samples_split = 0)   
   
from sklearn.metrics import confusion_matrix

#build and fit decision tree
X_train, X_test, y_train, y_test = train_test_split(data_array, df['BF'], random_state=0)
tree = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.1, min_samples_split=0.1,
                       random_state=0)
tree.fit(X_train, y_train)
predict = tree.predict(X_test).astype(int)

#generate confusion matrix
confusion = confusion_matrix(y_test, predict)

#calculate precision, sensitivty, specificity, FPR, errors
Precision = (confusion[1][1])/(confusion[1][1]+confusion[0][1])
Sensitivity = (confusion[1][1])/(confusion[1][1]+confusion[1][0])
Specificity = (confusion[0][0])/(confusion[0][0]+confusion[0][1])
FPR = 1-Specificity
very_major_error = (confusion[0][1]/y_test[y_test==0].count())*100
major_error = (confusion[1][0]/y_test[y_test==1].count())*100


print ('Precision: ', Precision)
print ('Sensitivity: ', Sensitivity)
print ('Specificity: ', Specificity)
print ('FPR :', FPR)
print ('very major error :', very_major_error)
print ('major error: ', major_error)
print ('\n confusion matrix: \n', confusion)

In [None]:
import seaborn as sns
sns.set_style({'font.family':'sans-serif', 'font.sans-serif':'Helvetica'})

group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                confusion.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     confusion.flatten()/np.sum(confusion)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (6.5,5))
sns.heatmap(confusion, annot=labels, fmt='', cmap='Blues')
plt.savefig('tree_binary_cf.png', bbox_inches='tight')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_array, df['BF'], random_state=0)
tree = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.1, min_samples_split=0.1,
                       random_state=0)
tree.fit(X_train, y_train)
predicted_proba = tree.predict_proba(X_test)
predict = (predicted_proba[:,1] >= 0.95).astype('int')

#generate confusion matrix
confusion = confusion_matrix(y_test, predict)

In [None]:
confusion

## Indirect binary classification via multiclass MIC classification with compressed MIC labels

#### Strategy for finding best performing model:
1) Grid search for best performing parameters for *ACCURACY*    
2) Grid search for best performing parameters for *weighted precision*    
3) Grid search for best performing parameters for *weighted recall*    
4) Generate feature importance plot     
5) Generate confusion matrix and classification report     
6) Convert predicted test MIC to binary phenotype and resplit data with same random state for binary y_test          
7) Generate confusion matrix and binary classification report with best parameters for accuracy      

In [None]:
#Compresss MIC labels via the following dictionary

Dict = {'>=32':['>32','32.0'], '16':['16.0'], '8':['>8','8.0'], '4':['4.0'], '2':['2.0'], '1':['1.0'], 
        '0.5':['0.5'], '<=0.25':['0.25','<=0.25','<=0.06']}
List = []
for i in df.index:
     for k,v in Dict.items():
            for j in v:
                if df['MIC'][i]==j:
                    List.append(k)
                    
#add compressed labels to df (not data array)
df['MIC_compressed'] = List

### 1) Grid search for best performing preprocessing and parameters for *ACCURACY* 

In [None]:
tree = DecisionTreeClassifier(random_state=0)
param_grid = {'max_depth':[2,4,6,8,10,12,14, None], 
              'min_samples_split':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
              'min_samples_leaf':[0.1,0.2,0.3,0.4,0.5],
              'max_features':['auto','sqrt','log2', None]}
X_train, X_test, y_train, y_test = train_test_split(data_array, df['MIC_compressed'],
                                                    random_state=0)


In [None]:
#Grid search with kfold cross validation (best cv for logreg multiclass classification)
Kfold=KFold(n_splits=5)
grid_kfold = GridSearchCV(tree, param_grid, cv=Kfold)
grid_kfold.fit(X_train, y_train)
print ('grid_kfold_shuffle: best estimator: \n', grid_kfold.best_estimator_)
print ('grid_kfold_shuffle: best cross-validation score: ', grid_kfold.best_score_)
print ('grid_kfold_shuffle: test set average accuracy: ', 
       accuracy_score(y_test, grid_kfold.predict(X_test)), '\n')

### 2) Grid search for best performing preprocessing and parameters for *weighted precision*

In [None]:
from sklearn.metrics import make_scorer, precision_score

tree = DecisionTreeClassifier(random_state=0)
scorer = make_scorer(precision_score, average='weighted')
param_grid =  {'max_depth':[2,4,6,8,10,12,14, None], 
              'min_samples_split':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
              'min_samples_leaf':[0.1,0.2,0.3,0.4,0.5],
              'max_features':['auto','sqrt','log2', None]}
X_train, X_test, y_train, y_test = train_test_split(data_array, df['MIC_compressed'],
                                                    random_state=0)


In [None]:
#Grid search with shuffled kfold cross validation
Kfold_shuffle=KFold(n_splits=5, shuffle=True, random_state=0)
grid_kfold_shuffle = GridSearchCV(tree, param_grid=param_grid, scoring=scorer, cv=Kfold_shuffle, n_jobs=-1)
grid_kfold_shuffle.fit(X_train, y_train)
print ('grid_kfold_shuffle: best estimator: \n', grid_kfold_shuffle.best_estimator_)
print ('grid_kfold_shuffle: best cross-validation score: ', grid_kfold_shuffle.best_score_)
print ('grid_kfold_shuffle: test set precision score: ', 
       precision_score(y_test, grid_kfold_shuffle.predict(X_test), average='weighted', zero_division=True), '\n')


### 3) Grid search for best performing preprocessing and parameters for *weighted recall*

In [None]:
from sklearn.metrics import recall_score

tree = DecisionTreeClassifier(random_state=0)
scorer = make_scorer(recall_score, average='weighted')
param_grid =  {'max_depth':[2,4,6,8,10,12,14, None], 
              'min_samples_split':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
              'min_samples_leaf':[0.1,0.2,0.3,0.4,0.5],
              'max_features':['auto','sqrt','log2', None]}
X_train, X_test, y_train, y_test = train_test_split(data_array, df['MIC_compressed'],
                                                    random_state=0)

In [None]:
#Grid search with shuffled kfold cross validation
Kfold_shuffle=KFold(n_splits=5, shuffle=True, random_state=0)
grid_kfold_shuffle = GridSearchCV(tree, param_grid=param_grid, scoring=scorer, cv=Kfold_shuffle, n_jobs=-1)
grid_kfold_shuffle.fit(X_train, y_train)
print ('grid_kfold_shuffle: best estimator: \n', grid_kfold_shuffle.best_estimator_)
print ('grid_kfold_shuffle: best cross-validation score: ', grid_kfold_shuffle.best_score_)
print ('grid_kfold_shuffle: test set recall score: ', 
       recall_score(y_test, grid_kfold_shuffle.predict(X_test), average='weighted', zero_division=True), '\n')


### 4) Generate feature importance plots (best params for weighted precision)

In [None]:
#define trees with best perfoming parameters, or default parameters
tree_best_params = DecisionTreeClassifier(max_depth=4, max_features='auto', min_samples_leaf=0.1,
                       min_samples_split=0.1, random_state=0)
tree_no_params = DecisionTreeClassifier(random_state=0)
tree_best_params.fit(X_train, y_train)    
tree_no_params.fit(X_train, y_train)

#plot charts
def plot_feature_importances(model):
    n_features = data_array.shape[1]
    plt.barh(np.arange(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), df.columns[2:-2])
    plt.xlabel('feature importance')
    plt.ylabel('feature')
    plt.ylim(-1, n_features)
    plt.title('Feature importance plot')

plot_feature_importances(tree_no_params)

In [None]:
plot_feature_importances(tree_best_params)

### 5) Generate multiclass confusion matrix and classification report 

In [None]:
#copy and fit classifier with best parameters from previous section
tree = tree_best_params
tree.fit(X_train, y_train)                 
                 
predict = tree.predict(X_test)
#generate confusion matrix
confusion = confusion_matrix(y_test, predict)

print ('\n confusion matrix: \n', confusion)

In [None]:
#generate heatmap of confusion matrix for visualisation

import mglearn

#this order of the target names is crucial
target_names = ['≤0.25', '0.5', '1', '2', '4', '8', '16', '≥32']
scores_image = mglearn.tools.heatmap(confusion, xlabel='Predicted Label',
                                     ylabel='True Label', xticklabels=target_names,
                                     yticklabels=target_names, cmap=plt.cm.gray_r, fmt='%d')
plt.title('confusion matrix heat map')
plt.gca().invert_yaxis()
plt.savefig('LR_multi_expand_cf.png', bbox_inches='tight')

In [None]:
#generate classification report
from sklearn.metrics import classification_report

print (classification_report(y_test, predict, zero_division=True))

### 6) Convert predicted test MIC to binary phenotype and resplit data with same random state for binary y_test

In [None]:
def MIC_to_binary(Predict):
    RS_dict = {1:['0.25', '0.5', '1', '2'],
           0:['4', '8', '16', '32']}
    binary_list = []
    for i in predict:
        for k,v in RS_dict.items():
            for j in v:
                if i == j:
                    binary_list.append(k)

    binary_array = np.array(binary_list)
    return binary_array

#convert MIC targets to binary targets
MIC_to_binary(predict)

#resplit data
X_train, X_test, y_train, y_test = train_test_split(data_array, df['BF'],
                                                  random_state=0)

### 7) Generate confusion matrix and classification report with best parameters for accuracy

In [None]:
#generate confusion matrix
confusion = confusion_matrix(y_test, MIC_to_binary(predict))

#calculate precision, sensitivity, specificty, FPR, and errors
Precision = (confusion[1][1])/(confusion[1][1]+confusion[0][1])
Sensitivity = (confusion[1][1])/(confusion[1][1]+confusion[1][0])
Specificity = (confusion[0][0])/(confusion[0][0]+confusion[0][1])
FPR = 1-Specificity
very_major_error = (confusion[0][1]/y_test[y_test==0].count())*100
major_error = (confusion[1][0]/y_test[y_test==1].count())*100


print ('Precision: ', Precision)
print ('Sensitivity: ', Sensitivity)
print ('Specificity: ', Specificity)
print ('FPR :', FPR)
print ('very major error :', very_major_error)
print ('major error: ', major_error)
print ('\n confusion matrix: \n', confusion)

In [None]:
print (classification_report(y_test, MIC_to_binary(predict)))

In [None]:
import seaborn as sns

group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                confusion.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     confusion.flatten()/np.sum(confusion)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (6.5,5))
sns.heatmap(confusion, annot=labels, fmt='', cmap='Blues')
plt.savefig('tree_multi_binary_cf.png', bbox_inches='tight')