In [1]:
## Import Libraries
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import regex as re
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import SVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
import math
from sklearn.model_selection import cross_validate
import joblib
from sklearn.metrics import confusion_matrix

In [2]:
'''
Function name: model_test_comparison.

Input: list of models, list od model names, raw features, scaled features, outcome.

Output: Print model test metrics on screen and return a string with the model testing results

Purpose: Calculate and display metrics of the testing of the top model for each classifier.

'''

def model_test_comparison(list_models, list_model_names, X_raw, X_scaled, y):

    ## Create variable for model report
    final_model_report = ""

    ## Loop through models
    for i in range(6):
        
        ## Define model and model name for this loop
        model_name = list_model_names[i]
        model = list_models[i]
        
        ## Check if model needs scaled data or raw data
        if model_name in ["LOGISTIC REGRESSION", "SUPPORT VECTOR MACHINE", "K NEAREST NEIGHBOR"]:
            X = X_scaled
        else:
            X = X_raw
        
        ## Predict test outcomes from test features using model
        y_predicted = model.predict(X)
        
        ## Create confusion matrix with predicted vs actual values for test dataset
        cm = confusion_matrix(y, y_predicted)
        
        ## Calculate False positive/negative and True positive/negative
        FP = cm[0,1]
        FN = cm[1,0]
        TP = cm[1,1]
        TN = cm[0,0]


        # Sensitivity, hit rate, recall, or true positive rate and other metrics
        TPR = round(TP/(TP+FN), 3)
        TNR = round(TN/(TN+FP), 3)
        PPV = round(TP/(TP+FP), 3)
        NPV = round(TN/(TN+FN), 3)
        FPR = round(FP/(FP+TN), 3)
        FNR = round(FN/(TP+FN), 3)
        FDR = round(FP/(TP+FP), 3)
        ACC = round((TP+TN)/(TP+FP+FN+TN), 3)
        F1 = round(((2*PPV*TPR)/(PPV+TPR)), 3)
        roc_auc_score_val = round(roc_auc_score(y_test, y_predicted), 3)
        
        ## Print results on screen
        print("\n\n\n|||||||||||||||||| " + model_name + " ||||||||||||||||||\n\n")
        print("The true discovery rate is (Recall): " + str(TPR))
        print("The true negative rate is: " + str(TNR))
        print("The positive predictive value is (Precision): " + str(PPV))
        print("The negative predictive value is: " + str(NPV))
        print("The false positive rate value is: " + str(FPR))
        print("The false negative rate value is: " + str(FNR))
        print("The false discovery rate value is: " + str(FDR))
        print("The overall accuracy is: " + str(ACC))
        print("F1 is: " + str(F1))
        print("ROC AUC is: " + str(roc_auc_score_val))

        ## Store results in a variable
        current_model_report = str("\n\n\n|||||||||||||||||| " + model_name + " ||||||||||||||||||\n\n" +
        "The true discovery rate is (Recall): " + str(TPR) + "\n" +
        "The true negative rate is: " + str(TNR) + "\n" +
        "The positive predictive value is (Precision): " + str(PPV) + "\n" +
        "The negative predictive value is: " + str(NPV) + "\n" +
        "The false positive rate value is: " + str(FPR) + "\n" +
        "The false negative rate value is: " + str(FNR) + "\n" +
        "The false discovery rate value is: " + str(FDR) + "\n" +
        "The overall accuracy is: " + str(ACC) + "\n" +
        "F1 is: " + str(F1) + "\nROC AUC is: " + str(roc_auc_score_val))
                                   
        ## Add current report to final report                  
        final_model_report = final_model_report + current_model_report
    
    ## Return final report
    return final_model_report

In [3]:
## Import dataframe for 70_30 feature selected data.
name = "../data/feature_selected_test_dataset_70_30.csv"
df = pd.read_csv(name)

## Separate into features and outcomes
X_test = df.drop(['outcome'], axis=1, inplace=False)
y_test = df["outcome"]

## Scale X with standard scaler for models that require it.
X_test_scaled = StandardScaler().fit_transform(X_test)

In [4]:
## Load all models
random_forest = joblib.load("../top_models/best_random_forest.pkl")
logistic_regression = joblib.load("../top_models/best_logistic_regression.pkl")
support_vector_machine = joblib.load("../top_models/best_support_vector_machine.pkl")
k_nearest_neighbors = joblib.load("../top_models/best_KNN.pkl")
naive_bayes = joblib.load("../top_models/best_naive_bayes.pkl")
light_gbm  = joblib.load("../top_models/best_light_gbm.pkl")

## Create list of models and list of model names
list_models = [random_forest, logistic_regression, support_vector_machine, k_nearest_neighbors, naive_bayes, light_gbm]
list_model_names = ["RANDOM FOREST", "LOGISTIC REGRESSION", "SUPPORT VECTOR MACHINE",
                    "K NEAREST NEIGHBOR", "NAIVE BAYES", "LIGHT GBM"]

In [5]:
## Calculate, display, and store test results in variable
test_results = model_test_comparison(list_models, list_model_names, X_test, X_test_scaled, y_test)




|||||||||||||||||| RANDOM FOREST ||||||||||||||||||


The true discovery rate is (Recall): 0.9
The true negative rate is: 0.556
The positive predictive value is (Precision): 0.771
The negative predictive value is: 0.769
The false positive rate value is: 0.444
The false negative rate value is: 0.1
The false discovery rate value is: 0.229
The overall accuracy is: 0.771
F1 is: 0.831
ROC AUC is: 0.728



|||||||||||||||||| LOGISTIC REGRESSION ||||||||||||||||||


The true discovery rate is (Recall): 0.7
The true negative rate is: 0.833
The positive predictive value is (Precision): 0.875
The negative predictive value is: 0.625
The false positive rate value is: 0.167
The false negative rate value is: 0.3
The false discovery rate value is: 0.125
The overall accuracy is: 0.75
F1 is: 0.778
ROC AUC is: 0.767



|||||||||||||||||| SUPPORT VECTOR MACHINE ||||||||||||||||||


The true discovery rate is (Recall): 0.633
The true negative rate is: 0.889
The positive predictive value is (Precision):

In [6]:
## Write testing results to txt file
text_file = open("../reports/model_testing_results_70_30.txt", "w")
n = text_file.write(test_results)
text_file.close()