In [1]:
## Import Libraries
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import regex as re
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import SVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
import math
from sklearn.model_selection import cross_validate
import joblib

In [2]:
'''
Function name: cross_validation_comparison

Input: List of models, list of model names, raw feature values, scaled feature values, outcome values.

Output: Print result of cross validation comparison on screen and return the string of the results.

Purpose: Compare cross validation results between the best model for each classifier.

'''

def cross_validation_comparison(list_models, list_model_names, X_raw, X_scaled, y):

    ## Create variable for model report
    final_model_report = ""

    ## Loop through models
    for i in range(6):
        
        ## Define model and model name for this loop
        model_name = list_model_names[i]
        model = list_models[i]
        
        ## Define arrays for storing average values of each metric on this loop
        average_scores_accuracy = np.array([])
        average_std_accuracy = np.array([])
        average_scores_recall = np.array([])
        average_std_recall = np.array([])
        average_scores_precision = np.array([])
        average_std_precision = np.array([])
        average_scores_roc = np.array([])
        average_std_roc = np.array([])
        average_scores_f1 = np.array([])
        average_std_f1 = np.array([])
        
        ## Loop through 20 rounds of 5-fold cross validation for current model being tested
        for j in range(20):
            
            ## Check if model needs scaled data or raw data
            if model_name in ["LOGISTIC REGRESSION", "SUPPORT VECTOR MACHINE", "K NEAREST NEIGHBOR"]:
                X = X_scaled
            else:
                X = X_raw

            ## Run 5-fold cross validation
            cv_results = cross_validate(model, X, y, cv=5, scoring=('accuracy', 'recall', 'precision', 'f1', 'roc_auc'))

            ## Store metrics of cross validation scores in variables
            scores_accuracy = cv_results['test_accuracy']
            scores_recall = cv_results['test_recall']
            scores_precision = cv_results['test_precision']
            scores_roc = cv_results['test_roc_auc']
            scores_f1 = cv_results['test_f1']

            ## Store standard deviation for cross validation scores
            std_accuracy = float(scores_accuracy.std(ddof=1))
            std_recall = float(scores_recall.std(ddof=1))
            std_precision = float(scores_precision.std(ddof=1))
            std_roc = float(scores_roc.std(ddof=1))
            std_f1 = float(scores_f1.std(ddof=1))

            ## Append mean of metric accross cross-validation to the list of average scores for that metric
            average_scores_accuracy = np.append(average_scores_accuracy, scores_accuracy.mean())
            
            ## Append standard deviation of cross validation metric to list of standard deviations
            average_std_accuracy = np.append(average_std_accuracy, std_accuracy)
            
            ## Repeat same process for all other metrics
            average_scores_recall = np.append(average_scores_recall, scores_recall.mean())
            average_std_recall = np.append(average_std_recall, std_recall)

            average_scores_precision = np.append(average_scores_precision, scores_precision.mean())
            average_std_precision = np.append(average_std_precision, std_precision)

            average_scores_roc = np.append(average_scores_roc, scores_roc.mean())
            average_std_roc = np.append(average_std_roc, std_roc)

            average_scores_f1 = np.append(average_scores_f1, scores_f1.mean())
            average_std_f1 = np.append(average_std_f1, std_f1)


        ## Print averages of average score lists on the screen once the loop for the model is done running
        print("\n\n\n |||||||||||||||||| " + model_name + " |||||||||||||||||| \n\n")
        print("Accuracy Score:", round(average_scores_accuracy.mean(), 3), "    STD:", round(average_std_accuracy.mean(), 3), "    SEM:", round((average_std_accuracy.mean()/math.sqrt(10)), 3))
        print("F1 Score:", round(average_scores_f1.mean(),3), "    STD:", round(average_std_f1.mean(), 3), "    SEM:", round((average_std_f1.mean()/math.sqrt(10)), 3))
        print("Recall Score:", round(average_scores_recall.mean(), 3), "    STD:", round(average_std_recall.mean(), 3), "    SEM:", round((average_std_recall.mean()/math.sqrt(10)), 3))
        print("Precision Score:", round(average_scores_precision.mean(), 3), "    STD:", round(average_std_precision.mean(), 3), "    SEM:", round((average_std_precision.mean()/math.sqrt(10)), 3))
        print("Roc AUC Score:", round(average_scores_roc.mean(), 3), "    STD:", round(average_std_roc.mean(), 3), "    SEM:", round((average_std_roc.mean()/math.sqrt(10)), 3))
        
        ## Create string with the scores for this models
        current_model_report = str("\n\n\n |||||||||||||||||| " + model_name + " |||||||||||||||||| \n\n" 
                         + "Accuracy Score: " + str(round(average_scores_accuracy.mean(), 3)) + "     STD: " + str(round(average_std_accuracy.mean(), 3)) + "     SEM: " + str(round((average_std_accuracy.mean()/math.sqrt(10)), 3)) + "\n" 
                         + "F1 Score: " + str(round(average_scores_f1.mean(), 3)) + "     STD: " + str(round(average_std_f1.mean(), 3)) + "     SEM: " + str(round((average_std_f1.mean()/math.sqrt(10)), 3)) + "\n" 
                         + "Recall Score: " + str(round(average_scores_recall.mean(), 3)) + "     STD: " + str(round(average_std_recall.mean(), 3)) + "     SEM: " + str(round((average_std_recall.mean()/math.sqrt(10)), 3)) + "\n" 
                         + "Precision Score: " + str(round(average_scores_precision.mean(), 3)) + "     STD: " + str(round(average_std_precision.mean(), 3)) + "     SEM: " + str(round((average_std_precision.mean()/math.sqrt(10)), 3)) + "\n" 
                         + "ROC AUC Score: " + str(round(average_scores_roc.mean(), 3)) + "     STD: " + str(round(average_std_roc.mean(), 3)) + "     SEM: " + str(round((average_std_roc.mean()/math.sqrt(10)), 3)) + "\n" )
        
        ## Add model scores to final report
        final_model_report = final_model_report + current_model_report
    
    ## Return final report once everything is done
    return final_model_report

In [3]:
## Import dataframe for 70_30 feature selected data.
name = "../data/feature_selected_train_dataset_70_30.csv"
df = pd.read_csv(name)

## Separate into features and outcomes
X_train = df.drop(['outcome', "CSSA Score Week 1"], axis=1, inplace=False)
y_train = df["outcome"]

## Scale X with standard scaler for models that require it.
X_train_scaled = StandardScaler().fit_transform(X_train)

In [4]:
## Load all models
random_forest = joblib.load("../top_models/best_random_forest.pkl")
logistic_regression = joblib.load("../top_models/best_logistic_regression.pkl")
support_vector_machine = joblib.load("../top_models/best_support_vector_machine.pkl")
k_nearest_neighbors = joblib.load("../top_models/best_KNN.pkl")
naive_bayes = joblib.load("../top_models/best_naive_bayes.pkl")
light_gbm  = joblib.load("../top_models/best_light_gbm.pkl")

In [5]:
## Refit all models to data without the CSSA1 Score
random_forest.fit(X_train, y_train)
logistic_regression.fit(X_train_scaled, y_train)
support_vector_machine.fit(X_train_scaled, y_train)
k_nearest_neighbors.fit(X_train_scaled, y_train)
naive_bayes.fit(X_train, y_train)
light_gbm.fit(X_train, y_train)

Pipeline(steps=[('model',
                 LGBMClassifier(learning_rate=0.005, max_bin=20,
                                n_estimators=400, num_leaves=10,
                                random_state=2022))])

In [6]:
## Create list of models and list of model names
list_models = [random_forest, logistic_regression, support_vector_machine, k_nearest_neighbors, naive_bayes, light_gbm]
list_model_names = ["RANDOM FOREST", "LOGISTIC REGRESSION", "SUPPORT VECTOR MACHINE",
                    "K NEAREST NEIGHBOR", "NAIVE BAYES", "LIGHT GBM"]

In [7]:
## Get cross validation results and print them on screen
cv_results = cross_validation_comparison(list_models, list_model_names, X_train, X_train_scaled, y_train)




 |||||||||||||||||| RANDOM FOREST |||||||||||||||||| 


Accuracy Score: 0.677     STD: 0.07     SEM: 0.022
F1 Score: 0.67     STD: 0.08     SEM: 0.025
Recall Score: 0.674     STD: 0.126     SEM: 0.04
Precision Score: 0.678     STD: 0.07     SEM: 0.022
Roc AUC Score: 0.733     STD: 0.06     SEM: 0.019



 |||||||||||||||||| LOGISTIC REGRESSION |||||||||||||||||| 


Accuracy Score: 0.683     STD: 0.064     SEM: 0.02
F1 Score: 0.669     STD: 0.09     SEM: 0.028
Recall Score: 0.663     STD: 0.137     SEM: 0.043
Precision Score: 0.683     STD: 0.059     SEM: 0.019
Roc AUC Score: 0.762     STD: 0.079     SEM: 0.025



 |||||||||||||||||| SUPPORT VECTOR MACHINE |||||||||||||||||| 


Accuracy Score: 0.698     STD: 0.074     SEM: 0.023
F1 Score: 0.664     STD: 0.089     SEM: 0.028
Recall Score: 0.611     STD: 0.115     SEM: 0.036
Precision Score: 0.739     STD: 0.09     SEM: 0.028
Roc AUC Score: 0.761     STD: 0.087     SEM: 0.028



 |||||||||||||||||| K NEAREST NEIGHBOR |||||||||||||||||| 

In [8]:
## Save report to txt file
text_file = open("../reports/cross_validation_results_NO_CSSA1_70_30.txt", "w")
n = text_file.write(cv_results)
text_file.close()