In [1]:
## Import libraries
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import regex as re
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.svm import SVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
import math

import joblib

In [2]:
'''
Function: random_forest_grid

Purpose: Performing hyperparameter search for random forest classifier using 5X nested 5-fold Cross Validation.

Input: X and y (Features and Outcome)

Output: Results of the hyperparameter GridSearchCV
'''

def random_forest_grid(X, y):
    
    rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=2022)
    
    
    pipeline = Pipeline(
    [
     ('model', RandomForestClassifier(random_state=2022))
    ]
    )
    
    search = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__min_samples_leaf':np.arange(1, 11, 1),
      'model__criterion':["gini","entropy"],
      'model__n_estimators':np.arange(11, 752, 10),
      'model__bootstrap':[True, False]        
     },
    n_jobs=-1,
    scoring="roc_auc",
    cv=rkf,
    verbose=3
    )
    
    search.fit(X,y)
    
    return search

In [3]:
'''
Function: logistic_regression_grid

Purpose: Performing hyperparameter search for logistic regression classifier using 5X nested 5-fold Cross Validation.

Input: X and y (Features and Outcome)

Output: Results of the hyperparameter GridSearchCV
'''

def logistic_regression_grid(X, y):
    
    
    rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=2022)
    
    pipeline = Pipeline(
    [
     ('model', LogisticRegression(random_state=2022, max_iter=10000))
    ]
    )
    
    search_1 = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__penalty':["l2"],
      'model__C':np.arange(0.01, 1.01, 0.01),
      'model__solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
      'model__fit_intercept':[True, False]        
     },
    n_jobs=-1,
    scoring="accuracy",
    cv=rkf,
    verbose=3
    )
    
    search_2 = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__penalty':["l1"],
      'model__C':np.arange(0.01, 1.01, 0.01),
      'model__solver':['liblinear', 'saga'],
      'model__fit_intercept':[True, False]        
     },
    n_jobs=-1,
    scoring="accuracy",
    cv=rkf,
    verbose=3
    )
    
    search_3 = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__penalty':["elasticnet"],
      'model__C':np.arange(0.01, 1.01, 0.01),
      'model__l1_ratio': np.arange(0.1, 1, 0.1),
      'model__solver':['saga'],
      'model__fit_intercept':[True, False]        
     },
    n_jobs=-1,
    scoring="accuracy",
    cv=rkf,
    verbose=3
    )
    
    search_4 = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__penalty':["none"],
      'model__solver':['newton-cg', 'lbfgs', 'sag', 'saga'],
      'model__fit_intercept':[True, False]        
     },
    n_jobs=-1,
    scoring="accuracy",
    cv=rkf,
    verbose=3
    )
    
    search_1.fit(X,y)
    search_2.fit(X,y)
    search_3.fit(X,y)
    search_4.fit(X,y)
    
    
    if (((search_1.best_score_ > search_2.best_score_) & (search_1.best_score_ > search_3.best_score_)) & (search_1.best_score_ > search_4.best_score_)):
        return search_1
    
    elif ((search_2.best_score_ > search_3.best_score_) & (search_2.best_score_ > search_4.best_score_)):
        return search_2
    
    elif (search_3.best_score_ > search_4.best_score_):
        return search_3
    
    else:
        return search_4

In [4]:
'''
Function: support_vector_machine_grid

Purpose: Performing hyperparameter search for support vector machine classifier using 5X nested 5-fold Cross Validation.

Input: X and y (Features and Outcome)

Output: Results of the hyperparameter GridSearchCV
'''

def support_vector_machine_grid(X, y):
    
    rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=2022)
    
    
    pipeline = Pipeline(
    [
     ('model', SVC(random_state=2022))
    ]
    )
    
    search_1 = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__gamma': ['scale', 'auto'],
      'model__coef0': [0, 1, 2],
      'model__C': np.arange(0.01, 1.001, 0.01),
      'model__kernel': ['poly'],
      'model__degree': [2, 3],
     },
    n_jobs=-1,
    scoring="accuracy",
    cv=rkf,
    verbose=3
    )
        
    search_2 = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__gamma': ['scale', 'auto'],
      'model__coef0': [0, 0.5, 1, 1.5, 2],
      'model__C': np.arange(0.01, 1.001, 0.01),
      'model__kernel': ['sigmoid'],
     },
    n_jobs=-1,
    scoring="accuracy",
    cv=rkf,
    verbose=3
    )
        
    search_3 = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__gamma': ['scale', 'auto'],
      'model__C': np.arange(0.005, 1.001, 0.005),
      'model__kernel': ['linear', 'rbf'],
     },
    n_jobs=-1,
    scoring="accuracy",
    cv=rkf,
    verbose=3
    )
    
    search_1.fit(X,y)
    search_2.fit(X,y)
    search_3.fit(X,y)
        
    if ((search_1.best_score_ > search_2.best_score_) & (search_1.best_score_ > search_3.best_score_)):
        return search_1
        
    elif (search_2.best_score_ > search_3.best_score_):
        return search_2
        
    else:
        return search_3

In [5]:
'''
Function: KNN_grid

Purpose: Performing hyperparameter search for K-Nearest Neighbors classifier using 5X nested 5-fold Cross Validation.

Input: X and y (Features and Outcome)

Output: Results of the hyperparameter GridSearchCV
'''

def KNN_grid(X, y):
    
    rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=2022)
    
    
    pipeline = Pipeline(
    [
     ('model', KNeighborsClassifier())
    ]
    )
    
    search = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__n_neighbors':np.arange(1, 128, 2),
      'model__weights':["uniform","distance"],
      'model__leaf_size':np.arange(1, 33, 3),
      'model__p':[1, 2]        
     },
    n_jobs=-1,
    scoring="accuracy",
    cv=rkf,
    verbose=3
    )
    
    search.fit(X,y)
    
    return search

In [6]:
'''
Function: naive_bayes_grid

Purpose: Performing hyperparameter search for Naive Bayes classifier using 5X nested 5-fold Cross Validation.

Input: X and y (Features and Outcome)

Output: Results of the hyperparameter GridSearchCV
'''

def naive_bayes_grid(X, y):
    
    rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=2022)
    
    
    pipeline = Pipeline(
    [
     ('model', GaussianNB())
    ]
    )
    
    search = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__var_smoothing': np.logspace(0,-11, num=3000)    
     },
    n_jobs=-1,
    scoring="accuracy",
    cv=rkf,
    verbose=3
    )
    
    search.fit(X,y)
    
    return search

In [7]:
'''
Function: light_gbm_grid

Purpose: Performing hyperparameter search for Light Gradient Boosting classifier using 5X nested 5-fold Cross Validation.

Input: X and y (Features and Outcome)

Output: Results of the hyperparameter GridSearchCV
'''

def light_gbm_grid(X, y):
    
    rkf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=2022)
    
    
    pipeline = Pipeline(
    [
     ('model', LGBMClassifier(random_state=2022))
    ]
    )
    
    search = GridSearchCV(
    estimator = pipeline,
    param_grid = {
      'model__n_estimators': [20, 100, 400],
      'model__learning_rate': [0.005, 0.01, 0.05, 0.1, 0.3, 0.5],
      'model__num_leaves': [5, 10, 20, 35, 50, 75],
      'model__min_child_samples':[5, 10, 20, 30, 50, 75],
      'model__max_bin': [20, 50, 100, 255, 400]
     },
    n_jobs=-1,
    scoring="accuracy",
    cv=rkf,
    verbose=3
    )
    
    search.fit(X,y)
    
    return search

In [8]:
## Import dataframe for 70_30 feature selected data.

name = "../data/feature_selected_train_dataset_70_30.csv"
df = pd.read_csv(name)

In [9]:
## Separate into features and outcomes

X_train = df.drop(['outcome'], axis=1, inplace=False)
y_train = df["outcome"]

## Scale X with standard scaler for models that require it.
X_train_scaled = StandardScaler().fit_transform(X_train)

In [10]:
## Perform hyperparameter search for random forest
search_rf = random_forest_grid(X_train, y_train)

## Display best ROC-AUC score
print("ROC-AUC of the best random forest model was:", search_rf.best_score_)

## Display parameters for best model
print("Hyperparameters of best random forest model were: \n\n", search_rf.best_params_)

## Save best random forest classifier model for future usage
joblib.dump(search_rf.best_estimator_, '../top_models/best_random_forest.pkl')

Fitting 25 folds for each of 3000 candidates, totalling 75000 fits


KeyboardInterrupt: 

In [11]:
## Perform hyperparameter search for logistic regression
search_logreg = logistic_regression_grid(X_train_scaled, y_train)

## Display best accuracy score
print("Accuracy of the best logistic regression model was:", search_logreg.best_score_)

## Display parameters for best model
print("Hyperparameters of best logistic regression model were: \n\n", search_logreg.best_params_)

## Save best Logistic Regression classifier model for future usage
joblib.dump(search_logreg.best_estimator_, '../top_models/best_logistic_regression.pkl')

Fitting 25 folds for each of 1000 candidates, totalling 25000 fits
Fitting 25 folds for each of 400 candidates, totalling 10000 fits
Fitting 25 folds for each of 1800 candidates, totalling 45000 fits
Fitting 25 folds for each of 8 candidates, totalling 200 fits
ROC-AUC of the best logistic regression model was: 0.8413888474495735
Hyperparameters of best logistic regression model were: 

 {'model__C': 0.01, 'model__fit_intercept': True, 'model__penalty': 'l2', 'model__solver': 'liblinear'}


In [12]:
## Perform hyperparameter search for logistic regression
search_svc = support_vector_machine_grid(X_train_scaled, y_train)

## Display best accuracy score
print("Accuracy of the best support vector machine model was:", search_svc.best_score_)

## Display parameters for best model
print("Hyperparameters of best support vector machine model were: \n\n", search_svc.best_params_)

## Save best Support Vector Machine classifier model for future usage
joblib.dump(search_svc.best_estimator_, '../top_models/best_support_vector_machine.pkl')

Fitting 25 folds for each of 1200 candidates, totalling 30000 fits
Fitting 25 folds for each of 1000 candidates, totalling 25000 fits
Fitting 25 folds for each of 800 candidates, totalling 20000 fits
Accuracy of the best support vector machine model was: 0.7780026990553306
Hyperparameters of best support vector machine model were: 

 {'model__C': 0.29000000000000004, 'model__coef0': 0.5, 'model__gamma': 'auto', 'model__kernel': 'sigmoid'}


['../top_models/best_support_vector_machine.pkl']

In [13]:
## Perform hyperparameter search for logistic regression
search_knn = KNN_grid(X_train_scaled, y_train)

## Display best accuracy score
print("Accuracy of the best K-Nearest Neighbor model was:", search_knn.best_score_)

## Display parameters for best model
print("Hyperparameters of best K-Nearest Neighbor model were: \n\n", search_knn.best_params_)

## Save best KNN classifier model for future usage
joblib.dump(search_knn.best_estimator_, '../top_models/best_KNN.pkl')

Fitting 25 folds for each of 2816 candidates, totalling 70400 fits
ROC-AUC of the best K-Nearest Neighbor model was: 0.840535026642277
Hyperparameters of best K-Nearest Neighbor model were: 

 {'model__leaf_size': 1, 'model__n_neighbors': 111, 'model__p': 2, 'model__weights': 'distance'}


In [15]:
## Perform hyperparameter search for logistic regression
search_naive_bayes = naive_bayes_grid(X_train, y_train)

## Display best accuracy score
print("Accuracy of the best Naive Bayes model was:", search_naive_bayes.best_score_)

## Display parameters for best model
print("Hyperparameters of best Naive Bayes model were: \n\n", search_naive_bayes.best_params_)

## Save best Naive Bayes classifier model for future usage
joblib.dump(search_naive_bayes.best_estimator_, '../top_models/best_naive_bayes.pkl')

Fitting 25 folds for each of 3000 candidates, totalling 75000 fits
Accuracy of the best Naive Bayes model was: 0.7406207827260458
Hyperparameters of best Naive Bayes model were: 

 {'model__var_smoothing': 4.126088252678016e-06}


['../top_models/best_naive_bayes.pkl']

In [16]:
## Perform hyperparameter search for logistic regression
search_light_gbm = light_gbm_grid(X_train, y_train)

## Display best accuracy score
print("Accuracy of the best Light GBM model was:", search_light_gbm.best_score_)

## Display parameters for best model
print("Hyperparameters of best Light GBM model were: \n\n", search_light_gbm.best_params_)

## Save best Light GBM classifier model for future usage
joblib.dump(search_light_gbm.best_estimator_, '../top_models/best_light_gbm.pkl')

Fitting 25 folds for each of 3240 candidates, totalling 81000 fits
Accuracy of the best Light GBM model was: 0.7406477732793522
Hyperparameters of best Light GBM model were: 

 {'model__learning_rate': 0.005, 'model__max_bin': 20, 'model__min_child_samples': 20, 'model__n_estimators': 400, 'model__num_leaves': 10}


['../top_models/best_light_gbm.pkl']