In [None]:
import pandas as pd
import numpy as np
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.base import clone
from sklearn.metrics import roc_auc_score, recall_score, confusion_matrix

from nn_load_data import LoadNNData
from nn_utils import *
from nn_models import *
import copy

# Cross Validation

In [None]:
def run_cv(X, y, base_clf, cv_num=10, repeat_num=1):
    train_scores = pd.DataFrame()
    val_scores = pd.DataFrame()
    
    for repeat in range(repeat_num):
        k_folds = StratifiedKFold(n_splits=cv_num)
        fold = 0
        for train_index, val_index in k_folds.split(X, y):
            fold += 1
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            clf = clone(base_clf)
            clf.fit(X_train, y_train)

            y_train_pred = clf.predict(X_train)
            y_train_prob = clf.predict_proba(X_train)
            y_train_prob = y_train_prob[:, 1]
            train_scores = pd.concat([train_scores, 
                                      evaluate_cv_results(y_train, y_train_pred, y_train_prob, fold, repeat)])

            y_val_pred = clf.predict(X_val)
            y_test_prob = clf.predict_proba(X_val)
            y_test_prob = y_test_prob[:, 1]
            val_scores = pd.concat([val_scores,
                                    evaluate_cv_results(y_val, y_val_pred, y_test_prob, fold, repeat)])

    return train_scores, val_scores

def run_NN_cv(X_NN, y_NN, selected_columns, cv_num=10, repeat_num=1, device='cpu'):
    train_scores = pd.DataFrame()
    val_scores = pd.DataFrame()
    
    for repeat in range(repeat_num):
        k_folds = StratifiedKFold(n_splits=cv_num)
        fold = 0
        for train_index, test_index in k_folds.split(X_NN, y_NN):
            fold += 1
            X_train, X_test = X_NN[train_index], X_NN[test_index]
            y_train, y_test = y_NN[train_index], y_NN[test_index]            
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train)
            
            batch_size = 2048
            lr = 0.005

            number_of_features = len(selected_columns)
            input_size = number_of_features
            h1_size = 32
            h2_size = 16
            output_size = 1
            epochs = 1000
            
            # load the data
            train_loader = LoadNNData(X_train, y_train, batch_size)
            val_loader = LoadNNData(X_val, y_val, batch_size)
            test_loader = LoadNNData(X_test, y_test, batch_size)
            
            # instantiate the model
            model = NeuralNetModule(input_size, h1_size, h2_size, output_size)
            model = model.double()
            model.to(device)
            
            # set training variables
            pos_weight = train_loader.find_pos_weight()
            criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            sigmoid = nn.Sigmoid()
            bestValAUC = 0.0
            bestValEpoch = 0
            patience = 10

            # epoch loop
            for ep in range(1, epochs + 1):
                # batch loop
                n_iter = 0
                for inputs, labels in train_loader.loader:
                    n_iter += 1
                    train_batch(model, inputs, labels, device, criterion, optimizer)
              
                train_auc, val_auc = validate_nn(model, device, train_loader, val_loader, sigmoid)

                # early stopping check
                if val_auc > bestValAUC:
                    bestValAUC = val_auc
                    bestValEpoch = ep
                    best_model = copy.deepcopy(model)
                if ep - bestValEpoch > patience:
                    break
            
            y_true_nn_train, y_prob_nn_train = get_predictions(best_model, device, train_loader, sigmoid)
            y_pred_nn_train = np.where(y_prob_nn_train > 0.5, 1, 0)
            train_scores = pd.concat([train_scores, 
                                      evaluate_cv_results(y_true_nn_train, y_pred_nn_train, y_prob_nn_train, 
                                                          fold, repeat)])
            
            y_true_nn_test, y_prob_nn_test = get_predictions(best_model, device, test_loader, sigmoid)
            y_pred_nn_test = np.where(y_prob_nn_test > 0.5, 1, 0)
            val_scores = pd.concat([val_scores,
                                    evaluate_cv_results(y_true_nn_test, y_pred_nn_test, y_prob_nn_test, 
                                                        fold, repeat)])

    return train_scores, val_scores

def evaluate_cv_results(y_true, y_pred, y_prob, fold, repeat):   
    # Performance metrics
    auc = roc_auc_score(y_true, y_prob)
    sensitivity = recall_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)

    # Collect metrics in dataframe
    scores = pd.DataFrame({'fold': fold,
                           'repeat': repeat,
                           'AUC': [auc],
                           'SENSITIVITY': [sensitivity], 
                           'SPECIFICITY': [specificity]})

    return scores


def summarize_cv_results(cv_results):    
    alpha = 100-95
    metrics = ['AUC','SENSITIVITY','SPECIFICITY']
    medians = []
    ci_low = []
    ci_high = []
    
    for col in metrics:
        medians.append(np.percentile(cv_results[col], 50))
        ci_low.append(np.percentile(cv_results[col], alpha/2))
        ci_high.append(np.percentile(cv_results[col], 100-alpha/2))

    metrics = pd.DataFrame({'METRIC': metrics, 'MEDIAN': medians, 'CI_LOW': ci_low, 'CI_HIGH': ci_high})
    return metrics

In [None]:
# Load data
with open('X_columns.pkl', 'rb') as f:
    X_columns = pickle.load(f)
with open('selected_columns_rf.pkl', 'rb') as f:
    selected_columns = pickle.load(f)

with open('train_X.pkl', 'rb') as f:
    X = pickle.load(f)
with open('train_y.pkl', 'rb') as f:
    y = pickle.load(f)

X = pd.DataFrame(X, columns=X_columns)
X = X[selected_columns]
X = X.values

In [None]:
# NB
nb_clf = ComplementNB(norm=True)

train_cv, val_cv = run_cv(X, y, nb_clf, cv_num=10, repeat_num=5)
summarize_cv_results(train_cv).to_csv('nb_train_cv.csv', index=False)
summarize_cv_results(val_cv).to_csv('nb_val_cv.csv', index=False)

In [None]:
# LR
lr_clf = LogisticRegression(max_iter=50000,
                            solver='liblinear',
                            penalty='l1',
                            class_weight='balanced',
                            C=0.1)

train_cv, val_cv = run_cv(X, y, lr_clf, cv_num=10, repeat_num=5)
summarize_cv_results(train_cv).to_csv('lr_train_cv.csv', index=False)
summarize_cv_results(val_cv).to_csv('lr_val_cv.csv', index=False)

In [None]:
# RF
rf_clf = RandomForestClassifier(n_estimators=200,
                                max_depth=20,
                                criterion='gini',
                                class_weight='balanced')

train_cv, val_cv = run_cv(X, y, rf_clf, cv_num=10, repeat_num=5)
summarize_cv_results(train_cv).to_csv('rf_train_cv.csv', index=False)
summarize_cv_results(val_cv).to_csv('rf_val_cv.csv', index=False)

In [None]:
# NN
train_cv, val_cv = run_NN_cv(X, y, selected_columns, cv_num=10, repeat_num=5, device='cpu')
summarize_cv_results(train_cv).to_csv('nn_train_cv.csv', index=False)
summarize_cv_results(val_cv).to_csv('nn_val_cv.csv', index=False)