In [None]:
! pip install keras-tuner --upgrade

In [None]:
import os, math, time, random, datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
print(__doc__)

import tensorflow as tf
print(tf.__version__)

# from tensorflow.python.keras.models import Sequential, load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Input, LSTM, Conv1D, MaxPool1D, BatchNormalization, Flatten, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from keras_tuner.tuners import RandomSearch, Hyperband
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras import regularizers
import keras_tuner as kt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler

from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_recall_curve, average_precision_score

from sklearn.feature_selection import chi2

from mlxtend.plotting import plot_decision_regions

In [None]:
class SelectiveProgbarLogger(tf.keras.callbacks.ProgbarLogger):
    def __init__(self, verbose, epoch_interval, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.default_verbose = verbose
        self.epoch_interval = epoch_interval
    
    def on_epoch_begin(self, epoch, *args, **kwargs):
        self.verbose = (
            0 
                if epoch % self.epoch_interval != 0 
                else self.default_verbose
        )
        super().on_epoch_begin(epoch, *args, **kwargs)

In [None]:
def calculate_performace(test_num, pred_y, labels):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for index in range(test_num):
        if labels[index] == 1:
            if labels[index] == pred_y[index]:
                tp = tp + 1
            else:
                fn = fn + 1
        else:
            if labels[index] == pred_y[index]:
                tn = tn + 1
            else:
                fp = fp + 1

    acc = float(tp + tn) / test_num
    precision = float(tp) / (tp + fp)
    sensitivity = float(tp) / (tp + fn)
    specificity = float(tn) / (tn + fp)
    MCC = float(tp * tn - fp * fn) / (np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)))
    return acc, precision, sensitivity, specificity, MCC

def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    xmin, xmax = plt.xlim()  # return the current xlim
    plt.xlim((xmin, xmax))  # set the xlim to xmin, xmax
    plt.ylim(xmin, xmax)  # set the xlim to xmin, xmax

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

def plot_performance(train_loss,val_loss,train_acc,val_acc):
    # plot train and validation loss across multiple runs
    plt.plot(train_loss, color='blue', label='train')
    plt.plot(val_loss, color='orange', label='validation')
    plt.title('model train vs validation loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.show()

    plt.plot(train_acc, color='blue', label='train')
    plt.plot(val_acc, color='orange', label='validation')
    plt.title('model train vs validation loss')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper right')
    plt.show()
def transfer_label_from_prob(proba):
    label = [1 if val >= 0.5 else 0 for val in proba]
    return label
def plot_roc_curve(labels, probality, legend_text, auc_tag=True):
    # fpr2, tpr2, thresholds = roc_curve(labels, pred_y)
    fpr, tpr, thresholds = roc_curve(labels, probality)  # probas_[:, 1])
    roc_auc = auc(fpr, tpr)
    if auc_tag:
        rects1 = plt.plot(fpr, tpr, label=legend_text + ' (AUC=%6.3f) ' % roc_auc)
    else:
        rects1 = plt.plot(fpr, tpr, label=legend_text)

In [None]:
str1='250629'
def evaluate_classifiers(clfList, X_train, y_train, X_test, y_test, fea_dim, training=True, epoc=100, ep_interval=10, cutoff=0.5):
    _accs = []
    _aucs = []
    _names = []
    _specs = []
    _sens = []
    _mccs = []
    _f1scores = []
    _proba = []
    _training_times = []
    _histories = []  # List to store training histories

    
    for clf in clfList:
        try:
            if(training):
                if(clf[0] == 'DNN'):
                    print("Traning "+clf[0])
                    callbacks = [
                                    EarlyStopping(patience=250, monitor='val_accuracy', restore_best_weights=True),
                                    ReduceLROnPlateau(factor=0.25, patience=100, min_lr=1e-5),
                                    ModelCheckpoint(str1+'_DNN.weights.h5', monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)
                                ]
                    time_start = time.time()
                    history = clf[1].fit(X_train, y_train, #verbose=1, 
                               shuffle=True, epochs=epoc, 
                              callbacks=[SelectiveProgbarLogger(verbose = 0, epoch_interval=ep_interval),callbacks],
                              batch_size=64,
                              validation_split=0.1)
                    time_end = time.time()
                    _training_times.append(time_end - time_start)
                    print("Evaluating "+clf[0])
                    proba = clf[1].predict(X_test)
                    _proba.append(proba)
                    _histories.append(history)  # Store the training history
                if(clf[0] == 'GRU'):
                    XTrainRNN =np.array(X_train).reshape(-1, 1, fea_dim)
                    XTestRNN = np.array(X_test).reshape(-1, 1, fea_dim)
                    print("Traning "+clf[0])
                    callbacks = [
                                    EarlyStopping(patience=250, monitor='val_accuracy', restore_best_weights=True),
                                    ReduceLROnPlateau(factor=0.25, patience=100, min_lr=1e-5),
                                    ModelCheckpoint(str1+'_GRU.weights.h5', monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)
                                ]
                    time_start = time.time()
                    history = clf[1].fit(XTrainRNN, y_train, #verbose=1, 
                               shuffle=True, epochs=epoc, 
                              callbacks=[SelectiveProgbarLogger(verbose = 0, epoch_interval=ep_interval),callbacks],
                              batch_size=64,
                              validation_split=0.1)
                    time_end = time.time()
                    _training_times.append(time_end - time_start)
                    print("Evaluating "+clf[0])
                    proba = clf[1].predict(XTestRNN)
                    _proba.append(proba)
                    _histories.append(history)  # Store the training history
                if(clf[0] == 'LSTM'):
                    XTrainRNN =np.array(X_train).reshape(-1, 1, fea_dim)
                    XTestRNN = np.array(X_test).reshape(-1, 1, fea_dim)
                    print("Traning "+clf[0])
                    callbacks = [
                                    EarlyStopping(patience=250, monitor='val_accuracy', restore_best_weights=True),
                                    ReduceLROnPlateau(factor=0.25, patience=100, min_lr=1e-5),
                                    ModelCheckpoint(str1+'_LSTM.weights.h5', monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)
                                ]
                    time_start = time.time()
                    history = clf[1].fit(XTrainRNN, y_train, #verbose=1, 
                               shuffle=True, epochs=epoc, 
                              callbacks=[SelectiveProgbarLogger(verbose = 0, epoch_interval=ep_interval),callbacks],
                              batch_size=64,
                              validation_split=0.1)
                    time_end = time.time()
                    _training_times.append(time_end - time_start)
                    print("Evaluating "+clf[0])
                    proba = clf[1].predict(XTestRNN)
                    _proba.append(proba)
                    _histories.append(history)  # Store the training history
                if(clf[0] == 'CNN'):
                    XTrainCNN =np.array(X_train).reshape(-1, fea_dim, 1)
                    XTestCNN = np.array(X_test).reshape(-1, fea_dim, 1)
                    print("Traning "+clf[0])
                    callbacks = [
                                    EarlyStopping(patience=250, monitor='val_accuracy', restore_best_weights=True),
                                    ReduceLROnPlateau(factor=0.25, patience=150, min_lr=1e-5),
                                    ModelCheckpoint(str1+'_CNN.weights.h5', monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)
                                ]
                 
                    time_start = time.time()
                    history = clf[1].fit(XTrainCNN, y_train, #verbose=1, 
                               shuffle=True, epochs=epoc, 
                              callbacks=[SelectiveProgbarLogger(verbose = 0, epoch_interval=ep_interval),callbacks],
                              batch_size=64,
                              validation_split=0.1)
                    time_end = time.time()
                    _training_times.append(time_end - time_start)
                    print("Evaluating "+clf[0])
                    proba = clf[1].predict(XTestCNN)
                    _proba.append(proba)
                    _histories.append(history)  # Store the training history
                
            if(clf[0] == 'DNN'):
                print("Evaluating "+clf[0])
                proba = clf[1].predict(X_test) 
            if(clf[0] == 'GRU' or clf[0] == 'LSTM'):
                print("Evaluating "+clf[0])
                proba = clf[1].predict(XTestRNN)
            if(clf[0] == 'CNN'):
                print("Evaluating "+clf[0])
                proba = clf[1].predict(XTestCNN)
            
        except Exception as e:
            print("The error is: ",e)
            continue

        r_pred = [1 if val >= cutoff else 0 for val in proba]
        acc, prec, sens, spec, mcc = calculate_performace(len(y_test), r_pred, y_test) 
        f1score = f1_score(y_true=y_test, y_pred=r_pred)
        fpr, tpr, threshold = roc_curve(y_test, proba)
        auc_clf = auc(fpr, tpr)
        _aucs.append(auc_clf)
        _accs.append(acc)
        _sens.append(sens)
        _specs.append(spec) 
        _mccs.append(mcc)
        _f1scores.append(f1score)
        _names.append(clf[0])
        # _proba.append(proba)

    scoreDataFrame = pd.DataFrame({'Model':_names, 'Accuracy': _accs, 'ROC AUC': _aucs, 'F1 Score':_f1scores,'Sens':_sens, 'Spec':_specs, 'MCC':_mccs, 'Training Time': _training_times,})
    history_list = pd.DataFrame({'Model':_names, 'history':_histories})
    return scoreDataFrame, _proba, history_list

In [None]:
def plot_histories(history_list):
    """
    Plots training and validation loss/accuracy for each model.

    Args:
        scoreDataFrame (pd.DataFrame): DataFrame containing model names and training histories.
    """

    for index, row in history_list.iterrows():
        model_name = row['Model']
        history = row['history']
        # if model_name=='CNN_LSTM':
        #     continue

        if history is not None:  # Ensure history is not None
            plt.figure(figsize=(12, 5))

            # Plot training & validation accuracy values
            plt.subplot(1, 2, 1)
            plt.plot(history.history['accuracy'])
            plt.plot(history.history['val_accuracy'])
            plt.title(f'{model_name} Accuracy')
            plt.ylabel('Accuracy')
            plt.xlabel('Epoch')
            plt.legend(['Train', 'Validation'], loc='upper left')

            # Plot training & validation loss values
            plt.subplot(1, 2, 2)
            plt.plot(history.history['loss'])
            plt.plot(history.history['val_loss'])
            plt.title(f'{model_name} Loss')
            plt.ylabel('Loss')
            plt.xlabel('Epoch')
            plt.legend(['Train', 'Validation'], loc='upper left')

            plt.tight_layout()
            plt.savefig(str1+' '+model_name+' ACC_VS_LOSS.png', dpi=300, bbox_inches='tight')
            plt.show()
        else:
            print(f"No training history available for {model_name}")

In [None]:
# Get splitted data for classical ML models
def getSplitDataSet(X, y, ratio=0.2):
    
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)

    #split data into training and test data. 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=245)

    return X_train, X_test, y_train, y_test, scaler

In [None]:
url_dataset ='PSCV_all.csv'
df_main = pd.read_csv(url_dataset, header=None, dtype=float)

y = df_main.iloc[:,:1].values 
X = df_main.iloc[:,1:].values

encoder = LabelEncoder()
labels = y = encoder.fit_transform(y.ravel())


# Split the data into training and testing sets
X_train, X_test, y_train, y_test, minmax_scaler = getSplitDataSet(X, y, ratio=0.2)

fea_dim = X_train.shape[1]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
def get_bad_features(pValue,thr):
    list_bad_feature=[]
    for i in range(len(pValue)):
        if pValue[i] < thr:
            list_bad_feature.append(i)
    return list_bad_feature

In [None]:
# Feature selection using Chi-Squared test
# This is a statistical test to determine if there is a significant association between the features and the
f_score1 = chi2(X_train,y_train)
pValue1 = pd.Series(f_score1[1])
thr = 0.4 # best at thr = 0.4
list_bad_feature1 = get_bad_features(pValue1,thr)
np.save('list_bad_feature_fvs.npy',list_bad_feature1)
X_train1 = np.delete(X_train, list_bad_feature1, axis=1)
X_test1 = np.delete(X_test, list_bad_feature1, axis=1)

In [None]:
fea_dim1 = X_train1.shape[1]
print(X_train1.shape, y_train.shape)
print(X_test1.shape, y_test.shape)

In [None]:
# Reshaping for CNN models
XTrainCNN =np.array(X_train1).reshape(-1, fea_dim1, 1)
XTestCNN = np.array(X_test1).reshape(-1, fea_dim1, 1)
# Reshaping for RNN models
XTrainRNN =np.array(X_train1).reshape(-1, 1, fea_dim1)
XTestRNN = np.array(X_test1).reshape(-1, 1, fea_dim1)

In [None]:
input_shape=X_train1.shape[1:]
def build_dnn_model(hp):
    model = Sequential()
    # Hyperparameters to tune
    kernel_regularizer = regularizers.l2(hp.Choice('l2_reg', [1e-2, 1e-3, 1e-4]))
    # Tune the number of units in the first Dense layer
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    # Tune the dropout rate
    hp_dropout = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
    # Tune the learning rate for the optimizer
    learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    
    model.add(tf.keras.layers.Input(shape=input_shape))
    model.add(Dense(units=hp_units, activation='relu', kernel_regularizer = kernel_regularizer))
    model.add(Dropout(rate=hp_dropout))
    
    hp_units2 = hp.Int('units', min_value=32, max_value=512, step=32)
    kernel_regularizer2 = regularizers.l2(hp.Choice('l2_reg', [1e-2, 1e-3, 1e-4]))
    hp_dropout2 = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
    model.add(Dense(units=hp_units2, activation='relu', kernel_regularizer = kernel_regularizer2))
    model.add(Dropout(rate=hp_dropout2))
    
    model.add(Dense(1, activation='sigmoid'))
    
    # Optimizer selection
    optimizer_choice = hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd'])
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    if optimizer_choice == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_choice == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate)

    # Compile model with tunable loss
    loss_choice = hp.Choice('loss', ['binary_crossentropy', 'mse'])
    model.compile(optimizer=optimizer, loss=loss_choice, metrics=['accuracy'])

    
    return model

tuner = kt.RandomSearch(
    build_dnn_model,
    objective='val_accuracy',
    max_trials=10,  # Number of hyperparameter combinations to try
    executions_per_trial=1,  # How many models to build and fit for each trial
    directory='dlm',
    project_name='dnn_tuning',
    overwrite=False
)
tuner.reload()
best_hps = tuner.get_best_hyperparameters(1)[0]
_dnn = tuner.hypermodel.build(best_hps)

In [None]:
# Build model function for tuner
input_shape=XTrainRNN.shape[1:]
def build_gru_model(hp):
    model = Sequential()
    kernel_regularizer = regularizers.l2(hp.Choice('l2_reg', [1e-2, 1e-3, 1e-4]))
    # Tunable GRU units and dropout
    model.add(tf.keras.layers.Input(shape=input_shape))
    model.add(GRU(units=hp.Int('gru_units', 16, 128, step=16),
                  activation='relu',
                  recurrent_activation='relu',
                  dropout=hp.Float('gru_dropout', 0.1, 0.5, step=0.1),
                  kernel_regularizer = kernel_regularizer,
                  ))

    model.add(Flatten())
    model.add(Dense(units=hp.Int('dense_units', 32, 512, step=32), activation='relu',kernel_regularizer = kernel_regularizer))
    model.add(Dropout(hp.Float('dense_dropout', 0.1, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    # Optimizer and learning rate
    optimizer_choice = hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd'])
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    if optimizer_choice == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_choice == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate)

    # Loss function
    loss_choice = hp.Choice('loss', ['binary_crossentropy', 'mse'])

    model.compile(optimizer=optimizer,
                  loss=loss_choice,
                  metrics=['accuracy'])
    return model
tuner = kt.RandomSearch(
    build_gru_model,
    objective='val_accuracy',
    max_trials=100,
    executions_per_trial=1,
    directory='dlm',
    project_name='gru_tuning',
    overwrite=False
)
tuner.reload()
best_hps = tuner.get_best_hyperparameters(1)[0]
_gru = tuner.hypermodel.build(best_hps)
optimizer = Adam(learning_rate=0.0001)
_gru.compile(optimizer=optimizer, loss='mse', metrics=['accuracy'])

In [None]:
input_shape=XTrainRNN.shape[1:]
def build_lstm_model(hp):
    model = Sequential()
    kernel_regularizer = regularizers.l2(hp.Choice('l2_reg', [1e-2, 1e-3, 1e-4]))
    # First LSTM layer
    model.add(tf.keras.layers.Input(shape=input_shape))
    model.add(LSTM(
        units=hp.Int('lstm1_units', 64, 256, step=64),
        return_sequences=True,
        activation='relu',
        dropout=hp.Float('lstm1_dropout', 0.1, 0.5, step=0.1),
        kernel_regularizer = kernel_regularizer,
    ))

    # Second LSTM layer
    model.add(LSTM(
        units=hp.Int('lstm2_units', 32, 128, step=32),
        return_sequences=False,
        activation='relu',
        kernel_regularizer = kernel_regularizer,
        dropout=hp.Float('lstm2_dropout', 0.1, 0.5, step=0.1)
    ))

    # Dense layer
    model.add(Dense(units=hp.Int('dense_units', 32, 128, step=32), activation='relu', kernel_regularizer = kernel_regularizer))
    model.add(Dropout(hp.Float('dense_dropout', 0.1, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    # Optimizer and learning rate
    optimizer_choice = hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd'])
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    if optimizer_choice == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_choice == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate)

    # Loss function
    loss_choice = hp.Choice('loss', ['binary_crossentropy', 'mse'])

    model.compile(optimizer=optimizer, loss=loss_choice, metrics=['accuracy'])
    return model

tuner = kt.RandomSearch(
    build_lstm_model,
    objective='val_accuracy',
    max_trials=100,
    executions_per_trial=1,
    directory='dlm',
    project_name='lstm_tuning',
    overwrite=False
)
tuner.reload()
best_hps = tuner.get_best_hyperparameters(1)[0]
_lstm = tuner.hypermodel.build(best_hps)

In [None]:
input_shape=XTrainCNN.shape[1:]
def build_cnn_model(hp):
    model = Sequential()
    kernel_regularizer = regularizers.l2(hp.Choice('l2_reg', [1e-2, 1e-3, 1e-4]))
    # Convolutional layers
    # Convolutional layers
    model.add(tf.keras.layers.Input(shape=input_shape))
    model.add(Conv1D(filters=hp.Choice('filters', [8, 16, 32, 64]),
                     kernel_size=hp.Choice('kernel_size', [3, 5, 7, 8]),
                     activation='relu', padding='same',
                     kernel_regularizer = kernel_regularizer, 
                     ))
    model.add(MaxPool1D(pool_size=2))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_rate', 0.2, 0.5, step=0.1)))

    model.add(Conv1D(filters=hp.Choice('filters2', [8, 16, 32, 64]),
                     kernel_size=hp.Choice('kernel_size2', [3, 5, 7, 8]),
                     kernel_regularizer = kernel_regularizer,
                     activation='relu', padding='same'))
    model.add(MaxPool1D(pool_size=2))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_rate2', 0.2, 0.5, step=0.1)))

    model.add(Flatten())
    model.add(Dense(units=hp.Int('dense_units', 32, 512, step=32), activation='relu', kernel_regularizer = kernel_regularizer))
    model.add(Dropout(hp.Float('final_dropout', 0.2, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    # Optimizer selection
    optimizer_choice = hp.Choice('optimizer', ['adam', 'rmsprop', 'sgd'])
    learning_rate = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
    if optimizer_choice == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_choice == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        optimizer = SGD(learning_rate=learning_rate)

    # Compile model with tunable loss
    loss_choice = hp.Choice('loss', ['binary_crossentropy', 'mse'])
    model.compile(optimizer=optimizer, loss=loss_choice, metrics=['accuracy'])
    return model

tuner = kt.RandomSearch(
    build_cnn_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='dlm',
    project_name='cnn_tuning',
    overwrite=False
)
tuner.reload()
best_hps = tuner.get_best_hyperparameters(1)[0]
_cnn = tuner.hypermodel.build(best_hps)

In [None]:
_Clfs = [
        ('DNN', _dnn),
        ('GRU', _gru),
        ('LSTM', _lstm),
        ('CNN', _cnn)
        ]

scores, proba, histories = evaluate_classifiers(_Clfs, X_train1, y_train, 
                                     X_test1, y_test, fea_dim1,
                                     training=True, 
                                     epoc=1500, ep_interval=10, cutoff=0.5)
print(scores)

In [None]:
plot_histories(histories)

In [None]:
np.save('socres_model_names.npy',scores)
np.save('proba_models.npy',proba)
np.save('y_test.npy',y_test)
np.save('models_history.npy',histories)

In [None]:
import matplotlib.pyplot as plt
def drawCLF_AUC(modelList, probaList, y_test):
    # size = size
    # num_cross_val = cv  
    all_performance = []
    all_prob = {}
    all_prob[0] = []
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots()
    fig.set_figheight(6)
    fig.set_figwidth(8)
    fig.set_dpi(100)
    
    for proba, name in zip(probaList, modelList):
        print("Model ", name)
        test_label = y_test
        _proba = proba #clf.predict(test)

        y_pred_xgb = [1 if val >= 0.5 else 0 for val in _proba]

        acc, precision, sensitivity, specificity, MCC = calculate_performace(len(test_label),
                                                                                               y_pred_xgb,
                                                                                               np.array(test_label))

    
        print('Classification Report:\n', classification_report(test_label, y_pred_xgb))
        fpr_keras, tpr_keras, _ = roc_curve(test_label, _proba)
        auc_keras = auc(fpr_keras, tpr_keras)
        print('AUC', auc_keras)
        linewdth=1
        if name=='CNN':
            linewdth=2
            ax.plot(fpr_keras, tpr_keras, #color='darkblue',
                label=r' {} (AUC = %0.4f)'.format(name) % (auc_keras),
                lw=linewdth, alpha=.8)
        else:
            ax.plot(fpr_keras, tpr_keras, #color='grey',
                    label=r' {} (AUC = %0.4f)'.format(name) % (auc_keras),
                    lw=linewdth, alpha=.8)
        interp_tpr = np.interp(mean_fpr, fpr_keras, tpr_keras) 
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(auc_keras)

        # all_labels = all_labels + real_labels
        all_prob[0] = all_prob[0] + [val for val in _proba]  
        all_performance.append([acc, precision, sensitivity, specificity, MCC])
        print('---' * 30)

    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.6) # label='Mid Point', 

  
    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
           title="Receiver operating characteristic curve (ROC-Curve)")
    ax.legend(loc="lower right")
    plt.xlabel('False Positive Rate (1 - Specificity) -->')
    plt.ylabel('True Positive Rate (Sensitivity) -->')
    plt.grid()
    
    # plt.axes
    plt.title('ROC Curve for Different Deep Learning Models')
    plt.savefig('ROC_Curve.png', dpi=300, bbox_inches='tight')
    plt.show()

drawCLF_AUC(scores['Model'], proba, y_test)

In [None]:
import matplotlib.pyplot as plt
def drawCLF_PRCurve(modelList, probaList, y_test):
    # size = size
    # num_cross_val = cv  
    all_performance = []
    all_labels = []
    all_prob = {}
    all_prob[0] = []
    tprs = []
    aucs = []

    mean_fpr = np.linspace(0, 1, 100)
#     plt.figure(figsize=(6, 6), dpi=300)
    fig, ax = plt.subplots()
    fig.set_figheight(6)
    fig.set_figwidth(8)
    fig.set_dpi(100)
    fold = 0
    
    for proba, name in zip(probaList, modelList):

        test_label = y_test
        _proba = proba #clf.predict(test)

        y_pred = [1 if val >= 0.5 else 0 for val in _proba]

        acc, precision, sensitivity, specificity, MCC = calculate_performace(len(test_label),
                                                                                               y_pred,
                                                                                               np.array(test_label))

        print('Classification Report:\n', classification_report(test_label, y_pred))
        pre_keras, rec_keras, _ = precision_recall_curve(test_label, _proba)
        auc_keras = average_precision_score(test_label, _proba)
        print('mAP', auc_keras)
        linewdth=1
        if name=='CNN':
            linewdth=2
            ax.plot(rec_keras, pre_keras, #color='darkblue',
                label=r' {} (mAP = %0.4f)'.format(name) % (auc_keras),
                lw=linewdth, alpha=.8)
        else:
            ax.plot(rec_keras, pre_keras, #color='grey',
                label=r' {} (mAP = %0.4f)'.format(name) % (auc_keras),
                lw=linewdth, alpha=.8)
        
        interp_tpr = np.interp(mean_fpr, pre_keras, rec_keras) 
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(auc_keras)

        # all_labels = all_labels + real_labels
        all_prob[0] = all_prob[0] + [val for val in _proba]  
        all_performance.append([acc, precision, sensitivity, specificity, MCC])
        print('---' * 30)

 
    ax.legend(loc="lower right")
    plt.xlabel('Precision -->')
    plt.ylabel('Recall -->')
    plt.grid()
    plt.title('PR-Curve for Different Deep Learning Models')
    plt.savefig('PR_Curve.png', dpi=300, bbox_inches='tight')
    # plt.axes
    plt.show()

drawCLF_PRCurve(scores['Model'], proba, y_test)

### 10-Fold Cross-validation test

In [None]:
def get_final_model():
    tuner = kt.RandomSearch(
        build_cnn_model,
        objective='val_accuracy',
        max_trials=10,
        executions_per_trial=1,
        directory='dlm',
        project_name='cnn_tuning',
        overwrite=False
    )
    
    tuner.reload()
    best_hps = tuner.get_best_hyperparameters(1)[0]

    _cnn = tuner.hypermodel.build(best_hps)
    learning_rate = 0.001
    # if optimizer_choice == 'adam':
    optimizer = Adam(learning_rate=learning_rate)
    # Loss function
    loss_choice = 'binary_crossentropy' 
    _cnn.compile(optimizer=optimizer, loss=loss_choice, metrics=['accuracy'])
    return _cnn


def hm5c_deepCnn_K_fold_CV(cv=10):
    
    num_cross_val = cv  
    all_performance = []
    all_labels = []
    all_prob = {}
    num_classifier = 3
    all_prob[0] = []
    all_average = []
    repeat = 5
    tprs = []
    aucs = []
    accs = []
    foldnames = []
    specs = []
    sens = []
    mccs = []
    mean_fpr = np.linspace(0, 1, 100)
    fig, ax = plt.subplots()

    df = df_main.sample(frac=1, random_state=123)
    X = df.iloc[:,1:].values
    y = df.iloc[:,:1].values

    encoder = LabelEncoder()
    y = encoder.fit_transform(y.ravel())
    
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    XT = np.delete(X, list_bad_feature1, axis=1)
    yT = y
    print(XT.shape, y.shape)
    _fea_dim = XT.shape[1]
    fold = 0
    
    cv = StratifiedKFold(n_splits=cv)
    for i, (train_index, test_index) in enumerate(cv.split(XT, yT)):

        print("Fold ", i)
        train, train_label = XT[train_index], yT[train_index]
        test, test_label = XT[test_index], yT[test_index]

        real_labels = []
        for val in test_label:
            if val == 1:
                real_labels.append(1)
            else:
                real_labels.append(0)

        
        clf = get_final_model()
        XTrainCNN =np.array(train).reshape(-1, _fea_dim, 1)
        XTestCNN = np.array(test).reshape(-1, _fea_dim, 1)
        # print("Traning "+clf[0])
        callbacks = [
                        EarlyStopping(patience=250, monitor='val_accuracy', restore_best_weights=True),
                        ReduceLROnPlateau(factor=0.25, patience=100, min_lr=1e-5),
                        ModelCheckpoint('{0}_CNN.weights.h5'.format(fold), monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)
                    ]
        clf.fit(XTrainCNN, train_label, #verbose=1, 
                    shuffle=True, epochs=1000, 
                    callbacks=[SelectiveProgbarLogger(verbose = 0, epoch_interval=20),callbacks],
                    batch_size=32,
                    validation_data=(XTestCNN, test_label))
        
        _proba = clf.predict(test)
        y_pred = [1 if val >= 0.5 else 0 for val in _proba] # 

        acc, precision, sensitivity, specificity, MCC = calculate_performace(len(test_label),
                                                                                               y_pred,
                                                                                               np.array(test_label))
#         print(fold, acc, precision, sensitivity, specificity, MCC)
        foldnames.append(fold); accs.append(acc); sens.append(sensitivity); specs.append(specificity); mccs.append(MCC)
        
    
        print('Classification Report:\n', classification_report(test_label, y_pred))
        fpr_keras, tpr_keras, _ = roc_curve(test_label, _proba)
        auc_keras = auc(fpr_keras, tpr_keras)
        print('AUC', auc_keras)
       
        ax.plot(fpr_keras, tpr_keras, #color='grey',
                label=r'ROC fold {} (AUC = %0.4f)'.format(fold) % (auc_keras),
                lw=1, alpha=.6)
        interp_tpr = np.interp(mean_fpr, fpr_keras, tpr_keras) 
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(auc_keras)

        all_labels = all_labels + real_labels
        all_prob[0] = all_prob[0] + [val for val in _proba]  
        all_performance.append([acc, precision, sensitivity, specificity, MCC])
        print('---' * 30)
        fold += 1

    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=.6) # label='Mid Point', 

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.4f)' % (mean_auc),
            lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                    label=r'$\pm$ %0.4f std. dev.'% (std_auc))

    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
           title="ROC-Curve ({0}-fold Cross Validation)".format(cv))
    ax.legend(loc="lower right")
    plt.xlabel('False Positive Rate (1 - Specificity) -->')
    plt.ylabel('True Positive Rate (Sensitivity) -->')
    plt.grid()
    plt.title('ROC-Curve ({0}-fold Cross Validation)'.format(cv))
    plt.savefig('{0}_fold_ROC_Curve.png'.format(cv), dpi=300, bbox_inches='tight')
    # plt.axes
    plt.show()

    mean_performance = np.mean(np.array(all_performance), axis=0)
    meanACC, meanSens, meanSpec, meanMCC, meanAUC = mean_performance[0], mean_performance[2], mean_performance[3], mean_performance[4], mean_auc

    meanScore = pd.DataFrame({'Mean ACC': meanACC, 'Mean Sens': meanSens, 'Mean Spec': meanSpec,
                              'Mean MCC': meanMCC, 'Mean AUC': meanAUC}, index=[0])
    print('---' * 50)
    foldScore = pd.DataFrame({'ACC': accs, 'Sen':sens, 'Spec':specs, 'MCC':mccs, 'AUC': aucs})

    np.save("kfold_socre.npy",foldScore)
    return foldScore, meanScore

In [None]:
foldScore_hm5C, meanScore_hm5C = hm5c_deepCnn_K_fold_CV(cv=10)

In [None]:
foldScore_hm5C

In [None]:
meanScore_hm5C