In [8]:
import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
from os.path import abspath
import numpy as np
import pandas as pd
from utils.generate_network import generate_network
from utils.prepare_data import prepare_data
from utils.popphy_io import get_config, save_params, load_params
from utils.popphy_io import get_stat, get_stat_dict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from models.PopPhy import PopPhyCNN
from models.CNN1D import CNN1D
from models.MLPNN import MLPNN
from models.RF import RF
from models.SVM import SVM
from models.LASSO import LASSO
import warnings
from datetime import datetime
import json
from cyjupyter import Cytoscape
import warnings
warnings.filterwarnings("ignore")

# Data Preparation

### Reading Configuration
The configuration file (config.py) contains all relevant parameters that can be set by the user. It is loaded and some parameters are stored in variables. The dataset should be found in _../data/_.

In [30]:
config = get_config()
filt_thresh = config.get('Evaluation', 'FilterThresh')
dataset = config.get('Evaluation', 'DataSet')
num_runs = int(config.get('Evaluation', 'NumberRuns'))
num_test = int(config.get('Evaluation', 'NumberTestSplits'))
path = "../data/" + dataset

### Create Result Directory
The notebook results will be saved in _../results/notebook_results/_ under a directory with the shared name as the dataset. The directories are created if they do not exist.

In [None]:
results_dir = "../results/notebook_results/" + dataset

try:
    os.makedirs(results_dir)
except OSError:
    print ("Creation of the directory %s failed" % results_dir)
else:
    print ("Successfully created the directory %s" % results_dir)

### Prepare Data
The configuration file is passed to _prepare_data()_ to contruct the taxonomic tree from the data. Populated taxonomic trees in matrix format are returned as well as their respective original feature vectors and tree vectors, with samples in the same order. Labels are converted to one-hot encoding for training.

In [None]:
my_maps, raw_x, tree_x, raw_features, tree_features, labels, label_set, g, feature_df = prepare_data(path, config)

num_class = len(np.unique(labels))
if num_class == 2:
    metric = "AUC"
else:
    metric = "MCC"

seed = np.random.randint(100)
np.random.seed(seed)
np.random.shuffle(my_maps)
np.random.seed(seed)
np.random.shuffle(raw_x)
np.random.seed(seed)
np.random.shuffle(tree_x)
np.random.seed(seed)
np.random.shuffle(labels)

n_values = np.max(labels) + 1
labels_oh = np.eye(n_values)[labels]

tree_row = my_maps.shape[1]
tree_col = my_maps.shape[2]

print("There are %d classes...%s" % (num_class, ", ".join(label_set)))
cv_list = ["Run_" + str(x) + "_CV_" + str(y) for x in range(num_runs) for y in range(num_test)]
seeds = np.random.randint(1000, size=num_runs)

# PopPhy-CNN Cross Validation

A dataframe for saving scores during the cross-validation is set up. Training is done using stratified cross-validation. Data is log transformed and then a MinMax transformation is learned on the training set and applied to the held out set. This allows features to share similar scales while still remaining positive, which is important for the feature evaluation method. Class weights are learned based on the class proportion in the training set. These weights are used for constructing a weighted loss function. PopPhy-CNN models are trained and AUC, MCC, Precision, Recall, and F1 Score are saved.

In [10]:
popphy_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)

feature_scores = {}

for l in label_set:
    feature_scores[l] = pd.DataFrame(index=tree_features)
run = 0
for seed in seeds:
    skf = StratifiedKFold(n_splits=num_test, shuffle=True, random_state=seed)
    fold = 0
    for train_index, test_index in skf.split(my_maps, labels):
        train_x, test_x = my_maps[train_index,:,:], my_maps[test_index,:,:]
        train_y, test_y = labels_oh[train_index,:], labels_oh[test_index,:]
        
        train_x = np.log(train_x + 1)
        test_x = np.log(test_x + 1)
        
        c_prob = [0] * len(np.unique(labels))
        train_weights = []

        for l in np.unique(labels):
            a = float(len(labels))
            b = 2.0 * float((np.sum(labels==l)))
            c_prob[int(l)] = a/b

        c_prob = np.array(c_prob).reshape(-1)

        for l in np.argmax(train_y, 1):
            train_weights.append(c_prob[int(l)])
        train_weights = np.array(train_weights)
        
        scaler = MinMaxScaler().fit(train_x.reshape(-1, tree_row * tree_col))
        train_x = np.clip(scaler.transform(train_x.reshape(-1, tree_row * tree_col)), 0, 1).reshape(-1, tree_row, tree_col)
        test_x = np.clip(scaler.transform(test_x.reshape(-1, tree_row * tree_col)), 0, 1).reshape(-1, tree_row, tree_col)

        train = [train_x, train_y]
        test = [test_x, test_y]

        popphy_model = PopPhyCNN((tree_row, tree_col), num_class, config)

        if fold + run == 0:
            print(popphy_model.model.summary())
            print("\n\n Run\tFold\t%s" % (metric))

        popphy_model.train(train, train_weights)
        preds, stats = popphy_model.test(test)
        if num_class == 2:
                popphy_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=stats["AUC"]
        popphy_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=stats["MCC"]
        popphy_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=stats["Precision"]
        popphy_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=stats["Recall"]
        popphy_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=stats["F1"]

        if metric == "AUC":
                print("# %d\t%d\t%.3f" % (run, fold, stats["AUC"]))
        if metric == "MCC":
                print("# %d\t%d\t%.3f\t" % (run, fold, stats["MCC"]))

        scores = popphy_model.get_feature_scores(train, g, label_set, tree_features, config)
        for l in range(len(label_set)):
                score_list = scores[:,l]
                lab = label_set[l]
                feature_scores[lab]["Run_" + str(run) + "_CV_" + str(fold)] = score_list


        popphy_model.destroy()
        fold += 1
    run += 1

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gaussian_noise (GaussianNois (None, 10, 179, 1)        0         
_________________________________________________________________
conv_0 (Conv2D)              (None, 8, 170, 32)        992       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 4, 85, 32)         0         
_________________________________________________________________
flatten (Flatten)            (None, 10880)             0         
_________________________________________________________________
dropout (Dropout)            (None, 10880)             0         
_________________________________________________________________
fc_0 (Dense)                 (None, 32)                348192    
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0

# Saving Metrics

Evaluation metrics for each partition of each cross-validation run and the mean metrics are saved in the results directory.

In [12]:
popphy_stat_df.to_csv(results_dir + "/popphy_evaluation.csv")
popphy_stat_df

Unnamed: 0,Run_0_CV_0,Run_0_CV_1,Run_0_CV_2,Run_0_CV_3,Run_0_CV_4,Run_0_CV_5,Run_0_CV_6,Run_0_CV_7,Run_0_CV_8,Run_0_CV_9
AUC,0.972222,0.777778,0.979167,0.909722,0.833333,0.984848,0.871212,1,0.92562,0.85124
MCC,0.707107,0.338062,0.774597,0.53033,0.516459,0.825758,0.568182,1,0.730297,0.547723
Precision,0.875,0.671429,0.9,0.78125,0.775362,0.913043,0.785573,1,0.866667,0.775
Recall,0.833333,0.666667,0.875,0.75,0.73913,0.913043,0.782609,1,0.863636,0.772727
F1,0.828571,0.664336,0.873016,0.742857,0.73311,0.913043,0.782609,1,0.863354,0.772257


In [13]:
popphy_stat_df.mean(1).to_csv(results_dir + "/popphy_mean_evaluation.csv")
popphy_stat_df.mean(1)

AUC          0.910514
MCC          0.653851
Precision    0.834332
Recall       0.819615
F1           0.817315
dtype: float64

# Network Construction

The _generate_network()_ function returns a network dictionary and a dataframe of feature scores from the tree. The scores are saved as a CSV file. The _network_ object can be used in Cytoscapes Jupyter Notebook API. To visualize the network using the desktop application for Cystoscape, the network is converted to JSON and saved in the result directory. This file can be imported into the Cytoscape's desktop application. An XML style file is provided in the _cytoscape_style_ directory which can also be loaded to visualize the tree based on a score gradient.

In [14]:
network, tree_scores = generate_network(g, feature_scores, label_set)

with open(results_dir + '/network.json', 'w') as json_file:
    json.dump(network, json_file, sort_keys=True, indent=4, separators=(',', ': '))
tree_scores.to_csv(results_dir + '/feature_scores.csv')

# Final PopPhy-CNN Model

After evaluation, a full model is trained using the entire dataset. This model can be used for future predictions and is saved in the results directory as an H5 file.

In [15]:
final_x = MinMaxScaler().fit_transform(my_maps.reshape(-1, tree_row * tree_col)).reshape(-1,tree_row, tree_col)
train = [final_x, labels_oh]

train_weights = []

for l in np.unique(labels):
    a = float(len(labels))
    b = 2.0 * float((np.sum(labels==l)))
    c_prob[int(l)] = a/b

c_prob = np.array(c_prob).reshape(-1)

for l in np.argmax(labels_oh, 1):
    train_weights.append(c_prob[int(l)])
train_weights = np.array(train_weights)
popphy_model = PopPhyCNN((tree_row, tree_col), num_class, config)
popphy_model.train(train, train_weights)
popphy_model.model.save(results_dir + '/PopPhy-CNN.h5')

# Benchmarking

Benchmarking is performed using random forest (RF), support vector machines (SVM), LASSO, MLPNN, and 1D-CNN models. The same partitions are used that were used in the cross validation and the same metrics are evaluated. We evaluate both the original features, as well as all the tree features that have been vectorized. Metrics for each model are saved in their independent files. Mean metrics for each model are also saved in a single file.

### Original Features

In [24]:
cnn1d_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)
mlpnn_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)
rf_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)
svm_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)
lasso_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)

run = 0
for seed in seeds:
    skf = StratifiedKFold(n_splits=num_test, shuffle=True, random_state=seed)
    fold = 0
    for train_index, test_index in skf.split(my_maps, labels):
        train_x, test_x = raw_x[train_index,:], raw_x[test_index,:]
        train_y_oh, test_y_oh = labels_oh[train_index,:], labels_oh[test_index,:]
        train_y, test_y = labels[train_index], labels[test_index]
        
        train_x = np.log(train_x + 1)
        test_x = np.log(test_x + 1)
        
        c_prob = [0] * len(np.unique(labels))
        train_weights = []

        for l in np.unique(labels):
            a = float(len(labels))
            b = 2.0 * float((np.sum(labels==l)))
            c_prob[int(l)] = a/b

        c_prob = np.array(c_prob).reshape(-1)

        for l in np.argmax(train_y_oh, 1):
            train_weights.append(c_prob[int(l)])
        train_weights = np.array(train_weights)
        
        scaler = MinMaxScaler().fit(train_x)
        train_x = np.clip(scaler.transform(train_x), 0, 1)
        test_x = np.clip(scaler.transform(test_x), 0, 1) 

        train_oh = [train_x, train_y_oh]
        test_oh = [test_x, test_y_oh]

        train = [train_x, train_y]
        test = [test_x, test_y]
        
        cnn1D_model = CNN1D(train_x.shape[1], num_class, config)
        mlpnn_model = MLPNN(train_x.shape[1], num_class, config)
        rf_model = RF(config)
        svm_model = SVM(config, label_set)
        lasso_model = LASSO(config, label_set)
        
        if fold + run == 0:
            print("CNN-1D")
            print(cnn1D_model.model.summary())
            print("\n\nMLPNN")
            print(mlpnn_model.model.summary())
            print("\n\n Run\tFold\tRF %s\t\tSVM %s\t\tLASSO %s\tMLPNN %s\tCNN-1D %s" % (metric, metric, 
                                                                                   metric, metric, metric))

        cnn1D_model.train(train_oh, train_weights)
        preds, cnn1d_stats = cnn1D_model.test(test_oh)
        if num_class == 2:
                cnn1d_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["AUC"]
        cnn1d_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["MCC"]
        cnn1d_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["Precision"]
        cnn1d_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["Recall"]
        cnn1d_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["F1"]

        mlpnn_model.train(train_oh, train_weights)
        preds, mlpnn_stats = mlpnn_model.test(test_oh)
        if num_class == 2:
                mlpnn_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=mlpnn_stats["AUC"]
        mlpnn_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=mlpnn_stats["MCC"]
        mlpnn_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=mlpnn_stats["Precision"]
        mlpnn_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=mlpnn_stats["Recall"]
        mlpnn_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=mlpnn_stats["F1"]
        
        rf_model.train(train)
        preds, rf_stats = rf_model.test(test)
        if num_class == 2:
                rf_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=rf_stats["AUC"]
        rf_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=rf_stats["MCC"]
        rf_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=rf_stats["Precision"]
        rf_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=rf_stats["Recall"]
        rf_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=rf_stats["F1"]
        
        svm_model.train(train)
        preds, svm_stats = svm_model.test(test)
        if num_class == 2:
                svm_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=svm_stats["AUC"]
        svm_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=svm_stats["MCC"]
        svm_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=svm_stats["Precision"]
        svm_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=svm_stats["Recall"]
        svm_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=svm_stats["F1"]
        
        lasso_model.train(train)
        preds, lasso_stats = lasso_model.test(test)
        if num_class == 2:
                lasso_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=lasso_stats["AUC"]
        lasso_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=lasso_stats["MCC"]
        lasso_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=lasso_stats["Precision"]
        lasso_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=lasso_stats["Recall"]
        lasso_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=lasso_stats["F1"]
        
                          
        
        if metric == "AUC":
                print("# %d\t%d\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f" % (run, fold, rf_stats["AUC"], svm_stats["AUC"],
                                                          lasso_stats["AUC"], mlpnn_stats["AUC"], cnn1d_stats["AUC"]))
        if metric == "MCC":
                print("# %d\t%d\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f" % (run, fold, rf_stats["MCC"], svm_stats["MCC"], 
                                                          lasso_stats["MCC"], mlpnn_stats["MCC"], cnn1d_stats["MCC"]))

        cnn1D_model.destroy()
        mlpnn_model.destroy()
        del(rf_model)
        del(svm_model)
        del(lasso_model)
        
        fold += 1
    run += 1

CNN-1D
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gaussian_noise (GaussianNois (None, 1, 269, 1)         0         
_________________________________________________________________
conv_0 (Conv2D)              (None, 1, 260, 32)        352       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 1, 130, 32)        0         
_________________________________________________________________
conv_1 (Conv2D)              (None, 1, 121, 32)        10272     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 60, 32)         0         
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1920)        

In [25]:
cnn1d_stat_df.to_csv(results_dir + "\cnn1d_raw_evaluation.csv")
mlpnn_stat_df.to_csv(results_dir + "\mlpnn_raw_evaluation.csv")
lasso_stat_df.to_csv(results_dir + "\lasso_raw_evaluation.csv")
svm_stat_df.to_csv(results_dir + "\svm_raw_evaluation.csv")
rf_stat_df.to_csv(results_dir + "\rf_raw_evaluation.csv")

In [26]:
benchmark_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], 
                            columns=["RF", "SVM", "LASSO", "MLPNN", "CNN1D"])
benchmark_df["RF"] = rf_stat_df.mean(1)
benchmark_df["SVM"] = svm_stat_df.mean(1)
benchmark_df["LASSO"] = lasso_stat_df.mean(1)
benchmark_df["MLPNN"] = mlpnn_stat_df.mean(1)
benchmark_df["CNN1D"] = cnn1d_stat_df.mean(1)
benchmark_df.to_csv(results_dir + "\benchmark_raw.csv")
benchmark_df

Unnamed: 0,RF,SVM,LASSO,MLPNN,CNN1D
AUC,0.935721,0.939319,0.901722,0.923875,0.899237
MCC,0.733145,0.756933,0.66178,0.737562,0.66315
Precision,0.871287,0.881488,0.838383,0.870817,0.840175
Recall,0.862055,0.875445,0.8236,0.866733,0.823205
F1,0.861067,0.874997,0.82158,0.866408,0.821093


### Tree Features

In [27]:
cnn1d_tree_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)
mlpnn_tree_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)
rf_tree_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)
svm_tree_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)
lasso_tree_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)

run = 0
for seed in seeds:
    skf = StratifiedKFold(n_splits=num_test, shuffle=True, random_state=seed)
    fold = 0
    for train_index, test_index in skf.split(my_maps, labels):
        train_x, test_x = tree_x[train_index,:], tree_x[test_index,:]
        train_y_oh, test_y_oh = labels_oh[train_index,:], labels_oh[test_index,:]
        train_y, test_y = labels[train_index], labels[test_index]
        
        train_x = np.log(train_x + 1)
        test_x = np.log(test_x + 1)
        
        c_prob = [0] * len(np.unique(labels))
        train_weights = []

        for l in np.unique(labels):
            a = float(len(labels))
            b = 2.0 * float((np.sum(labels==l)))
            c_prob[int(l)] = a/b

        c_prob = np.array(c_prob).reshape(-1)

        for l in np.argmax(train_y_oh, 1):
            train_weights.append(c_prob[int(l)])
        train_weights = np.array(train_weights)
        
        scaler = MinMaxScaler().fit(train_x)
        train_x = np.clip(scaler.transform(train_x), 0, 1)
        test_x = np.clip(scaler.transform(test_x), 0, 1) 

        train_oh = [train_x, train_y_oh]
        test_oh = [test_x, test_y_oh]

        train = [train_x, train_y]
        test = [test_x, test_y]
        
        cnn1D_model = CNN1D(train_x.shape[1], num_class, config)
        mlpnn_model = MLPNN(train_x.shape[1], num_class, config)
        rf_model = RF(config)
        svm_model = SVM(config, label_set)
        lasso_model = LASSO(config, label_set)
        
        if fold + run == 0:
            print("CNN-1D")
            print(cnn1D_model.model.summary())
            print("\n\nMLPNN")
            print(mlpnn_model.model.summary())
            print("\n\n Run\tFold\tRF %s\t\tSVM %s\t\tLASSO %s\tMLPNN %s\tCNN-1D %s" % (metric, metric, 
                                                                                   metric, metric, metric))

        cnn1D_model.train(train_oh, train_weights)
        preds, cnn1d_stats = cnn1D_model.test(test_oh)
        if num_class == 2:
                cnn1d_tree_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["AUC"]
        cnn1d_tree_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["MCC"]
        cnn1d_tree_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["Precision"]
        cnn1d_tree_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["Recall"]
        cnn1d_tree_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["F1"]

        mlpnn_model.train(train_oh, train_weights)
        preds, mlpnn_stats = mlpnn_model.test(test_oh)
        if num_class == 2:
                mlpnn_tree_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=mlpnn_stats["AUC"]
        mlpnn_tree_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=mlpnn_stats["MCC"]
        mlpnn_tree_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=mlpnn_stats["Precision"]
        mlpnn_tree_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=mlpnn_stats["Recall"]
        mlpnn_tree_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=mlpnn_stats["F1"]
        
        rf_model.train(train)
        preds, rf_stats = rf_model.test(test)
        if num_class == 2:
                rf_tree_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=rf_stats["AUC"]
        rf_tree_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=rf_stats["MCC"]
        rf_tree_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=rf_stats["Precision"]
        rf_tree_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=rf_stats["Recall"]
        rf_tree_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=rf_stats["F1"]
        
        svm_model.train(train)
        preds, svm_stats = svm_model.test(test)
        if num_class == 2:
                svm_tree_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=svm_stats["AUC"]
        svm_tree_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=svm_stats["MCC"]
        svm_tree_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=svm_stats["Precision"]
        svm_tree_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=svm_stats["Recall"]
        svm_tree_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=svm_stats["F1"]
        
        lasso_model.train(train)
        preds, lasso_stats = lasso_model.test(test)
        if num_class == 2:
                lasso_tree_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=lasso_stats["AUC"]
        lasso_tree_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=lasso_stats["MCC"]
        lasso_tree_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=lasso_stats["Precision"]
        lasso_tree_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=lasso_stats["Recall"]
        lasso_tree_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=lasso_stats["F1"]
        
                          
        
        if metric == "AUC":
                print("# %d\t%d\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f" % (run, fold, rf_stats["AUC"], svm_stats["AUC"],
                                                          lasso_stats["AUC"], mlpnn_stats["AUC"], cnn1d_stats["AUC"]))
        if metric == "MCC":
                print("# %d\t%d\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f" % (run, fold, rf_stats["MCC"], svm_stats["MCC"], 
                                                          lasso_stats["MCC"], mlpnn_stats["MCC"], cnn1d_stats["MCC"]))

        cnn1D_model.destroy()
        mlpnn_model.destroy()
        del(rf_model)
        del(svm_model)
        del(lasso_model)
        
        fold += 1
    run += 1

CNN-1D
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gaussian_noise (GaussianNois (None, 1, 479, 1)         0         
_________________________________________________________________
conv_0 (Conv2D)              (None, 1, 470, 32)        352       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 1, 235, 32)        0         
_________________________________________________________________
conv_1 (Conv2D)              (None, 1, 226, 32)        10272     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 113, 32)        0         
_________________________________________________________________
flatten (Flatten)            (None, 3616)              0         
_________________________________________________________________
dropout (Dropout)            (None, 3616)        

In [28]:
cnn1d_tree_stat_df.to_csv(results_dir + "\cnn1d_tree_evaluation.csv")
mlpnn_tree_stat_df.to_csv(results_dir + "\mlpnn_tree_evaluation.csv")
lasso_tree_stat_df.to_csv(results_dir + "\lasso_tree_evaluation.csv")
svm_tree_stat_df.to_csv(results_dir + "\svm_tree_evaluation.csv")
rf_tree_stat_df.to_csv(results_dir + "\rf_tree_evaluation.csv")

In [29]:
benchmark_tree_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], 
                            columns=["RF", "SVM", "LASSO", "MLPNN", "CNN1D"])
benchmark_tree_df["RF"] = rf_tree_stat_df.mean(1)
benchmark_tree_df["SVM"] = svm_tree_stat_df.mean(1)
benchmark_tree_df["LASSO"] = lasso_tree_stat_df.mean(1)
benchmark_tree_df["MLPNN"] = mlpnn_tree_stat_df.mean(1)
benchmark_tree_df["CNN1D"] = cnn1d_tree_stat_df.mean(1)
benchmark_tree_df.to_csv(results_dir + "\benchmark_tree.csv")
benchmark_tree_df

Unnamed: 0,RF,SVM,LASSO,MLPNN,CNN1D
AUC,0.929695,0.932828,0.901217,0.93618,0.865473
MCC,0.759823,0.700783,0.679982,0.734445,0.625129
Precision,0.885077,0.855202,0.852435,0.872013,0.819611
Recall,0.874918,0.845916,0.828343,0.862582,0.806159
F1,0.873893,0.843917,0.824735,0.861618,0.803022
