# Experimentos

In [1]:
# imports
import numpy as np
import pandas as pd

# time
import time
from datetime import timedelta

# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# ETL
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Oversampling Methods
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

# Deep Learning Framework - Keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Conv2D, AveragePooling2D, Flatten
from keras.utils import to_categorical
from keras.backend import clear_session
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from keras.utils import plot_model

# K-fold
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

models = pd.DataFrame()

Using TensorFlow backend.


#  Data

In [2]:
## Data
df = pd.read_csv('data/train.csv')

## Remove colunas constantes
colsToRemove = []
for col in df.columns:
    if df[col].std() == 0:
        colsToRemove.append(col)
df.drop(colsToRemove, axis=1, inplace=True)

## Remove colunas multiplas
colsToRemove = []
columns = df.columns
for i in range(len(columns)-1):
    print(i, end="\r")
    v = df[columns[i]]
    for j in range(i+1,len(columns)):
        m = df.loc[(df[columns[j]]*v) != 0, [columns[j], columns[i]]]
        if m.shape[0] > 0:
            c = m.iloc[0,0]/m.iloc[0,1]
            if np.array_equal(c*v.values,df[columns[j]].values):
                colsToRemove.append(columns[j])
df.drop(colsToRemove, axis=1, inplace=True)

## Remove duplicate rows
df.drop_duplicates(inplace=True)

## Split (X,Y) 
x_data = df.drop(['ID','TARGET'], axis=1)
y_data = df['TARGET'].copy()

x_data.shape

335

(76020, 287)

# Oversampling: Random Oversampling

## CNN 1 AVG RO

In [3]:
# info
model_name = 'CNN 1 AVG RO Clean'

i = 1
metrics_kf = pd.DataFrame()
for train, test in kfold.split(x_data, y_data):

    ## Data
    x_train = x_data.iloc[train]
    y_train = y_data.iloc[train]
    
    x_test = x_data.iloc[test]
    y_test = y_data.iloc[test]
    
    ## Train, validation
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
    
    ## Remove duplicate label (Train)
    removeIndex = x_train[x_train.duplicated(keep=False)].index
    x_train = x_train.drop(removeIndex)
    y_train = y_train.drop(removeIndex)
    
    ## Standard Scaler
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    x_val = scaler.transform(x_val)
    
    # Oversampling
    sampling = RandomOverSampler(random_state=42)
    x_train, y_train = sampling.fit_sample(x_train, y_train)
    
    # one hot
    y_train = to_categorical(y_train)
    y_val = to_categorical(y_val)
    
    ## New Shape 20x20
    # Train
    n = x_train.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_train = np.concatenate((x_train, zeros), axis=1)
    
    # Val
    n = x_val.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_val = np.concatenate((x_val, zeros), axis=1)
    
    # Test
    n = x_test.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_test = np.concatenate((x_test, zeros), axis=1)
    
    # reshape to be [samples][width][height][pixels]
    x_train = x_train.reshape(x_train.shape[0], 17, 17, 1)
    x_test = x_test.reshape(x_test.shape[0], 17, 17, 1)
    x_val = x_val.reshape(x_val.shape[0], 17, 17, 1)
    
    ## status
    kf_i = 'KF_{0}'.format(i)
    print(kf_i)
    
    ## Train
    startTime = time.time()
    model = Sequential()
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, input_shape=(17, 17, 1), activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Flatten())
    model.add(Dense(243, activation='relu'))
    model.add(Dense(243, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy',
              metrics=['accuracy'])

    model.fit(x=x_train,
          y=y_train,
          batch_size=128, 
          epochs=10000, 
          callbacks=[EarlyStopping(patience=20)], 
          validation_data=(x_val, y_val))
    tm = timedelta(seconds=(time.time()-startTime))
    
    ## Predict
    y_pred_proba = model.predict_proba(x_test)
    y_pred = y_pred_proba.argmax(axis=1)
    
    ## Metrics
    auc_train = roc_auc_score(y_train[:,1], model.predict_proba(x_train)[:,1])
    fpr, tpr, thresholds = roc_curve(y_test.values, y_pred_proba[:,1])
    auc_1 = auc(fpr, tpr)
    auc_0 = auc(1-tpr, 1-fpr)
    precision = precision_score(y_test.values, y_pred, average='macro')
    recall = recall_score(y_test.values, y_pred, average='macro')  
    f1 = f1_score(y_test.values, y_pred, average='macro')
    accuracy = accuracy_score(y_test.values, y_pred)
           
    metrics_kf.loc[kf_i,'AURoc Train'] = auc_train
    metrics_kf.loc[kf_i,'AURoc 0'] = auc_0
    metrics_kf.loc[kf_i,'AURoc 1'] = auc_1
    metrics_kf.loc[kf_i,'Precision'] = precision
    metrics_kf.loc[kf_i,'Recall'] = recall
    metrics_kf.loc[kf_i,'F1 score'] = f1
    metrics_kf.loc[kf_i,'Accuracy'] = accuracy
    metrics_kf.loc[kf_i,'Time'] = tm
    if i == 1:
        plot_model(model, to_file='models/model_' + model_name.replace(' ', '_') + '.png', show_shapes=True)
    i=i+1
    clear_session()

print()
for m in ['AURoc Train', 'AURoc 0', 'AURoc 1', 'Precision', 'Recall', 'F1 score', 'Accuracy', 'Time']:

    mean = metrics_kf[m].mean()
    metrics_kf.loc['Media', m] = mean
    models.loc[model_name, m] = mean
    
    metrics_kf.loc['STD', m] = metrics_kf[m].std()

KF_1
Train on 97506 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
KF_2
Train on 97448 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26

Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Train on 97570 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Train on 97530 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epo

Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Train on 97598 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Train on 97666 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epo

In [4]:
print(model_name)

metrics_kf

CNN 1 AVG RO Clean


Unnamed: 0,AURoc Train,AURoc 0,AURoc 1,Precision,Recall,F1 score,Accuracy,Time
KF_1,0.945165,0.713945,0.713945,0.546677,0.687466,0.543335,0.815731,00:07:34.890339
KF_2,0.946759,0.73215,0.738794,0.545654,0.708622,0.532244,0.786006,00:09:12.421739
KF_3,0.951992,0.7282,0.7282,0.561715,0.697655,0.575212,0.859774,00:10:18.252338
KF_4,0.94237,0.763664,0.763664,0.560006,0.715827,0.568917,0.842673,00:06:09.296277
KF_5,0.937383,0.73677,0.73677,0.544105,0.701621,0.529847,0.784793,00:05:18.893451
KF_6,0.935501,0.774281,0.777603,0.553276,0.725439,0.549942,0.809129,00:04:58.166026
KF_7,0.929819,0.752897,0.752897,0.565355,0.712058,0.580259,0.859905,00:07:18.484052
KF_8,0.945625,0.719473,0.719473,0.551859,0.702849,0.552683,0.823862,00:07:47.460261
KF_9,0.921903,0.758553,0.758553,0.544304,0.723418,0.520009,0.757269,00:04:21.569784
KF_10,0.937151,0.744499,0.747832,0.559053,0.72017,0.566071,0.836995,00:05:25.882994


## CNN 2 AVG RO

In [5]:
# info
model_name = 'CNN 2 AVG RO Clean'

i = 1
metrics_kf = pd.DataFrame()
for train, test in kfold.split(x_data, y_data):

    ## Data
    x_train = x_data.iloc[train]
    y_train = y_data.iloc[train]
    
    x_test = x_data.iloc[test]
    y_test = y_data.iloc[test]
    
    ## Train, validation
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
    
    ## Remove duplicate label (Train)
    removeIndex = x_train[x_train.duplicated(keep=False)].index
    x_train = x_train.drop(removeIndex)
    y_train = y_train.drop(removeIndex)
    
    ## Standard Scaler
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    x_val = scaler.transform(x_val)
    
    # Oversampling
    sampling = RandomOverSampler(random_state=42)
    x_train, y_train = sampling.fit_sample(x_train, y_train)
    
    # one hot
    y_train = to_categorical(y_train)
    y_val = to_categorical(y_val)
    
    ## New Shape 20x20
    # Train
    n = x_train.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_train = np.concatenate((x_train, zeros), axis=1)
    
    # Val
    n = x_val.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_val = np.concatenate((x_val, zeros), axis=1)
    
    # Test
    n = x_test.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_test = np.concatenate((x_test, zeros), axis=1)
    
    # reshape to be [samples][width][height][pixels]
    x_train = x_train.reshape(x_train.shape[0], 17, 17, 1)
    x_test = x_test.reshape(x_test.shape[0], 17, 17, 1)
    x_val = x_val.reshape(x_val.shape[0], 17, 17, 1)
    
    ## status
    kf_i = 'KF_{0}'.format(i)
    print(kf_i)
    
    ## Train
    startTime = time.time()
    model = Sequential()
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, input_shape=(17, 17, 1), activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Flatten())
    model.add(Dense(96, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy',
              metrics=['accuracy'])

    model.fit(x=x_train,
          y=y_train,
          batch_size=128, 
          epochs=10000, 
          callbacks=[EarlyStopping(patience=20)], 
          validation_data=(x_val, y_val))
    tm = timedelta(seconds=(time.time()-startTime))
    
    ## Predict
    y_pred_proba = model.predict_proba(x_test)
    y_pred = y_pred_proba.argmax(axis=1)
    
    ## Metrics
    auc_train = roc_auc_score(y_train[:,1], model.predict_proba(x_train)[:,1])
    fpr, tpr, thresholds = roc_curve(y_test.values, y_pred_proba[:,1])
    auc_1 = auc(fpr, tpr)
    auc_0 = auc(1-tpr, 1-fpr)
    precision = precision_score(y_test.values, y_pred, average='macro')
    recall = recall_score(y_test.values, y_pred, average='macro')  
    f1 = f1_score(y_test.values, y_pred, average='macro')
    accuracy = accuracy_score(y_test.values, y_pred)
           
    metrics_kf.loc[kf_i,'AURoc Train'] = auc_train
    metrics_kf.loc[kf_i,'AURoc 0'] = auc_0
    metrics_kf.loc[kf_i,'AURoc 1'] = auc_1
    metrics_kf.loc[kf_i,'Precision'] = precision
    metrics_kf.loc[kf_i,'Recall'] = recall
    metrics_kf.loc[kf_i,'F1 score'] = f1
    metrics_kf.loc[kf_i,'Accuracy'] = accuracy
    metrics_kf.loc[kf_i,'Time'] = tm
    if i == 1:
        plot_model(model, to_file='models/model_' + model_name.replace(' ', '_') + '.png', show_shapes=True)
    i=i+1
    clear_session()

print()
for m in ['AURoc Train', 'AURoc 0', 'AURoc 1', 'Precision', 'Recall', 'F1 score', 'Accuracy', 'Time']:

    mean = metrics_kf[m].mean()
    metrics_kf.loc['Media', m] = mean
    models.loc[model_name, m] = mean
    
    metrics_kf.loc['STD', m] = metrics_kf[m].std()

KF_1
Train on 97594 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Train on 97558 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/1000

Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Train on 97528 samples, validate on 13684 samples
Epoch 1/1000

Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Train on 97506 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000

Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Train on 97452 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000

In [6]:
print(model_name)

metrics_kf

CNN 2 AVG RO Clean


Unnamed: 0,AURoc Train,AURoc 0,AURoc 1,Precision,Recall,F1 score,Accuracy,Time
KF_1,0.894024,0.768739,0.768739,0.540867,0.712895,0.51015,0.742207,00:10:00.033503
KF_2,0.878147,0.810333,0.810333,0.549205,0.751596,0.525287,0.75536,00:06:08.302946
KF_3,0.885617,0.785004,0.785004,0.542212,0.724901,0.508808,0.734675,00:08:03.088433
KF_4,0.904929,0.763248,0.766571,0.543846,0.724042,0.517276,0.751381,00:13:21.047739
KF_5,0.862239,0.780714,0.780714,0.540325,0.71872,0.503092,0.725862,00:06:12.886755
KF_6,0.89833,0.791245,0.791245,0.548672,0.732276,0.532613,0.776375,00:08:58.475920
KF_7,0.891073,0.786114,0.789436,0.542584,0.733345,0.504706,0.723362,00:11:04.952150
KF_8,0.865458,0.777513,0.777513,0.547906,0.706928,0.540292,0.801105,00:05:46.851521
KF_9,0.904274,0.777966,0.777966,0.544358,0.722642,0.520672,0.758848,00:12:13.401345
KF_10,0.899576,0.762595,0.765928,0.5442,0.727643,0.517112,0.750033,00:12:16.868979


## CNN 3 AVG RO

In [7]:
# info
model_name = 'CNN 3 AVG RO Clean'

i = 1
metrics_kf = pd.DataFrame()
for train, test in kfold.split(x_data, y_data):

    ## Data
    x_train = x_data.iloc[train]
    y_train = y_data.iloc[train]
    
    x_test = x_data.iloc[test]
    y_test = y_data.iloc[test]
    
    ## Train, validation
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
    
    ## Remove duplicate label (Train)
    removeIndex = x_train[x_train.duplicated(keep=False)].index
    x_train = x_train.drop(removeIndex)
    y_train = y_train.drop(removeIndex)
    
    ## Standard Scaler
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    x_val = scaler.transform(x_val)
    
    # Oversampling
    sampling = RandomOverSampler(random_state=42)
    x_train, y_train = sampling.fit_sample(x_train, y_train)
    
    # one hot
    y_train = to_categorical(y_train)
    y_val = to_categorical(y_val)
    
    ## New Shape 20x20
    # Train
    n = x_train.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_train = np.concatenate((x_train, zeros), axis=1)
    
    # Val
    n = x_val.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_val = np.concatenate((x_val, zeros), axis=1)
    
    # Test
    n = x_test.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_test = np.concatenate((x_test, zeros), axis=1)
    
    # reshape to be [samples][width][height][pixels]
    x_train = x_train.reshape(x_train.shape[0], 17, 17, 1)
    x_test = x_test.reshape(x_test.shape[0], 17, 17, 1)
    x_val = x_val.reshape(x_val.shape[0], 17, 17, 1)
    
    ## status
    kf_i = 'KF_{0}'.format(i)
    print(kf_i)
    
    ## Train
    startTime = time.time()
    model = Sequential()
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, input_shape=(17, 17, 1), activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Flatten())
    model.add(Dense(24, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy',
              metrics=['accuracy'])

    model.fit(x=x_train,
          y=y_train,
          batch_size=128, 
          epochs=10000, 
          callbacks=[EarlyStopping(patience=20)], 
          validation_data=(x_val, y_val))
    tm = timedelta(seconds=(time.time()-startTime))
    
    ## Predict
    y_pred_proba = model.predict_proba(x_test)
    y_pred = y_pred_proba.argmax(axis=1)
    
    ## Metrics
    auc_train = roc_auc_score(y_train[:,1], model.predict_proba(x_train)[:,1])
    fpr, tpr, thresholds = roc_curve(y_test.values, y_pred_proba[:,1])
    auc_1 = auc(fpr, tpr)
    auc_0 = auc(1-tpr, 1-fpr)
    precision = precision_score(y_test.values, y_pred, average='macro')
    recall = recall_score(y_test.values, y_pred, average='macro')  
    f1 = f1_score(y_test.values, y_pred, average='macro')
    accuracy = accuracy_score(y_test.values, y_pred)
           
    metrics_kf.loc[kf_i,'AURoc Train'] = auc_train
    metrics_kf.loc[kf_i,'AURoc 0'] = auc_0
    metrics_kf.loc[kf_i,'AURoc 1'] = auc_1
    metrics_kf.loc[kf_i,'Precision'] = precision
    metrics_kf.loc[kf_i,'Recall'] = recall
    metrics_kf.loc[kf_i,'F1 score'] = f1
    metrics_kf.loc[kf_i,'Accuracy'] = accuracy
    metrics_kf.loc[kf_i,'Time'] = tm
    if i == 1:
        plot_model(model, to_file='models/model_' + model_name.replace(' ', '_') + '.png', show_shapes=True)
    i=i+1
    clear_session()

print()
for m in ['AURoc Train', 'AURoc 0', 'AURoc 1', 'Precision', 'Recall', 'F1 score', 'Accuracy', 'Time']:

    mean = metrics_kf[m].mean()
    metrics_kf.loc['Media', m] = mean
    models.loc[model_name, m] = mean
    
    metrics_kf.loc['STD', m] = metrics_kf[m].std()

KF_1
Train on 97528 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Train on 97594 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/1000

Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Train on 97568 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000

Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Train on 97536 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Train on 97488 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epo

Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Train on 97416 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Ep

In [8]:
print(model_name)

metrics_kf

CNN 3 AVG RO Clean


Unnamed: 0,AURoc Train,AURoc 0,AURoc 1,Precision,Recall,F1 score,Accuracy,Time
KF_1,0.810143,0.795626,0.795626,0.549366,0.717696,0.540909,0.797317,00:10:58.973911
KF_2,0.822635,0.818722,0.818722,0.54601,0.751413,0.510332,0.727476,00:09:12.029418
KF_3,0.827695,0.799054,0.799054,0.534636,0.712787,0.465,0.650224,00:13:04.929618
KF_4,0.845871,0.803059,0.803059,0.544128,0.736889,0.510483,0.733228,00:16:13.645847
KF_5,0.822771,0.794686,0.794686,0.53614,0.720715,0.469206,0.656275,00:06:36.428791
KF_6,0.82673,0.81218,0.81218,0.541147,0.732114,0.497073,0.708761,00:06:39.978570
KF_7,0.824705,0.80052,0.803842,0.542514,0.733139,0.504464,0.722968,00:08:22.414639
KF_8,0.824291,0.791417,0.791417,0.545913,0.723458,0.526313,0.768614,00:05:15.028267
KF_9,0.835357,0.802555,0.802555,0.545607,0.720883,0.526678,0.77082,00:06:50.586967
KF_10,0.831269,0.801464,0.801464,0.558169,0.729281,0.561953,0.826865,00:09:03.765982


# Oversampling: SMOTE

## CNN 1 AVG SM

In [9]:
# info
model_name = 'CNN 1 AVG SM Clean'

i = 1
metrics_kf = pd.DataFrame()
for train, test in kfold.split(x_data, y_data):

    ## Data
    x_train = x_data.iloc[train]
    y_train = y_data.iloc[train]
    
    x_test = x_data.iloc[test]
    y_test = y_data.iloc[test]
    
    ## Train, validation
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
    
    ## Remove duplicate label (Train)
    removeIndex = x_train[x_train.duplicated(keep=False)].index
    x_train = x_train.drop(removeIndex)
    y_train = y_train.drop(removeIndex)
    
    ## Standard Scaler
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    x_val = scaler.transform(x_val)
    
    # Oversampling
    sampling = SMOTE(kind='borderline2',k_neighbors=5, random_state=42, n_jobs=4)
    x_train, y_train = sampling.fit_sample(x_train, y_train)
    
    # one hot
    y_train = to_categorical(y_train)
    y_val = to_categorical(y_val)
    
    ## New Shape 20x20
    # Train
    n = x_train.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_train = np.concatenate((x_train, zeros), axis=1)
    
    # Val
    n = x_val.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_val = np.concatenate((x_val, zeros), axis=1)
    
    # Test
    n = x_test.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_test = np.concatenate((x_test, zeros), axis=1)
    
    # reshape to be [samples][width][height][pixels]
    x_train = x_train.reshape(x_train.shape[0], 17, 17, 1)
    x_test = x_test.reshape(x_test.shape[0], 17, 17, 1)
    x_val = x_val.reshape(x_val.shape[0], 17, 17, 1)
    
    ## status
    kf_i = 'KF_{0}'.format(i)
    print(kf_i)
    
    ## Train
    startTime = time.time()
    model = Sequential()
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, input_shape=(17, 17, 1), activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Flatten())
    model.add(Dense(243, activation='relu'))
    model.add(Dense(243, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy',
              metrics=['accuracy'])

    model.fit(x=x_train,
          y=y_train,
          batch_size=128, 
          epochs=10000, 
          callbacks=[EarlyStopping(patience=20)], 
          validation_data=(x_val, y_val))
    tm = timedelta(seconds=(time.time()-startTime))
    
    ## Predict
    y_pred_proba = model.predict_proba(x_test)
    y_pred = y_pred_proba.argmax(axis=1)
    
    ## Metrics
    auc_train = roc_auc_score(y_train[:,1], model.predict_proba(x_train)[:,1])
    fpr, tpr, thresholds = roc_curve(y_test.values, y_pred_proba[:,1])
    auc_1 = auc(fpr, tpr)
    auc_0 = auc(1-tpr, 1-fpr)
    precision = precision_score(y_test.values, y_pred, average='macro')
    recall = recall_score(y_test.values, y_pred, average='macro')  
    f1 = f1_score(y_test.values, y_pred, average='macro')
    accuracy = accuracy_score(y_test.values, y_pred)
           
    metrics_kf.loc[kf_i,'AURoc Train'] = auc_train
    metrics_kf.loc[kf_i,'AURoc 0'] = auc_0
    metrics_kf.loc[kf_i,'AURoc 1'] = auc_1
    metrics_kf.loc[kf_i,'Precision'] = precision
    metrics_kf.loc[kf_i,'Recall'] = recall
    metrics_kf.loc[kf_i,'F1 score'] = f1
    metrics_kf.loc[kf_i,'Accuracy'] = accuracy
    metrics_kf.loc[kf_i,'Time'] = tm
    if i == 1:
        plot_model(model, to_file='models/model_' + model_name.replace(' ', '_') + '.png', show_shapes=True)
    i=i+1
    clear_session()

print()
for m in ['AURoc Train', 'AURoc 0', 'AURoc 1', 'Precision', 'Recall', 'F1 score', 'Accuracy', 'Time']:

    mean = metrics_kf[m].mean()
    metrics_kf.loc['Media', m] = mean
    models.loc[model_name, m] = mean
    
    metrics_kf.loc['STD', m] = metrics_kf[m].std()

KF_1
Train on 97539 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Train on 97562 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Train on 97610 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Ep

Epoch 25/10000
Epoch 26/10000
Train on 97455 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Train on 97545 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Tra

Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000


In [10]:
print(model_name)

metrics_kf

CNN 1 AVG SM Clean


Unnamed: 0,AURoc Train,AURoc 0,AURoc 1,Precision,Recall,F1 score,Accuracy,Time
KF_1,0.968054,0.770692,0.770692,0.562535,0.688826,0.577704,0.867289,00:04:29.466772
KF_2,0.966646,0.787534,0.790856,0.55883,0.72358,0.564704,0.833092,00:04:26.773022
KF_3,0.972357,0.773615,0.773615,0.571328,0.702328,0.591377,0.877927,00:05:01.789749
KF_4,0.969145,0.777552,0.777552,0.562835,0.708342,0.575935,0.855827,00:04:49.947710
KF_5,0.967514,0.764043,0.764043,0.561702,0.698837,0.575037,0.858984,00:05:05.079617
KF_6,0.973466,0.773432,0.773432,0.573158,0.69731,0.594391,0.883583,00:06:12.760052
KF_7,0.972532,0.783421,0.786743,0.562877,0.710774,0.575675,0.85438,00:05:10.170384
KF_8,0.972313,0.768856,0.772178,0.56387,0.693664,0.579574,0.867403,00:05:02.287215
KF_9,0.970964,0.783008,0.783008,0.573819,0.707541,0.595113,0.880279,00:05:12.152521
KF_10,0.971331,0.755258,0.755258,0.561032,0.719074,0.570654,0.844099,00:04:37.686694


## CNN 2 AVG SM

In [11]:
# info
model_name = 'CNN 2 AVG SM Clean'

i = 1
metrics_kf = pd.DataFrame()
for train, test in kfold.split(x_data, y_data):

    ## Data
    x_train = x_data.iloc[train]
    y_train = y_data.iloc[train]
    
    x_test = x_data.iloc[test]
    y_test = y_data.iloc[test]
    
    ## Train, validation
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
    
    ## Remove duplicate label (Train)
    removeIndex = x_train[x_train.duplicated(keep=False)].index
    x_train = x_train.drop(removeIndex)
    y_train = y_train.drop(removeIndex)
    
    ## Standard Scaler
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    x_val = scaler.transform(x_val)
    
    # Oversampling
    sampling = SMOTE(kind='borderline2',k_neighbors=5, random_state=42, n_jobs=4)
    x_train, y_train = sampling.fit_sample(x_train, y_train)
    
    # one hot
    y_train = to_categorical(y_train)
    y_val = to_categorical(y_val)
    
    ## New Shape 20x20
    # Train
    n = x_train.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_train = np.concatenate((x_train, zeros), axis=1)
    
    # Val
    n = x_val.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_val = np.concatenate((x_val, zeros), axis=1)
    
    # Test
    n = x_test.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_test = np.concatenate((x_test, zeros), axis=1)
    
    # reshape to be [samples][width][height][pixels]
    x_train = x_train.reshape(x_train.shape[0], 17, 17, 1)
    x_test = x_test.reshape(x_test.shape[0], 17, 17, 1)
    x_val = x_val.reshape(x_val.shape[0], 17, 17, 1)
    
    ## status
    kf_i = 'KF_{0}'.format(i)
    print(kf_i)
    
    ## Train
    startTime = time.time()
    model = Sequential()
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, input_shape=(17, 17, 1), activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Flatten())
    model.add(Dense(96, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy',
              metrics=['accuracy'])

    model.fit(x=x_train,
          y=y_train,
          batch_size=128, 
          epochs=10000, 
          callbacks=[EarlyStopping(patience=20)], 
          validation_data=(x_val, y_val))
    tm = timedelta(seconds=(time.time()-startTime))
    
    ## Predict
    y_pred_proba = model.predict_proba(x_test)
    y_pred = y_pred_proba.argmax(axis=1)
    
    ## Metrics
    auc_train = roc_auc_score(y_train[:,1], model.predict_proba(x_train)[:,1])
    fpr, tpr, thresholds = roc_curve(y_test.values, y_pred_proba[:,1])
    auc_1 = auc(fpr, tpr)
    auc_0 = auc(1-tpr, 1-fpr)
    precision = precision_score(y_test.values, y_pred, average='macro')
    recall = recall_score(y_test.values, y_pred, average='macro')  
    f1 = f1_score(y_test.values, y_pred, average='macro')
    accuracy = accuracy_score(y_test.values, y_pred)
           
    metrics_kf.loc[kf_i,'AURoc Train'] = auc_train
    metrics_kf.loc[kf_i,'AURoc 0'] = auc_0
    metrics_kf.loc[kf_i,'AURoc 1'] = auc_1
    metrics_kf.loc[kf_i,'Precision'] = precision
    metrics_kf.loc[kf_i,'Recall'] = recall
    metrics_kf.loc[kf_i,'F1 score'] = f1
    metrics_kf.loc[kf_i,'Accuracy'] = accuracy
    metrics_kf.loc[kf_i,'Time'] = tm
    if i == 1:
        plot_model(model, to_file='models/model_' + model_name.replace(' ', '_') + '.png', show_shapes=True)
    i=i+1
    clear_session()

print()
for m in ['AURoc Train', 'AURoc 0', 'AURoc 1', 'Precision', 'Recall', 'F1 score', 'Accuracy', 'Time']:

    mean = metrics_kf[m].mean()
    metrics_kf.loc['Media', m] = mean
    models.loc[model_name, m] = mean
    
    metrics_kf.loc['STD', m] = metrics_kf[m].std()

KF_1
Train on 97564 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Train on 97673 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/1000

Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Train on 97396 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
KF_6
Train on 97441 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/1000

Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Train on 97631 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Train on 97652 samples, validate on 13684 samples
Epoch 1/10000


In [12]:
print(model_name)

metrics_kf

CNN 2 AVG SM Clean


Unnamed: 0,AURoc Train,AURoc 0,AURoc 1,Precision,Recall,F1 score,Accuracy,Time
KF_1,0.952528,0.793324,0.796646,0.555561,0.706883,0.560752,0.834671,00:06:29.244792
KF_2,0.951445,0.81717,0.820492,0.574995,0.747019,0.594149,0.862817,00:05:49.038281
KF_3,0.95935,0.777937,0.777937,0.564806,0.705464,0.580015,0.862536,00:09:23.151749
KF_4,0.950396,0.801073,0.801073,0.565567,0.732764,0.577754,0.847672,00:05:25.903938
KF_5,0.954884,0.783081,0.783081,0.561646,0.709473,0.57342,0.851881,00:06:03.382000
KF_6,0.953754,0.80724,0.810562,0.567352,0.732249,0.581444,0.852802,00:06:48.387696
KF_7,0.958277,0.791241,0.794563,0.567195,0.702723,0.584526,0.869508,00:07:48.502874
KF_8,0.950203,0.764589,0.764589,0.556112,0.70533,0.562354,0.837806,00:06:37.169201
KF_9,0.948863,0.777785,0.781118,0.558072,0.723481,0.563071,0.831075,00:06:15.985186
KF_10,0.946718,0.77674,0.780074,0.5569,0.715125,0.562177,0.833443,00:05:13.196033


## CNN 3 AVG SM

In [14]:
# info
model_name = 'CNN 3 AVG SM Clean'

i = 1
metrics_kf = pd.DataFrame()
for train, test in kfold.split(x_data, y_data):

    ## Data
    x_train = x_data.iloc[train]
    y_train = y_data.iloc[train]
    
    x_test = x_data.iloc[test]
    y_test = y_data.iloc[test]
    
    ## Train, validation
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train)
    
    ## Remove duplicate label (Train)
    removeIndex = x_train[x_train.duplicated(keep=False)].index
    x_train = x_train.drop(removeIndex)
    y_train = y_train.drop(removeIndex)
    
    ## Standard Scaler
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    x_val = scaler.transform(x_val)
    
    # Oversampling
    sampling = SMOTE(kind='borderline2',k_neighbors=5, random_state=42, n_jobs=4)
    x_train, y_train = sampling.fit_sample(x_train, y_train)
    
    # one hot
    y_train = to_categorical(y_train)
    y_val = to_categorical(y_val)
    
    ## New Shape 20x20
    # Train
    n = x_train.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_train = np.concatenate((x_train, zeros), axis=1)
    
    # Val
    n = x_val.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_val = np.concatenate((x_val, zeros), axis=1)
    
    # Test
    n = x_test.shape[0]
    zeros = np.zeros((n,2),np.int32)
    x_test = np.concatenate((x_test, zeros), axis=1)
    
    # reshape to be [samples][width][height][pixels]
    x_train = x_train.reshape(x_train.shape[0], 17, 17, 1)
    x_test = x_test.reshape(x_test.shape[0], 17, 17, 1)
    x_val = x_val.reshape(x_val.shape[0], 17, 17, 1)
    
    ## status
    kf_i = 'KF_{0}'.format(i)
    print(kf_i)
    
    ## Train
    startTime = time.time()
    model = Sequential()
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, input_shape=(17, 17, 1), activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Conv2D(filters=6, kernel_size=(2,2), strides=1, activation='relu'))
    model.add(AveragePooling2D(pool_size=(2,2), strides=2))
    model.add(Flatten())
    model.add(Dense(24, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy',
              metrics=['accuracy'])

    model.fit(x=x_train,
          y=y_train,
          batch_size=128, 
          epochs=10000, 
          callbacks=[EarlyStopping(patience=20)], 
          validation_data=(x_val, y_val))
    tm = timedelta(seconds=(time.time()-startTime))
    
    ## Predict
    y_pred_proba = model.predict_proba(x_test)
    y_pred = y_pred_proba.argmax(axis=1)
    
    ## Metrics
    auc_train = roc_auc_score(y_train[:,1], model.predict_proba(x_train)[:,1])
    fpr, tpr, thresholds = roc_curve(y_test.values, y_pred_proba[:,1])
    auc_1 = auc(fpr, tpr)
    auc_0 = auc(1-tpr, 1-fpr)
    precision = precision_score(y_test.values, y_pred, average='macro')
    recall = recall_score(y_test.values, y_pred, average='macro')  
    f1 = f1_score(y_test.values, y_pred, average='macro')
    accuracy = accuracy_score(y_test.values, y_pred)
           
    metrics_kf.loc[kf_i,'AURoc Train'] = auc_train
    metrics_kf.loc[kf_i,'AURoc 0'] = auc_0
    metrics_kf.loc[kf_i,'AURoc 1'] = auc_1
    metrics_kf.loc[kf_i,'Precision'] = precision
    metrics_kf.loc[kf_i,'Recall'] = recall
    metrics_kf.loc[kf_i,'F1 score'] = f1
    metrics_kf.loc[kf_i,'Accuracy'] = accuracy
    metrics_kf.loc[kf_i,'Time'] = tm
    if i == 1:
        plot_model(model, to_file='models/model_' + model_name.replace(' ', '_') + '.png', show_shapes=True)
    i=i+1
    clear_session()

print()
for m in ['AURoc Train', 'AURoc 0', 'AURoc 1', 'Precision', 'Recall', 'F1 score', 'Accuracy', 'Time']:

    mean = metrics_kf[m].mean()
    metrics_kf.loc['Media', m] = mean
    models.loc[model_name, m] = mean
    
    metrics_kf.loc['STD', m] = metrics_kf[m].std()

KF_1
Train on 97589 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
KF_2
Train on 97494 samples, validate on 13684 samples
Epoch 1/10000


Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Train on 97500 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000

Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Train on 97392 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Train on 97545 samples, valid

Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
KF_9
Train on 97572 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Train on 97573 samples, validate on 13684 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/1000

In [15]:
print(model_name)

metrics_kf

CNN 3 AVG SM Clean


Unnamed: 0,AURoc Train,AURoc 0,AURoc 1,Precision,Recall,F1 score,Accuracy,Time
KF_1,0.938338,0.780862,0.780862,0.558157,0.693966,0.568822,0.85269,00:13:05.675209
KF_2,0.942117,0.810778,0.810778,0.564332,0.740858,0.573701,0.838748,00:10:25.964082
KF_3,0.939249,0.78244,0.78244,0.569091,0.690546,0.588414,0.879768,00:12:31.809886
KF_4,0.932397,0.798547,0.798547,0.560823,0.721445,0.569642,0.841226,00:10:20.373491
KF_5,0.936947,0.776853,0.776853,0.556183,0.697469,0.564092,0.84412,00:09:09.554630
KF_6,0.941153,0.799379,0.802701,0.560161,0.721805,0.568109,0.838858,00:09:01.163874
KF_7,0.9349,0.805252,0.805252,0.565541,0.715877,0.580135,0.858064,00:06:43.824637
KF_8,0.938347,0.773111,0.773111,0.551869,0.690877,0.55573,0.834517,00:06:43.196698
KF_9,0.935343,0.799631,0.802965,0.553962,0.729624,0.550779,0.809104,00:05:57.994335
KF_10,0.939455,0.789654,0.789654,0.562784,0.718685,0.574392,0.849493,00:09:17.324640


# Metrics

In [16]:
models

Unnamed: 0,AURoc Train,AURoc 0,AURoc 1,Precision,Recall,F1 score,Accuracy,Time
CNN 1 AVG RO Clean,0.939367,0.742443,0.743773,0.5532,0.709513,0.551852,0.817614,00:06:50.531726
CNN 2 AVG RO Clean,0.888367,0.780347,0.781345,0.544418,0.725499,0.518001,0.751921,00:09:24.590929
CNN 3 AVG RO Clean,0.827147,0.801928,0.802261,0.544363,0.727837,0.511241,0.736255,00:09:13.778201
CNN 1 AVG SM Clean,0.970432,0.773741,0.774738,0.565199,0.705028,0.580016,0.862287,00:05:00.811373
CNN 2 AVG SM Clean,0.952642,0.789018,0.791013,0.562821,0.718051,0.573966,0.848421,00:06:35.396175
CNN 3 AVG SM Clean,0.937825,0.791651,0.792316,0.56029,0.712115,0.569382,0.844659,00:09:19.688148
