In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping
import nn_models

import sys, os 
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
sys.path.insert(1, os.path.join(sys.path[0], '..')) 
import custom_utils


def init_dirs(config_file):
    out_dir = custom_utils.create_out_dir(config_file)
    out_dir = '../' + out_dir	
    ml_data_dir = out_dir + '/ml_data'

    return out_dir, ml_data_dir


def read_data():
    print('> Loading data (top ' + str(100 * float(top_ratio)) + ' %) ...')
    top_ratio_str = '.top_' + str(top_ratio)

    print('Reading training data...')
    pkl_in = open(ml_data_dir + '/train' + top_ratio_str + '.pkl', 'rb')
    train_dict = pickle.load(pkl_in)
    pkl_in.close()

    print('Reading validation data...')
    pkl_in = open(ml_data_dir + '/validation' + top_ratio_str + '.pkl', 'rb')
    validation_dict = pickle.load(pkl_in)
    pkl_in.close()

    print('Reading test data...')
    pkl_in = open(ml_data_dir + '/test' + top_ratio_str + '.pkl', 'rb')
    test_dict = pickle.load(pkl_in)
    pkl_in.close()

    return train_dict, validation_dict, test_dict


def inspect_input_data(train_dict, validation_dict, test_dict):

    print('\n> Train:')
    print(train_dict['X'].shape)
    print(train_dict[y].shape)
    print(train_dict['seqs'].shape)

    print('\n> Validation:')
    print(validation_dict['X'].shape)
    print(validation_dict[y].shape)
    print(validation_dict['seqs'].shape)

    print('\n> Test:')
    print(test_dict['X'].shape)
    print(test_dict[y].shape)
    print(test_dict['seqs'].shape)



def compile_and_train_model(model, epochs=40):

    if regression:
        model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
    else:
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['cosine'])

    checkpoint_name = 'gwrvis_best_model.hdf5'
    checkpointer = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
    earlystopper = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

    model.fit(train_dict['seqs'], train_dict[y], batch_size=2048, epochs=epochs, 
          shuffle=True,
          validation_data=(validation_dict['seqs'], validation_dict[y]), 
          callbacks=[checkpointer,earlystopper])

    return model


def test_and_evaluate_model(model):
    test_results = model.evaluate(test_dict['seqs'], test_dict[y]) 
    print(test_results)

    preds = model.predict(test_dict['seqs'])
    decision_thres = 0.5 # for classification
    if regression:
        decision_thres = 0 # 0 is the natural border between tolerant and intolerant gwRVIS values

    preds[preds >= decision_thres] = 1
    preds[preds < decision_thres] = 0

    preds_flat = preds.flatten()
    test_flat = test_dict[y].flatten()

    print(accuracy_score(test_flat, preds_flat))
    print(confusion_matrix(test_flat, preds_flat))

    roc_auc = roc_auc_score(test_flat, preds_flat)
    print('ROC AUC:', roc_auc)

Using TensorFlow backend.


In [2]:
if __name__ == '__main__':

    config_file = '../config.yaml' #sys.argv[1]
    top_ratio = 0.001 #sys.argv[2] 	#default: 0.01 -- look at top 1% of intolerant/tolerant windows

    regression=False
    if regression:
        y = 'gwrvis'   # continuous value
    else:
        y = 'y'  # 1/0 annotation

    config_params = custom_utils.get_config_params(config_file)
    win_len = config_params['win_len']

    # init out dir structure
    out_dir, ml_data_dir = init_dirs(config_file)

    # read train, validation, test data
    train_dict, validation_dict, test_dict = read_data()

    # print input data shapes to check for consistency
    inspect_input_data(train_dict, validation_dict, test_dict)

    #model = nn_models.cnn_1_conv_2_fcc(regression=regression) 
    model = nn_models.cnn_rnn_1_conv_1_lstm(regression=regression)
    print(model.summary())

    model = compile_and_train_model(model, epochs=10)

> Loading data (top 0.1 %) ...
Reading training data...
Reading validation data...
Reading test data...

> Train:
(1240, 5)
(1240, 2)
(1240, 3000, 4)

> Validation:
(64, 5)
(64, 2)
(64, 3000, 4)

> Test:
(310, 5)
(310, 2)
(310, 3000, 4)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 2971, 16)          1936      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 198, 16)           0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 198, 16)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 198, 640)          862720    
_________________________________________________________________
dropout_2 (Dropout)          (None, 198, 640)          0         
_____________________________________

In [None]:
test_dict['gwrvis']

In [3]:
preds = model.predict(test_dict['seqs'])

print(preds)

[[0.38635784 0.61364216]
 [0.23948577 0.7605143 ]
 [0.24778436 0.7522156 ]
 [0.5161654  0.48383468]
 [0.6138244  0.38617554]
 [0.5252398  0.47476017]
 [0.57467455 0.42532548]
 [0.56012976 0.43987024]
 [0.23780088 0.7621991 ]
 [0.21731295 0.78268707]
 [0.4694343  0.53056574]
 [0.19750874 0.80249125]
 [0.35222748 0.64777243]
 [0.6007285  0.3992715 ]
 [0.27218783 0.7278122 ]
 [0.19937176 0.80062824]
 [0.22824971 0.77175033]
 [0.47077364 0.5292264 ]
 [0.4463056  0.55369437]
 [0.47552165 0.5244784 ]
 [0.26611894 0.73388106]
 [0.4261733  0.57382673]
 [0.29003727 0.7099627 ]
 [0.543739   0.45626098]
 [0.2590626  0.7409375 ]
 [0.49579215 0.5042078 ]
 [0.20262912 0.7973709 ]
 [0.50714535 0.49285462]
 [0.27021974 0.72978026]
 [0.19340652 0.8065934 ]
 [0.263356   0.736644  ]
 [0.21823543 0.7817646 ]
 [0.18940392 0.8105961 ]
 [0.44658446 0.5534156 ]
 [0.199038   0.80096203]
 [0.46457148 0.5354285 ]
 [0.28001127 0.71998876]
 [0.39928678 0.6007132 ]
 [0.2638685  0.7361314 ]
 [0.24672599 0.75327396]


In [4]:
decision_thres = 0.5 # for classification
if regression:
    decision_thres = 0 # 0 is the natural border between tolerant and intolerant gwRVIS values

preds[preds >= decision_thres] = 1
preds[preds < decision_thres] = 0
print(preds)

[[0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 

In [20]:
preds_flat = np.argmax(preds, axis=1)
test_flat = np.argmax(test_dict[y], axis=1)

print(preds_flat)
# print(accuracy_score(test_flat, preds_flat))
# print(confusion_matrix(test_flat, preds_flat))

# roc_auc = roc_auc_score(test_flat, preds_flat)
# print('ROC AUC:', roc_auc)

[1 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0
 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 0 1 1 1 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1
 1 0 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1
 1 0 1 0 1 1 1 1 1 1 0 1 1 1]


In [25]:
print(test_flat[ test_flat == 0 ].shape)
print(test_flat[ test_flat == 1 ].shape)

print(preds_flat[ preds_flat == 0 ].shape)
print(preds_flat[ preds_flat == 1 ].shape)

(160,)
(150,)
(52,)
(258,)


In [26]:
print(accuracy_score(test_flat, preds_flat))
print(confusion_matrix(test_flat, preds_flat))

roc_auc = roc_auc_score(test_flat, preds_flat)
print('ROC AUC:', roc_auc)

0.5483870967741935
[[ 36 124]
 [ 16 134]]
ROC AUC: 0.5591666666666666


In [24]:
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

def plot_roc_curve(test_flat, preds_flat, make_plot=True):

    fpr, tpr, _ = roc_curve(test_flat, preds_flat)
    roc_auc = roc_auc_score(test_flat, preds_flat)

    if make_plot:
        f = plt.figure(figsize=(6, 6))
        _ = plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
        _ = plt.plot([0, 1], [0, 1], '--', linewidth=0.5)  # random predictions curve

        _ = plt.xlim([0.0, 1.0])
        _ = plt.ylim([0.0, 1.0])
        _ = plt.title('\nROC (area = %0.3f)' % roc_auc)
        _ = plt.xlabel('False Positive Rate (1 — Specificity)')
        _ = plt.ylabel('True Positive Rate (Sensitivity)')
        plt.grid(True)
        plt.show()

        f.savefig("ROC_curve.pdf", bbox_inches='tight')

    return fpr, tpr
    
plot_roc_curve(test_flat, preds_flat)

  % get_backend())


(array([0.   , 0.775, 1.   ]), array([0.        , 0.89333333, 1.        ]))