In [59]:
import numpy as np
import json
import warnings
import operator

import h5py
from keras.models import model_from_json
from keras import backend as K
from keras.utils import get_custom_objects

from matplotlib import pyplot as plt

warnings.filterwarnings("ignore")

size_title = 18
size_label = 14
n_pred = 2

base_path = "data/evaluate_rnn_compatible_loss_current/"

path_data_dict = base_path + "data_dict.txt"
path_usage_wt = base_path + "usage_prediction.txt"
path_class_wt = base_path + "class_weights.txt"
path_test_data = base_path + "test_paths_dict.txt"
model_path = base_path + "trained_model.hdf5"

def read_file(file_path):
    with open(file_path, 'r') as data_file:
        data = json.loads(data_file.read())
    return data

#class_weights1 = read_file(path_class_wt)

def output_act(class_weights):
    weights = list(class_weights.values())
    def weighted_sigmoid_activation(x):
        exp_weights = K.expand_dims(weights, axis=-1)
        return K.sigmoid(x * K.transpose(exp_weights))
    return weighted_sigmoid_activation

usage_weights = read_file(path_usage_wt)
data_dict = read_file(path_data_dict)
trained_model = h5py.File(model_path, 'r')
model_config = json.loads(trained_model.get('model_config').value)
class_weights = json.loads(trained_model.get('class_weights').value)
        
get_custom_objects().update({'weighted_sigmoid_activation': output_act(class_weights)})

loaded_model = model_from_json(model_config)
dictionary = json.loads(trained_model.get('data_dictionary').value)
compatibile_tools = json.loads(trained_model.get('compatible_tools').value)
best_params = json.loads(trained_model.get('best_parameters').value)

def create_model(model_path):
    
    reverse_dictionary = dict((str(v), k) for k, v in dictionary.items())
    model_weights = list()
    weight_ctr = 0
    while True:
        try:
            d_key = "weight_" + str(weight_ctr)
            weights = trained_model.get(d_key).value
            model_weights.append(weights)
            weight_ctr += 1
        except Exception as exception:
            break
    # set the model weights
    loaded_model.set_weights(model_weights)
    return loaded_model, dictionary, reverse_dictionary, compatibile_tools

model, dictionary, reverse_dictionary, compatibile_tools = create_model(model_path)

{'0': 0.0, '1': 2.27446361785358, '2': 1.6354577339219374, '3': 0.9087478102749869, '4': 0.17325309800988614, '5': 0.07736351284922702, '6': 0.29365654653338213, '7': 0.7919778886591803, '8': 1.1029134762805788, '9': 2.5026567842826952, '10': 4.375811394604323, '11': 2.5026567842826952, '12': 4.375811394604323, '13': 0.22443585123375284, '14': 0.4556750051868605, '15': 2.0963916503839046, '16': 1.1383900431676628, '17': 0.0, '18': 0.6900050515589842, '19': 0.3723730692219485, '20': 2.380752845189337, '21': 3.695163701386445, '22': 0.2552286750465688, '23': 1.3911318987151051, '24': 2.0963916503839046, '25': 0.46389630866834036, '26': 3.302043770252753, '27': 2.1804663026469755, '28': 4.375811394604323, '29': 2.0204909651536216, '30': 0.0, '31': 2.27446361785358, '32': 3.302043770252753, '33': 1.555678650387579, '34': 3.695163701386445, '35': 0.1431735463527236, '36': 3.026556328018614, '37': 0.0, '38': 3.695163701386445, '39': 2.27446361785358, '40': 1.5191582858184918, '41': 1.4518511

In [55]:
def analyze_output_layer(model, test_sample, dimensions, phase, iter_num=10):
    output_last = K.function([model.layers[0].input, K.learning_phase()], [model.layers[-1].output])
    result = np.zeros((iter_num,) + (1, dimensions))
    for idx in range(iter_num):
        result[idx] = output_last([test_sample, phase])[0]
    prediction = result.mean(axis=0)
    uncertainty = result.var(axis=0)
    return prediction, uncertainty


def plot_error_bar(prediction, variance):
    pred = prediction[0].tolist()
    var = variance[0].tolist()
    x = range(0, len(pred))
    plt.figure(figsize = (12, 12))
    #plt.errorbar(x, pred, var, marker='s', mfc='red', mec='green', ms=20, mew=4)
    plt.errorbar(x, pred, var, fmt='o', marker='s', mfc='red', mec='green')
    plt.grid(True)
    plt.show()


def verify_model(model, tool_sequence, labels, dictionary, reverse_dictionary, compatible_tools, phase, topk=20, max_seq_len=25):
    tl_seq = tool_sequence.split(",")
    last_tool_name = reverse_dictionary[str(tl_seq[-1])]
    last_compatible_tools = compatible_tools[last_tool_name]
    sample = np.zeros(max_seq_len)
    for idx, tool_id in enumerate(tl_seq):
        sample[idx] = int(tool_id)
    sample_reshaped = np.reshape(sample, (1, max_seq_len))

    tool_sequence_names = [reverse_dictionary[str(tool_pos)] for tool_pos in tool_sequence.split(",")]
    print("Tool seq: %s" % ",".join(tool_sequence_names))
    
    # predict next tools for a test path
    prediction = model.predict(sample_reshaped, verbose=0)
    #prediction, uncertainty = analyze_output_layer(model, sample_reshaped, len(reverse_dictionary) + 1, phase)
    #plot_error_bar(prediction, uncertainty)
    
    prediction = np.reshape(prediction, (prediction.shape[1],))
    prediction_pos = np.argsort(prediction, axis=-1)

    # get topk prediction
    topk_prediction_pos = prediction_pos[-topk:]
    topk_prediction_val = [np.round(prediction[pos], 2) for pos in topk_prediction_pos]

    # read tool names using reverse dictionary
    pred_tool_ids = [reverse_dictionary[str(tool_pos)] for tool_pos in topk_prediction_pos]
    actual_next_tool_ids = list(set(pred_tool_ids).intersection(set(last_compatible_tools.split(","))))      

    print()
    pred_tool_ids_sorted = dict()
    for (tool_pos, tool_pred_val) in zip(topk_prediction_pos, topk_prediction_val):
        tool_name = reverse_dictionary[str(tool_pos)]
        if tool_name in actual_next_tool_ids:
            pred_tool_ids_sorted[tool_name] = tool_pred_val
    pred_tool_ids_sorted = dict(sorted(pred_tool_ids_sorted.items(), key=lambda kv: kv[1], reverse=True))
    
    cls_wt = dict()
    usg_wt = dict()
    inv_wt = dict()
    ids_tools = dict()
    keys = list(pred_tool_ids_sorted.keys())
    for k in keys:
        try:
            cls_wt[k] = np.round(class_weights[str(data_dict[k])], 2)
            usg_wt[k] = np.round(usage_weights[k], 2)
            inv_wt[k] = np.round(inverted_weights[str(data_dict[k])], 2)
        except:
            continue
    print("Predicted tools: \n")
    print(pred_tool_ids_sorted)
    print()
    print("Class weights: \n")
    cls_wt = dict(sorted(cls_wt.items(), key=lambda kv: kv[1], reverse=True))
    print(cls_wt)
    print()
    print("Usage weights: \n")
    usg_wt = dict(sorted(usg_wt.items(), key=lambda kv: kv[1], reverse=True))
    print(usg_wt)
    print()
    total_usage_wt = np.mean(list(usg_wt.values()))
    print("Mean usage wt: %0.4f" % (total_usage_wt))
    print()
    print("Inverted weights: \n")
    inv_wt = dict(sorted(inv_wt.items(), key=lambda kv: kv[1], reverse=True))
    print(inv_wt)
    for key in pred_tool_ids_sorted:
        ids_tools[key] = dictionary[key]
    print()
    print("Tool ids")
    print(ids_tools)
    print("======================================")
    return cls_wt, usg_wt, inv_wt, pred_tool_ids_sorted


In [56]:
reverse_dictionary

{'1': 'vcffilter2',
 '2': 'uniprot',
 '3': 'mothur_metastats',
 '4': 'Extract genomic DNA 1',
 '5': 'tp_cat',
 '6': 'heinz_scoring',
 '7': 'fasta_filter_by_length',
 '8': 'bctools_remove_spurious_events',
 '9': 'fastq_filter',
 '10': 'abims_xcms_group',
 '11': 'maldi_quant_preprocessing',
 '12': 'qiime_pick_open_reference_otus',
 '13': 'bedtools_mergebed',
 '14': 'TextExporter',
 '15': 'gatk2_unified_genotyper',
 '16': 'taxonomy_krona_chart',
 '17': 'chipsequtil_maptoknowngenes',
 '18': 'umi_tools_extract',
 '19': 'circgraph',
 '20': 'EMBOSS: tranalign100',
 '21': 'blast2go',
 '22': 'get_sequences',
 '23': 'tophat2',
 '24': 'mimodd_varreport',
 '25': 'tabular_to_fastq',
 '26': '__FILTER_FROM_FILE__',
 '27': 'cshl_fastx_reverse_complement',
 '28': 'gtf_filter_by_attribute_values_list',
 '29': 'structure_to_gspan',
 '30': 'FeatureLinkerUnlabeled',
 '31': 'hicexplorer_hiccorrectmatrix',
 '32': 'XTandemAdapter',
 '33': 'vcf_filter',
 '34': 'wsdl_hmdb',
 '35': 'tp_replace_in_column',
 '36':

In [58]:
topk = 20
tool_seq = "600"
class_wt, usage_wt, inverse_wt, pred_tools = verify_model(model, tool_seq, "", dictionary, reverse_dictionary, compatibile_tools, 1, topk)

Tool seq: bowtie2

Predicted tools: 

{}

Class weights: 

{}

Usage weights: 

{}

Mean usage wt: nan

Inverted weights: 

{}

Tool ids
{}
