In [2]:
import numpy as np
import json
import warnings
import operator

import h5py
from keras.models import model_from_json
from keras import backend as K

from matplotlib import pyplot as plt

warnings.filterwarnings("ignore")

size_title = 18
size_label = 14
n_pred = 2

base_path = "data/remote_sum/"

path_data_dict = base_path + "data_dict.txt"
path_inverted_wt = base_path + "inverted_weights.txt"
path_usage_wt = base_path + "usage_prediction.txt"
path_class_wt = base_path + "class_weights.txt"
path_test_data = base_path + "test_data.txt"
model_path = base_path + "trained_model.hdf5"

def read_file(file_path):
    with open(file_path, 'r') as data_file:
        data = json.loads(data_file.read())
    return data

class_weights = read_file(path_class_wt)
usage_weights = read_file(path_usage_wt)
inverted_weights = read_file(path_inverted_wt)
data_dict = read_file(path_data_dict)

def create_model(model_path):
    trained_model = h5py.File(model_path, 'r')
    model_config = json.loads(trained_model.get('model_config').value)
    loaded_model = model_from_json(model_config)
    dictionary = json.loads(trained_model.get('data_dictionary').value)
    compatibile_tools = json.loads(trained_model.get('compatible_tools').value)
    reverse_dictionary = dict((str(v), k) for k, v in dictionary.items())
    model_weights = list()
    weight_ctr = 0
    while True:
        try:
            d_key = "weight_" + str(weight_ctr)
            weights = trained_model.get(d_key).value
            model_weights.append(weights)
            weight_ctr += 1
        except Exception as exception:
            break
    # set the model weights
    loaded_model.set_weights(model_weights)
    return loaded_model, dictionary, reverse_dictionary, compatibile_tools

model, dictionary, reverse_dictionary, compatibile_tools = create_model(model_path)

Using TensorFlow backend.


In [4]:
reverse_dictionary

{'1': 'bedtools_coveragebed',
 '2': 'IDMapper',
 '3': 'heatmapper_deepTools',
 '4': 'gatk2_indel_realigner',
 '5': 'transpose',
 '6': 'myrimatch',
 '7': 'mothur_make_contigs',
 '8': 'samtools_rmdup',
 '9': 'qiime_pick_open_reference_otus',
 '10': 'piranha',
 '11': 'cshl_fastx_quality_statistics',
 '12': 'ggplot2_histogram',
 '13': 'FROGS_affiliations_stat',
 '14': 'goslimmer',
 '15': 'MassCalculator',
 '16': 'deeptools_bamCorrelate',
 '17': 'EMBOSS: water107',
 '18': 'vcffilter',
 '19': 'maldi_quant_peak_detection',
 '20': 'hicexplorer_hicplottads',
 '21': 'samtools_mpileup',
 '22': 'deeptools_correct_gc_bias',
 '23': 'rgPicFixMate',
 '24': 'gmx_md',
 '25': 'cardinal_filtering',
 '26': 'Fetch Taxonomic Ranks',
 '27': 'flexbar_split_RR_bcs',
 '28': 'vcfannotate',
 '29': 'idr-embl',
 '30': 'MapAlignerPoseClustering',
 '31': 'cshl_fastx_artifacts_filter',
 '32': 'iReport',
 '33': 'rseqc_infer_experiment',
 '34': 'umi_tools_extract',
 '35': 'ggplot2_heatmap',
 '36': 'sickle',
 '37': 'cuffn

In [47]:
def verify_model(model, tool_sequence, labels, dictionary, reverse_dictionary, compatible_tools, topk=10, max_seq_len=25):
    tl_seq = tool_sequence.split(",")
    last_tool_name = reverse_dictionary[str(tl_seq[-1])]
    #print(last_tool_name, tl_seq)
    try:
        last_compatible_tools = compatible_tools[last_tool_name]
    except:
        last_compatible_tools = []
    sample = np.zeros(max_seq_len)
    for idx, tool_id in enumerate(tl_seq):
        sample[idx] = int(tool_id)
    sample_reshaped = np.reshape(sample, (1, max_seq_len))
    tool_sequence_names = [reverse_dictionary[str(tool_pos)] for tool_pos in tool_sequence.split(",")]
    print(",".join(tool_sequence_names))
    #print(sample)
    # predict next tools for a test path
    prediction = model.predict(sample_reshaped, verbose=0)
    prediction = np.reshape(prediction, (prediction.shape[1],))
    prediction_pos = np.argsort(prediction, axis=-1)

    # get topk prediction
    topk_prediction_pos = prediction_pos[-topk:]
    topk_prediction_val = [np.round(prediction[pos] * 100, 2) for pos in topk_prediction_pos]

    # read tool names using reverse dictionary
    pred_tool_ids = [reverse_dictionary[str(tool_pos)] for tool_pos in topk_prediction_pos]
    try:
        actual_next_tool_ids = list(set(pred_tool_ids).intersection(set(last_compatible_tools.split(","))))
    except:
        actual_next_tool_ids = []
        
    #print("Predicted tools: %s" % ",".join(pred_tool_ids))
    #print()
    pred_tool_ids_sorted = dict()
    for (tool_pos, tool_pred_val) in zip(topk_prediction_pos, topk_prediction_val):
        tool_name = reverse_dictionary[str(tool_pos)]
        if tool_name in actual_next_tool_ids:
            pred_tool_ids_sorted[tool_name] = tool_pred_val
    pred_tool_ids_sorted = dict(sorted(pred_tool_ids_sorted.items(), key=lambda kv: kv[1], reverse=True))
    cls_wt = dict()
    usg_wt = dict()
    inv_wt = dict()
    ids_tools = dict()
    keys = list(pred_tool_ids_sorted.keys())
    for k in keys:
        try:
            cls_wt[k] = np.round(class_weights[str(data_dict[k])], 2)
            usg_wt[k] = np.round(usage_weights[k], 2)
            inv_wt[k] = np.round(inverted_weights[str(data_dict[k])], 2)
        except:
            continue
    print("Predicted tools: \n")
    print(pred_tool_ids_sorted)
    print()
    print("Class weights: \n")
    cls_wt = dict(sorted(cls_wt.items(), key=lambda kv: kv[1], reverse=True))
    print(cls_wt)
    print()
    print("Usage weights: \n")
    usg_wt = dict(sorted(usg_wt.items(), key=lambda kv: kv[1], reverse=True))
    print(usg_wt)
    print()
    print("Inverted weights: \n")
    inv_wt = dict(sorted(inv_wt.items(), key=lambda kv: kv[1], reverse=True))
    print(inv_wt)
    for key in pred_tool_ids_sorted:
        ids_tools[key] = dictionary[key]
    print()
    print(ids_tools)
    print("======================================")
    ave_pred_wt = np.mean(list(pred_tool_ids_sorted.values()))
    ave_cls_wt = np.mean(list(cls_wt.values()))
    ave_usg_wt = np.mean(list(usg_wt.values()))
    ave_inv_wt = np.mean(list(inv_wt.values()))
    return ave_pred_wt, ave_cls_wt, ave_usg_wt, ave_inv_wt

ave_prediction_weights = list()
ave_class_weights = list()
ave_usage_weights = list()
ave_inverted_weights = list()

def get_predictions(model, dictionary, reverse_dictionary, compatibile_tools):
    t_data = read_file(path_test_data)
    ctr = 1
    for ph, cl in t_data.items():
        ave_pred_wt, ave_cls_wt, ave_usg_wt, ave_inv_wt = verify_model(model, ph, cl, dictionary, reverse_dictionary, compatibile_tools)
        ave_prediction_weights.append(ave_pred_wt)
        ave_class_weights.append(ave_cls_wt)
        ave_usage_weights.append(ave_usg_wt)
        ave_inverted_weights.append(ave_inv_wt)
        ctr += 1
        if ctr == 10000:
            break
tool_seq = "302,275" #
verify_model(model, tool_seq, "", dictionary, reverse_dictionary, compatibile_tools)
#get_predictions(model, dictionary, reverse_dictionary, compatibile_tools)

trimmomatic,hisat2
Predicted tools: 

{'fastqc': 7.12, 'featurecounts': 6.87, 'multiqc': 4.51, 'htseq_count': 3.56, 'bam_to_sam': 1.54}

Class weights: 

{'fastqc': 6666.35, 'featurecounts': 2646.14, 'htseq_count': 1993.2, 'bam_to_sam': 1523.86, 'multiqc': 1315.05}

Usage weights: 

{'fastqc': 5879.0, 'featurecounts': 1191.25, 'htseq_count': 538.31, 'multiqc': 434.46, 'bam_to_sam': 68.97}

Inverted weights: 

{'featurecounts': 1454.89, 'htseq_count': 1454.89, 'bam_to_sam': 1454.89, 'multiqc': 880.59, 'fastqc': 787.35}

{'fastqc': 430, 'featurecounts': 38, 'multiqc': 597, 'htseq_count': 240, 'bam_to_sam': 334}


(4.72, 2828.92, 1622.398, 1206.5220000000002)

In [None]:
def plot_scatter(x_val, y_val, title, xlabel, ylabel):
    plt.figure(figsize=(8, 8))
    plt.plot(x_val, y_val, 'ro')
    plt.xlabel(xlabel, size=size_label)
    plt.ylabel(ylabel, size=size_label)
    plt.title(title, size=size_title)
    plt.grid(True)
    plt.show()
    
plot_scatter(ave_prediction_weights, ave_class_weights, "Prediction vs class weights", "Prediction scores", "Class weights")
plot_scatter(ave_prediction_weights, ave_usage_weights, "Prediction vs usage weights", "Prediction scores", "Usage weights")
plot_scatter(ave_prediction_weights, ave_inverted_weights, "Prediction vs inverted weights", "Prediction scores", "Inverted weights")