# Tool recommendation 
## (Gated recurrent units neural network with weighted cross-entropy loss)

In [18]:
import numpy as np
import json
import warnings
import operator

import h5py
from keras.models import model_from_json
from keras import backend as K

warnings.filterwarnings("ignore")


def read_file(file_path):
    with open(file_path, 'r') as data_file:
        data = json.loads(data_file.read())
    return data


def create_model(model_path):
    reverse_dictionary = dict((str(v), k) for k, v in dictionary.items())
    model_weights = list()
    weight_ctr = 0
    for index, item in enumerate(trained_model.keys()):
        if "weight_" in item:
            d_key = "weight_" + str(weight_ctr)
            weights = trained_model.get(d_key).value
            model_weights.append(weights)
            weight_ctr += 1
    # set the model weights
    loaded_model.set_weights(model_weights)
    return loaded_model, dictionary, reverse_dictionary


def compute_recommendations(model, tool_sequence, labels, dictionary, reverse_dictionary, class_weights, topk=20, max_seq_len=25):
    tl_seq = tool_sequence.split(",")
    last_tool_name = reverse_dictionary[str(tl_seq[-1])]
    sample = np.zeros(max_seq_len)
    for idx, tool_id in enumerate(tl_seq):
        sample[idx] = int(tool_id)
    sample_reshaped = np.reshape(sample, (1, max_seq_len))
    tool_sequence_names = [reverse_dictionary[str(tool_pos)] for tool_pos in tool_sequence.split(",")]
    # predict next tools for a test path
    prediction = model.predict(sample_reshaped, verbose=0)
    weight_val = list(class_weights.values())
    weight_val = np.reshape(weight_val, (len(weight_val),))
    prediction = np.reshape(prediction, (prediction.shape[1],))
    #prediction = prediction * weight_val
    prediction = prediction / float(np.max(prediction))
    prediction_pos = np.argsort(prediction, axis=-1)
    # get topk prediction
    topk_prediction_pos = prediction_pos[-topk:]
    topk_prediction_val = [int(prediction[pos] * 100) for pos in topk_prediction_pos]
    # read tool names using reverse dictionary
    pred_tool_ids = [reverse_dictionary[str(tool_pos)] for tool_pos in topk_prediction_pos]
    pred_tool_ids_sorted = dict()
    c_tools = list()
    if last_tool_name in compatible_tools:
        c_tools = compatible_tools[last_tool_name].split(",")
    for (tool_pos, tool_pred_val) in zip(topk_prediction_pos, topk_prediction_val):
        tool_name = reverse_dictionary[str(tool_pos)]
        if tool_name in c_tools:
            pred_tool_ids_sorted[tool_name] = tool_pred_val
    pred_tool_ids_sorted = dict(sorted(pred_tool_ids_sorted.items(), key=lambda kv: kv[1], reverse=True))
    ids_tools = dict()
    keys = list(pred_tool_ids_sorted.keys())
    tool_seq_name = ",".join(tool_sequence_names)
    print("Current tool sequence: ")
    print()
    print(tool_seq_name)
    print()
    print("Recommended tools for the tool sequence '%s' with their scores in decreasing order:" % tool_seq_name)
    print()
    for i in pred_tool_ids_sorted:
        print(i + "(" + str(pred_tool_ids_sorted[i]) + "%)")
    for key in pred_tool_ids_sorted:
        ids_tools[key] = dictionary[key]
    print()
    print("Tool ids:")
    print()
    for i in ids_tools:
        print(i + "(" + str(ids_tools[i]) + ")")

## Unpack trained model for prediction

In [19]:
model_path = "data/tool_recommendation_bidi_model.hdf5"
trained_model = h5py.File(model_path, 'r')
model_config = json.loads(trained_model.get('model_config').value)
dictionary = json.loads(trained_model.get('data_dictionary').value)
class_weights = json.loads(trained_model.get('class_weights').value)
compatible_tools = json.loads(trained_model.get('compatible_tools').value)
loaded_model = model_from_json(model_config)

model, dictionary, reverse_dictionary = create_model(model_path)

## Indices of tools

In [20]:
print(reverse_dictionary)

{'1': 'CONVERTER_sam_to_unsorted_bam', '2': 'predict_pipeline', '3': 'fraggenescan', '4': 'ip_2d_filter_segmentation_by_features', '5': 'snpEff', '6': 'proteomics_moff', '7': 'scanpy_cluster_reduce_dimension', '8': 'scanpy_regress_variable', '9': 'get_sequences', '10': 'scanpy_plot_embed', '11': 'EMBOSS: revseq82', '12': 'hicexplorer_hicplotviewpoint', '13': 'sixgill_build', '14': 'macs2_filterdup', '15': 'canu', '16': 'modencode_peakcalling_macs2', '17': 'thermo_raw_file_converter', '18': 'mycrobiota-split-multi-otutable', '19': 'fastq_to_fasta_python', '20': 'mothur_make_design', '21': 'maxquant', '22': 'tp_cut_tool', '23': 'deg_annotate', '24': 'uniprot', '25': 'mycrobiota-correct-replicates', '26': 'qiime_pick_rep_set', '27': 'bio3d_rmsd', '28': 'phyml', '29': 'cp_export_to_spreadsheet', '30': 'mass_spectrometry_imaging_mzplots', '31': 'bedtools_multiintersectbed', '32': 'ncbi_blastp_wrapper', '33': 'scatterplot_rpy', '34': 'sickle', '35': 'Extractor', '36': 'genbank_to_gff', '37':

## Recommended tools

In [34]:
topk = 20 # set the maximum number of recommendations
tool_seq = "88,1319" # give tools ids in a sequence and see the recommendations. To know all the tool ids, 
                     # please print the variable 'reverse_dictionary'
compute_recommendations(model, tool_seq, "", dictionary, reverse_dictionary, class_weights, topk)

Current tool sequence: 

rna_star,freebayes

Recommended tools for the tool sequence 'rna_star,freebayes' with their scores in decreasing order:

Filter1(100%)
vcffixup(99%)
vcftools_compare(99%)
lofreq_filter(99%)
SnpEff-cds-report(99%)
vcffilter2(99%)
vcfcheck(99%)
vcftools_merge(99%)
vcfallelicprimitives(99%)
custom_pro_db(99%)
tp_sed_tool(99%)
vcfbedintersect(99%)
tp_tail_tool(99%)
vcfcombine(99%)
snpSift_annotate(99%)
bcftools_consensus(99%)
jbrowse(99%)
vcfvcfintersect(99%)
Remove beginning1(99%)
snpEff(99%)

Tool ids:

Filter1(511)
vcffixup(508)
vcftools_compare(898)
lofreq_filter(840)
SnpEff-cds-report(452)
vcffilter2(1424)
vcfcheck(599)
vcftools_merge(767)
vcfallelicprimitives(976)
custom_pro_db(397)
tp_sed_tool(1151)
vcfbedintersect(1216)
tp_tail_tool(630)
vcfcombine(136)
snpSift_annotate(185)
bcftools_consensus(1304)
jbrowse(101)
vcfvcfintersect(1080)
Remove beginning1(814)
snpEff(5)
