# Tool recommendation with GRU bidirectional network 
## (GRU bidirectional neural network with weighted cross-entropy loss)

In [41]:
import numpy as np
import json
import warnings
import operator

import tensorflow as tf
from keras import backend as K
import h5py

warnings.filterwarnings("ignore")


def load_model(model_path):
    trained_model = h5py.File(model_path, 'r')
    model_config = json.loads(trained_model.get('model_config').value)
    model = tf.keras.models.model_from_json(model_config)
    model.load_weights(model_path)
    
    dictionary = json.loads(trained_model.get('data_dictionary').value)
    best_parameters = json.loads(trained_model.get('best_parameters').value)
    compatible_tools = json.loads(trained_model.get('compatible_tools').value)
    class_weights = json.loads(trained_model.get('class_weights').value)
    reverse_dictionary = dict((str(v), k) for k, v in dictionary.items())
    return dictionary, reverse_dictionary, best_parameters, compatible_tools, model, class_weights


model_path = "data/tool_recommendation_bidirectional_model.hdf5"
dictionary, reverse_dictionary, best_parameters, compatible_tools, new_model, class_weights = load_model(model_path)

## Unpack trained model for prediction

In [42]:
print(reverse_dictionary)

{'1': 'FalseDiscoveryRate', '2': 'gatk2_variant_apply_recalibration', '3': 'bgchem_fragment_merger', '4': 'gatk2_indel_realigner', '5': 'rseqc_infer_experiment', '6': 'blast2go', '7': 'proteomics_search_protein_prophet_1', '8': 'blastxml_to_tabular', '9': 'CONVERTER_interval_to_bgzip_0', '10': 'cshl_fastx_artifacts_filter', '11': 'bedtools_bamtofastq', '12': 'EMBOSS: transeq101', '13': 'ctb_pubchem_download_as_smiles', '14': 'picard_NormalizeFasta', '15': 'vcffilter', '16': 'CONVERTER_gff_to_bed_0', '17': 'deeptools_bamCoverage', '18': 'cufflinks', '19': 'picard_ReorderSam', '20': 'openms_protein_quantifier', '21': 'XTandemAdapter', '22': 'Add_a_column1', '23': 'samtools_mpileup', '24': 'IDPosteriorErrorProbability', '25': 'cshl_fastq_to_fasta', '26': 'peakcalling_macs14', '27': 'secretbt2test', '28': 'cshl_sed_tool', '29': 'FileMerger', '30': 'EMBOSS: water107', '31': 'rseqc_bam2wig', '32': 'picard_CollectInsertSizeMetrics', '33': 'gff_to_sequence', '34': 'sam_bw_filter', '35': 'fastq

In [43]:
def compute_recommendations(tool_sequence, labels, topk=20, max_seq_len=25):
    tl_seq = tool_sequence.split(",")
    last_tool_name = reverse_dictionary[str(tl_seq[-1])]
    sample = np.zeros(max_seq_len)
    for idx, tool_id in enumerate(tl_seq):
        sample[idx] = int(tool_id)
    sample_reshaped = np.reshape(sample, (1, max_seq_len))
    tool_sequence_names = [reverse_dictionary[str(tool_pos)] for tool_pos in tool_sequence.split(",")]
    # predict next tools for a test path
    prediction = new_model.predict(sample_reshaped, verbose=0)
    weight_val = list(class_weights.values())
    weight_val = np.reshape(weight_val, (len(weight_val),))
    prediction = np.reshape(prediction, (prediction.shape[1],))
    #prediction = prediction * weight_val
    prediction = prediction / float(np.max(prediction))
    prediction_pos = np.argsort(prediction, axis=-1)
    # get topk prediction
    topk_prediction_pos = prediction_pos[-topk:]
    topk_prediction_pos = [item for item in topk_prediction_pos if item != 0]
    topk_prediction_val = [int(prediction[pos] * 100) for pos in topk_prediction_pos]
    # read tool names using reverse dictionary
    pred_tool_ids = [reverse_dictionary[str(tool_pos)] for tool_pos in topk_prediction_pos]
    pred_tool_ids_sorted = dict()
    c_tools = list()
    if last_tool_name in compatible_tools:
        c_tools = compatible_tools[last_tool_name].split(",")
    for (tool_pos, tool_pred_val) in zip(topk_prediction_pos, topk_prediction_val):
        tool_name = reverse_dictionary[str(tool_pos)]
        if tool_name in c_tools:
            pred_tool_ids_sorted[tool_name] = tool_pred_val
    pred_tool_ids_sorted = dict(sorted(pred_tool_ids_sorted.items(), key=lambda kv: kv[1], reverse=True))
    ids_tools = dict()
    keys = list(pred_tool_ids_sorted.keys())
    tool_seq_name = ",".join(tool_sequence_names)
    print("Current tool sequence: ")
    print()
    print(tool_seq_name)
    print()
    print("Recommended tools for the tool sequence '%s' with their scores in decreasing order:" % tool_seq_name)
    print()
    for i in pred_tool_ids_sorted:
        print(i + "(" + str(pred_tool_ids_sorted[i]) + "%)")
    for key in pred_tool_ids_sorted:
        ids_tools[key] = dictionary[key]
    print()
    print("Tool ids:")
    print()
    for i in ids_tools:
        print(i + "(" + str(ids_tools[i]) + ")")

## Indices of tools

In [44]:
topk = 20 # set the maximum number of recommendations
tool_seq = "1" # give tools ids in a sequence and see the recommendations. To know all the tool ids, 
                     # please print the variable 'reverse_dictionary'
compute_recommendations(tool_seq, "", topk)

Current tool sequence: 

FalseDiscoveryRate

Recommended tools for the tool sequence 'FalseDiscoveryRate' with their scores in decreasing order:


Tool ids:



## Recommended tools

In [45]:
print(new_model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 25)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 25, 212)      65720       input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 25, 212)      0           embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 25, 344), (N 398352      spatial_dropout1d[0][0]          
______________________________________________________________________________________________

In [46]:
print(best_parameters)

{'embedding_size': 212, 'gru_units': 172, 'spatial_dropout': 0.15000000000000002, 'dropout': 0.45, 'recurrent_dropout': 0.4, 'learning_rate': 0.0036164835628463956, 'batch_size': 32, 'max_len': 25, 'dimensions': 310}
