## Load model and compute recommended tools

In [62]:
import os
import numpy as np
import json
import h5py

def load_model(model_path):
    model = h5py.File(model_path, 'r')
    dictionary = json.loads(model.get('data_dictionary').value)
    paths = json.loads(model.get('multilabels_paths').value)
    c_tools = json.loads(model.get('compatible_next_tools').value)
    class_weights = json.loads(model.get('class_weights').value)
    rev_dict = dict((str(v), k) for k, v in dictionary.items())
    return paths, dictionary, rev_dict, c_tools, class_weights

def predict_tools(dict_paths, d_dict, c_tools, class_weights, test_path="bowtie2"):
    p_num = list()
    for t in test_path.split(","):
            p_num.append(str(d_dict[t]))
    p_num = ",".join(p_num)
    predicted_tools = list()
    for k in dict_paths:
        if k == p_num:
            predicted_tools = dict_paths[k].split(",")
            break
    pred_names = list()
    for tool in predicted_tools:
        pred_names.append(rev_dict[tool])
    return predicted_tools, pred_names

In [123]:
model_path = "data/tool_recommendation_model_statistical_model.hdf5"
test_path = "umi_tools_extract"
dict_paths, d_dict, rev_dict, c_tools, class_weights = load_model(model_path)
pred_ids, pred_names = predict_tools(dict_paths, d_dict, c_tools, class_weights, test_path)

## Fetch top recommended tools (sorted in descending order based on their usage)

In [124]:
c_wt_names = dict()
topk = 20 # this specifies how many top recommended tools are computed
for t_id in pred_ids:
    t_name = rev_dict[t_id]
    c_wt_names[t_name] = class_weights[t_id]
sorted_pred_tools = sorted(c_wt_names.items(), key=lambda item: item[1], reverse=True)
sorted_names = list()
for k, v in sorted_pred_tools:
    sorted_names.append(k)

## Top recommended tools

In [125]:
print(sorted_names[:topk])

['fastqc', 'bowtie2', 'rna_star', 'bwa', 'je_markdupes', 'je_demultiplex', 'umi_tools_group']


In [93]:
d_dict

{'bedtools_windowbed': 1,
 'bg_diamond': 2,
 'ip_histogram_equalization': 3,
 'rseqc_junction_annotation': 4,
 'glob_report': 5,
 'vcfbedintersect': 6,
 'scanpy_run_umap': 7,
 'augustus': 8,
 'nn_classifier': 9,
 'ip_landmark_registration': 10,
 'mothur_get_communitytype': 11,
 'bedtools_slopbed': 12,
 'gd_calc_freq': 13,
 'goseq': 14,
 'extract_aln_ends.py': 15,
 'join1': 16,
 'mothur_get_groups': 17,
 'gatk2_variant_annotator': 18,
 'bio3d_rmsd': 19,
 'sklearn_searchcv': 20,
 'Flash': 21,
 'bedtools_intersectBed': 22,
 'biosigner': 23,
 'filter_by_fasta_ids': 24,
 'regionalgam_ab_index': 25,
 'FROGS_preprocess': 26,
 'ucsc_cell_browser': 27,
 'mothur_tree_shared': 28,
 'annotateMyIDs': 29,
 'varscan_somatic': 30,
 'rseqc_insertion_profile': 31,
 'hisat2': 32,
 'mtbls520_19d_seasons_concentration': 33,
 'tp_sorted_uniq': 34,
 'msconvert_win': 35,
 'flexbar': 36,
 'preMloc': 37,
 'gemini_burden': 38,
 'mtbls520_07_species_diversity': 39,
 'picard_CollectRnaSeqMetrics': 40,
 'graphclust