## Load model and compute recommended tools

In [7]:
import os
import numpy as np
import json
import h5py

def load_model(model_path):
    model = h5py.File(model_path, 'r')
    dictionary = json.loads(model.get('data_dictionary').value)
    paths = json.loads(model.get('multilabels_paths').value)
    c_tools = json.loads(model.get('compatible_next_tools').value)
    class_weights = json.loads(model.get('class_weights').value)
    standard_connections = json.loads(model.get('standard_connections').value)
    rev_dict = dict((str(v), k) for k, v in dictionary.items())
    return paths, dictionary, rev_dict, c_tools, class_weights, standard_connections

def predict_tools(dict_paths, d_dict, c_tools, class_weights, test_path="bowtie2"):
    p_num = list()
    for t in test_path.split(","):
            p_num.append(str(d_dict[t]))
    p_num = ",".join(p_num)
    predicted_tools = list()
    for k in dict_paths:
        if k == p_num:
            predicted_tools = dict_paths[k].split(",")
            break
    pred_names = list()
    for tool in predicted_tools:
        pred_names.append(rev_dict[tool])
    return predicted_tools, pred_names

def sort_by_wt(t_list):
    c_wt_names = dict()
    
    for t_name in t_list:
        t_id = d_dict[t_name]
        c_wt_names[t_name] = class_weights[str(t_id)]
    sorted_pred_tools = sorted(c_wt_names.items(), key=lambda item: item[1], reverse=True)
    sorted_names = list()
    for k, v in sorted_pred_tools:
        sorted_names.append(k)
    return sorted_names[:topk]

def sort_recommended_tools(pred_names):
    s_pred_sorted = list()
    n_pred_sorted = list()
    if test_path in standard_connections:
        s_conn = standard_connections[test_path]
        s_pred = list(set(s_conn).intersection(set(pred_names)))
        n_pred = list(set(pred_names).difference(set(s_conn)))
        s_pred_sorted = sort_by_wt(s_pred)
        n_pred_sorted = sort_by_wt(n_pred)
    else:
        n_pred_sorted = sort_by_wt(pred_names)
    s_pred_sorted.extend(n_pred_sorted)
    return s_pred_sorted

In [8]:
model_path = "data/tool_recommendation_model_statistical_model.hdf5"
dict_paths, d_dict, rev_dict, c_tools, class_weights, standard_connections = load_model(model_path)

# get recommendations
topk = 10 # this specifies how many top recommended tools are computed


## Example tools

In [9]:
# Assembly: 
# (https://training.galaxyproject.org/training-material/topics/assembly/tutorials/debruijn-graph-assembly/tutorial.html)
# spades -> 'bandage_info', 'fasta-stats', 'bandage_image', 'fasta_filter_by_length', 'abricate', 'quast', 'mlst' ... 
# velveth -> velvetg
# (https://training.galaxyproject.org/training-material/topics/assembly/tutorials/unicycler-assembly/tutorial.html)
# unicycler -> 'bandage_info', 'glimmer_build-icm', 'glimmer_knowlegde-based', 'bandage_image', 'transdecoder', 'minimap2', 'antismash', 'fasta_filter_by_length' ...


## Computational chemistry
# ctb_remDuplicates -> ctb_remIons 
# ctb_remDuplicates,ctb_remIons -> 'ctb_chemfp_mol2fps', 'ctb_compound_convert'
# ctb_remDuplicates,ctb_remIons,ctb_chemfp_mol2fps -> 'ctb_chemfp_butina_clustering', 'ctb_simsearch', 'ctb_chemfp_nxn_clustering', 'comp1'
# (https://training.galaxyproject.org/training-material/topics/computational-chemistry/tutorials/cheminformatics/tutorial.html)


## RAD-seq
# (https://training.galaxyproject.org/training-material/topics/ecology/tutorials/ref-based-rad-seq/tutorial.html)
# stacks_procrad -> 'bwa', 'bwa_wrapper', 'Grep1', 'stacks_denovomap', 'fastqc', 'fastq_filter'


## Epigenetics
# (https://training.galaxyproject.org/training-material/topics/epigenetics/tutorials/atac-seq/tutorial.html)
# cutadapt,bowtie2 => samtools_flagstat', 'picard_MarkDuplicates', 'picard_AddOrReplaceReadGroups', 'macs2_callpeak','bg_sortmerna', 'multiqc', 'hisat2', 'trim_galore', 'bowtie2' ...
# cutadapt,bowtie2,picard_MarkDuplicates -> 'picard_ReorderSam', 'gatk4_mutect2', 'samtools_rmdup'
# cutadapt,bowtie2,picard_MarkDuplicates,genrich -> 'pygenomeTracks'
#
# (https://training.galaxyproject.org/training-material/topics/epigenetics/tutorials/methylation-seq/tutorial.html)
# bwameth -> 'samtools_rmdup', 'samtools_sort', 'bam_to_sam', 'pileometh' ..
# bwameth,pileometh -> 'deeptools_compute_matrix', 'tp_sed_tool', 'Filter1', 'metilene', 'Remove beginning1', 'wig_to_bigWig'
#
# bowtie2 -> samtools_flagstat', 'picard_MarkDuplicates', 'picard_AddOrReplaceReadGroups ... 
# bowtie2,deeptools_multi_bam_summary -> 'deeptools_plot_pca', 'r_correlation_matrix' ...
# (https://training.galaxyproject.org/training-material/topics/epigenetics/tutorials/formation_of_super-structures_on_xi/tutorial.html)

# bowtie2,hicexplorer_hicbuildmatrix -> 'hicexplorer_hicsummatrices', 'hicexplorer_hicplotviewpoint', 'tp_sed_tool', 'hicexplorer_hiccorrectmatrix', 'hicexplorer_hicmergematrixbins', 'hicexplorer_hicpca' ..
# bowtie2,hicexplorer_hicbuildmatrix,hicexplorer_hicmergematrixbins -> 'hicexplorer_hiccorrectmatrix', 'hicexplorer_hicplottads', 'hicexplorer_hicplotmatrix'
# (https://training.galaxyproject.org/training-material/topics/epigenetics/tutorials/hicexplorer/tutorial.html)

# minfi_read450k -> 'minfi_getbeta'


## Genome annotation
# (https://training.galaxyproject.org/training-material/topics/genome-annotation/tutorials/annotation-with-maker/tutorial.html)
# maker -> 'gffread', 'maker_map_ids', 'jcvi_gff_stats'
# (https://training.galaxyproject.org/training-material/topics/genome-annotation/tutorials/annotation-with-prokka/tutorial.html)
# prokka -> 'mlst', 'jbrowse', 'taxonomy_krona_chart' ...


## Imaging
# (https://training.galaxyproject.org/training-material/topics/imaging/tutorials/hela-screen-analysis/tutorial.html)
# ip_filter_standard -> 'ip_histogram_equalization', 'ip_threshold', 'ip_count_objects
# ip_filter_standard,ip_threshold -> ip_binary_to_labelimage', 'ip_2d_split_binaryimage_by_watershed', 'ip_count_objects', 'ip_convertimage'
# ip_filter_standard,ip_threshold,ip_2d_split_binaryimage_by_watershed -> 'ip_2d_filter_segmentation_by_features', 'ip_2d_feature_extraction'


## Mass spectrometry
# mass_spectrometry_imaging_preprocessing -> 'Cut1', 'tp_easyjoin_tool', 'tp_sort_header_tool' ...
# mass_spectrometry_imaging_preprocessing,mass_spectrometry_imaging_combine -> 'maldi_quant_preprocessing', 'mass_spectrometry_imaging_preprocessing' ...
# search_gui -> peptide_shaker
# search_gui,peptide_shaker -> 'Filter1', 'Grep1', 'tp_easyjoin_tool', 'Remove beginning1', 'query_tabular' ...

## Single cell
# (https://training.galaxyproject.org/training-material/topics/transcriptomics/tutorials/scrna-preprocessing-tenx/tutorial.html)
# rna_starsolo -> 'dropletutils', 'multiqc'
# rna_starsolo,dropletutils -> 'Convert characters1', 'scanpy_plot', 'scanpy_cluster_reduce_dimension'


## Variant calling
# (https://training.galaxyproject.org/training-material/topics/variant-analysis/tutorials/microbial-variants/tutorial.html)
# snippy -> 'bedtools_intersectbed', 'freebayes' ...

# https://training.galaxyproject.org/training-material/topics/variant-analysis/tutorials/somatic-variants/tutorial.html
# trimmomatic,bwa_mem,samtools_rmdup,bamleftalign -> 'fastqc', 'deeptools_bam_coverage', 'freebayes', 'samtool_filter2' ...

# (https://training.galaxyproject.org/training-material/topics/variant-analysis/tutorials/dip/tutorial.html)
# freebayes -> 'bcftools_norm', 'vcfallelicprimitives', 'custom_pro_db' ...
# freebayes,vcfallelicprimitives -> 'snpEff', 'snpSift_filter', 'vt_normalize', 'snpSift_annotate' 
# freebayes,vcfallelicprimitives,snpEff -> 'Add_a_column1', 'freebayes', 'vcf2tsv', 'samtools_mpileup', 'vcffilter2' ...


## Transcriptomics
# (https://training.galaxyproject.org/training-material/topics/transcriptomics/tutorials/ref-based/tutorial.html)
# cutadapt -> 'fastqc', 'rna_star', 'multiqc', 'fastq_paired_end_interlacer' ...
# cutadapt,rna_star -> 'fastqc', 'featurecounts', 'multiqc', 'deeptools_bam_coverage',
# cutadapt,rna_star,featurecounts ->'fastqc', 'bowtie2', 'multiqc', 'trim_galore' ...


# Single-cell HiC
# schicexplorer_schicqualitycontrol -> 'schicexplorer_schicnormalize'

## Recommended tools

In [10]:
test_path = "mass_spectrometry_imaging_preprocessing"
pred_ids, pred_names = predict_tools(dict_paths, d_dict, c_tools, class_weights, test_path)
s_pred_sorted = sort_recommended_tools(pred_names)
print(s_pred_sorted)

['Cut1', 'tp_easyjoin_tool', 'tp_sort_header_tool', 'collection_column_join', 'mass_spectrometry_imaging_preprocessing', 'mass_spectrometry_imaging_mzplots', 'mass_spectrometry_imaging_qc', 'mass_spectrometry_imaging_combine', 'mass_spectrometry_imaging_segmentations', 'mass_spectrometry_imaging_ion_images']
