# Tool recommendation
## (Convolutional neural network with weighted cross-entropy loss)

In [1]:
import numpy as np
import json
import warnings
import operator

import h5py
from keras.models import model_from_json
from keras import backend as K

warnings.filterwarnings("ignore")


def read_file(file_path):
    with open(file_path, 'r') as data_file:
        data = json.loads(data_file.read())
    return data


def create_model(model_path):
    reverse_dictionary = dict((str(v), k) for k, v in dictionary.items())
    model_weights = list()
    weight_ctr = 0
    for index, item in enumerate(trained_model.keys()):
        if "weight_" in item:
            d_key = "weight_" + str(weight_ctr)
            weights = trained_model.get(d_key).value
            model_weights.append(weights)
            weight_ctr += 1
    # set the model weights
    loaded_model.set_weights(model_weights)
    return loaded_model, dictionary, reverse_dictionary

def get_predicted_tools(base_tools, predictions, topk):
    """
    Get predicted tools. If predicted tools are less in number, combine them with published tools
    """
    intersection = list(set(predictions).intersection(set(base_tools)))
    print(intersection)
    print()
    return intersection[:topk]

def sort_by_usage(t_list, class_weights, d_dict):
    """
    Sort predictions by usage/class weights
    """
    tool_dict = dict()
    for tool in t_list:
        t_id = d_dict[tool]
        tool_dict[tool] = class_weights[str(t_id)]
    tool_dict = dict(sorted(tool_dict.items(), key=lambda kv: kv[1], reverse=True))
    return list(tool_dict.keys()), list(tool_dict.values())

def separate_predictions(base_tools, predictions, last_tool_name, weight_values, topk):
    """
    Get predictions from published and normal workflows
    """
    last_base_tools = list()
    predictions = predictions * weight_values
    prediction_pos = np.argsort(predictions, axis=-1)
    topk_prediction_pos = prediction_pos[-topk:]
    # get tool ids
    pred_tool_ids = [reverse_dictionary[str(tool_pos)] for tool_pos in topk_prediction_pos]
    if last_tool_name in base_tools:
        last_base_tools = base_tools[last_tool_name]
        if type(last_base_tools).__name__ == "str":
            # get published or compatible tools for the last tool in a sequence of tools
            last_base_tools = last_base_tools.split(",")
    # get predicted tools
    p_tools = get_predicted_tools(last_base_tools, pred_tool_ids, topk)
    sorted_c_t, sorted_c_v = sort_by_usage(p_tools, class_weights, dictionary)
    return sorted_c_t, sorted_c_v

def compute_recommendations(model, tool_sequence, labels, dictionary, reverse_dictionary, class_weights, topk=10, max_seq_len=25):
    tl_seq = tool_sequence.split(",")
    last_tool_name = reverse_dictionary[str(tl_seq[-1])]
    sample = np.zeros(max_seq_len)
    weight_val = list(class_weights.values())
    weight_val = np.reshape(weight_val, (len(weight_val),))
    for idx, tool_id in enumerate(tl_seq):
        sample[idx] = int(tool_id)
    sample_reshaped = np.reshape(sample, (1, max_seq_len))
    tool_sequence_names = [reverse_dictionary[str(tool_pos)] for tool_pos in tool_sequence.split(",")]
    # predict next tools for a test path
    prediction = model.predict(sample_reshaped, verbose=0)
    nw_dimension = prediction.shape[1]
    prediction = np.reshape(prediction, (nw_dimension,))
    
    half_len = int(nw_dimension / 2)
    
    pub_t, pub_v = separate_predictions(standard_connections, prediction[:half_len], last_tool_name, weight_val, topk)
    # get recommended tools from normal workflows
    c_t, c_v = separate_predictions(compatible_tools, prediction[half_len:], last_tool_name, weight_val, topk)
    # combine predictions coming from different workflows
    # promote recommended tools coming from published workflows
    # to the top and then show other recommendations
    print()
    tool_seq_name = ",".join(tool_sequence_names)
    print("Current tool sequence: ")
    print()
    print(tool_seq_name)
    print()
    print("Overall recommendations: ")
    print()
    pub_t.extend(c_t)
    pub_v.extend(c_v)
    # remove duplicates if any
    pub_t = list(dict.fromkeys(pub_t))
    pub_v = list(dict.fromkeys(pub_v))
    print(pub_t)
    
    ids_tools = dict()
    for key in pub_t:
        ids_tools[key] = dictionary[key]
    print()
    print("Recommended tool ids:")
    print()
    for i in ids_tools:
        print(i + "(" + str(ids_tools[i]) + ")")

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Unpack trained model for prediction

In [2]:
model_path = "data/tool_recommendation_model.hdf5"
trained_model = h5py.File(model_path, 'r')
model_config = json.loads(trained_model.get('model_config').value)
dictionary = json.loads(trained_model.get('data_dictionary').value)
class_weights = json.loads(trained_model.get('class_weights').value)
standard_connections = json.loads(trained_model.get('standard_connections').value)
compatible_tools = json.loads(trained_model.get('compatible_tools').value)
loaded_model = model_from_json(model_config)
model, dictionary, reverse_dictionary = create_model(model_path)

Instructions for updating:
Colocations handled automatically by placer.


## Indices of tools

In [3]:
print(reverse_dictionary)

{'1': 'bedtools_genomecoveragebed_bedgraph', '2': 'smooth_running_window', '3': 'bamFilter', '4': 'snpSift_annotate', '5': 'deeptools_correctGCBias', '6': 'cshl_word_list_grep', '7': 'infernal_cmsearch', '8': 'fasta_compute_length', '9': 'picard_ARRG', '10': 'cshl_multijoin', '11': 'samtools_mpileup', '12': 'bamCompare_deepTools', '13': 'Grep1', '14': 'scaffold2fasta', '15': 'wig_to_bigWig', '16': 'gemini_load', '17': 'vt_normalize', '18': 'rseqc_bam2wig', '19': 'EMBOSS: water107', '20': 'cshl_sort_header', '21': 'trim_galore', '22': 'samtools_flagstat', '23': 'proteomics_search_protein_prophet_1', '24': 'DatamashTranspose', '25': 'ncbi_tblastn_wrapper', '26': 'fastq_filter', '27': 'FeatureFinderMultiplex', '28': 'rsem_prepare_reference', '29': 'blastxml_to_tabular', '30': 'gtf2bedgraph', '31': 'samtools_sort', '32': 'ctb_filter', '33': 'ctb_online_data_fetch', '34': 'blastxml_to_top_descr', '35': 'deseq2_single', '36': 'freebayes', '37': 'CONVERTER_bed_to_bgzip_0', '38': 'gatk2_varian

## Recommended tools

In [6]:
########### 
####### Tools from training material
# Assembly: 
# (https://training.galaxyproject.org/training-material/topics/assembly/tutorials/debruijn-graph-assembly/tutorial.html)
# (504) Spades -> 'bandage_info', 'fasta-stats', 'bandage_image', 'fasta_filter_by_length', 'abricate', 'quast', 'mlst' ... 
# (1113) Velveth -> velvetg
# (https://training.galaxyproject.org/training-material/topics/assembly/tutorials/unicycler-assembly/tutorial.html)
# (35) Unicycler -> 'bandage_info', 'glimmer_build-icm', 'glimmer_knowlegde-based', 'bandage_image', 'transdecoder', 'minimap2', 'antismash', 'fasta_filter_by_length' ...

## Computational chemistry
# (84) ctb_remDuplicates -> ctb_remIons 
# (84,1204) ctb_remDuplicates,ctb_remIons -> 'ctb_chemfp_mol2fps', 'ctb_compound_convert'
# (84,1204,626) ctb_remDuplicates,ctb_remIons,ctb_chemfp_mol2fps -> 'ctb_chemfp_butina_clustering', 'ctb_simsearch', 'ctb_chemfp_nxn_clustering', 'comp1'
# (https://training.galaxyproject.org/training-material/topics/computational-chemistry/tutorials/cheminformatics/tutorial.html)

## RAD-seq
# (https://training.galaxyproject.org/training-material/topics/ecology/tutorials/ref-based-rad-seq/tutorial.html)
# (583) stacks_procrad -> 'bwa', 'bwa_wrapper', 'Grep1', 'stacks_denovomap', 'fastqc', 'fastq_filter'

## Epigenetics
# (https://training.galaxyproject.org/training-material/topics/epigenetics/tutorials/atac-seq/tutorial.html)
# (1344,775) cutadapt,bowtie2 => samtools_flagstat', 'picard_MarkDuplicates', 'picard_AddOrReplaceReadGroups', 'macs2_callpeak','bg_sortmerna', 'multiqc', 'hisat2', 'trim_galore', 'bowtie2' ...
# (1344,775,292) cutadapt,bowtie2,picard_MarkDuplicates -> 'picard_ReorderSam', 'gatk4_mutect2', 'samtools_rmdup'
# (1344,775,292,473) cutadapt,bowtie2,picard_MarkDuplicates,genrich -> 'pygenomeTracks'
#
# (https://training.galaxyproject.org/training-material/topics/epigenetics/tutorials/methylation-seq/tutorial.html)
# (639) bwameth -> 'samtools_rmdup', 'samtools_sort', 'bam_to_sam', 'pileometh' ..
# (639,1451) bwameth,pileometh -> 'deeptools_compute_matrix', 'tp_sed_tool', 'Filter1', 'metilene', 'Remove beginning1', 'wig_to_bigWig'
# ()
# (775) bowtie2 -> samtools_flagstat', 'picard_MarkDuplicates', 'picard_AddOrReplaceReadGroups ... 
# (775,1231) bowtie2,deeptools_multi_bam_summary -> 'deeptools_plot_pca', 'r_correlation_matrix' ...
# (https://training.galaxyproject.org/training-material/topics/epigenetics/tutorials/formation_of_super-structures_on_xi/tutorial.html)

# (775,170) bowtie2,hicexplorer_hicbuildmatrix -> 'hicexplorer_hicsummatrices', 'hicexplorer_hicplotviewpoint', 'tp_sed_tool', 'hicexplorer_hiccorrectmatrix', 'hicexplorer_hicmergematrixbins', 'hicexplorer_hicpca' ..
# (775,170,669) bowtie2,hicexplorer_hicbuildmatrix,hicexplorer_hicmergematrixbins -> 'hicexplorer_hiccorrectmatrix', 'hicexplorer_hicplottads', 'hicexplorer_hicplotmatrix'
# (https://training.galaxyproject.org/training-material/topics/epigenetics/tutorials/hicexplorer/tutorial.html)

# (232) minfi_read450k -> 'minfi_getbeta'


## Genome annotation
# (https://training.galaxyproject.org/training-material/topics/genome-annotation/tutorials/annotation-with-maker/tutorial.html)
# (1157) maker -> 'gffread', 'maker_map_ids', 'jcvi_gff_stats'
# (https://training.galaxyproject.org/training-material/topics/genome-annotation/tutorials/annotation-with-prokka/tutorial.html)
# (94) prokka -> 'mlst', 'jbrowse', 'taxonomy_krona_chart' ...


## Imaging
# (https://training.galaxyproject.org/training-material/topics/imaging/tutorials/hela-screen-analysis/tutorial.html)
# (415) ip_filter_standard -> 'ip_histogram_equalization', 'ip_threshold', 'ip_count_objects
# (415,309) ip_filter_standard,ip_threshold -> ip_binary_to_labelimage', 'ip_2d_split_binaryimage_by_watershed', 'ip_count_objects', 'ip_convertimage'
# (415,309,992) ip_filter_standard,ip_threshold,ip_2d_split_binaryimage_by_watershed -> 'ip_2d_filter_segmentation_by_features', 'ip_2d_feature_extraction'


## Mass spectrometry
# (630) mass_spectrometry_imaging_preprocessing -> 'mass_spectrometry_imaging_combine', 'mass_spectrometry_imaging_preprocessing'
# (630,1386) mass_spectrometry_imaging_preprocessing,mass_spectrometry_imaging_combine -> 'maldi_quant_preprocessing', 'mass_spectrometry_imaging_preprocessing', 'mass_spectrometry_imaging_qc' ...
# (711) search_gui -> peptide_shaker
# (711,1411) search_gui,peptide_shaker -> 'mz_to_sqlite', 'Remove beginning1', 'tp_replace_in_column', 'unipept', proteomics_moff ...


## Single cell
# (531) raceid_main -> 'seurat','raceid_trajectory'
# (531,691) raceid_main,raceid_trajectory -> 'raceid_inspecttrajectory'
# (141,685) raceid_inspectclusters,__BUILD_LIST__ -> 'picard_MarkDuplicates', 'hisat2', 'stringtie_merge', 'cutadapt', 'tp_cat'
# (89) raceid_filtnormconf -> 'raceid_clustering', '__BUILD_LIST__'
# (89,270) raceid_filtnormconf,raceid_clustering -> 'raceid_trajectory', 'raceid_inspectclusters'
# (638) scanpy_regress_variable -> scanpy_scale_data 
# (638,739) scanpy_regress_variable,scanpy_scale_data -> 'scanpy_run_pca', 'scanpy_find_variable_genes'
# (638,739,505) scanpy_regress_variable,scanpy_scale_data,scanpy_run_pca -> 'scanpy_compute_graph', 'scanpy_plot', 'scanpy_run_tsne', 'scanpy_plot_embed'

# (https://training.galaxyproject.org/training-material/topics/transcriptomics/tutorials/scrna-preprocessing-tenx/tutorial.html)
# (728) rna_starsolo -> 'dropletutils', 'multiqc'
# (728,306) rna_starsolo,dropletutils -> 'scanpy_read_10x', 'scanpy_cluster_reduce_dimension', 'seurat_read10x', 'anndata_import', 'scanpy_plot', 'raceid_filtnormconf'


## Variant calling
# (https://training.galaxyproject.org/training-material/topics/variant-analysis/tutorials/microbial-variants/tutorial.html)
# (1076) snippy -> 'bedtools_intersectbed', 'vcfvcfintersect', 'Remove beginning1', 'snippy_core', 'qualimap_bamqc', 'freebayes', 'jbrowse', 'vcfcombine'

# https://training.galaxyproject.org/training-material/topics/variant-analysis/tutorials/somatic-variants/tutorial.html
# (57,712,991,629) trimmomatic,bwa_mem,samtools_rmdup,bamleftalign -> 'samtool_filter2', 'ivar_variants', 'freebayes', 'varscan_somatic', 'deeptools_bam_coverage', 'fastqc', 'ngsutils_bam_filter', 'bamFilter', 'rgPicFixMate', 'samtools_calmd'
# (57,712,991,629,273) trimmomatic,bwa_mem,samtools_rmdup,bamleftalign,varscan_somatic -> 'bcftools_norm', 'gemini_annotate', 'vt_normalize', 'vcffilter2', 'vcfallelicprimitives', 'snpEff'

# (https://training.galaxyproject.org/training-material/topics/variant-analysis/tutorials/dip/tutorial.html)
# (988) freebayes -> 'vcfallelicprimitives', 'bcftools_norm', 'custom_pro_db', 'vcfvcfintersect'
# (988,436) freebayes,vcfallelicprimitives -> 'vt_normalize', 'snpSift_filter', 'snpSift_annotate', 'snpEff'
# (988,436,32) freebayes,vcfallelicprimitives,snpEff -> 'gemini_load', 'mimodd_varreport', 'snpSift_extractFields


# Transcriptomics
#(https://training.galaxyproject.org/training-material/topics/transcriptomics/tutorials/small_ncrna_clustering/tutorial.html)
# (1459,1473,1522) samtools_sort,blockclust,sort1 -> 'cshl_awk_tool', 'Show beginning1', 'tp_awk_tool', 'blockbuster'

# (https://training.galaxyproject.org/training-material/topics/transcriptomics/tutorials/ref-based/tutorial.html)
# (1344) cutadapt -> 'umi_tools_extract', 'fastq_paired_end_interlacer', 'chira_collapse', 'rna_star',
# (1344,484) cutadapt,rna_star -> 'featurecounts', 'multiqc', 'htseq_count', 'rseqc_infer_experiment', 'samtools_stats', 'bamFilter',
# (1344,484,1209) cutadapt,rna_star,featurecounts -> 'multiqc', 'deseq2', 'tp_sort_header_tool', 'bamFilter', 'collection_column_join ...

# Single-cell HiC
# 508 schicexplorer_schicqualitycontrol -> 'schicexplorer_schicnormalize'
# schicexplorer_schicnormalize -> 'schicexplorer_schicclustersvl', 'schicexplorer_schicconsensusmatrices', 'schicexplorer_schicplotclusterprofiles'
# (13,743) schicexplorer_schicnormalize,schicexplorer_schicclustersvl -> 'schicexplorer_schicplotclusterprofiles', 'schicexplorer_schicconsensusmatrices'


# Animal detection on acoustic recording
# (1224) vigiechiro_idvalid -> 'vigiechiro_bilanenrichipf', 'vigiechiro_bilanenrichirp'

topk = 10 # set the maximum number of recommendations #"980,1300,465,937,977 
# ctb - 45,244,180,379
tool_seq = "142" # give tools ids in a sequence and see the recommendations. To know all the tool ids, 
                     # please print the variable 'reverse_dictionary'
compute_recommendations(model, tool_seq, "", dictionary, reverse_dictionary, class_weights, topk)

['sort1', 'addValue', 'wig_to_bigWig']

['join1', 'sort1', 'Filter1', 'Add_a_column1', 'addValue', 'Remove beginning1']


Current tool sequence: 

Cut1

Overall recommendations: 

['addValue', 'wig_to_bigWig', 'sort1', 'Filter1', 'join1', 'Add_a_column1', 'Remove beginning1']

Recommended tool ids:

addValue(234)
wig_to_bigWig(15)
sort1(175)
Filter1(152)
join1(83)
Add_a_column1(97)
Remove beginning1(102)
