In [36]:
import csv
import random


def format_tool_id(tool_link):
    """
    Extract tool id from tool link
    """
    tool_id_split = tool_link.split("/")
    tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link
    return tool_id

def read_workflow(wf_id, workflow_rows):
    """
    Read all connections for a workflow
    """
    tool_parents = dict()
    for connection in workflow_rows:
        in_tool = connection[0]
        out_tool = connection[1]
        if out_tool not in tool_parents:
            tool_parents[out_tool] = list()
        if in_tool not in tool_parents[out_tool]:
            tool_parents[out_tool].append(in_tool)
    return tool_parents

def get_roots_leaves(graph):
    roots = list()
    leaves = list()
    all_parents = list()
    for item in graph:
        all_parents.extend(graph[item])
    all_parents = list(set(all_parents))
    children = graph.keys()
    roots = list(set(all_parents).difference(set(children)))
    leaves = list(set(children).difference(set(all_parents)))
    return roots, leaves

def find_tool_paths_workflow(graph, start, end, path=[]):
    path = path + [end]
    if start == end:
        return [path]
    path_list = list()
    if end in graph:
        for node in graph[end]:
            if node not in path:
                new_tools_paths = find_tool_paths_workflow(graph, start, node, path)
                for tool_path in new_tools_paths:
                    path_list.append(tool_path)
    return path_list

def read_tabular_file(raw_file_path):
    """
    Read tabular file and extract workflow connections
    """
    print("Reading workflows...")
    workflows = {}
    workflow_paths_dup = ""
    workflow_parents = dict()
    workflow_paths = list()
    unique_paths = dict()
    tool_frequency = dict()
    standard_connections = dict()
    with open(raw_file_path, 'rt') as workflow_connections_file:
        workflow_connections = csv.reader(workflow_connections_file, delimiter='\t')
        for index, row in enumerate(workflow_connections):
            wf_id = str(row[0])
            in_tool = format_tool_id(row[3])
            out_tool = format_tool_id(row[6])
            if wf_id not in workflows:
                workflows[wf_id] = list()
            if out_tool and in_tool and out_tool != in_tool:
                workflows[wf_id].append((out_tool, in_tool))
            if out_tool != "":
                if out_tool not in tool_frequency:
                    tool_frequency[out_tool] = 0
                tool_frequency[out_tool] += 1
            if in_tool != "":
                if in_tool not in tool_frequency:
                    tool_frequency[in_tool] = 0
                
                tool_frequency[in_tool] += 1
    print("Reading workflows finished")
    print("Processing workflows...")
    wf_ctr = 0
    for wf_id in workflows:
        wf_ctr += 1
        workflow_parents[wf_id] = read_workflow(wf_id, workflows[wf_id])

    for wf_id in workflow_parents:
        flow_paths = list()
        parents_graph = workflow_parents[wf_id]
        roots, leaves = get_roots_leaves(parents_graph)
        for root in roots:
            for leaf in leaves:
                paths = find_tool_paths_workflow(parents_graph, root, leaf)
                # reverse the paths as they are computed from leaves to roots leaf
                paths = [tool_path for tool_path in paths]
                if len(paths) > 0:
                    flow_paths.extend(paths)
        workflow_paths.extend(flow_paths)
    print("Workflows processed: %d" % wf_ctr)

    # remove slashes from the tool ids
    wf_paths_no_slash = list()
    for path in workflow_paths:
        path_no_slash = [format_tool_id(tool_id) for tool_id in path]
        wf_paths_no_slash.append(path_no_slash)

    # collect duplicate paths
    for path in wf_paths_no_slash:
        workflow_paths_dup += ",".join(path) + "\n"
    unique_paths = list(workflow_paths_dup.split("\n"))
    unique_paths = list(filter(None, unique_paths))
    paths_freq = dict()
    for path in unique_paths:
        if path not in paths_freq:
            paths_freq[path] = 0
        paths_freq[path] += 1
        
    return tool_frequency, paths_freq


wf_path = "../data/worflow-connection-20-04.tsv"
t_freq, p_freq = read_tabular_file(wf_path)

t_freq  = dict(sorted(t_freq.items(), key=lambda kv: kv[1], reverse=False))
p_freq = dict(sorted(p_freq.items(), key=lambda kv: kv[1], reverse=False))

Reading workflows...
Reading workflows finished
Processing workflows...
Workflows processed: 18659


In [37]:
t_freq

{'cshl_awk_tool1': 1,
 'ncbi_tblastx_wrapper': 1,
 'msconvert3': 1,
 'bed_to_bigBed': 1,
 'gemini_windower': 1,
 'createInterval': 1,
 'EMBOSS: pepstats71': 1,
 'numeric_clustering': 1,
 'cardinal_qc': 1,
 'bed_to_bigwig': 1,
 'igvtools_count': 1,
 'GeMoMa_Annotation_Filter': 1,
 'FROGS_affiliations_stat': 1,
 'get_feature_info': 1,
 'mothur_phylotype': 1,
 'minfi_getanno': 1,
 'minfi_methcpg': 1,
 'minfi_getM': 1,
 'cufflinks_prok': 1,
 'snpSift_dbnsfp_generic': 1,
 'velvetg_jgi': 1,
 'FeatureLinkerUnlabeled': 1,
 'packmol': 1,
 'gops_basecoverage_1': 1,
 'ggplot2_heatmap': 1,
 'mcClust': 1,
 'FastTree': 1,
 'tag_stat2': 1,
 '16Sclassifier': 1,
 'kpca1': 1,
 'scpipe': 1,
 'dgidb_annotator': 1,
 'enhanced_bowtie_wrapper': 1,
 'vsearch_chimera_detection': 1,
 'sortmerna_wrapper': 1,
 'AnnovarShed': 1,
 'qiime_alpha_diversity': 1,
 'align_back_trans': 1,
 'mtbls520_18_phylogeny': 1,
 'mtbls520_23_seasons_rda': 1,
 'mtbls520_19e_seasons_features': 1,
 'mtbls520_19d_seasons_concentration':

In [38]:
p_freq

{'cshl_find_and_replace,CONVERTER_interval_to_bed_0,CONVERTER_interval_to_bedstrict_0,addValue,mergeCols1,Cut1': 1,
 'get_flanks1,gops_intersect_1,Extract genomic DNA 1': 1,
 'get_flanks1,Extract genomic DNA 1,fasta2tab,Show beginning1,tab2fasta,meme_meme': 1,
 'get_flanks1,gops_intersect_1,gops_subtract_1,Count1,barchart_gnuplot': 1,
 'peakcalling_macs14,cshl_awk_tool,gops_intersect_1,CONVERTER_interval_to_bed_0,Count1': 1,
 'get_flanks1,gops_intersect_1,CONVERTER_interval_to_bed_0,Count1': 1,
 'bam_to_sam,PicardASMetrics': 1,
 'peakcalling_macs14,cshl_awk_tool,gops_intersect_1,CONVERTER_interval_to_bed_0,Extract genomic DNA 1,fasta2tab,Show beginning1,tab2fasta,meme_meme': 1,
 'get_flanks1,gops_intersect_1,CONVERTER_interval_to_bed_0,Extract genomic DNA 1,fasta2tab,Show beginning1,tab2fasta,meme_meme': 1,
 'bams2ratio,heatmapper': 1,
 'Extract genomic DNA 1,fasta2tab,Show beginning1,tab2fasta': 1,
 'gops_intersect_1,cshl_uniq_tool,Cut1': 1,
 'Filter1,Paste1,Cut1,XY_Plot_1': 1,
 'Add_