In [1]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import matplotlib.pyplot as plt
import seaborn as sns
import re
from itertools import islice
from collections import Counter
import networkx as nx

In [5]:
taxonomy_lineage = pd.read_csv("../output/ranked_lineage.tsv", sep='\t')

In [6]:
pattern = re.compile(r'Eukaryota', flags=re.IGNORECASE)
taxonomy_lineage_eukaryota =  taxonomy_lineage[taxonomy_lineage['Lineage'].str.match(pattern, na=False)]

In [4]:
def process_row(row):
    taxonomy_path = row.Lineage.strip().split('>')
    
    current_dict = nested_dict
    for category in taxonomy_path:
        if category not in current_dict:
            current_dict[category] = {}
        current_dict = current_dict[category]

def get_values_count_per_level(nested_dict, max_level, level=1):
    global level_count_dict
    specific_level_value_count = get_specific_level_nested_dict(nested_dict, max_level, level)
    return specific_level_value_count
def get_specific_level_nested_dict(nested_dict, max_level, specific_explore=""):
    level_count_dict = {}
    stack = [(key, value, 1) for key, value in nested_dict.items()]
    while stack:
        key, value, level = stack.pop()
        if isinstance(value, dict):
            if level == max_level:
                if specific_explore != "":
                    if key == specific_explore:
                        level_count_dict = value
                        break
                else:
                    if value:
                        level_count_dict[key] = len(value)
            stack.extend((k, v, level + 1) for k, v in value.items())
    return level_count_dict

def batched(iterable, n):
        if n < 1:
            raise ValueError('n must be at least one')
        it = iter(iterable)
        while batch := tuple(islice(it, n)):
            yield batch
            
def long_data(level_count_dict):
    level_data_frame = pd.DataFrame(columns = ["level", "count", "facet"])
    for idx, batch in enumerate(batched(level_count_dict.items(), 10)):
        batched_data = pd.DataFrame(batch)
        batched_data["facet"] = idx
        batched_data.columns = ["level", "count", "facet"]
        level_data_frame = pd.concat([level_data_frame, batched_data])
    return level_data_frame

def visualize_levels(nested_dict, max_level, amount_each_facet = 15):
    specific_level_value_count = get_specific_level_nested_dict(nested_dict, max_level)
    long_level_data = long_data(specific_level_value_count)
    #sns.FacetGrid(long_level_data, col="facet", sharey=True, )
    category_order = [x for x, y in sorted(specific_level_value_count.items(), key=lambda x: x[1], reverse=True)]
    barplot = sns.barplot(x=specific_level_value_count.keys(), y = specific_level_value_count.values(), 
            order = category_order)
    barplot.set_xticklabels(category_order, rotation=90)
 
from multiprocessing import Manager

def get_species_tax_name_lineage(species_name, taxonomy_lineage_eukaryota):
    pattern = re.compile(species_name, flags=re.IGNORECASE)
    w = taxonomy_lineage_eukaryota[taxonomy_lineage_eukaryota['tax_name'].str.match(pattern, na=False)]
    if w.shape[0] <= 0: 
        print(f"Species {species_name} not matched")
    return w



In [7]:
nested_dict = {}
with ThreadPoolExecutor(max_workers=None) as executor:
    print(executor._max_workers)
    executor.map(process_row, taxonomy_lineage_eukaryota.itertuples(index=False))
linearage_nested_dict = nested_dict

8


In [85]:
from multiprocessing import Manager

liamp_shaw = pd.read_csv("../data/raw/liamp-shaw/PathogenVsHostDB-2019-05-30.csv", encoding="latin-1")
liamp_shaw_host = liamp_shaw[["HostGroup", "HostOrder", "HostFamily", "HostSpecies", "HostName"]]
liamp_shaw_host.columns =["phylum", "order", "family", "species", "name"]

host_species_resolutions = pd.DataFrame(columns=list(taxonomy_lineage.columns))
liamp_shaw_host = liamp_shaw[["HostGroup", "HostOrder", "HostFamily", "HostSpecies", "HostName"]]
liamp_shaw_host.columns =["phylum", "order", "family", "species", "name"]
species_names = pd.DataFrame(liamp_shaw_host.species.unique())
with Manager() as manager: 
    shared_data = manager.list([taxonomy_lineage_eukaryota, host_species_resolutions])
    with ThreadPoolExecutor(max_workers=None) as executor:
        executor.map(get_species_tax_name_lineage, 
                     species_names.itertuples(index=False), 
                     [shared_data] * len(species_names))

In [93]:
host_species_resolutions = pd.DataFrame(columns=list(taxonomy_lineage.columns))
for species_name in liamp_shaw_host.species.unique():
    w = get_species_tax_name_lineage(species_name, taxonomy_lineage_eukaryota)
    if w.shape[0] > 0:
        host_species_resolutions = pd.concat([host_species_resolutions, w])
host_species_resolutions.to_csv("lineage_liamp_shaw.csv")

Species Ovis aries orientalis not matched
Species Neovison vison not matched
Species Spermophilus richardsonii not matched
Species Gyps rueppellii not matched
Species Anas cyanoptera not matched
Species Grus canadensis not matched
Species Uncia uncia not matched
Species Otaria flavescens not matched
Species Astronotus ocellatus  not matched
Species Symphysodon aequifasciatus not matched
Species Rhacophorus arboreus not matched
Species Acrocodia indica not matched
Species Epinephelus septemfasciatus not matched
Species Spermophilus columbianus not matched
Species Chelidonichthys lucernus not matched
Species Litoria citropa not matched
Species Philomachus pugnax not matched
Species Puffinus tenuirostris not matched
Species Pionopsitta barrabandi not matched
Species Pagophilus groenlandicus not matched
Species Puntius conchonius not matched
Species Liza klunzingeri not matched
Species Dasyatis fluviorum not matched
Species Himantura granulata not matched
Species Liza alata not matched
Spe

In [14]:
host_species_resolutions = pd.read_csv("lineage_liamp_shaw.csv")
insect_host_species_resolutions = pd.read_csv("insect_host_lineage.csv")
liamp_shaw = pd.read_csv("../data/raw/liamp-shaw/PathogenVsHostDB-2019-05-30.csv", encoding="latin-1")
liamp_shaw_host = liamp_shaw[["HostGroup", "HostOrder", "HostFamily", "HostSpecies", "HostName"]]
liamp_shaw_host.columns =["phylum", "order", "family", "species", "name"]
species_names = pd.DataFrame(liamp_shaw_host.species.unique())

In [211]:
insect_host_species_resolutions = pd.DataFrame(columns=list(taxonomy_lineage.columns))
insect_pathogens_host = pd.read_csv("insect_interactions_pathogen_host.csv")
for species_name in insect_pathogens_host.sourceTaxonName.unique():
    w = get_species_tax_name_lineage(species_name, taxonomy_lineage_eukaryota)
    if w.shape[0] > 0:
        insect_host_species_resolutions = pd.concat([insect_host_species_resolutions, w])

Species Aphodius tasmaniae not matched
Species Rhizotrogus majalis not matched
Species Anoplognathus hirsutus not matched
Species Oryctes monoceros not matched
Species Antitrogus morbillosus not matched
Species Othnonius batesi not matched
Species Rhopaea verreauxi not matched
Species Maladera brunnea not matched
Species Rhizotrogus solstitiale not matched
Species Sericesthis germinata not matched
Species Sericesthis ocularis not matched
Species Nematus ribesii not matched
Species Pseudaletia unipuncta not matched
Species Sabulodes aegrotata not matched
Species Pericoptus truncatus not matched
Species Gremifania nigrocellulata not matched
Species Cleonus punctiventris not matched
Species Phyllophaga smithi not matched
Species Strategus antaeus not matched
Species Lixophaga diatraeae not matched
Species Liohippelates collusor not matched
Species Locustana pardalina not matched
Species Epilachna vigintioctomaculata not matched
Species Carposina niponensis not matched
Species Metaseiulus 

In [15]:
import numpy as np

unique_host_resolutions = pd.concat([insect_host_species_resolutions, 
                               host_species_resolutions[[species_name in np.array(species_names) for species_name in host_species_resolutions.tax_name]]])
nested_dict = {}
with ThreadPoolExecutor(max_workers=None) as executor:
    print(executor._max_workers)
    executor.map(process_row, unique_host_resolutions.itertuples(index=False))
condensed_species_nested_dict = nested_dict

8


In [38]:
def get_cytoscape_elements(tree_subset):
    elements = []
    stack = [(key, value) for key, value in tree_subset.items()]
    while stack:
        key, values = stack.pop(0)
        elements.append({'data': {'id': key}})
        if isinstance(values, dict):
            for k_ in values.keys():
                elements.append({'data': {'source': key, 'target': k_, 'weight': len(values[k_])}})   
            stack.extend([(k, v) for k, v in values.items()])
    return elements

elements = get_cytoscape_elements(condensed_species_nested_dict)

In [35]:
import concurrent.futures

def process_element(key, values, elements, stack):
    data = [{'data': {'id': key}}]
    if isinstance(values, dict):
        for k_, v_ in values.items():
            data.append({'data': {'source': key, 'target': k_, 'weight': len(v_)}})
        stack.extend([(key, value) for key, value in values.items()])
    elements.extend(data)
def get_cytoscape_elements(tree_subset):
    elements = []
    stack = [(key, value) for key, value in tree_subset.items()]
    with concurrent.futures.ThreadPoolExecutor() as executor:
        while stack:
            key, values = stack.pop(0)
            executor.submit(process_element, key, values, elements, stack)
    return elements

elements = get_cytoscape_elements(condensed_species_nested_dict)
print(elements)

[{'data': {'id': 'Eukaryota'}}, {'data': {'source': 'Eukaryota', 'target': 'Arthropoda', 'weight': 2}}, {'data': {'source': 'Eukaryota', 'target': 'Ascomycota', 'weight': 0}}, {'data': {'source': 'Eukaryota', 'target': 'Chordata', 'weight': 7}}, {'data': {'id': 'Arthropoda'}}, {'data': {'source': 'Arthropoda', 'target': 'Insecta', 'weight': 7}}, {'data': {'source': 'Arthropoda', 'target': 'Arachnida', 'weight': 1}}, {'data': {'id': 'Ascomycota'}}, {'data': {'id': 'Chordata'}}, {'data': {'source': 'Chordata', 'target': 'Mammalia', 'weight': 20}}, {'data': {'source': 'Chordata', 'target': 'Lepidosauria', 'weight': 1}}, {'data': {'source': 'Chordata', 'target': 'Aves', 'weight': 32}}, {'data': {'source': 'Chordata', 'target': 'Actinopteri', 'weight': 42}}, {'data': {'source': 'Chordata', 'target': 'Amphibia', 'weight': 2}}, {'data': {'source': 'Chordata', 'target': 'Hyperoartia', 'weight': 1}}, {'data': {'source': 'Chordata', 'target': 'Chondrichthyes', 'weight': 6}}, {'data': {'id': 'Ins

In [44]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.graph_objs as go

import dash_cytoscape as cyto

condensed_species_nested_cytoscape_graph_elements = elements
app = dash.Dash(
    __name__,
    meta_tags=[
        {
            "name": "viewport",
            "content": "width=device-width, initial-scale=1, maximum-scale=1.0, user-scalable=no",
        }
    ],
)

levels_defined = {
    "superkingdom":1,
    "phylum":2,
    "class":3,
    "order":4,
    "family":5,
    "genus":6,
    "species":7
}

reverse_mapping = {value: key for key, value in levels_defined.items()}

levels_list = list(levels_defined.keys())

def load_data(max_level, level, linearage_nested_dict, level_count_dict ={}):
    data = get_specific_level_nested_dict(linearage_nested_dict, max_level=level)
    return data

def get_subtree_level_stats(subtree_dictionary):
    level_count_dict = {}
    stack = [(subtree_dictionary, 0)] 
    while stack:
        current_dict, level = stack.pop()
        level_count = level_count_dict.get(level, 1) 
        level_count += len(current_dict)
        level_count_dict[level] = level_count

        for sub_dict in current_dict.values():
            if isinstance(sub_dict, dict):
                stack.append((sub_dict, level + 1))

    return level_count_dict


data = load_data(max_level = 1, level = 1, linearage_nested_dict=linearage_nested_dict)
host_data = load_data(max_level = 1, level = 1, linearage_nested_dict=condensed_species_nested_dict)
def build_upper_left_panel():
    return html.Div(
        id="upper-left",
        className="six columns",
        children=[
            html.P(
                className="section-title",
                children="Select the phylum levels for the taxonomy visualization",
            ),
            html.Div(
                className="control-row-1",
                children=[
                    html.Div(
                        id="state-select-outer",
                        children=[
                            html.Label("Select a taxonomoy level"),
                            dcc.Dropdown(
                                id="select_taxonomy_level",
                                options=[{"label": i, "value": i} for i in levels_list],
                                value=levels_list[0],
                            ),
                        ],
                    )
                ],
            )
        ],
    )

app.layout =html.Div(
    className="container scalable",
    children=[
        html.Div(
            id="banner",
            className="banner",
            children=[
                html.H6("Taxonomy visualization")
            ],
        ), 
        html.Div(
            id="upper_cointainer", 
            children = [
                build_upper_left_panel(),
                html.Div(
                    id="visualize levels informations. ",
                    children = [
                       html.Label("Select Level:"), 
                       dcc.Dropdown(
                            multi=True, 
                            searchable=True,
                            id='level-dropdown')
                    ],
                ), 
                dcc.Graph(id='nested-plot')
            ]),
        html.Div(
            id='level_exploir',
            children = [
                dcc.Dropdown(
                    id="specific-level-explorer-dropdown",
                    options=[{"label": i, "value": i} for i in ['Mammalia', 'Insecta']],
                    value='Mammalia',
                    style={'width': '50%'}
                ), 
                dcc.Dropdown(
                    id="add_level", 
                    options = [{"label":"","value":""}], 
                    value= "",
                    style={'width': '50%'}
                )], style={'display': 'flex', 'flexDirection': 'row'}),
        html.Div(
            children = [
            dcc.Graph(id='nested-plot-1'),
            cyto.Cytoscape(
                id='cytoscape-graph',
                layout={'name': 'cose'},
                style={'width': '50%', 'height': '400px'},
                elements={},
                stylesheet=[
                    {
                        'selector': 'node',
                        'style': {
                            'label': 'data(id)'
                        }
                    }
                ]
            )
        ], style={'display': 'flex', 'flexDirection': 'row'})
    ])
@app.callback(
    Output('nested-plot', 'figure'),
    Input('level-dropdown', 'value'),
    Input('select_taxonomy_level', 'value')
)
def update_plot(selected_level, select_taxonomy_level):
    keys = [key for key in selected_level]
    values = [data[key] for key in keys]

    host_keys = [k for k in keys if k in host_data.keys()]
    host_values = [host_data[key] for key in host_keys]
    fig = go.Figure(data=[
        go.Bar(name="Taxonomy lineage", x=keys, y=values), 
        go.Bar(name="host data", x=host_keys, y=host_values)
        ])
    fig.update_layout(title=f"Plot visualization for {select_taxonomy_level} level: ")
    return fig

@app.callback(
    [
        Output("level-dropdown", "value"),
        Output("level-dropdown", "options"), 
        Output('specific-level-explorer-dropdown', 'value'), 
        Output('specific-level-explorer-dropdown', 'options')
    ],
    Input('select_taxonomy_level', 'value')
)
def select_data_by_levels(select_taxonomy_level):
    level = levels_defined.get(select_taxonomy_level)
    global data
    global host_data
    data = get_specific_level_nested_dict(linearage_nested_dict  , max_level=level)
    host_data = get_specific_level_nested_dict(condensed_species_nested_dict  , max_level=level)
    options = [{"label": i, "value": i} for i in data.keys()]
    values = [i["value"] for i in options ]

    return ( values, options, values, options)

@app.callback(
    Output('level-dropdown', 'style'),
    Input('nested-plot', 'clickData') 
)
def update_dropdown_visibility(click_data):
    return {'display': 'flex'}

@app.callback(
    Output('nested-plot-1', 'figure'),
    Input('select_taxonomy_level', 'value'), 
    Input('specific-level-explorer-dropdown', 'value'),
)
def update_nested_plot(select_taxonomy_level, specific_level_explore):
    select_taxonomy_map_no = levels_defined.get(select_taxonomy_level)
    tree_subset = get_specific_level_nested_dict(linearage_nested_dict, max_level=select_taxonomy_map_no, 
                                                 specific_explore=specific_level_explore)
    subtree_level_stats = get_subtree_level_stats(tree_subset)
    subtree_level_stats = { reverse_mapping.get(k+1+select_taxonomy_map_no): v for k, v in subtree_level_stats.items()}
    fig = go.Figure(data=[go.Bar(x=list(subtree_level_stats.keys()), y=list(subtree_level_stats.values()))])
    fig.update_layout(title=f"Plot visualization for {specific_level_explore} level: ")
    return fig

@app.callback(
    Output('cytoscape-graph', 'elements'),
    Output('add_level', 'value'), 
    Output('add_level', 'options'),
    Input('cytoscape-graph', 'elements'),
    Input('specific-level-explorer-dropdown', 'value'), 
    Input('add_level', 'value'))
def update_elements(elements, specific_level_explore, add_level_value):
    targets = [specific_level_explore]
    subgraphs = []
    if isinstance(add_level_value, str):
        targets = [add_level_value]

    for target in targets: 
        subgraphs.extend(get_all_the_subsets(condensed_species_nested_cytoscape_graph_elements, target))

    values = [d['data']['target'] for d in subgraphs if 'target' in d['data'].keys()]
    options = [{'label': i, 'value': i} for i in values]
    return (subgraphs, values, options)

def get_all_the_subsets(condensed_species_nested_cytoscape_graph_elements, target):
    subsets = []
    for d in condensed_species_nested_cytoscape_graph_elements: 
        if 'id' in d['data'].keys():
            if d['data']['id'] == target: subsets.append(d)
        else:
            if d['data']['source'] == target: 
                  subsets.append(d)
                  subsets.append({'data': {'id': d['data']['target'] }})
    return subsets

if __name__ == '__main__':
    app.run_server(debug=True, port=3000)

In [42]:
elements


[{'data': {'id': 'Eukaryota'}},
 {'data': {'source': 'Eukaryota', 'target': 'Arthropoda', 'weight': 2}},
 {'data': {'source': 'Eukaryota', 'target': 'Ascomycota', 'weight': 0}},
 {'data': {'source': 'Eukaryota', 'target': 'Chordata', 'weight': 7}},
 {'data': {'id': 'Arthropoda'}},
 {'data': {'source': 'Arthropoda', 'target': 'Insecta', 'weight': 7}},
 {'data': {'source': 'Arthropoda', 'target': 'Arachnida', 'weight': 1}},
 {'data': {'id': 'Ascomycota'}},
 {'data': {'id': 'Chordata'}},
 {'data': {'source': 'Chordata', 'target': 'Mammalia', 'weight': 20}},
 {'data': {'source': 'Chordata', 'target': 'Lepidosauria', 'weight': 1}},
 {'data': {'source': 'Chordata', 'target': 'Aves', 'weight': 32}},
 {'data': {'source': 'Chordata', 'target': 'Actinopteri', 'weight': 42}},
 {'data': {'source': 'Chordata', 'target': 'Amphibia', 'weight': 2}},
 {'data': {'source': 'Chordata', 'target': 'Hyperoartia', 'weight': 1}},
 {'data': {'source': 'Chordata', 'target': 'Chondrichthyes', 'weight': 6}},
 {'da