In [36]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import matplotlib.pyplot as plt
import seaborn as sns
import re
from itertools import islice
from collections import Counter
from itertools import chain

In [2]:
taxonomy_lineage = pd.read_csv("../output/ranked_lineage.tsv", sep='\t')

In [4]:
pattern = re.compile(r'Eukaryota', flags=re.IGNORECASE)
taxonomy_lineage_eukaryota =  taxonomy_lineage[taxonomy_lineage['Lineage'].str.match(pattern, na=False)]

In [5]:
def process_row(row):
    taxonomy_path = row.Lineage.strip().split('>')
    
    current_dict = nested_dict
    
    for category in taxonomy_path:
        if category not in current_dict:
            current_dict[category] = {}
        current_dict = current_dict[category]

def get_values_count_per_level(nested_dict, max_level, level=1):
    global level_count_dict
    specific_level_value_count = get_specific_level_nested_dict(nested_dict, max_level, level)
    return specific_level_value_count

def get_specific_level_nested_dict(nested_dict, max_level, specific_explore=""):
    level_count_dict = {}
    stack = [(key, value, 1) for key, value in nested_dict.items()]
    while stack:
        key, value, level = stack.pop()
        if isinstance(value, dict):
            if level == max_level:
                if specific_explore != "":
                    if key == specific_explore:
                        level_count_dict = value
                        break
                else:
                    if value:
                        level_count_dict[key] = len(value)
            stack.extend((k, v, level + 1) for k, v in value.items())
    return level_count_dict

def batched(iterable, n):
        if n < 1:
            raise ValueError('n must be at least one')
        it = iter(iterable)
        while batch := tuple(islice(it, n)):
            yield batch
            
def long_data(level_count_dict):
    level_data_frame = pd.DataFrame(columns = ["level", "count", "facet"])
    for idx, batch in enumerate(batched(level_count_dict.items(), 10)):
        batched_data = pd.DataFrame(batch)
        batched_data["facet"] = idx
        batched_data.columns = ["level", "count", "facet"]
        level_data_frame = pd.concat([level_data_frame, batched_data])
    return level_data_frame

def visualize_levels(nested_dict, max_level, amount_each_facet = 15):
    specific_level_value_count = get_specific_level_nested_dict(nested_dict, max_level)
    long_level_data = long_data(specific_level_value_count)
    #sns.FacetGrid(long_level_data, col="facet", sharey=True, )
    category_order = [x for x, y in sorted(specific_level_value_count.items(), key=lambda x: x[1], reverse=True)]
    barplot = sns.barplot(x=specific_level_value_count.keys(), y = specific_level_value_count.values(), 
            order = category_order)
    barplot.set_xticklabels(category_order, rotation=90)
        

In [6]:
nested_dict = {}
with ThreadPoolExecutor(max_workers=None) as executor:
    print(executor._max_workers)
    executor.map(process_row, taxonomy_lineage_eukaryota.itertuples(index=False))

8


In [39]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.graph_objs as go
import plotly.subplots as sp
import pandas as pd

app = dash.Dash(
    __name__,
    meta_tags=[
        {
            "name": "viewport",
            "content": "width=device-width, initial-scale=1, maximum-scale=1.0, user-scalable=no",
        }
    ],
)

levels_defined = {
    "superkingdom":1,
    "phylum":2,
    "class":3,
    "order":4,
    "family":5,
    "genus":6,
    "species":7
}

levels_defined = {
    "superkingdom":1,
    "phylum":2,
    "class":3,
    "order":4,
    "family":5,
    "genus":6,
    "species":7
}
reverse_mapping = {value: key for key, value in levels_defined.items()}

levels_list = list(levels_defined.keys())

def load_data(max_level, level, level_count_dict ={}):
    data = get_specific_level_nested_dict(nested_dict, max_level=level)
    return data

def get_subtree_level_stats(subtree_dictionary):
    level_count_dict = {}
    stack = [(subtree_dictionary, 0)] 
    while stack:
        current_dict, level = stack.pop()
        level_count = level_count_dict.get(level, 1) 
        level_count += len(current_dict)
        level_count_dict[level] = level_count

        for sub_dict in current_dict.values():
            if isinstance(sub_dict, dict):
                stack.append((sub_dict, level + 1))

    return level_count_dict


data = load_data(max_level = 1, level = 1)

def build_upper_left_panel():
    return html.Div(
        id="upper-left",
        className="six columns",
        children=[
            html.P(
                className="section-title",
                children="Select the phylum levels for the taxonomy visualization",
            ),
            html.Div(
                className="control-row-1",
                children=[
                    html.Div(
                        id="state-select-outer",
                        children=[
                            html.Label("Select a taxonomoy level"),
                            dcc.Dropdown(
                                id="select_taxonomy_level",
                                options=[{"label": i, "value": i} for i in levels_list],
                                value=levels_list[0],
                            ),
                        ],
                    )
                ],
            )
        ],
    )

app.layout =html.Div(
    className="container scalable",
    children=[
        html.Div(
            id="banner",
            className="banner",
            children=[
                html.H6("Taxonomy visualization")
            ],
        ), 
        html.Div(
            id="upper_cointainer", 
            children = [
                build_upper_left_panel(),
                html.Div(
                    id="visualize levels informations. ",
                    children = [
                       html.Label("Select Level:"), 
                       dcc.Dropdown(
                            multi=True, 
                            searchable=True,
                            id='level-dropdown')
                    ],
                ), 
                dcc.Graph(id='nested-plot')
            ]),
        html.Div(
            id='level_exploir',
            children = [
                dcc.Dropdown(
                    id="specific-level-explorer-dropdown",
                    options=[{"label": i, "value": i} for i in ['Mammalia', 'Insecta']],
                    value='Mammalia',
                    ), 
                dcc.Graph(id='nested-plot-1')
            ]
        )
    ])

@app.callback(
    Output('nested-plot', 'figure'),
    Input('level-dropdown', 'value'),
    Input('select_taxonomy_level', 'value')
)
def update_plot(selected_level, select_taxonomy_level):
    keys = [key for key in selected_level]
    values = [data[key] for key in keys]
    host_selected_level = Counter(liamp_shaw_host[select_taxonomy_level])
    print(host_selected_level)
    keys_host =list(host_selected_level.keys())
    vaues_host =[ host_selected_level[k] for k in keys_host ]
    
    fig = go.Figure(data=[
        go.Bar(name = "Taxonomy lineage ", x=keys, y=values), 
        go.Bar(name ="Condensed_spp", x=keys_host, y=vaues_host)
        ])
    fig.update_layout(title=f"Plot visualization for {select_taxonomy_level} level: ")
    return fig

@app.callback(
    [
        Output("level-dropdown", "value"),
        Output("level-dropdown", "options"), 
        Output('specific-level-explorer-dropdown', 'value'), 
        Output('specific-level-explorer-dropdown', 'options')
    ],
    Input('select_taxonomy_level', 'value')
)
def select_data_by_levels(select_taxonomy_level):
    level = levels_defined.get(select_taxonomy_level)
    global data
    data = get_specific_level_nested_dict(nested_dict, max_level=level)
    options = [{"label": i, "value": i} for i in data.keys()]
    values = [i["value"] for i in options ]
    return ( values, options, values, options )

@app.callback(
    Output('level-dropdown', 'style'),
    Input('nested-plot', 'clickData') 
)
def update_dropdown_visibility(click_data):
    return {'display': 'flex'}

@app.callback(
    Output('nested-plot-1', 'figure'),
    Input('select_taxonomy_level', 'value'), 
    Input('specific-level-explorer-dropdown', 'value'),
)
def update_nested_plot(select_taxonomy_level, specific_level_explore):
    select_taxonomy_map_no = levels_defined.get(select_taxonomy_level)
    tree_subset = get_specific_level_nested_dict(nested_dict, max_level=select_taxonomy_map_no, 
                                                 specific_explore=specific_level_explore)
    subtree_level_stats = get_subtree_level_stats(tree_subset)
    subtree_level_stats = { reverse_mapping.get(k+1+select_taxonomy_map_no): v for k, v in subtree_level_stats.items()}
    fig = go.Figure(data=[go.Bar(x=list(subtree_level_stats.keys()), y=list(subtree_level_stats.values()))])
    fig.update_layout(title=f"Plot visualization for {select_taxonomy_level} level: ")
    return fig

if __name__ == '__main__':
    app.run_server(debug=True, port=3000)

[1;31m---------------------------------------------------------------------------[0m
[1;31mKeyError[0m                                  Traceback (most recent call last)
File [1;32m~/Desktop/CU/work/bacteria-archaea-traits/.venv/lib/python3.11/site-packages/pandas/core/indexes/base.py:3790[0m, in [0;36mIndex.get_loc[1;34m(
    self=Index(['phylum', 'order', 'family', 'species', 'name'], dtype='object'),
    key='superkingdom'
)[0m
[0;32m   3789[0m [38;5;28;01mtry[39;00m:
[1;32m-> 3790[0m     [38;5;28;01mreturn[39;00m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43m_engine[49m[38;5;241;43m.[39;49m[43mget_loc[49m[43m([49m[43mcasted_key[49m[43m)[49m
        casted_key [1;34m= 'superkingdom'[0m[1;34m
        [0mself [1;34m= Index(['phylum', 'order', 'family', 'species', 'name'], dtype='object')[0m
[0;32m   3791[0m [38;5;28;01mexcept[39;00m [38;5;167;01mKeyError[39;00m [38;5;28;01mas[39;00m err:

File [1;32mindex.pyx:152[0m, in [0;36mpandas.

In [41]:
liamp_shaw = pd.read_csv("../data/raw/liamp-shaw/PathogenVsHostDB-2019-05-30.csv", encoding="latin-1")
liamp_shaw_host = liamp_shaw[["HostGroup", "HostOrder", "HostFamily", "HostSpecies", "HostName"]]
liamp_shaw_host.columns =["phylum", "order", "family", "species", "name"]