build the annotation file for style and layout in iTOL.

In [1]:
import os
from os.path import join
import pandas as pd
import numpy as np
CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

d:\Python\aox\enzyme-mining-aox


In [2]:
DATADIR = os.path.join(CURRENT_DIR, "data", "aox")
CACHEDIR = join(DATADIR, "cache")
RESULTDIR = join(DATADIR, "result")
SSN_DIR = join(DATADIR, "graph", "acc")
ITOL_DIR = join(DATADIR, "graph", "itol")

filenames = {
    # download or curate from the database website
    "labels_annotation": join(ITOL_DIR, "labels_annotation.txt"),
    "range_color_class_annotation": join(ITOL_DIR, "range_color_class_annotation.txt"),
    "range_color_phylum_annotation": join(ITOL_DIR, "range_color_phylum_annotation.txt"),
    "binary_annotation": join(ITOL_DIR, "binary_annotation.txt"),


    # styles
    "phylum_style": join(SSN_DIR, "phylum_styles.xml"),
    "class_style": join(SSN_DIR, "class_styles.xml"),

    # data
    "sequence_scored.index": join(CACHEDIR, "sequence_scored.index.tsv"),
    "df_align_picked": join(RESULTDIR, "sequence_picked_results.tsv"),
}

load data from the cached files

In [3]:
df_distance_matrix_index = pd.read_csv(filenames['sequence_scored.index'], sep='\t', header=None)
sequence_index = list(df_distance_matrix_index[0])
df_align_picked = pd.read_csv(filenames['df_align_picked'], sep='\t')

TAXDUMPDIR_L = join(CACHEDIR, "taxdump_db")
taxid2name = pd.read_csv(join(TAXDUMPDIR_L, "taxid_name.tsv"), sep="\t", index_col=0)['name_txt'].to_dict()
taxidrankedlineage = pd.read_csv(join(TAXDUMPDIR_L, "taxidrankedlineage.tsv"), sep="\t", index_col=0)


df_align_picked_taxid = pd.DataFrame(df_align_picked['taxid'].unique(), columns=['taxid'])
df_align_picked_taxid = df_align_picked_taxid.merge(taxidrankedlineage, left_on='taxid', right_on='tax_name_tax_id')


make the colormap corresponding to the SSN styles

In [4]:
import xml.etree.ElementTree as ET
tree = ET.parse(filenames['phylum_style'])
root = tree.getroot()
visual_property = root.find(".//visualProperty[@name='NODE_FILL_COLOR']")
phylum2color = {entry.get('attributeValue'):entry.get('value') for entry in visual_property.findall(".//discreteMappingEntry") }

df_phylum_colors = pd.DataFrame(df_align_picked_taxid['phylum_tax_id'].unique(), columns=['phylum_tax_id'])
df_phylum_colors['phylum'] = df_phylum_colors['phylum_tax_id'].apply(lambda x: taxid2name[x])
df_phylum_colors['color'] = df_phylum_colors['phylum'].apply(lambda x: phylum2color[x])

# phylum_colors
phylum_tax_id2color = df_phylum_colors.set_index('phylum_tax_id')['color'].to_dict()

In [5]:
import xml.etree.ElementTree as ET
tree = ET.parse(filenames['class_style'])
root = tree.getroot()
visual_property = root.find(".//visualProperty[@name='NODE_FILL_COLOR']")
class2color = {entry.get('attributeValue'):entry.get('value') for entry in visual_property.findall(".//discreteMappingEntry") }

df_class_colors = pd.DataFrame(df_align_picked_taxid['class_tax_id'].unique(), columns=['class_tax_id'])
df_class_colors['class'] = df_class_colors['class_tax_id'].apply(lambda x: taxid2name[x])
df_class_colors['color'] = df_class_colors['class'].apply(lambda x: class2color[x])

# class_colors
class_tax_id2color = df_class_colors.set_index('class_tax_id')['color'].to_dict()

In [6]:
taxid2class_tax_id = df_align_picked_taxid.set_index('taxid')['class_tax_id'].to_dict()
taxid2phylum_tax_id = df_align_picked_taxid.set_index('taxid')['phylum_tax_id'].to_dict()

sid2taxid = df_align_picked.set_index('sequence_id')['taxid'].to_dict()

make the files

In [7]:
with open(filenames['labels_annotation'], "w") as f:
    f.write("LABELS\n")
    f.write("SEPARATOR COMMA\n")
    f.write("DATA\n")
    for i, query in enumerate(sequence_index):
        f.write(f"{i},{query}\n")


In [13]:
with open(filenames['range_color_class_annotation'], "w") as f:
    f.write("TREE_COLORS\n")
    f.write("SEPARATOR COMMA\n")
    f.write("DATA\n")
    for i, query in enumerate(sequence_index):
        try:
            class_tax_id = taxid2class_tax_id[sid2taxid[query]]
            class_name = taxid2name[class_tax_id]
            color = class_tax_id2color[class_tax_id]
            f.write(f"{i},range,{color},{class_name}\n")
        except KeyError:
            continue

In [14]:
with open(filenames['range_color_phylum_annotation'], "w") as f:
    f.write("TREE_COLORS\n")
    f.write("SEPARATOR COMMA\n")
    f.write("DATA\n")
    for i, query in enumerate(sequence_index):
        try:
            phylum_tax_id = taxid2phylum_tax_id[sid2taxid[query]]
            phylum_name = taxid2name[phylum_tax_id]
            color = phylum_tax_id2color[phylum_tax_id]
            f.write(f"{i},range,{color},{phylum_name}\n")
        except KeyError:
            continue

In [15]:
sid2active_sequence = df_align_picked.set_index('sequence_id')['active_sequence'].to_dict()
sid2active_taxid = df_align_picked.set_index('sequence_id')['tax_score'].to_dict()
threshold_active_taxid = 0.01

with open(filenames["binary_annotation"], "w") as f:
    f.write("DATASET_BINARY\n")
    f.write("SEPARATOR COMMA\n")
    f.write("DATASET_LABEL,label1\n")
    f.write("COLOR,#ff0000\n")
    f.write("FIELD_SHAPES,1,2\n")
    f.write("FIELD_LABELS,,\n")
    f.write("#FIELD_COLORS,#ff0000,#ffff00\n")
    f.write("DATA\n")
    for i, query in enumerate(sequence_index):
        active_sequence = 1 if sid2active_sequence[query] else -1
        active_organism = 1 if sid2active_taxid[query] >= threshold_active_taxid else -1
        f.write(f"{i},{active_sequence},{active_organism}\n")