# Notebook to illustrate problem of weird tree matching to taxonomy

In [1]:
import os

import numpy as np
import pandas as pd
import qiime2 as q2
import skbio
from qiime2.plugins import phylogeny

%matplotlib inline
%load_ext autoreload
%autoreload 2

# Read data

In [2]:
# read feature table
art_feature_table = q2.Artifact.load("data/220728_monthly/all_otu_table_filt.qza")
df_ft = art_feature_table.view(pd.DataFrame)
df_ft.shape

(9478, 5580)

In [3]:
# read taxonomy
path_to_taxonomy = "data/220728_monthly/otu_taxonomy_all.qza"
art_taxonomy = q2.Artifact.load(path_to_taxonomy)
df_taxonomy = art_taxonomy.view(pd.DataFrame)
print(df_taxonomy.shape)

# Filter the taxonomy based on the feature table
df_taxonomy_f = df_taxonomy[df_taxonomy.index.isin(df_ft.columns.tolist())]
print(df_taxonomy_f.shape)

(5608, 2)
(5580, 2)


In [4]:
# read silva phylo tree
path_to_phylo = "data/220728_monthly/silva-138-99-rooted-tree.qza"
art_phylo = q2.Artifact.load(path_to_phylo)
tree_phylo = art_phylo.view(skbio.TreeNode)
# total nodes
print(tree_phylo.count())

# filter tree by feature table: this prunes a phylogenetic tree to match the
# input ids
(art_phylo_f,) = phylogeny.actions.filter_tree(tree=art_phylo, table=art_feature_table)
tree_phylo_f = art_phylo_f.view(skbio.TreeNode)

# total nodes
tree_phylo_f.count()

870198


11159

In [5]:
# ensure that # leaves in tree == feature table dimension
num_leaves = tree_phylo_f.count(tips=True)
assert num_leaves == df_ft.shape[1]

# Get matrix A

In [6]:
def _create_matrix_from_tree(tree, tax):
    # ! function copied from _process_train.py after git commit "b51c22f" in PR
    # ! #16 to have dic_node2leaf output conserved Get all leaves and create a
    # ! mapping from leaf names to indices
    leaves = list(tree.tips())
    leaf_names = [leaf.name for leaf in leaves]
    # map each leaf name to unique index
    leaf_index_map = {name: idx for idx, name in enumerate(leaf_names)}

    # Get the number of leaves and internal nodes
    num_leaves = len(leaf_names)
    # root is not included
    internal_nodes = list(tree.non_tips())

    # Create the identity matrix for the leaves: A1 (num_leaves x num_leaves)
    A1 = np.eye(num_leaves)
    # taxonomic name should include OTU name
    tax_e = tax.copy()
    tax_e["tax_ft"] = tax_e["Taxon"] + "; otu__" + tax_e.index
    a2_node_names = tax_e.loc[leaf_names, "tax_ft"].tolist()
    # Create the matrix for the internal nodes: A2 (num_leaves x
    # num_internal_nodes)
    # initialise it with zeros
    A2 = np.zeros((num_leaves, len(internal_nodes)))

    # Populate A2 with 1s for the leaves linked by each internal node
    # iterate over all internal nodes to find descendents of this node and mark
    # them accordingly
    dict_node2leaf = {}
    for j, node in enumerate(internal_nodes):
        # per node keep track of leaf names - for consensus naming
        node_leaf_names = []
        # todo: adjust names to consensus taxonomy from descendents
        # for now node names are just increasing integers - since node.name is float
        descendant_leaves = {leaf.name for leaf in node.tips()}
        for leaf_name in leaf_names:
            if leaf_name in descendant_leaves:
                node_leaf_names.append(leaf_name)
                A2[leaf_index_map[leaf_name], j] = 1

        # create consensus taxonomy from all leaf_names
        node_mapped_taxon = tax_e.loc[node_leaf_names, "tax_ft"].tolist()
        dict_node2leaf[j] = node_mapped_taxon
        str_consensus_taxon = os.path.commonprefix(node_mapped_taxon)
        # get name before last ";"
        node_consensus_taxon = str_consensus_taxon.rpartition(";")[0]

        if node_consensus_taxon in a2_node_names:
            # if consensus name already exists, add index to make it unique
            node_consensus_taxon = node_consensus_taxon + "; n__" + str(j)
        a2_node_names.append(node_consensus_taxon)

    # Concatenate A1 and A2 to create the final matrix A
    A = np.hstack((A1, A2))

    return A, a2_node_names, dict_node2leaf

In [7]:
A, a2_names, dic_node2leaf = _create_matrix_from_tree(tree_phylo_f, df_taxonomy_f)

df_A = pd.DataFrame(A, columns=a2_names)
df_A.shape

(5580, 11158)

In [8]:
df_A.head()

Unnamed: 0,d__Archaea; p__Nanoarchaeota; c__Nanoarchaeia; o__Woesearchaeales; f__SCGC_AAA011-D5; g__SCGC_AAA011-D5; s__Nanoarchaeota_archaeon; otu__ASMP01000002.125551.126982,d__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanobrevibacter; otu__JX833581.1.1262,d__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanobrevibacter; s__uncultured_Methanobacteriales; otu__AB535261.1.1262,d__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanosphaera; otu__CP000102.408655.410144,d__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanosphaera; s__uncultured_methanogenic; otu__AB905959.1.1268,d__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanobrevibacter; otu__AY196669.1.1262,d__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanobrevibacter; otu__AB905821.1.1267,d__Archaea; p__Thermoplasmatota; c__Thermoplasmata; o__Methanomassiliicoccales; f__Methanomethylophilaceae; otu__DQ445723.1.1210,d__Archaea; p__Thermoplasmatota; c__Thermoplasmata; o__Methanomassiliicoccales; f__Methanomethylophilaceae; otu__JF980498.1.1419,d__Bacteria; p__Patescibacteria; c__Gracilibacteria; o__Absconditabacteriales_(SR1); f__Absconditabacteriales_(SR1); g__Absconditabacteriales_(SR1); s__SR1_bacterium; otu__AOTF01000010.101107.102585,...,d__Bacteria; n__5568,d__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45,d__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; n__5570,d__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; n__5571,d__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; n__5572,d__Bacteria; p__Chloroflexi,d__Bacteria; p__Chloroflexi; n__5574,d__Bacteria; p__Chloroflexi; n__5575,d__Bacteria; p__Chloroflexi; n__5576,d__Bacteria; n__5577
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Show problem with identical "taxonomic nodes"

In [9]:
nb_otus = 5580

df_A.iloc[:, nb_otus + 5573 : nb_otus + 5577]

Unnamed: 0,d__Bacteria; p__Chloroflexi,d__Bacteria; p__Chloroflexi; n__5574,d__Bacteria; p__Chloroflexi; n__5575,d__Bacteria; p__Chloroflexi; n__5576
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
5575,1.0,1.0,0.0,1.0
5576,1.0,1.0,0.0,1.0
5577,1.0,1.0,0.0,1.0
5578,0.0,0.0,1.0,1.0


In [10]:
df_A.iloc[:, nb_otus + 5573 : nb_otus + 5577].sum()

d__Bacteria; p__Chloroflexi             6.0
d__Bacteria; p__Chloroflexi; n__5574    7.0
d__Bacteria; p__Chloroflexi; n__5575    2.0
d__Bacteria; p__Chloroflexi; n__5576    9.0
dtype: float64

In [11]:
# get matching leave taxonomic names
dic_node2leaf[5573]

['d__Bacteria; p__Chloroflexi; c__OLB14; o__OLB14; f__OLB14; g__OLB14; s__uncultured_bacterium; otu__KT835595.1.1453',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__bacterium_QTYC46b; otu__JQ624352.1.1469',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__uncultured_bacterium; otu__JQ978845.1.1460',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__uncultured_Chloroflexi; otu__AM935498.1.1337',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__uncultured_soil; otu__FQ659571.1.1334',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__metagenome; otu__FPLS01036268.22.1504']

In [12]:
dic_node2leaf[5574]

['d__Bacteria; p__Chloroflexi; c__Anaerolineae; o__SBR1031; f__SBR1031; g__SBR1031; s__anaerobic_digester; otu__CZCB01001507.135.1626',
 'd__Bacteria; p__Chloroflexi; c__OLB14; o__OLB14; f__OLB14; g__OLB14; s__uncultured_bacterium; otu__KT835595.1.1453',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__bacterium_QTYC46b; otu__JQ624352.1.1469',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__uncultured_bacterium; otu__JQ978845.1.1460',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__uncultured_Chloroflexi; otu__AM935498.1.1337',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__uncultured_soil; otu__FQ659571.1.1334',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__metagenome; otu__FPLS01036268.22.150

In [13]:
dic_node2leaf[5575]

['d__Bacteria; p__Chloroflexi; c__Gitt-GS-136; o__Gitt-GS-136; f__Gitt-GS-136; g__Gitt-GS-136; otu__HM299006.1.1326',
 'd__Bacteria; p__Chloroflexi; c__KD4-96; o__KD4-96; f__KD4-96; g__KD4-96; s__uncultured_bacterium; otu__KY190653.1.1395']

In [14]:
dic_node2leaf[5576]

['d__Bacteria; p__Chloroflexi; c__Anaerolineae; o__SBR1031; f__SBR1031; g__SBR1031; s__anaerobic_digester; otu__CZCB01001507.135.1626',
 'd__Bacteria; p__Chloroflexi; c__OLB14; o__OLB14; f__OLB14; g__OLB14; s__uncultured_bacterium; otu__KT835595.1.1453',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__bacterium_QTYC46b; otu__JQ624352.1.1469',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__uncultured_bacterium; otu__JQ978845.1.1460',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__uncultured_Chloroflexi; otu__AM935498.1.1337',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__uncultured_soil; otu__FQ659571.1.1334',
 'd__Bacteria; p__Chloroflexi; c__Chloroflexia; o__Thermomicrobiales; f__JG30-KF-CM45; g__JG30-KF-CM45; s__metagenome; otu__FPLS01036268.22.150

Is this a problem?