In [1]:
import numpy as np
from skbio import TreeNode
import qiime2 as q2
import pandas as pd
import skbio
from qiime2.plugins import phylogeny

In [16]:
def create_matrix_from_tree(tree):
    # Get all leaves and create a mapping from leaf names to indices
    leaves = list(tree.tips())
    leaf_names = [leaf.name for leaf in leaves]
    # map each leaf name to unique index
    leaf_index_map = {name: idx for idx, name in enumerate(leaf_names)}

    # Get the number of leaves and internal nodes
    num_leaves = len(leaf_names)
    # root is not included
    internal_nodes = list(tree.non_tips())

    # Create the identity matrix for the leaves: A1 (num_leaves x num_leaves)
    A1 = np.eye(num_leaves)

    # Create the matrix for the internal nodes: A2 (num_leaves x
    # num_internal_nodes)
    # initialise it with zeros
    A2 = np.zeros((num_leaves, len(internal_nodes)))

    # Populate A2 with 1s for the leaves linked by each internal node
    # iterate over all internal nodes to find descendents of this node and mark
    # them accordingly
    for j, node in enumerate(internal_nodes):
        descendant_leaves = {leaf.name for leaf in node.tips()}
        for leaf_name in leaf_names:
            if leaf_name in descendant_leaves:
                A2[leaf_index_map[leaf_name], j] = 1

    # Concatenate A1 and A2 to create the final matrix A
    A = np.hstack((A1, A2))

    return A

## Example data

In [3]:
# Create the tree nodes with lengths
n1 = TreeNode(name="n1")
f1 = TreeNode(name="f1", length=1.0)
f2 = TreeNode(name="f2", length=1.0)
n2 = TreeNode(name="n2")
f3 = TreeNode(name="f3", length=1.0)

# Build the tree structure with lengths
n1.extend([f1, f2])
n2.extend([n1, f3])
n1.length = 1.0
n2.length = 1.0

# n2 is the root of this tree
tree = n2
print(tree.ascii_art())

                    /-f1
          /n1------|
-n2------|          \-f2
         |
          \-f3


In [4]:
A = create_matrix_from_tree(tree)
A

array([[1., 0., 0., 1.],
       [0., 1., 0., 1.],
       [0., 0., 1., 0.]])

## Real data: MA2

In [5]:
# read feature table
art_feature_table = q2.Artifact.load("data/220728_monthly/all_otu_table_filt.qza")
df_ft = art_feature_table.view(pd.DataFrame)
df_ft.shape

(9478, 5580)

In [6]:
path_to_taxonomy = "data/220728_monthly/otu_taxonomy_all.qza"
art_taxonomy = q2.Artifact.load(path_to_taxonomy)
df_taxonomy = art_taxonomy.view(pd.DataFrame)
print(df_taxonomy.shape)

# Filter the taxonomy based on the feature table
df_taxonomy_f = df_taxonomy[df_taxonomy.index.isin(df_ft.columns.tolist())]
print(df_taxonomy_f.shape)

(5608, 2)
(5580, 2)


In [7]:
# read silva phylo tree
path_to_phylo = "data/220728_monthly/silva-138-99-rooted-tree.qza"
art_phylo = q2.Artifact.load(path_to_phylo)
tree_phylo = art_phylo.view(skbio.TreeNode)
# total nodes
tree_phylo.count()

870198

In [8]:
# filter tree by feature table: this prunes a phylogenetic tree to match the
# input ids
(art_phylo_f,) = phylogeny.actions.filter_tree(tree=art_phylo, table=art_feature_table)
tree_phylo_f = art_phylo_f.view(skbio.TreeNode)

# total nodes
tree_phylo_f.count()

11159

In [9]:
# ensure that # leaves in tree == feature table dimension
num_leaves = tree_phylo_f.count(tips=True)
assert num_leaves == df_ft.shape[1]

In [17]:
A_ma2 = create_matrix_from_tree(tree_phylo_f)
A_ma2

Root is not included


array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.]])

In [20]:
# verififcation
# no all 1 in one column
assert not np.any(np.all(A_ma2 == 1.0, axis=0))

# shape should be = feature_count + node_count
nb_features = df_ft.shape[1]
nb_non_leaf_nodes = len(list(tree_phylo_f.non_tips()))

assert nb_features + nb_non_leaf_nodes == A_ma2.shape[1]