In [None]:
import numpy as np
import os
import pandas as pd
import qiime2 as q2
import skbio
from classo import classo_problem
from qiime2.plugins import phylogeny
from skbio import TreeNode
from q2_ritme.process_data import load_n_split_data

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
def create_matrix_from_tree(tree):
    # Get all leaves and create a mapping from leaf names to indices
    leaves = list(tree.tips())
    leaf_names = [leaf.name for leaf in leaves]
    # map each leaf name to unique index
    leaf_index_map = {name: idx for idx, name in enumerate(leaf_names)}

    # Get the number of leaves and internal nodes
    num_leaves = len(leaf_names)
    # root is not included
    internal_nodes = list(tree.non_tips())

    # Create the identity matrix for the leaves: A1 (num_leaves x num_leaves)
    A1 = np.eye(num_leaves)

    # Create the matrix for the internal nodes: A2 (num_leaves x
    # num_internal_nodes)
    # initialise it with zeros
    A2 = np.zeros((num_leaves, len(internal_nodes)))

    # Populate A2 with 1s for the leaves linked by each internal node
    # iterate over all internal nodes to find descendents of this node and mark
    # them accordingly
    a2_node_names = []
    for j, node in enumerate(internal_nodes):
        # todo: adjust names to consensus taxonomy from descentents
        # for now node names are just increasing integers - since node.name is float
        a2_node_names.append("n" + str(j))
        descendant_leaves = {leaf.name for leaf in node.tips()}
        for leaf_name in leaf_names:
            if leaf_name in descendant_leaves:
                A2[leaf_index_map[leaf_name], j] = 1

    # Concatenate A1 and A2 to create the final matrix A
    A = np.hstack((A1, A2))

    return A, a2_node_names

## Example data

In [None]:
# Create the tree nodes with lengths
n1 = TreeNode(name="n1")
f1 = TreeNode(name="f1", length=1.0)
f2 = TreeNode(name="f2", length=1.0)
n2 = TreeNode(name="n2")
f3 = TreeNode(name="f3", length=1.0)

# Build the tree structure with lengths
n1.extend([f1, f2])
n2.extend([n1, f3])
n1.length = 1.0
n2.length = 1.0

# n2 is the root of this tree
tree = n2
print(tree.ascii_art())

In [None]:
A_example, a2_names_ex = create_matrix_from_tree(tree)
A_example

In [None]:
a2_names_ex

## Real data: MA2

In [None]:
# read feature table
art_feature_table = q2.Artifact.load("data/220728_monthly/all_otu_table_filt.qza")
df_ft = art_feature_table.view(pd.DataFrame)
df_ft.shape

In [None]:
path_to_taxonomy = "data/220728_monthly/otu_taxonomy_all.qza"
art_taxonomy = q2.Artifact.load(path_to_taxonomy)
df_taxonomy = art_taxonomy.view(pd.DataFrame)
print(df_taxonomy.shape)

# Filter the taxonomy based on the feature table
df_taxonomy_f = df_taxonomy[df_taxonomy.index.isin(df_ft.columns.tolist())]
print(df_taxonomy_f.shape)

In [None]:
# read silva phylo tree
path_to_phylo = "data/220728_monthly/silva-138-99-rooted-tree.qza"
art_phylo = q2.Artifact.load(path_to_phylo)
tree_phylo = art_phylo.view(skbio.TreeNode)
# total nodes
tree_phylo.count()

In [None]:
# filter tree by feature table: this prunes a phylogenetic tree to match the
# input ids
(art_phylo_f,) = phylogeny.actions.filter_tree(tree=art_phylo, table=art_feature_table)
tree_phylo_f = art_phylo_f.view(skbio.TreeNode)

# total nodes
tree_phylo_f.count()

In [None]:
# ensure that # leaves in tree == feature table dimension
num_leaves = tree_phylo_f.count(tips=True)
assert num_leaves == df_ft.shape[1]

In [None]:
A, a2_names = create_matrix_from_tree(tree_phylo_f)
A

In [None]:
a2_names

In [None]:
# verification
# no all 1 in one column
assert not np.any(np.all(A == 1.0, axis=0))

# shape should be = feature_count + node_count
nb_features = df_ft.shape[1]
nb_non_leaf_nodes = len(list(tree_phylo_f.non_tips()))

assert nb_features + nb_non_leaf_nodes == A.shape[1]

## Run trac with this

In [None]:
# load metadata
target = "age_months"
train_val, test = load_n_split_data(
    path2md="data/220728_monthly/metadata_proc_v20240323_r0_r3_le_2yrs.tsv",
    path2ft="data/220728_monthly/all_otu_table_filt.qza",
    host_id="host_id",
    target=target,
    train_size=0.8,
    seed=12,
)

In [None]:
# preprocess taxonomy aggregation
def _preprocess_taxonomy_aggregation(x, A):
    pseudo_count = 0.000001
    # ? what happens if x is relative abundances
    X = np.log(pseudo_count + x)
    nleaves = np.sum(A, axis=0)
    log_geom = X.dot(A) / nleaves

    return log_geom, nleaves

In [None]:
# perform preprocessing on train
ft_cols = [x for x in train_val.columns if x.startswith("F")]
x_train_val = train_val[ft_cols]
y_train_val = train_val[target]
# todo: afterwards perform it on test
log_geom_trainval, nleaves = _preprocess_taxonomy_aggregation(x_train_val.values, A)

n, d = log_geom_trainval.shape

In [None]:
# get labels from taxonomy
# change labels to match new feature names
df_taxonomy_f.index = df_taxonomy_f.index.map(lambda x: "F" + str(x))

# todo: add proper A2 labels for A -> for now it's just n + count
label = df_taxonomy_f["Taxon"].values
label_short = np.array([la.split(";")[-1].strip() for la in label])
assert len(label) == len(ft_cols)
assert len(label) == len(label_short)
label = np.append(label, a2_names)
label_short = np.append(label_short, a2_names)

assert len(label_short) == A.shape[1]
label_short

In [None]:
# perform CV classo: trac
problem = classo_problem(log_geom_trainval, y_train_val.values, label=label_short)

problem.formulation.w = 1 / nleaves
problem.formulation.intercept = True
problem.formulation.concomitant = False  # not relevant for here

# ! one form of model selection needs to be chosen
# stability selection: for pre-selected range of lambda find beta paths
problem.model_selection.StabSel = False
# calculate coefficients for a grid of lambdas
problem.model_selection.PATH = False
# todo: check if it is fair that trac is trained with CV internally whereas others are not
# lambda values checked with CV are `Nlam` points between 1 and `lamin`, with
# logarithm scale or not depending on `logscale`.
problem.model_selection.CV = True
problem.model_selection.CVparameters.seed = (
    6  # one could change logscale, Nsubset, oneSE
)
# 'one-standard-error' = select simplest model (largest lambda value) in CV
# whose CV score is within 1 stddev of best score
# ! create hyperparameter for this
problem.model_selection.CVparameters.oneSE = True
# ! create hyperparameter for this
problem.model_selection.CVparameters.Nlam = 80
# ! create hyperparameter for this
problem.model_selection.CVparameters.lamin = 0.001

# ! for ritme: no feature_transformation to be used for trac
print(problem)

In [None]:
problem.solve()
# todo: find out how to extract the insights from the model to disk without changing classo
print(problem.solution)

In [None]:
# alpha [0] is learned intercept, alpha [1:] are learned coefficients for all features
# in logGeom (n_samples, n_features)
# ! if oneSE=True -> uses lambda_1SE else lambda_min (see CV in
# ! classo>cross_validation.py)
# refit -> solves unconstrained least squares problem with selected lambda and
# variables
alpha = problem.solution.CV.refit

In [None]:
# ! class solution_CV: defined in @solver.py L930
selection = problem.solution.CV.selected_param[1:]  # exclude the intercept
selected_ft = label[selection]
print(selected_ft)

In [None]:
# # selected lambda with 1-standard-error method
# problem.solution.CV.lambda_1SE

# # selected lambda without 1-standard-error method
# problem.solution.CV.lambda_min

In [None]:
# save model: A, label, alpha (includes selected_ft)
path2out = "test_model"
if not os.path.exists(path2out):
    os.makedirs(path2out)

# storing A w labels
df_A_with_labels = pd.DataFrame(A, columns=label, index=label[:nb_features])
df_A_with_labels.to_csv(os.path.join(path2out, "matrix_a_w_labels.csv"), index=True)

In [None]:
# storing alpha w labels
idx_alpha = ["intercept"] + label.tolist()
df_alpha_with_labels = pd.DataFrame(alpha, columns=["alpha"], index=idx_alpha)
df_alpha_with_labels.to_csv(
    os.path.join(path2out, "model_alpha_w_labels.csv"), index=True
)

# we can get selected features from alpha
selected_ft_inf = df_alpha_with_labels[
    df_alpha_with_labels["alpha"] != 0
].index.tolist()
assert selected_ft_inf[1:] == selected_ft.tolist()

## Perform prediction on test set

In [None]:
# derive log_geom for test
ft_cols = [x for x in test.columns if x.startswith("F")]

x_test = test[ft_cols]
y_test = test[target]
# todo: read A
log_geom_test, nleaves = _preprocess_taxonomy_aggregation(x_test.values, A)

# apply model to test
# todo: read alpha
y_test_pred = log_geom_test.dot(alpha[1:]) + alpha[0]