# Generating benchmark data with 2 covariates

## p=30

In [6]:
import pandas as pd
import toytree as tt
import numpy as np
import anndata as ad
import os
import toyplot as tp
import toyplot.svg
import seaborn as sns

import benchmarks.scripts.tree_data_generation as tgen

In [7]:
# tree depth
d = 5

effect_sizes = [0.3, 0.5, 0.7, 0.9]
# number of effects
num_effects = 3
# baseline parameter scale
a_abs = 2

# sampling depth
N = 10000
# dispersion
theta = 499
# samples per group
num_samples = [10]
reps = 10


# counter through all datasets
id = 0
dataset_path = os.path.abspath("../../../tascCODA_data/benchmarks/2_covariates/datasets/")
print(dataset_path)

/Users/johannes.ostner/Documents/PhD/tascCODA/tascCODA_data/benchmarks/2_covariates/datasets


In [8]:
# Want everything to be reproducible - set a seed at every block
np.random.seed(96)
p = 30
id = 0

newick = tgen.generate_tree_levels(p, d)

tree = tt.tree(newick)
tree.draw(tip_labels_align=True, node_sizes=10, node_labels='idx')

(<toyplot.canvas.Canvas at 0x7fb40b9f0640>,
 <toyplot.coordinates.Cartesian at 0x7fb40b9c8fa0>,
 <toytree.Render.ToytreeMark at 0x7fb40b98adf0>)

In [9]:
np.random.seed(76)
effect_nodes, effect_leaves = tgen.get_effect_nodes(
    newick,
    num_effects=num_effects,
    num_leaves=p
)

print(f"nodes: {effect_nodes}")
print(f"leaves: {effect_leaves}")

tlc = ["red" if int(i) in effect_leaves else "blue" if int(i)==p-1 else "black" for i in tree.get_node_values("idx", 1, 1)[-p:]]
tlc.reverse()
ref_nodes = [p.idx for p in tree.idx_dict[p-1].get_ancestors()][:-1]
ref_nodes.append(p-1)

canvas = tp.Canvas(width=800, height=1600)
ax0 = canvas.cartesian(bounds=(0, 700, 0, 1600), padding=0)
tree.draw(
    # tip_labels=False,
    node_sizes=[20 for i in tree.get_node_values("name", 1, 1)],
    node_labels=[x for x in tree.get_node_values("idx", 1, 1)],
    node_colors=["lightcoral" if i in effect_nodes else "lightblue" if i in ref_nodes else "lightgrey" for i in tree.get_node_values("idx", 1, 1)],
    node_labels_style={"font-size": 10},
    width=700,
    height=1600,
    node_style={"stroke": "black"},
    axes=ax0,
    tip_labels="name",
    tip_labels_colors=tlc,
)
# tp.svg.render(canvas, "./plots/benchmark_tree_30.svg")

effect_nodes: [33, 7, 0]
effect_leaves: [0, 7, 13, 14, 15, 16]
nodes: [33, 7, 0]
leaves: [0, 7, 13, 14, 15, 16]


(None,
 <toyplot.coordinates.Cartesian at 0x7fb40b999250>,
 <toytree.Render.ToytreeMark at 0x7fb40b997fa0>)

In [10]:
id = 0

x1_nodes = [39]
x1_leaves = np.arange(13, 24, 1)
beta_1 = np.zeros(p)
beta_1[x1_leaves] = 3

np.random.seed(1234)
for e in effect_sizes:
    for n in num_samples:
        for r in range(reps):

            mu_0, mu_1 = tgen.generate_mu(
                a_abs=a_abs,
                num_leaves=p,
                effect_nodes=effect_nodes,
                effect_leaves=effect_leaves,
                effect_size=e,
                newick=newick
            )

            X = pd.DataFrame({"x_0": np.repeat([0,1], n), "x_1": np.random.uniform(0, 1, 2*n)})

            Y = np.zeros((n*2, p))
            for i in range(n):
                #Y[i, :] = np.sum(mu_0) * (mu_0 + beta_1*X.loc[i+n, "x_1"])/np.sum(mu_0 + beta_1*X.loc[i+n, "x_1"])
                #Y[i+n, :] = np.sum(mu_1) * (mu_1 + beta_1*X.loc[i+n, "x_1"])/np.sum(mu_1 + beta_1*X.loc[i+n, "x_1"])
                Y[i, :] = np.exp(np.log(mu_0) + beta_1*X.loc[i, "x_1"])
                Y[i+n, :] = np.exp(np.log(mu_1) + beta_1*X.loc[i+n, "x_1"])

            X = X.astype(np.float64)
            Y = Y.astype(np.float64)

            test_data = ad.AnnData(
                X=Y,
                obs=X,
                uns={
                    "tree_newick": newick,
                    "effect_nodes": effect_nodes,
                    "effect_leaves": effect_leaves,
                    "effect_size": e,
                    "num_samples": n,
                }
            )

            # test_data.write_h5ad(dataset_path + f"/data_{id}")
            id += 1



