# Generating benchmark data with a high effect

## p=30

In [1]:
import pandas as pd
import toytree as tt
import numpy as np
import anndata as ad
import os
import toyplot as tp
import toyplot.svg

import benchmarks.scripts.tree_data_generation as tgen

In [2]:
# tree depth
d = 5

effect_sizes = [0.3, 0.5, 0.7, 0.9]
# number of effects
num_effects = 3
# baseline parameter scale
a_abs = 2

# sampling depth
N = 10000
# dispersion
theta = 499
# samples per group
num_samples = [10]
reps = 20


# counter through all datasets
id = 0
dataset_path = os.path.abspath("../../../tascCODA_data/benchmarks/high_effect/datasets/")
print(dataset_path)

/Users/johannes.ostner/Documents/PhD/tascCODA/tascCODA_data/benchmarks/high_effect/datasets


In [3]:
# Want everything to be reproducible - set a seed at every block
np.random.seed(523)
p = 30
id = 0

newick = tgen.generate_tree_levels(p, d)

tree = tt.tree(newick)
tree.draw(tip_labels_align=True, node_sizes=10, node_labels='idx')

(<toyplot.canvas.Canvas at 0x7fc57a818b80>,
 <toyplot.coordinates.Cartesian at 0x7fc57cb37850>,
 <toytree.Render.ToytreeMark at 0x7fc57cb49760>)

In [4]:
np.random.seed(76)
effect_nodes = [39]
effect_leaves = np.arange(0, 27, 1)

print(f"nodes: {effect_nodes}")
print(f"leaves: {effect_leaves}")

tlc = ["red" if int(i) in effect_leaves else "blue" if int(i)==p-1 else "black" for i in tree.get_node_values("idx", 1, 1)[-p:]]
tlc.reverse()
ref_nodes = [p.idx for p in tree.idx_dict[p-1].get_ancestors()][:-1]
ref_nodes.append(p-1)

canvas = tp.Canvas(width=800, height=1600)
ax0 = canvas.cartesian(bounds=(0, 700, 0, 1600), padding=0)
tree.draw(
    # tip_labels=False,
    node_sizes=[20 for i in tree.get_node_values("name", 1, 1)],
    node_labels=[x for x in tree.get_node_values("idx", 1, 1)],
    node_colors=["lightcoral" if i in effect_nodes else "lightblue" if i in ref_nodes else "lightgrey" for i in tree.get_node_values("idx", 1, 1)],
    node_labels_style={"font-size": 10},
    width=700,
    height=1600,
    node_style={"stroke": "black"},
    axes=ax0,
    tip_labels="name",
    tip_labels_colors=tlc,
)
# tp.svg.render(canvas, "./plots/benchmark_tree_30_high_effect.svg")

nodes: [39]
leaves: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26]


(None,
 <toyplot.coordinates.Cartesian at 0x7fc57cb7f4f0>,
 <toytree.Render.ToytreeMark at 0x7fc57cb7f2e0>)

In [5]:
id = 0

np.random.seed(5678)
for e in effect_sizes:
    for n in num_samples:
        for r in range(reps):

            mu_0, mu_1 = tgen.generate_mu(
                a_abs=a_abs,
                num_leaves=p,
                effect_nodes=effect_nodes,
                effect_leaves=effect_leaves,
                effect_size=e,
                newick=newick
            )

            X = pd.DataFrame({"x_0": np.repeat([0,1], n)})

            Y = np.zeros((n*2, p))
            for i in range(n):

                Y[i, :] = tgen.generate_one_sample(N, mu_0 , theta)
                Y[i+n, :] = tgen.generate_one_sample(N, mu_1, theta)

            X = X.astype(np.float64)
            Y = Y.astype(np.float64)

            test_data = ad.AnnData(
                X=Y,
                obs=X,
                uns={
                    "tree_newick": newick,
                    "effect_nodes": effect_nodes,
                    "effect_leaves": effect_leaves,
                    "effect_size": e,
                    "num_samples": n,
                }
            )

            # test_data.write_h5ad(dataset_path + f"/data_{id}")
            id += 1

