# generating benchmark data

## p=10

In [21]:
import pandas as pd
import toytree as tt
import numpy as np
import anndata as ad
import os
import toyplot as tp
import toyplot.svg

import tree_aggregation.tree_data_generation as tgen


In [3]:
# tree depth
d = 5

effect_sizes = [0.3, 0.5, 0.7, 0.9]
# number of effects
num_effects = 3
# baseline parameter scale
a_abs = 2

# sampling depth
N = 10000
# dispersion
theta = 499
# samples per group
num_samples = [5, 10, 30, 50]
reps = 20


# counter through all datasets
id = 0
dataset_path = os.path.abspath("../../benchmarking_2/benchmark_0808/data/datasets2")
print(dataset_path)

/Users/johannes.ostner/Documents/PhD/tree_aggregation/benchmarking_2/benchmark_0808/data/datasets2


In [104]:
# Want everything to be reproducible - set a seed at every block
np.random.seed(521)
p = 10

newick = tgen.generate_tree_levels(p, d)

tree = tt.tree(newick)
tree.draw(tip_labels_align=True, node_sizes=10, node_labels='idx')

(<toyplot.canvas.Canvas at 0x7f834396ab20>,
 <toyplot.coordinates.Cartesian at 0x7f8343b5b760>,
 <toytree.Render.ToytreeMark at 0x7f83442f9dc0>)

In [105]:
np.random.seed(56847)
effect_nodes, effect_leaves = tgen.get_effect_nodes(
    newick,
    num_effects=num_effects,
    num_leaves=p
)

print(f"nodes: {effect_nodes}")
print(f"leaves: {effect_leaves}")

tlc = ["red" if int(i) in effect_leaves else "blue" if int(i)==p-1 else "black" for i in tree.get_node_values("idx", 1, 1)[-p:]]
tlc.reverse()
ref_nodes = [p.idx for p in tree.idx_dict[p-1].get_ancestors()][:-1]
ref_nodes.append(p-1)

canvas = tp.Canvas(width=800, height=1600)
ax0 = canvas.cartesian(bounds=(0, 700, 0, 1600), padding=0)
tree.draw(
    # tip_labels=False,
    node_sizes=[20 for i in tree.get_node_values("name", 1, 1)],
    node_labels=[x for x in tree.get_node_values("idx", 1, 1)],
    node_colors=["lightcoral" if i in effect_nodes else "lightblue" if i in ref_nodes else "lightgrey" for i in tree.get_node_values("idx", 1, 1)],
    node_labels_style={"font-size": 10},
    width=700,
    height=1600,
    node_style={"stroke": "black"},
    axes=ax0,
    tip_labels="name",
    tip_labels_colors=tlc,
)
tp.svg.render(canvas, "../../paper_stuff/benchmark_tree_10.svg")

effect_nodes: [12, 4, 0]
effect_leaves: [8, 0, 4, 7]
nodes: [12, 4, 0]
leaves: [8, 0, 4, 7]


In [5]:
np.random.seed(1234)
for e in effect_sizes:
    for n in num_samples:
        for r in range(reps):

            mu_0, mu_1 = tgen.generate_mu(
                a_abs=a_abs,
                num_leaves=p,
                effect_nodes=effect_nodes,
                effect_leaves=effect_leaves,
                effect_size=e,
                newick=newick
            )

            X = pd.DataFrame({"x_0": np.repeat([0,1], n)})
            Y = np.zeros((n*2, p))
            for i in range(n):
                Y[i, :] = tgen.generate_one_sample(N, mu_0, theta)
                Y[-(i+1), :] = tgen.generate_one_sample(N, mu_1, theta)

            X = X.astype(np.float64)
            Y = Y.astype(np.float64)

            test_data = ad.AnnData(
                X=Y,
                obs=X,
                uns={
                    "tree_newick": newick,
                    "effect_nodes": effect_nodes,
                    "effect_leaves": effect_leaves,
                    "effect_size": e,
                    "num_samples": n,
                }            )

            test_data.write_h5ad(dataset_path + f"/data_{id}")
            id += 1



## p=30

In [106]:
# Want everything to be reproducible - set a seed at every block
np.random.seed(96)
p = 30
id = 320

newick = tgen.generate_tree_levels(p, d)

tree = tt.tree(newick)
tree.draw(tip_labels_align=True, node_sizes=10, node_labels='idx')

(<toyplot.canvas.Canvas at 0x7f83445eee20>,
 <toyplot.coordinates.Cartesian at 0x7f8343943760>,
 <toytree.Render.ToytreeMark at 0x7f83442e0e50>)

In [107]:
np.random.seed(76)
effect_nodes, effect_leaves = tgen.get_effect_nodes(
    newick,
    num_effects=num_effects,
    num_leaves=p
)

print(f"nodes: {effect_nodes}")
print(f"leaves: {effect_leaves}")

tlc = ["red" if int(i) in effect_leaves else "blue" if int(i)==p-1 else "black" for i in tree.get_node_values("idx", 1, 1)[-p:]]
tlc.reverse()
ref_nodes = [p.idx for p in tree.idx_dict[p-1].get_ancestors()][:-1]
ref_nodes.append(p-1)

canvas = tp.Canvas(width=800, height=1600)
ax0 = canvas.cartesian(bounds=(0, 700, 0, 1600), padding=0)
tree.draw(
    # tip_labels=False,
    node_sizes=[20 for i in tree.get_node_values("name", 1, 1)],
    node_labels=[x for x in tree.get_node_values("idx", 1, 1)],
    node_colors=["lightcoral" if i in effect_nodes else "lightblue" if i in ref_nodes else "lightgrey" for i in tree.get_node_values("idx", 1, 1)],
    node_labels_style={"font-size": 10},
    width=700,
    height=1600,
    node_style={"stroke": "black"},
    axes=ax0,
    tip_labels="name",
    tip_labels_colors=tlc,
)
tp.svg.render(canvas, "../../paper_stuff/benchmark_tree_30.svg")

effect_nodes: [33, 7, 0]
effect_leaves: [0, 7, 13, 14, 15, 16]
nodes: [33, 7, 0]
leaves: [0, 7, 13, 14, 15, 16]


In [8]:
np.random.seed(1234)
for e in effect_sizes:
    for n in num_samples:
        for r in range(reps):

            mu_0, mu_1 = tgen.generate_mu(
                a_abs=a_abs,
                num_leaves=p,
                effect_nodes=effect_nodes,
                effect_leaves=effect_leaves,
                effect_size=e,
                newick=newick
            )

            X = pd.DataFrame({"x_0": np.repeat([0,1], n)})
            Y = np.zeros((n*2, p))
            for i in range(n):
                Y[i, :] = tgen.generate_one_sample(N, mu_0, theta)
                Y[-(i+1), :] = tgen.generate_one_sample(N, mu_1, theta)

            X = X.astype(np.float64)
            Y = Y.astype(np.float64)

            test_data = ad.AnnData(
                X=Y,
                obs=X,
                uns={
                    "tree_newick": newick,
                    "effect_nodes": effect_nodes,
                    "effect_leaves": effect_leaves,
                    "effect_size": e,
                    "num_samples": n,
                }
            )

            test_data.write_h5ad(dataset_path + f"/data_{id}")
            id += 1


## p=50

In [108]:
# Want everything to be reproducible - set a seed at every block
np.random.seed(96)
p = 50
id = 640

newick = tgen.generate_tree_levels(p, d)

tree = tt.tree(newick)
tree.draw(tip_labels_align=True, node_sizes=10, node_labels='idx')

(<toyplot.canvas.Canvas at 0x7f83442d6b20>,
 <toyplot.coordinates.Cartesian at 0x7f8343bdb5e0>,
 <toytree.Render.ToytreeMark at 0x7f83445eeb20>)

In [109]:
np.random.seed(4657)
effect_nodes, effect_leaves = tgen.get_effect_nodes(
    newick,
    num_effects=num_effects,
    num_leaves=p
)

print(f"nodes: {effect_nodes}")
print(f"leaves: {effect_leaves}")

tlc = ["red" if int(i) in effect_leaves else "blue" if int(i)==p-1 else "black" for i in tree.get_node_values("idx", 1, 1)[-p:]]
tlc.reverse()
ref_nodes = [p.idx for p in tree.idx_dict[p-1].get_ancestors()][:-1]
ref_nodes.append(p-1)

canvas = tp.Canvas(width=800, height=1600)
ax0 = canvas.cartesian(bounds=(0, 700, 0, 1600), padding=0)
tree.draw(
    # tip_labels=False,
    node_sizes=[20 for i in tree.get_node_values("name", 1, 1)],
    node_labels=[x for x in tree.get_node_values("idx", 1, 1)],
    node_colors=["lightcoral" if i in effect_nodes else "lightblue" if i in ref_nodes else "lightgrey" for i in tree.get_node_values("idx", 1, 1)],
    node_labels_style={"font-size": 10},
    width=700,
    height=1600,
    node_style={"stroke": "black"},
    axes=ax0,
    tip_labels="name",
    tip_labels_colors=tlc
)
tp.svg.render(canvas, "../../paper_stuff/benchmark_tree_50.svg")

effect_nodes: [61, 40, 5]
effect_leaves: [5, 40, 44, 45, 46, 47]
nodes: [61, 40, 5]
leaves: [5, 40, 44, 45, 46, 47]


In [11]:
np.random.seed(1234)
for e in effect_sizes:
    for n in num_samples:
        for r in range(reps):

            mu_0, mu_1 = tgen.generate_mu(
                a_abs=a_abs,
                num_leaves=p,
                effect_nodes=effect_nodes,
                effect_leaves=effect_leaves,
                effect_size=e,
                newick=newick
            )

            X = pd.DataFrame({"x_0": np.repeat([0,1], n)})
            Y = np.zeros((n*2, p))
            for i in range(n):
                Y[i, :] = tgen.generate_one_sample(N, mu_0, theta)
                Y[-(i+1), :] = tgen.generate_one_sample(N, mu_1, theta)

            X = X.astype(np.float64)
            Y = Y.astype(np.float64)

            test_data = ad.AnnData(
                X=Y,
                obs=X,
                uns={
                    "tree_newick": newick,
                    "effect_nodes": effect_nodes,
                    "effect_leaves": effect_leaves,
                    "effect_size": e,
                    "num_samples": n,
                }            )

            test_data.write_h5ad(dataset_path + f"/data_{id}")
            id += 1

## p=100

In [110]:
# Want everything to be reproducible - set a seed at every block
np.random.seed(96)
p = 100
id = 960

newick = tgen.generate_tree_levels(p, d)

tree = tt.tree(newick)
tree.draw(tip_labels_align=True, node_sizes=10, node_labels='idx')

(<toyplot.canvas.Canvas at 0x7f83439ba760>,
 <toyplot.coordinates.Cartesian at 0x7f83439435e0>,
 <toytree.Render.ToytreeMark at 0x7f8343ca9d30>)

In [111]:
np.random.seed(6543)
effect_nodes, effect_leaves = tgen.get_effect_nodes(
    newick,
    num_effects=num_effects,
    num_leaves=p
)

print(f"nodes: {effect_nodes}")
print(f"leaves: {effect_leaves}")

tlc = ["red" if int(i) in effect_leaves else "blue" if int(i)==p-1 else "black" for i in tree.get_node_values("idx", 1, 1)[-p:]]
tlc.reverse()
ref_nodes = [p.idx for p in tree.idx_dict[p-1].get_ancestors()][:-1]
ref_nodes.append(p-1)

canvas = tp.Canvas(width=800, height=1600)
ax0 = canvas.cartesian(bounds=(0, 700, 0, 1600), padding=0)
tree.draw(
    # tip_labels=False,
    node_sizes=[15 for i in tree.get_node_values("name", 1, 1)],
    node_labels=[x for x in tree.get_node_values("idx", 1, 1)],
    node_colors=["lightcoral" if i in effect_nodes else "lightblue" if i in ref_nodes else "lightgrey" for i in tree.get_node_values("idx", 1, 1)],
    node_labels_style={"font-size": 8},
    width=700,
    height=1600,
    node_style={"stroke": "black"},
    axes=ax0,
    tip_labels="name",
    tip_labels_colors=tlc
)
tp.svg.render(canvas, "../../paper_stuff/benchmark_tree_100.svg")

effect_nodes: [113, 72, 28]
effect_leaves: [72, 55, 56, 57, 58, 28]
nodes: [113, 72, 28]
leaves: [72, 55, 56, 57, 58, 28]


In [14]:
np.random.seed(1234)
for e in effect_sizes:
    for n in num_samples:
        for r in range(reps):

            mu_0, mu_1 = tgen.generate_mu(
                a_abs=a_abs,
                num_leaves=p,
                effect_nodes=effect_nodes,
                effect_leaves=effect_leaves,
                effect_size=e,
                newick=newick
            )

            X = pd.DataFrame({"x_0": np.repeat([0,1], n)})
            Y = np.zeros((n*2, p))
            for i in range(n):
                Y[i, :] = tgen.generate_one_sample(N, mu_0, theta)
                Y[-(i+1), :] = tgen.generate_one_sample(N, mu_1, theta)

            X = X.astype(np.float64)
            Y = Y.astype(np.float64)

            test_data = ad.AnnData(
                X=Y,
                obs=X,
                uns={
                    "tree_newick": newick,
                    "effect_nodes": effect_nodes,
                    "effect_leaves": effect_leaves,
                    "effect_size": e,
                    "num_samples": n,
                }            )

            test_data.write_h5ad(dataset_path + f"/data_{id}")
            id += 1