# Generate yang22 TreeData

Generate TreeData object using KP data from [Lineage tracing reveals the phylodynamics, plasticity, and paths of tumor evolution](https://doi.org/10.1016/j.cell.2022.04.015)

## Setup

In [1]:
from pathlib import Path

import ete3
import networkx as nx
import pandas as pd
import scanpy as sc
import treedata as td

import pycea as py

path = Path("/lab/solexa_weissman/wcolgan/pycea/datasets/yang22/")
data_path = path / "data"

%load_ext autoreload
%autoreload 2

## Helper functions

In [2]:
def read_newick(path, cells):
    """Helper function to read newick tree files."""
    tree = ete3.Tree(str(path))
    leaves = tree.get_leaf_names()
    tree.prune(set(cells) & set(leaves))
    g = nx.DiGraph()
    node_iter = 0
    for n in tree.traverse():
        if n.name == "":
            n.name = f"node{node_iter}"
            node_iter += 1
        if n.is_root():
            continue
        g.add_edge(n.up.name, n.name)
    return g

## Load data

load expression

In [194]:
tdata = td.read_h5td(data_path / "expression" / "adata_processed.nt.h5ad")

Load trees

In [195]:
fitness = []
characters = []
for tumor in tdata.obs.Tumor.unique():
    tree_path = data_path / "trees" / f"{tumor}_tree.nwk"
    if not tree_path.exists():
        continue
    tree = read_newick(tree_path, tdata.obs_names)
    tdata.obst[tumor] = tree
    tumor_characters = pd.read_csv(data_path / "trees" / f"{tumor}_character_matrix.txt", sep="\t", index_col=0)
    characters.append(tumor_characters)
    fitness_path = data_path / "fitnesses" / f"mean_fitness.{tumor}.txt"
    if fitness_path.exists():
        tumor_fitness = pd.read_csv(fitness_path, sep="\t", index_col=0)
        fitness.append(tumor_fitness)
tdata = tdata[tdata.obs.tree.notnull()].copy()
characters = pd.concat(characters, axis=0).fillna("-").astype(str).loc[tdata.obs_names].copy()
tdata.obsm["characters"] = characters.iloc[:, :30]
tdata.obs["fitness"] = pd.concat(fitness, axis=0)
plasticity = pd.read_csv(data_path / "plasticity_scores.tsv", sep="\t", index_col=0)
tdata.obs["plasticity"] = plasticity.loc[:, "scPlasticity"]
py.pp.add_depth(tdata)

## Process data

subset to 2000 variable genes

In [196]:
tdata.layers["counts"] = tdata.raw.X.copy()
tdata.layers["normalized"] = sc.pp.normalize_total(tdata, layer="counts", copy=True).X
tdata.layers["normalized"] = sc.pp.log1p(tdata, layer="normalized", copy=True).X
sc.pp.highly_variable_genes(tdata, n_top_genes=2000, subset=True, layer="normalized")

Calculate priors

In [197]:
priors = tdata.obsm["characters"].stack().value_counts()
priors = priors[~priors.index.isin(["-", "0"])]
priors = priors / priors.sum()
priors = priors.to_dict()
tdata.uns["priors"] = priors

Clean up

In [198]:
tdata.obs.rename(columns={"Tumor": "tumor", "Cluster-Name": "cluster", "Lane": "lane", "Mouse": "mouse"}, inplace=True)
tdata.obs = tdata.obs[
    [
        "tumor",
        "mouse",
        "lane",
        "fitness",
        "plasticity",
        "cluster",
        "tree",
    ]
].copy()
del tdata.raw
del tdata.layers["normalized"]
del tdata.obsp["connectivities"]
del tdata.obsp["distances"]
tdata.uns = {}

## Save TreeData

In [213]:
tdata.write_h5td(path / "yang22.h5td")