# Generate koblan25 TreeData

Generate TreeData object using 4T1 data from [High-resolution spatial mapping of cell state and lineage dynamics in vivo with PEtracer](https://www.biorxiv.org/content/10.1101/2025.06.15.659774v1)

## Setup

In [47]:
from pathlib import Path

import scanpy as sc
import scipy as sp
import treedata as td

path = Path("/lab/solexa_weissman/wcolgan/pycea/datasets/koblan25/")
data_path = path / "data"

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Tumor data

In [48]:
mouse = 3
tdata = td.read_h5td(data_path / f"M{mouse}_tumor_tracing.h5td")
tdata.X = sp.sparse.csr_matrix(tdata.layers["counts"])
layers = list(tdata.layers.keys())
for key in layers:
    del tdata.layers[key]
tdata.obst["tree"] = tdata.obst["1_collapsed"]
del tdata.obst["1"]
del tdata.obst["1_collapsed"]
tdata.write_h5td(path / "koblan25_tumor.h5td", overwrite=True)

## Barcoding data

In [None]:
clone = 4
tdata = td.read_h5td(data_path / f"barcoded_tracing_clone_{clone}.h5td")
tdata.layers["counts"] = tdata.X.copy()
sc.pp.normalize_total(tdata, target_sum=1e4)
sc.pp.log1p(tdata)
sc.pp.filter_genes(tdata, min_cells=0.1 * tdata.shape[0])
sc.pp.highly_variable_genes(tdata, n_top_genes=2000, subset=True)
tdata.X = tdata.layers["counts"].copy()
del tdata.layers["counts"]
tdata.write_h5td(path / "koblan25_barcoding.h5td", overwrite=True)

In [45]:
tdata

TreeData object with n_obs × n_vars = 3108 × 2000
    obs: 'cellBC', 'sample', 'clone', 'type', 'puro', 'blast', 'tree', 'puro_clade', 'combined_clade', 'blast_clade', 'fitness'
    var: 'mean_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
    obsm: 'blast_counts', 'characters', 'puro_counts'
    obst: 'tree'