In [1]:
import numpy as np
import anndata
from scipy import sparse

In [2]:
basepath = 'data/tasic/'
adata_tasic = anndata.read_h5ad(f'{basepath}adata.h5ad')

  utils.warn_names_duplicates("var")


In [3]:
def simulate(adata,only_marker_genes_DE,read_umi_factor = 100, theta_umi = 100, theta_z = 1, seed = 42):

    cluster_ids = np.unique(adata.obs['clusters'])

    readcounts_sim = []
    umi_counts_sim = []
    clusterlabels_sim = []
    clustercolors_sim = []

    housekeeping_idx = ~adata.var['marker_idx']
    across_cluster_ps = np.array(adata.X.sum(axis=0) / adata.X.sum()).flatten()

    for cluster_i in cluster_ids:

        print(f'simulating cluster {cluster_i}..')

        cluster_idx = adata.obs['clusters'] == cluster_i
        ad_cl = adata[cluster_idx]

        ###get UMIs
        #get seqdepths per cell
        readcount_ns = np.array(ad_cl.X.sum(axis=1)).flatten()
        umi_ns = readcount_ns / read_umi_factor
        #get proportions per gene
        ps = np.array(ad_cl.X.sum(axis=0) / np.sum(readcount_ns)).flatten()
        
        if only_marker_genes_DE:
            #set ps to across-cluster ps for all non-marker genes
            ps[housekeeping_idx] = across_cluster_ps[housekeeping_idx]

        #sample UMIs from NB(n*p)
        mus = np.outer(umi_ns,ps)
        nb_p = theta_umi / (theta_umi + mus)
        np.random.seed(seed)
        umi_counts = np.random.negative_binomial(np.ones_like(nb_p)*theta_umi, nb_p)

        ###amplify UMIs to get readcounts
        #intialize readcounts
        readcounts=np.zeros(umi_counts.shape)
        #only nonzero counts need amplification observations 
        umi_nonzero_idx = umi_counts>0
        umi_nonzero_counts = umi_counts[umi_nonzero_idx]
        #index for which amplification sample(s) get to amplify which UMIs
        split_idx = np.cumsum(umi_nonzero_counts).astype(int)
        #one large sample instead of separate ones is more efficient (one sample per molecule observed)
        
        mean_z = read_umi_factor
        np.random.seed(seed)        
        zs = np.random.geometric(1/mean_z,size=sum(umi_nonzero_counts))

        #splitting up into separate groups of samples for each (gene-x-cell)-observation
        zs_per_cell_x_gene=np.split(zs,split_idx[:-1])
        #summing reads for each (gene-x-cell)-observation
        zs_sums =[sum(z) for z in zs_per_cell_x_gene]
        #mapping back to count matrix
        readcounts[umi_nonzero_idx]=zs_sums

        n_cells = sum(cluster_idx)
        clusterlabels_sim += [cluster_i]*n_cells
        clustercolors_sim += [adata.uns['clustercolors'][cluster_i]]*n_cells
        readcounts_sim += [readcounts]
        umi_counts_sim += [umi_counts]
    
    adata_sim = anndata.AnnData(X=sparse.csc_matrix(np.concatenate(readcounts_sim)))
    adata_sim.obs['clustercolor'] = clustercolors_sim
    adata_sim.obs['clusters'] = clusterlabels_sim
    adata_sim.layers['umis_sim'] = sparse.csc_matrix(np.concatenate(umi_counts_sim))
    adata_sim.uns['theta_umi']=theta_umi
    adata_sim.uns['theta_z']=theta_z
    adata_sim.uns['seed']=seed
    adata_sim.uns['read_umi_factor']=read_umi_factor
    adata_sim.var_names = adata.var_names.copy()
    adata_sim.var['marker_idx'] = adata.var['marker_idx'].copy()
    
    return adata_sim


In [4]:
theta_umi = 100
theta_z = 1

In [5]:
adata_sim = simulate(adata_tasic,theta_umi=theta_umi,theta_z=theta_z,only_marker_genes_DE=False)
adata_sim_only_marker_genes_DE = simulate(adata_tasic,theta_umi=theta_umi,theta_z=theta_z,only_marker_genes_DE=True)

simulating cluster 0..
simulating cluster 1..
simulating cluster 2..
simulating cluster 3..
simulating cluster 4..
simulating cluster 5..
simulating cluster 6..
simulating cluster 7..
simulating cluster 8..
simulating cluster 9..
simulating cluster 10..
simulating cluster 11..
simulating cluster 12..
simulating cluster 13..
simulating cluster 14..
simulating cluster 15..
simulating cluster 16..
simulating cluster 17..
simulating cluster 18..
simulating cluster 19..
simulating cluster 20..
simulating cluster 21..
simulating cluster 22..
simulating cluster 23..
simulating cluster 24..
simulating cluster 25..
simulating cluster 26..
simulating cluster 27..
simulating cluster 28..
simulating cluster 29..
simulating cluster 30..
simulating cluster 31..
simulating cluster 32..
simulating cluster 33..
simulating cluster 34..
simulating cluster 35..
simulating cluster 36..
simulating cluster 37..
simulating cluster 38..
simulating cluster 39..
simulating cluster 40..
simulating cluster 41..
si

tcmalloc: large alloc 8152080384 bytes == 0x4505e4000 @ 
tcmalloc: large alloc 1985470464 bytes == 0x63b02e000 @ 
  adata_sim = anndata.AnnData(X=sparse.csc_matrix(np.concatenate(readcounts_sim)))
tcmalloc: large alloc 8152080384 bytes == 0x4505e4000 @ 
tcmalloc: large alloc 1985470464 bytes == 0x72832c000 @ 


simulating cluster 0..
simulating cluster 1..
simulating cluster 2..
simulating cluster 3..
simulating cluster 4..
simulating cluster 5..
simulating cluster 6..
simulating cluster 7..
simulating cluster 8..
simulating cluster 9..
simulating cluster 10..
simulating cluster 11..
simulating cluster 12..
simulating cluster 13..
simulating cluster 14..
simulating cluster 15..
simulating cluster 16..
simulating cluster 17..
simulating cluster 18..
simulating cluster 19..
simulating cluster 20..
simulating cluster 21..
simulating cluster 22..
simulating cluster 23..
simulating cluster 24..
simulating cluster 25..
simulating cluster 26..
simulating cluster 27..
simulating cluster 28..
simulating cluster 29..
simulating cluster 30..
simulating cluster 31..
simulating cluster 32..
simulating cluster 33..
simulating cluster 34..
simulating cluster 35..
simulating cluster 36..
simulating cluster 37..
simulating cluster 38..
simulating cluster 39..
simulating cluster 40..
simulating cluster 41..
si

tcmalloc: large alloc 8152080384 bytes == 0x3e4c00000 @ 
tcmalloc: large alloc 2127740928 bytes == 0x79ddc8000 @ 
  adata_sim = anndata.AnnData(X=sparse.csc_matrix(np.concatenate(readcounts_sim)))
tcmalloc: large alloc 8152080384 bytes == 0x3e4c00000 @ 


In [6]:
adata_sim.write_h5ad(f'data/tasic/simulations/adata_sim_thetaUMI_{theta_umi}_GeomZ.h5ad')
adata_sim_only_marker_genes_DE.write_h5ad(f'data/tasic/simulations/adata_sim_only_marker_genes_DE_thetaUMI_{theta_umi}_GeomZ.h5ad')