Simulate an arbitrarily large single-cell object to test pyucell performance

In [1]:
import scanpy as sc
import matplotlib.pyplot as plt
from scipy import sparse
from scipy.stats import rankdata
import numpy as np
from scipy.sparse import csr_matrix, vstack
import pandas as pd
from anndata import AnnData

Install development version of pyucell using:
%pip install git+https://github.com/carmonalab/pyucell.git@master

In [2]:
import pyucell

Load a test dataset

In [3]:
adata = sc.datasets.pbmc3k()

In [4]:
adata

AnnData object with n_obs × n_vars = 2700 × 32738
    var: 'gene_ids'

Generate an arbitrarily large object (e.g. 100k)

In [18]:
# Extract data matrix
X = adata.X  # sparse or dense matrix

if not isinstance(X, csr_matrix):
    X = csr_matrix(X)

n_target_cells = 500000
n_per_batch = adata.n_obs
n_reps = int(np.ceil(n_target_cells / n_per_batch))

sparse_batches = []

for i in range(n_reps):
    # Shuffle counts across genes for each cell
    X_copy = X.copy().tolil()  # easier to modify rows
    for row in range(X_copy.shape[0]):
        non_zero_idx = X_copy.rows[row]  # indices of non-zero genes
        if len(non_zero_idx) > 1:
            # shuffle the gene positions
            X_copy.data[row] = list(np.random.permutation(X_copy.data[row]))
    sparse_batches.append(X_copy.tocsr())

# Stack batches vertically
X_big = vstack(sparse_batches)[:n_target_cells, :]

# Repeat obs metadata
obs_big = pd.concat([adata.obs]*n_reps, ignore_index=True)[:n_target_cells]

# Create new AnnData object
adata_big = AnnData(X=X_big, obs=obs_big, var=adata.var)

print(adata_big)

adata = adata_big



AnnData object with n_obs × n_vars = 500000 × 32738
    obs: 'Tcell_UCell', 'Bcell_UCell', 'CD4T_UCell', 'CD8T_UCell'
    var: 'gene_ids'


Define two simple signatures to test

In [19]:
signatures = {
    'Tcell': ['CD3D', 'CD3E', 'CD2'],
    'Bcell': ['MS4A1', 'CD79A', 'CD79B']
}


Run UCell!

In [20]:
%time adata = pyucell.compute_ucell_scores(adata, signatures=signatures, chunk_size=500)

CPU times: user 1min 59s, sys: 11.4 s, total: 2min 10s
Wall time: 2min 35s


In [None]:
%time adata = pyucell.compute_ucell_scores(adata, signatures=signatures, chunk_size=500, n_jobs=16)

In [None]:
%time adata = pyucell.compute_ucell_scores(adata, signatures=signatures, chunk_size=500, n_jobs=8)

In [None]:
%time adata = pyucell.compute_ucell_scores(adata, signatures=signatures, chunk_size=500, n_jobs=4)

In [None]:
%time adata = pyucell.compute_ucell_scores(adata, signatures=signatures, chunk_size=500, n_jobs=1)

In [None]:
%time adata = pyucell.compute_ucell_scores(adata, signatures=signatures, chunk_size=100, n_jobs=8)

In [None]:
%time adata = pyucell.compute_ucell_scores(adata, signatures=signatures, chunk_size=1000, n_jobs=8)

In [None]:
%time adata = pyucell.compute_ucell_scores(adata, signatures=signatures, chunk_size=5000, n_jobs=8)

In [None]:
%time adata = pyucell.compute_ucell_scores(adata, signatures=signatures, chunk_size=500, n_jobs=8)