Function which uses Poisson distribution to perturb each element in the row count matrix by 5% on average.

In [21]:
import scanpy as sc
import numpy as np

def subsample_dataset(adata, p):
    return sc.pp.subsample(adata, fraction=p, random_state=None, copy=True)

Loading, preprocessing and subsampling the dataset.

In [22]:
def remove_rare_cell_types(adata, labels, threshold):
    n = len(labels)

    # select 'good' labels - labels with at least 50 cells
    cnts = labels.value_counts()
    filtered = set(cnts[cnts >= 50])

    # filter the dataset - first we get the ids
    ids = [i for i in range(n) if labels.iloc[i] in filtered]

    # modify the dataset to remove rare cell types
    if len(ids) < n:
        adata = adata[ids, :].copy()

def get_subsampled_data(p=0.5):
    adata = sc.datasets.paul15()

    # preprocess
    remove_rare_cell_types(adata, adata.obs["paul15_clusters"], 50)
    sc.pp.filter_genes(adata, min_cells=10)
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

    adata = subsample_dataset(adata, p)
    labels = adata.obs["paul15_clusters"]

    return adata.X, labels

Use SepSolve to select $m$ marker genes on the perturbed dataset.

In [23]:
# import sepsolve
import sys

sys.path.append('../sepsolve')
from sepsolve import get_markers

def get_markers_subsampling(num_markers):
    data, labels = get_subsampled_data(p = 0.5)
    markers = get_markers(data, labels, num_markers)
    return markers

Function which computes the stability index for the given collection of marker genes using Jaccard similarity.

In [24]:
def calculate_stability(marker_collection):
    stability = 0
    inters = set([])
    n = len(marker_collection)
    
    for i in range(n):
        for j in range(i + 1, n):
            inters = marker_collection[i].intersection(marker_collection[j])
            union = marker_collection[i].union(marker_collection[j])
            stability += len(inters) / len(union)
    
    stability *= 2 / (n * (n - 1))

    return stability


Run the perturbation test $k = 5$ times and calculate the perturbation stability index.

In [25]:
k = 5
num_markers = 25

marker_collection = []
for i in range(k):
    print("Running subsampling test", i + 1, flush=True)

    # get markers on the randomly perturbed dataset
    markers = get_markers_subsampling(num_markers)
    marker_collection.append(set(markers))

stability = calculate_stability(marker_collection)

print("Subsampling stability index:", stability)

Running subsampling test 1
Running subsampling test 2
Running subsampling test 3
Running subsampling test 4
Running subsampling test 5
Subsampling stability index: 0.7629152456738664
