Skip to content

Commit

Permalink
Merge pull request #71 from broadinstitute/sf_removebkg_v2.1
Browse files Browse the repository at this point in the history
remove-background version 0.2.0
  • Loading branch information
sjfleming committed Oct 16, 2020
2 parents 20bab46 + 63c6ff2 commit f93d67b
Show file tree
Hide file tree
Showing 27 changed files with 2,607 additions and 923 deletions.
8 changes: 4 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ Citing CellBender
-----------------

If you use CellBender in your research (and we hope you will), please consider
citing `our paper <https://www.biorxiv.org/content/10.1101/791699v1>`_:
citing `our paper on bioRxiv <https://doi.org/10.1101/791699>`_.

Stephen J Fleming, John C Marioni, and Mehrtash Babadi. CellBender remove-background: a deep
generative model for unsupervised removal of background noise from scRNA-seq datasets.
bioRxiv 791699; doi: https://doi.org/10.1101/791699
Stephen J Fleming, John C Marioni, and Mehrtash Babadi. CellBender remove-background:
a deep generative model for unsupervised removal of background noise from scRNA-seq
datasets. bioRxiv 791699; doi: `https://doi.org/10.1101/791699 <https://doi.org/10.1101/791699>`_
8 changes: 8 additions & 0 deletions REQUIREMENTS-DOCKER.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
numpy
scipy
tables
pandas
pyro-ppl>=0.3.2
torch
scikit-learn
matplotlib
49 changes: 32 additions & 17 deletions cellbender/remove_background/argparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ def add_subparser_args(subparsers: argparse) -> argparse:
"analyzed. The largest 'total_droplets' "
"droplets will have their cell "
"probabilities inferred as an output.")
subparser.add_argument("--model", nargs=None, type=str, default="full",
choices=["simple", "ambient",
"swapping", "full"],
subparser.add_argument("--model", nargs=None, type=str,
default="full",
choices=["simple", "ambient", "swapping", "full"],
dest="model",
help="Which model is being used for count data. "
" 'simple' does not model either ambient "
Expand Down Expand Up @@ -84,23 +84,21 @@ def add_subparser_args(subparsers: argparse) -> argparse:
"correct prior for empty droplet counts "
"in the rare case where empty counts "
"are extremely high (over 200).")
subparser.add_argument("--z-dim", type=int, default=20,
subparser.add_argument("--z-dim", type=int, default=100,
dest="z_dim",
help="Dimension of latent variable z.")
subparser.add_argument("--z-layers", nargs="+", type=int, default=[500],
dest="z_hidden_dims",
help="Dimension of hidden layers in the encoder "
"for z.")
subparser.add_argument("--d-layers", nargs="+", type=int,
default=[5, 2, 2],
dest="d_hidden_dims",
help="Dimension of hidden layers in the encoder "
"for d.")
subparser.add_argument("--p-layers", nargs="+", type=int,
default=[100, 10],
dest="p_hidden_dims",
help="Dimension of hidden layers in the encoder "
"for p.")
subparser.add_argument("--training-fraction",
type=float, nargs=None,
default=consts.TRAINING_FRACTION,
dest="training_fraction",
help="Training detail: the fraction of the "
"data used for training. The rest is never "
"seen by the inference algorithm. Speeds up "
"learning.")
subparser.add_argument("--empty-drop-training-fraction",
type=float, nargs=None,
default=consts.FRACTION_EMPTIES,
Expand All @@ -116,11 +114,28 @@ def add_subparser_args(subparsers: argparse) -> argparse:
"entirely. In the output count matrix, "
"the counts for these genes will be set "
"to zero.")
subparser.add_argument("--fpr", nargs="+",
type=float, default=[0.01],
dest="fpr",
help="Target false positive rate in (0, 1). A false "
"positive is a true signal count that is "
"erroneously removed. More background removal "
"is accompanied by more signal removal "
"at high values of FPR. You can specify "
"multiple values, which will create multiple "
"output files.")
subparser.add_argument("--exclude-antibody-capture",
dest="exclude_antibodies", action="store_true",
help="Including the flag --exclude-antibody-capture "
"will cause remove-background to operate on "
"gene counts only, ignoring other features.")
subparser.add_argument("--learning-rate", nargs=None,
type=float, default=1e-3,
type=float, default=1e-4,
dest="learning_rate",
help="Training detail: learning rate for "
"inference (probably "
help="Training detail: lower learning rate for "
"inference. A OneCycle learning rate schedule "
"is used, where the upper learning rate is ten "
"times this value. (For this value, probably "
"do not exceed 1e-3).")

return subparsers
29 changes: 20 additions & 9 deletions cellbender/remove_background/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,13 @@ def validate_args(self, args):
"fraction_empties must be between 0 and 1, exclusive. This is " \
"the fraction of each minibatch that is composed of empty droplets."

assert args.learning_rate < 0.1, "learning_rate must be < 0.1"
assert args.learning_rate > 0, "learning_rate must be > 0"
assert args.learning_rate < 0.1, "learning-rate must be < 0.1"
assert args.learning_rate > 0, "learning-rate must be > 0"

# Set training_fraction to consts.TRAINING_FRACTION (which is 1.).
args.training_fraction = consts.TRAINING_FRACTION
# args.training_fraction = consts.TRAINING_FRACTION
assert args.training_fraction > 0, "training-fraction must be > 0"
assert args.training_fraction <= 1., "training-fraction must be <= 1"

# If cuda is requested, make sure it is available.
if args.use_cuda:
Expand All @@ -74,13 +76,22 @@ def validate_args(self, args):
"significant speed-ups.\n\n")
sys.stdout.flush() # Write immediately

# Ensure all network layer dimensions are positive.
for n in args.z_hidden_dims:
assert n > 0, "--z-layers must be all positive integers."

# Ensure that z_hidden_dims are in encoder order.
# (The same dimensions are used in reverse order for the decoder.)
args.z_hidden_dims = sorted(args.z_hidden_dims, reverse=True)

# Set use_jit to False.
args.use_jit = False

# Ensure false positive rate is between zero and one.
for fpr in args.fpr:
assert (fpr > 0.) and (fpr < 1.), \
"False positive rate --fpr must be between 0 and 1."

self.args = args

return args
Expand Down Expand Up @@ -132,15 +143,15 @@ def run_remove_background(args):
try:
dataset_obj = \
SingleCellRNACountsDataset(input_file=args.input_file,
expected_cell_count=
args.expected_cell_count,
total_droplet_barcodes=
args.total_droplets,
expected_cell_count=args.expected_cell_count,
total_droplet_barcodes=args.total_droplets,
fraction_empties=args.fraction_empties,
model_name=args.model,
gene_blacklist=args.blacklisted_genes,
low_count_threshold=
args.low_count_threshold)
exclude_antibodies=args.exclude_antibodies,
low_count_threshold=args.low_count_threshold,
fpr=args.fpr)

except OSError:
logging.error(f"OSError: Unable to open file {args.input_file}.")
sys.exit(1)
Expand Down
55 changes: 52 additions & 3 deletions cellbender/remove_background/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

# Factor by which the mode UMI count of the empty droplet plateau is
# multiplied to come up with a UMI cutoff below which no barcodes are used.
EMPIRICAL_LOW_UMI_TO_EMPTY_DROPLET_THRESHOLD = 0.8
EMPIRICAL_LOW_UMI_TO_EMPTY_DROPLET_THRESHOLD = 0.5

# Default prior for the standard deviation of the LogNormal distribution for
# cell size, used only in the case of the 'simple' model.
SIMPLE_MODEL_D_STD_PRIOR = 0.2
D_STD_PRIOR = 0.02

# Probability cutoff for determining which droplets contain cells and which
# are empty. The droplets n with inferred probability q_n > CELL_PROB_CUTOFF
Expand All @@ -21,10 +22,58 @@
TOTAL_DROPLET_DEFAULT = 25000

# Fraction of the data used for training (versus testing).
TRAINING_FRACTION = 1.
TRAINING_FRACTION = 0.9

# Size of minibatch by default.
DEFAULT_BATCH_SIZE = 128

# Fraction of totally empty droplets that makes up each minibatch, by default.
FRACTION_EMPTIES = 0.5
FRACTION_EMPTIES = 0.5

# Prior on rho, the swapping fraction: the two concentration parameters alpha and beta.
RHO_ALPHA_PRIOR = 18.
RHO_BETA_PRIOR = 200.

# Constraints on rho posterior latents.
RHO_PARAM_MIN = 1.
RHO_PARAM_MAX = 1000.

# Prior on epsilon, the RT efficiency concentration parameter [Gamma(alpha, alpha)].
EPSILON_PRIOR = 500.

# Prior used for the global overdispersion parameter.
PHI_LOC_PRIOR = 0.2
PHI_SCALE_PRIOR = 0.2

# Initial value of global latent scale for d_cell.
D_CELL_SCALE_INIT = 0.02

# Scale used to regularize values of logit cell probability (mean zero).
P_LOGIT_SCALE = 2.

# Hidden layer sizes of non-z latent encoder neural network.
ENC_HIDDEN_DIMS = [100, 50]

# False to use an approximate log_prob computation which is much faster.
USE_EXACT_LOG_PROB = False

# If using an exact log_prob computation, we integrate numerically over this size range.
NBPC_EXACT_N_TERMS = 50

# Negative binomial poisson convolution likelihood calculation: numerical safeguards.
NBPC_MU_EPS_SAFEGAURD = 1e-10
NBPC_ALPHA_EPS_SAFEGAURD = 1e-10
NBPC_LAM_EPS_SAFEGAURD = 1e-10

# Scale factors for loss function regularization terms: semi-supervision.
REG_SCALE_AMBIENT_EXPRESSION = 0.01
REG_SCALE_EMPTY_PROB = 1.0
REG_SCALE_CELL_PROB = 10.0

# Number of cells used to esitmate posterior regularization lambda. Memory hungry.
CELLS_POSTERIOR_REG_CALC = 100

# Posterior regularization constant's upper and lower bounds.
POSTERIOR_REG_MIN = 0.1
POSTERIOR_REG_MAX = 500
POSTERIOR_REG_SEARCH_MAX_ITER = 20
12 changes: 8 additions & 4 deletions cellbender/remove_background/data/dataprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import torch
import torch.utils.data

from typing import Tuple, List
from typing import Tuple, List, Optional


class SparseDataset(torch.utils.data.Dataset):
Expand Down Expand Up @@ -47,15 +47,18 @@ class DataLoader:

def __init__(self,
dataset: sp.csr_matrix,
empty_drop_dataset: sp.csr_matrix,
empty_drop_dataset: Optional[sp.csr_matrix],
batch_size: int = consts.DEFAULT_BATCH_SIZE,
fraction_empties: float = consts.FRACTION_EMPTIES,
shuffle: bool = True,
use_cuda: bool = True):
self.dataset = dataset
self.ind_list = np.arange(self.dataset.shape[0])
self.empty_drop_dataset = empty_drop_dataset
self.empty_ind_list = np.arange(self.empty_drop_dataset.shape[0])
if self.empty_drop_dataset is None:
self.empty_ind_list = np.array([])
else:
self.empty_ind_list = np.arange(self.empty_drop_dataset.shape[0])
self.batch_size = batch_size
self.fraction_empties = fraction_empties
self.cell_batch_size = int(batch_size * (1. - fraction_empties))
Expand All @@ -73,7 +76,8 @@ def _reset(self):
self.ptr = 0

def __len__(self):
return int(self.ind_list.size * (1 + self.fraction_empties)) # ...ish
return int(self.ind_list.size *
(1 + (self.fraction_empties / (1 - self.fraction_empties)))) # ...ish

def __iter__(self):
return self
Expand Down
Loading

0 comments on commit f93d67b

Please sign in to comment.