Merge pull request #71 from broadinstitute/sf_removebkg_v2.1

remove-background version 0.2.0
broadinstitute · Oct 16, 2020 · f93d67b · f93d67b
2 parents 20bab46 + 63c6ff2
commit f93d67b
Show file tree

Hide file tree

Showing 27 changed files with 2,607 additions and 923 deletions.
diff --git a/README.rst b/README.rst
@@ -75,8 +75,8 @@ Citing CellBender
 -----------------
 
 If you use CellBender in your research (and we hope you will), please consider
-citing `our paper <https://www.biorxiv.org/content/10.1101/791699v1>`_:
+citing `our paper on bioRxiv <https://doi.org/10.1101/791699>`_.
 
-Stephen J Fleming, John C Marioni, and Mehrtash Babadi. CellBender remove-background: a deep
-generative model for unsupervised removal of background noise from scRNA-seq datasets.
-bioRxiv 791699; doi: https://doi.org/10.1101/791699
+Stephen J Fleming, John C Marioni, and Mehrtash Babadi. CellBender remove-background:
+a deep generative model for unsupervised removal of background noise from scRNA-seq
+datasets. bioRxiv 791699; doi: `https://doi.org/10.1101/791699 <https://doi.org/10.1101/791699>`_
diff --git a/REQUIREMENTS-DOCKER.txt b/REQUIREMENTS-DOCKER.txt
@@ -0,0 +1,8 @@
+numpy
+scipy
+tables
+pandas
+pyro-ppl>=0.3.2
+torch
+scikit-learn
+matplotlib
diff --git a/cellbender/remove_background/argparse.py b/cellbender/remove_background/argparse.py
@@ -54,9 +54,9 @@ def add_subparser_args(subparsers: argparse) -> argparse:
                                 "analyzed. The largest 'total_droplets' "
                                 "droplets will have their cell "
                                 "probabilities inferred as an output.")
-    subparser.add_argument("--model", nargs=None, type=str, default="full",
-                           choices=["simple", "ambient",
-                                    "swapping", "full"],
+    subparser.add_argument("--model", nargs=None, type=str,
+                           default="full",
+                           choices=["simple", "ambient", "swapping", "full"],
                            dest="model",
                            help="Which model is being used for count data. "
                                 " 'simple' does not model either ambient "
@@ -84,23 +84,21 @@ def add_subparser_args(subparsers: argparse) -> argparse:
                                 "correct prior for empty droplet counts "
                                 "in the rare case where empty counts "
                                 "are extremely high (over 200).")
-    subparser.add_argument("--z-dim", type=int, default=20,
+    subparser.add_argument("--z-dim", type=int, default=100,
                            dest="z_dim",
                            help="Dimension of latent variable z.")
     subparser.add_argument("--z-layers", nargs="+", type=int, default=[500],
                            dest="z_hidden_dims",
                            help="Dimension of hidden layers in the encoder "
                                 "for z.")
-    subparser.add_argument("--d-layers", nargs="+", type=int,
-                           default=[5, 2, 2],
-                           dest="d_hidden_dims",
-                           help="Dimension of hidden layers in the encoder "
-                                "for d.")
-    subparser.add_argument("--p-layers", nargs="+", type=int,
-                           default=[100, 10],
-                           dest="p_hidden_dims",
-                           help="Dimension of hidden layers in the encoder "
-                                "for p.")
+    subparser.add_argument("--training-fraction",
+                           type=float, nargs=None,
+                           default=consts.TRAINING_FRACTION,
+                           dest="training_fraction",
+                           help="Training detail: the fraction of the "
+                                "data used for training.  The rest is never "
+                                "seen by the inference algorithm.  Speeds up "
+                                "learning.")
     subparser.add_argument("--empty-drop-training-fraction",
                            type=float, nargs=None,
                            default=consts.FRACTION_EMPTIES,
@@ -116,11 +114,28 @@ def add_subparser_args(subparsers: argparse) -> argparse:
                                 "entirely.  In the output count matrix, "
                                 "the counts for these genes will be set "
                                 "to zero.")
+    subparser.add_argument("--fpr", nargs="+",
+                           type=float, default=[0.01],
+                           dest="fpr",
+                           help="Target false positive rate in (0, 1).  A false "
+                                "positive is a true signal count that is "
+                                "erroneously removed.  More background removal "
+                                "is accompanied by more signal removal "
+                                "at high values of FPR.  You can specify "
+                                "multiple values, which will create multiple "
+                                "output files.")
+    subparser.add_argument("--exclude-antibody-capture",
+                           dest="exclude_antibodies", action="store_true",
+                           help="Including the flag --exclude-antibody-capture "
+                                "will cause remove-background to operate on "
+                                "gene counts only, ignoring other features.")
     subparser.add_argument("--learning-rate", nargs=None,
-                           type=float, default=1e-3,
+                           type=float, default=1e-4,
                            dest="learning_rate",
-                           help="Training detail: learning rate for "
-                                "inference (probably "
+                           help="Training detail: lower learning rate for "
+                                "inference. A OneCycle learning rate schedule "
+                                "is used, where the upper learning rate is ten "
+                                "times this value. (For this value, probably "
                                 "do not exceed 1e-3).")
 
     return subparsers
diff --git a/cellbender/remove_background/cli.py b/cellbender/remove_background/cli.py
@@ -56,11 +56,13 @@ def validate_args(self, args):
             "fraction_empties must be between 0 and 1, exclusive.  This is " \
             "the fraction of each minibatch that is composed of empty droplets."
 
-        assert args.learning_rate < 0.1, "learning_rate must be < 0.1"
-        assert args.learning_rate > 0, "learning_rate must be > 0"
+        assert args.learning_rate < 0.1, "learning-rate must be < 0.1"
+        assert args.learning_rate > 0, "learning-rate must be > 0"
 
         # Set training_fraction to consts.TRAINING_FRACTION (which is 1.).
-        args.training_fraction = consts.TRAINING_FRACTION
+        # args.training_fraction = consts.TRAINING_FRACTION
+        assert args.training_fraction > 0, "training-fraction must be > 0"
+        assert args.training_fraction <= 1., "training-fraction must be <= 1"
 
         # If cuda is requested, make sure it is available.
         if args.use_cuda:
@@ -74,13 +76,22 @@ def validate_args(self, args):
                                  "significant speed-ups.\n\n")
                 sys.stdout.flush()  # Write immediately
 
+        # Ensure all network layer dimensions are positive.
+        for n in args.z_hidden_dims:
+            assert n > 0, "--z-layers must be all positive integers."
+
         # Ensure that z_hidden_dims are in encoder order.
         # (The same dimensions are used in reverse order for the decoder.)
         args.z_hidden_dims = sorted(args.z_hidden_dims, reverse=True)
 
         # Set use_jit to False.
         args.use_jit = False
 
+        # Ensure false positive rate is between zero and one.
+        for fpr in args.fpr:
+            assert (fpr > 0.) and (fpr < 1.), \
+                "False positive rate --fpr must be between 0 and 1."
+
         self.args = args
 
         return args
@@ -132,15 +143,15 @@ def run_remove_background(args):
     try:
         dataset_obj = \
             SingleCellRNACountsDataset(input_file=args.input_file,
-                                       expected_cell_count=
-                                       args.expected_cell_count,
-                                       total_droplet_barcodes=
-                                       args.total_droplets,
+                                       expected_cell_count=args.expected_cell_count,
+                                       total_droplet_barcodes=args.total_droplets,
                                        fraction_empties=args.fraction_empties,
                                        model_name=args.model,
                                        gene_blacklist=args.blacklisted_genes,
-                                       low_count_threshold=
-                                       args.low_count_threshold)
+                                       exclude_antibodies=args.exclude_antibodies,
+                                       low_count_threshold=args.low_count_threshold,
+                                       fpr=args.fpr)
+
     except OSError:
         logging.error(f"OSError: Unable to open file {args.input_file}.")
         sys.exit(1)

diff --git a/cellbender/remove_background/consts.py b/cellbender/remove_background/consts.py
@@ -2,11 +2,12 @@
 
 # Factor by which the mode UMI count of the empty droplet plateau is
 # multiplied to come up with a UMI cutoff below which no barcodes are used.
-EMPIRICAL_LOW_UMI_TO_EMPTY_DROPLET_THRESHOLD = 0.8
+EMPIRICAL_LOW_UMI_TO_EMPTY_DROPLET_THRESHOLD = 0.5
 
 # Default prior for the standard deviation of the LogNormal distribution for
 # cell size, used only in the case of the 'simple' model.
 SIMPLE_MODEL_D_STD_PRIOR = 0.2
+D_STD_PRIOR = 0.02
 
 # Probability cutoff for determining which droplets contain cells and which
 # are empty.  The droplets n with inferred probability q_n > CELL_PROB_CUTOFF
@@ -21,10 +22,58 @@
 TOTAL_DROPLET_DEFAULT = 25000
 
 # Fraction of the data used for training (versus testing).
-TRAINING_FRACTION = 1.
+TRAINING_FRACTION = 0.9
 
 # Size of minibatch by default.
 DEFAULT_BATCH_SIZE = 128
 
 # Fraction of totally empty droplets that makes up each minibatch, by default.
-FRACTION_EMPTIES = 0.5
+FRACTION_EMPTIES = 0.5
+
+# Prior on rho, the swapping fraction: the two concentration parameters alpha and beta.
+RHO_ALPHA_PRIOR = 18.
+RHO_BETA_PRIOR = 200.
+
+# Constraints on rho posterior latents.
+RHO_PARAM_MIN = 1.
+RHO_PARAM_MAX = 1000.
+
+# Prior on epsilon, the RT efficiency concentration parameter [Gamma(alpha, alpha)].
+EPSILON_PRIOR = 500.
+
+# Prior used for the global overdispersion parameter.
+PHI_LOC_PRIOR = 0.2
+PHI_SCALE_PRIOR = 0.2
+
+# Initial value of global latent scale for d_cell.
+D_CELL_SCALE_INIT = 0.02
+
+# Scale used to regularize values of logit cell probability (mean zero).
+P_LOGIT_SCALE = 2.
+
+# Hidden layer sizes of non-z latent encoder neural network.
+ENC_HIDDEN_DIMS = [100, 50]
+
+# False to use an approximate log_prob computation which is much faster.
+USE_EXACT_LOG_PROB = False
+
+# If using an exact log_prob computation, we integrate numerically over this size range.
+NBPC_EXACT_N_TERMS = 50
+
+# Negative binomial poisson convolution likelihood calculation: numerical safeguards.
+NBPC_MU_EPS_SAFEGAURD = 1e-10
+NBPC_ALPHA_EPS_SAFEGAURD = 1e-10
+NBPC_LAM_EPS_SAFEGAURD = 1e-10
+
+# Scale factors for loss function regularization terms: semi-supervision.
+REG_SCALE_AMBIENT_EXPRESSION = 0.01
+REG_SCALE_EMPTY_PROB = 1.0
+REG_SCALE_CELL_PROB = 10.0
+
+# Number of cells used to esitmate posterior regularization lambda. Memory hungry.
+CELLS_POSTERIOR_REG_CALC = 100
+
+# Posterior regularization constant's upper and lower bounds.
+POSTERIOR_REG_MIN = 0.1
+POSTERIOR_REG_MAX = 500
+POSTERIOR_REG_SEARCH_MAX_ITER = 20
diff --git a/cellbender/remove_background/data/dataprep.py b/cellbender/remove_background/data/dataprep.py
@@ -10,7 +10,7 @@
 import torch
 import torch.utils.data
 
-from typing import Tuple, List
+from typing import Tuple, List, Optional
 
 
 class SparseDataset(torch.utils.data.Dataset):
@@ -47,15 +47,18 @@ class DataLoader:
 
     def __init__(self,
                  dataset: sp.csr_matrix,
-                 empty_drop_dataset: sp.csr_matrix,
+                 empty_drop_dataset: Optional[sp.csr_matrix],
                  batch_size: int = consts.DEFAULT_BATCH_SIZE,
                  fraction_empties: float = consts.FRACTION_EMPTIES,
                  shuffle: bool = True,
                  use_cuda: bool = True):
         self.dataset = dataset
         self.ind_list = np.arange(self.dataset.shape[0])
         self.empty_drop_dataset = empty_drop_dataset
-        self.empty_ind_list = np.arange(self.empty_drop_dataset.shape[0])
+        if self.empty_drop_dataset is None:
+            self.empty_ind_list = np.array([])
+        else:
+            self.empty_ind_list = np.arange(self.empty_drop_dataset.shape[0])
         self.batch_size = batch_size
         self.fraction_empties = fraction_empties
         self.cell_batch_size = int(batch_size * (1. - fraction_empties))
@@ -73,7 +76,8 @@ def _reset(self):
         self.ptr = 0
 
     def __len__(self):
-        return int(self.ind_list.size * (1 + self.fraction_empties))  # ...ish
+        return int(self.ind_list.size *
+                   (1 + (self.fraction_empties / (1 - self.fraction_empties))))  # ...ish
 
     def __iter__(self):
         return self