broadinstitute · haydenm · Apr 17, 2018 · Apr 6, 2018 · Apr 8, 2018 · Apr 9, 2018
diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ The main program to design probes is [`design.py`](./bin/design.py).
 To see details on all the arguments that the program accepts, run:
 
 ```bash
-design.py -h
+design.py --help
 ```
 
 [`design.py`](./bin/design.py) requires one or more `dataset`s that specify input sequence data to target:
@@ -116,6 +116,9 @@ Probes are designed such that each `dataset` should be captured by probes that a
 * `--add-adapters`: Add PCR adapters to the ends of each probe sequence.
 This selects adapters to add to probe sequences so as to minimize overlap among probes that share an adapter, allowing probes with the same adapter to be amplified together.
 (See `--adapter-a` and `--adapter-b` too.)
+* `--filter-with-lsh-hamming FILTER_WITH_LSH_HAMMING`/`--filter-with-lsh-minhash FILTER_WITH_LSH_MINHASH`: Use locality-sensitive hashing to reduce the space of candidate probes.
+This can significantly improve runtime and memory requirements when the input is especially large and diverse.
+See `design.py --help` for details on using these options and downsides.
 * `-o OUTPUT`: Write probe sequences in FASTA format to OUTPUT.
 
 ### Pooling across many runs ([`pool.py`](./bin/pool.py))
@@ -125,7 +128,7 @@ It does this by searching over a space of probe sets to solve a constrained opti
 To see details on all the arguments that the program accepts, run:
 
 ```bash
-pool.py -h
+pool.py --help
 ```
 
 You need to run [`design.py`](./bin/design.py) on each dataset over a grid of parameters values that spans a reasonable domain.

diff --git a/bin/design.py b/bin/design.py
@@ -14,6 +14,7 @@
 from catch.filter import duplicate_filter
 from catch.filter import fasta_filter
 from catch.filter import n_expansion_filter
+from catch.filter import near_duplicate_filter
 from catch.filter import probe_designer
 from catch.filter import reverse_complement_filter
 from catch.filter import set_cover_filter
@@ -132,15 +133,38 @@ def main(args):
                                       skip_reverse_complements=True)
         filters += [ff]
 
-    #  Duplicate filter (df) -- condense all candidate probes that
+    # Duplicate filter (df) -- condense all candidate probes that
     #     are identical down to one; this is not necessary for
     #     correctness, as the set cover filter achieves the same task
     #     implicitly, but it does significantly lower runtime by
     #     decreasing the input size to the set cover filter
-    df = duplicate_filter.DuplicateFilter()
-    filters += [df]
+    # Near duplicate filter (ndf) -- condense candidate probes that
+    #     are near-duplicates down to one using locality-sensitive
+    #     hashing; like the duplicate filter, this is not necessary
+    #     but can significantly lower runtime and reduce memory usage
+    #     (even more than the duplicate filter)
+    if (args.filter_with_lsh_hamming is not None and
+            args.filter_with_lsh_minhash is not None):
+        raise Exception(("Cannot use both --filter-with-lsh-hamming "
+            "and --filter-with-lsh-minhash"))
+    if args.filter_with_lsh_hamming is not None:
+        if args.filter_with_lsh_hamming > args.mismatches:
+            logger.warning(("Setting FILTER_WITH_LSH_HAMMING (%d) to be greater "
+                "than MISMATCHES (%d) may cause the probes to achieve less "
+                "than the desired coverage"), args.filter_with_lsh_hamming,
+                args.mismatches)
+        ndf = near_duplicate_filter.NearDuplicateFilterWithHammingDistance(
+            args.filter_with_lsh_hamming, args.probe_length)
+        filters += [ndf]
+    elif args.filter_with_lsh_minhash is not None:
+        ndf = near_duplicate_filter.NearDuplicateFilterWithMinHash(
+            args.filter_with_lsh_minhash)
+        filters += [ndf]
+    else:
+        df = duplicate_filter.DuplicateFilter()
+        filters += [df]
 
-    #  Set cover filter (scf) -- solve the problem by treating it as
+    # Set cover filter (scf) -- solve the problem by treating it as
     #     an instance of the set cover problem
     scf = set_cover_filter.SetCoverFilter(
         mismatches=args.mismatches,
@@ -445,6 +469,49 @@ def check_coverage(val):
               "replacement"))
 
     # Technical adjustments
+    parser.add_argument('--filter-with-lsh-hamming',
+        type=int,
+        help=("(Optional) If set, filter candidate probes for near-"
+              "duplicates using LSH with a family of hash functions that "
+              "works with Hamming distance. FILTER_WITH_LSH_HAMMING gives "
+              "the maximum Hamming distance at which to call near-"
+              "duplicates; it should be commensurate with (but not greater "
+              "than) MISMATCHES. Using this may significantly improve "
+              "runtime and reduce memory usage by reducing the number of "
+              "candidate probes to consider, but may lead to a slightly "
+              "sub-optimal solution. It may also, particularly with "
+              "relatively high values of FILTER_WITH_LSH_HAMMING, cause "
+              "coverage obtained for each genome to be slightly less than "
+              "the desired coverage (COVERAGE) when that desired coverage "
+              "is the complete genome; it is recommended to also use "
+              "--print-analysis or --write-analysis-to-tsv with this "
+              "to see the coverage that is obtained."))
+    def check_filter_with_lsh_minhash(val):
+        fval = float(val)
+        if fval >= 0.0 and fval <= 1.0:
+            # a float in [0,1]
+            return fval
+        else:
+            raise argparse.ArgumentTypeError(("%s is an invalid Jaccard "
+                                              "distance") % val)
+    parser.add_argument('--filter-with-lsh-minhash',
+        type=check_filter_with_lsh_minhash,
+        help=("(Optional) If set, filter candidate probes for near-"
+              "duplicates using LSH with a MinHash family. "
+              "FILTER_WITH_LSH_MINHASH gives the maximum Jaccard distance "
+              "(1 minus Jaccard similarity) at which to call near-duplicates; "
+              "the Jaccard similarity is calculated by treating each probe "
+              "as a set of overlapping 10-mers. Its value should be "
+              "commensurate with parameter values determining whether a probe "
+              "hybridizes to a target sequence, but this can be difficult "
+              "to measure compared to the input for --filter-with-lsh-hamming. "
+              "However, this allows more sensitivity in near-duplicate "
+              "detection than --filter-with-lsh-hamming (e.g., if near-"
+              "duplicates should involve probes shifted relative to each "
+              "other). The same caveats mentioned in help for "
+              "--filter-with-lsh-hamming also apply here. Values of "
+              "FILTER_WITH_LSH_MINHASH above ~0.7 may start to require "
+              "significant memory and runtime for near-duplicate detection."))
     parser.add_argument('--cover-groupings-separately',
         dest="cover_groupings_separately",
         action="store_true",

diff --git a/catch/filter/near_duplicate_filter.py b/catch/filter/near_duplicate_filter.py
@@ -0,0 +1,181 @@
+"""Removes near-duplicates from an input list of probes.
+
+This acts as a filter on the probes by removing ones that are
+near-duplicates of another using LSH. There might be near-duplicates
+in the output that are not detected, but every near-duplicate removed
+should indeed be a near-duplicate as defined by the given criteria.
+"""
+
+from collections import defaultdict
+import math
+import operator
+
+from catch.filter.base_filter import BaseFilter
+from catch.utils import lsh
+
+__author__ = 'Hayden Metsky <hayden@mit.edu>'
+
+
+class NearDuplicateFilter(BaseFilter):
+    """Filter that removes near-duplicates using LSH.
+
+    This constructs a concatenation of k hash functions, and does
+    this multiple times so as to achieve a desired probability of
+    reporting any probe as a near-duplicate of a queried probe. k
+    can be a constant, and the number of (concatenated) hash functions
+    to use is calculated to achieve the desired reporting probability.
+
+    This sorts input probes by their multiplicity; therefore, the
+    duplicate filter should *not* be run before this.
+    """
+
+    def __init__(self, k, reporting_prob=0.95):
+        """
+        Args:
+            k: number of hash functions to draw from a family of
+                hash functions for amplification; each hash function is then
+                the concatenation (h_1, h_2, ..., h_k)
+            reporting_prob: ensure that any probe within dist_thres of
+                a queried probe is detected as such; this constructs
+                multiple hash functions (each of which is a concatenation
+                of k functions drawn from the family) to achieve this
+                probability
+        """
+        self.k = k
+        self.reporting_prob = reporting_prob
+
+    def _filter(self, input):
+        """Filter with an arbitrary LSH family.
+
+        This performs near neighbor lookups using self.lsh_family. It only
+        calls probes near-duplicates if their distance, according to
+        self.dist_fn, is within self.dist_thres.
+
+        Args:
+            input: collection of probes to filter
+
+        Returns:
+            subset of input
+        """
+        # Sort the probes by their mulitiplicity (descending)
+        occurrences = defaultdict(int)
+        for p in input:
+            occurrences[p] += 1
+        input_sorted = [p for p, count in
+            sorted(occurrences.items(), key=operator.itemgetter(1),
+                   reverse=True)]
+
+        # Remove exact duplicates from the input
+        input = list(set(input))
+
+        # Construct a collection of hash tables for looking up
+        # near neighbors of each probe
+        nnl = lsh.NearNeighborLookup(self.lsh_family, self.k, self.dist_thres,
+            self.dist_fn, self.reporting_prob)
+        nnl.add(input)
+
+        # Iterate through all probes in order; for each p, remove others
+        # that are near-duplicates (neighbors) of p. Since we iterate
+        # in sorted order by multiplicity, the ones that hit more targets
+        # should appear earlier and will be included in the filtered output
+        to_include = set()
+        to_exclude = set()
+        for p in input_sorted:
+            # p should not have already been included because input_sorted
+            # should not contain duplicates
+            assert p not in to_include
+
+            if p in to_exclude:
+                # p is already being filtered out
+                continue
+
+            # Include p in the output and exclude all near-duplicates of it
+            to_include.add(p)
+            for near_dup in nnl.query(p):
+                if near_dup not in to_include:
+                    to_exclude.add(near_dup)
+
+        # Check that every probe is either included or excluded and
+        # that none are both included and excluded
+        assert len(to_include | to_exclude) == len(input_sorted)
+        assert len(to_include & to_exclude) == 0
+
+        return list(to_include)
+
+
+class NearDuplicateFilterWithHammingDistance(NearDuplicateFilter):
+    """Filter that removes near-duplicates according to Hamming distance.
+    """
+
+    def __init__(self, dist_thres, probe_length):
+        """
+        Args:
+            dist_thres: only call two probes near-duplicates if their
+                Hamming distance is within this value; this should be
+                equal to or commensurate with (but not greater than)
+                the number of mismatches at/below which a probe is
+                considered to hybridize to a target sequence so that
+                candidate probes further apart than this value are not
+                collapsed as near-duplicates
+            probe_length: length of probes
+        """
+        super().__init__(k=20)
+        self.lsh_family = lsh.HammingDistanceFamily(probe_length)
+        self.dist_thres = dist_thres
+
+        def hamming_dist(a, b):
+            # a and b are probe.Probe objects
+            return a.mismatches(b)
+        self.dist_fn = hamming_dist
+
+    def _filter(self, input):
+        """Filter with LSH using family that works with Hamming distance.
+
+        Args:
+            input: collection of probes to filter
+
+        Returns:
+            subset of input
+        """
+        return NearDuplicateFilter._filter(self, input)
+
+
+class NearDuplicateFilterWithMinHash(NearDuplicateFilter):
+    """Filter that removes near-duplicates using MinHash.
+    """
+
+    def __init__(self, dist_thres, kmer_size=10):
+        """
+        Args:
+            dist_thres: only call two probes near-duplicates if their
+                Jaccard distance (1 minus Jaccard similarity) is within
+                this value; the Jaccard similarity is measured by treating
+                each probe sequence as a set of k-mers and measuring
+                the overlap of those k-mers
+            kmer_size: the length of each k-mer to use with MinHash; note
+                that this is *not* the same as self.k
+        """
+        super().__init__(k=3)
+        self.lsh_family = lsh.MinHashFamily(kmer_size)
+        self.dist_thres = dist_thres
+
+        def jaccard_dist(a, b):
+            a_kmers = [a[i:(i + kmer_size)] for i in range(len(a) - kmer_size + 1)]
+            b_kmers = [b[i:(i + kmer_size)] for i in range(len(b) - kmer_size + 1)]
+            a_kmers = set(a_kmers)
+            b_kmers = set(b_kmers)
+            jaccard_sim = float(len(a_kmers & b_kmers)) / len(a_kmers | b_kmers)
+            return 1.0 - jaccard_sim
+        self.dist_fn = jaccard_dist
+
+    def _filter(self, input):
+        """Filter with LSH using MinHash family.
+
+        Args:
+            input: collection of probes to filter
+
+        Returns:
+            subset of input
+        """
+        return NearDuplicateFilter._filter(self, input)
+