Skip to content
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ The main program to design probes is [`design.py`](./bin/design.py).
To see details on all the arguments that the program accepts, run:

```bash
design.py -h
design.py --help
```

[`design.py`](./bin/design.py) requires one or more `dataset`s that specify input sequence data to target:
Expand Down Expand Up @@ -116,6 +116,9 @@ Probes are designed such that each `dataset` should be captured by probes that a
* `--add-adapters`: Add PCR adapters to the ends of each probe sequence.
This selects adapters to add to probe sequences so as to minimize overlap among probes that share an adapter, allowing probes with the same adapter to be amplified together.
(See `--adapter-a` and `--adapter-b` too.)
* `--filter-with-lsh-hamming FILTER_WITH_LSH_HAMMING`/`--filter-with-lsh-minhash FILTER_WITH_LSH_MINHASH`: Use locality-sensitive hashing to reduce the space of candidate probes.
This can significantly improve runtime and memory requirements when the input is especially large and diverse.
See `design.py --help` for details on using these options and downsides.
* `-o OUTPUT`: Write probe sequences in FASTA format to OUTPUT.

### Pooling across many runs ([`pool.py`](./bin/pool.py))
Expand All @@ -125,7 +128,7 @@ It does this by searching over a space of probe sets to solve a constrained opti
To see details on all the arguments that the program accepts, run:

```bash
pool.py -h
pool.py --help
```

You need to run [`design.py`](./bin/design.py) on each dataset over a grid of parameters values that spans a reasonable domain.
Expand Down
75 changes: 71 additions & 4 deletions bin/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from catch.filter import duplicate_filter
from catch.filter import fasta_filter
from catch.filter import n_expansion_filter
from catch.filter import near_duplicate_filter
from catch.filter import probe_designer
from catch.filter import reverse_complement_filter
from catch.filter import set_cover_filter
Expand Down Expand Up @@ -132,15 +133,38 @@ def main(args):
skip_reverse_complements=True)
filters += [ff]

# Duplicate filter (df) -- condense all candidate probes that
# Duplicate filter (df) -- condense all candidate probes that
# are identical down to one; this is not necessary for
# correctness, as the set cover filter achieves the same task
# implicitly, but it does significantly lower runtime by
# decreasing the input size to the set cover filter
df = duplicate_filter.DuplicateFilter()
filters += [df]
# Near duplicate filter (ndf) -- condense candidate probes that
# are near-duplicates down to one using locality-sensitive
# hashing; like the duplicate filter, this is not necessary
# but can significantly lower runtime and reduce memory usage
# (even more than the duplicate filter)
if (args.filter_with_lsh_hamming is not None and
args.filter_with_lsh_minhash is not None):
raise Exception(("Cannot use both --filter-with-lsh-hamming "
"and --filter-with-lsh-minhash"))
if args.filter_with_lsh_hamming is not None:
if args.filter_with_lsh_hamming > args.mismatches:
logger.warning(("Setting FILTER_WITH_LSH_HAMMING (%d) to be greater "
"than MISMATCHES (%d) may cause the probes to achieve less "
"than the desired coverage"), args.filter_with_lsh_hamming,
args.mismatches)
ndf = near_duplicate_filter.NearDuplicateFilterWithHammingDistance(
args.filter_with_lsh_hamming, args.probe_length)
filters += [ndf]
elif args.filter_with_lsh_minhash is not None:
ndf = near_duplicate_filter.NearDuplicateFilterWithMinHash(
args.filter_with_lsh_minhash)
filters += [ndf]
else:
df = duplicate_filter.DuplicateFilter()
filters += [df]

# Set cover filter (scf) -- solve the problem by treating it as
# Set cover filter (scf) -- solve the problem by treating it as
# an instance of the set cover problem
scf = set_cover_filter.SetCoverFilter(
mismatches=args.mismatches,
Expand Down Expand Up @@ -445,6 +469,49 @@ def check_coverage(val):
"replacement"))

# Technical adjustments
parser.add_argument('--filter-with-lsh-hamming',
type=int,
help=("(Optional) If set, filter candidate probes for near-"
"duplicates using LSH with a family of hash functions that "
"works with Hamming distance. FILTER_WITH_LSH_HAMMING gives "
"the maximum Hamming distance at which to call near-"
"duplicates; it should be commensurate with (but not greater "
"than) MISMATCHES. Using this may significantly improve "
"runtime and reduce memory usage by reducing the number of "
"candidate probes to consider, but may lead to a slightly "
"sub-optimal solution. It may also, particularly with "
"relatively high values of FILTER_WITH_LSH_HAMMING, cause "
"coverage obtained for each genome to be slightly less than "
"the desired coverage (COVERAGE) when that desired coverage "
"is the complete genome; it is recommended to also use "
"--print-analysis or --write-analysis-to-tsv with this "
"to see the coverage that is obtained."))
def check_filter_with_lsh_minhash(val):
fval = float(val)
if fval >= 0.0 and fval <= 1.0:
# a float in [0,1]
return fval
else:
raise argparse.ArgumentTypeError(("%s is an invalid Jaccard "
"distance") % val)
parser.add_argument('--filter-with-lsh-minhash',
type=check_filter_with_lsh_minhash,
help=("(Optional) If set, filter candidate probes for near-"
"duplicates using LSH with a MinHash family. "
"FILTER_WITH_LSH_MINHASH gives the maximum Jaccard distance "
"(1 minus Jaccard similarity) at which to call near-duplicates; "
"the Jaccard similarity is calculated by treating each probe "
"as a set of overlapping 10-mers. Its value should be "
"commensurate with parameter values determining whether a probe "
"hybridizes to a target sequence, but this can be difficult "
"to measure compared to the input for --filter-with-lsh-hamming. "
"However, this allows more sensitivity in near-duplicate "
"detection than --filter-with-lsh-hamming (e.g., if near-"
"duplicates should involve probes shifted relative to each "
"other). The same caveats mentioned in help for "
"--filter-with-lsh-hamming also apply here. Values of "
"FILTER_WITH_LSH_MINHASH above ~0.7 may start to require "
"significant memory and runtime for near-duplicate detection."))
parser.add_argument('--cover-groupings-separately',
dest="cover_groupings_separately",
action="store_true",
Expand Down
181 changes: 181 additions & 0 deletions catch/filter/near_duplicate_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
"""Removes near-duplicates from an input list of probes.

This acts as a filter on the probes by removing ones that are
near-duplicates of another using LSH. There might be near-duplicates
in the output that are not detected, but every near-duplicate removed
should indeed be a near-duplicate as defined by the given criteria.
"""

from collections import defaultdict
import math
import operator

from catch.filter.base_filter import BaseFilter
from catch.utils import lsh

__author__ = 'Hayden Metsky <hayden@mit.edu>'


class NearDuplicateFilter(BaseFilter):
"""Filter that removes near-duplicates using LSH.

This constructs a concatenation of k hash functions, and does
this multiple times so as to achieve a desired probability of
reporting any probe as a near-duplicate of a queried probe. k
can be a constant, and the number of (concatenated) hash functions
to use is calculated to achieve the desired reporting probability.

This sorts input probes by their multiplicity; therefore, the
duplicate filter should *not* be run before this.
"""

def __init__(self, k, reporting_prob=0.95):
"""
Args:
k: number of hash functions to draw from a family of
hash functions for amplification; each hash function is then
the concatenation (h_1, h_2, ..., h_k)
reporting_prob: ensure that any probe within dist_thres of
a queried probe is detected as such; this constructs
multiple hash functions (each of which is a concatenation
of k functions drawn from the family) to achieve this
probability
"""
self.k = k
self.reporting_prob = reporting_prob

def _filter(self, input):
"""Filter with an arbitrary LSH family.

This performs near neighbor lookups using self.lsh_family. It only
calls probes near-duplicates if their distance, according to
self.dist_fn, is within self.dist_thres.

Args:
input: collection of probes to filter

Returns:
subset of input
"""
# Sort the probes by their mulitiplicity (descending)
occurrences = defaultdict(int)
for p in input:
occurrences[p] += 1
input_sorted = [p for p, count in
sorted(occurrences.items(), key=operator.itemgetter(1),
reverse=True)]

# Remove exact duplicates from the input
input = list(set(input))

# Construct a collection of hash tables for looking up
# near neighbors of each probe
nnl = lsh.NearNeighborLookup(self.lsh_family, self.k, self.dist_thres,
self.dist_fn, self.reporting_prob)
nnl.add(input)

# Iterate through all probes in order; for each p, remove others
# that are near-duplicates (neighbors) of p. Since we iterate
# in sorted order by multiplicity, the ones that hit more targets
# should appear earlier and will be included in the filtered output
to_include = set()
to_exclude = set()
for p in input_sorted:
# p should not have already been included because input_sorted
# should not contain duplicates
assert p not in to_include

if p in to_exclude:
# p is already being filtered out
continue

# Include p in the output and exclude all near-duplicates of it
to_include.add(p)
for near_dup in nnl.query(p):
if near_dup not in to_include:
to_exclude.add(near_dup)

# Check that every probe is either included or excluded and
# that none are both included and excluded
assert len(to_include | to_exclude) == len(input_sorted)
assert len(to_include & to_exclude) == 0

return list(to_include)


class NearDuplicateFilterWithHammingDistance(NearDuplicateFilter):
"""Filter that removes near-duplicates according to Hamming distance.
"""

def __init__(self, dist_thres, probe_length):
"""
Args:
dist_thres: only call two probes near-duplicates if their
Hamming distance is within this value; this should be
equal to or commensurate with (but not greater than)
the number of mismatches at/below which a probe is
considered to hybridize to a target sequence so that
candidate probes further apart than this value are not
collapsed as near-duplicates
probe_length: length of probes
"""
super().__init__(k=20)
self.lsh_family = lsh.HammingDistanceFamily(probe_length)
self.dist_thres = dist_thres

def hamming_dist(a, b):
# a and b are probe.Probe objects
return a.mismatches(b)
self.dist_fn = hamming_dist

def _filter(self, input):
"""Filter with LSH using family that works with Hamming distance.

Args:
input: collection of probes to filter

Returns:
subset of input
"""
return NearDuplicateFilter._filter(self, input)


class NearDuplicateFilterWithMinHash(NearDuplicateFilter):
"""Filter that removes near-duplicates using MinHash.
"""

def __init__(self, dist_thres, kmer_size=10):
"""
Args:
dist_thres: only call two probes near-duplicates if their
Jaccard distance (1 minus Jaccard similarity) is within
this value; the Jaccard similarity is measured by treating
each probe sequence as a set of k-mers and measuring
the overlap of those k-mers
kmer_size: the length of each k-mer to use with MinHash; note
that this is *not* the same as self.k
"""
super().__init__(k=3)
self.lsh_family = lsh.MinHashFamily(kmer_size)
self.dist_thres = dist_thres

def jaccard_dist(a, b):
a_kmers = [a[i:(i + kmer_size)] for i in range(len(a) - kmer_size + 1)]
b_kmers = [b[i:(i + kmer_size)] for i in range(len(b) - kmer_size + 1)]
a_kmers = set(a_kmers)
b_kmers = set(b_kmers)
jaccard_sim = float(len(a_kmers & b_kmers)) / len(a_kmers | b_kmers)
return 1.0 - jaccard_sim
self.dist_fn = jaccard_dist

def _filter(self, input):
"""Filter with LSH using MinHash family.

Args:
input: collection of probes to filter

Returns:
subset of input
"""
return NearDuplicateFilter._filter(self, input)

Loading