Skip to content

Commit

Permalink
Fix interval query (#132)
Browse files Browse the repository at this point in the history
* update docs

* added new parser that is measurably faster

* add related tests
  • Loading branch information
zcqian committed Aug 17, 2021
1 parent 3c54ace commit 5358992
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 26 deletions.
7 changes: 6 additions & 1 deletion docs/doc/variant_query_service.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,12 @@ You can use **AND**/**OR**/**NOT** boolean operators and grouping to form compli
q=dbnsfp.polyphen2.hdiv.score:>0.99 AND chrom:1 AND operator
q=_exists_:dbsnp AND NOT dbsnp.vartype:indel NOT operator
q=_exists_:dbsnp AND (NOT dbsnp.vartype:indel) grouping with ()

Genomic interval queries can be mixed in as well, but only when surrounded by **AND** operators, and cannot be used inside parentheses.::

q=dbnsfp.genename:CDK* AND chr2:39406300-39406400
q=chr2:39406300-39406400 AND dbnsfp.genename:CDK*

Escaping reserved characters
""""""""""""""""""""""""""""
Expand Down
47 changes: 44 additions & 3 deletions src/tests/app/test_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,47 @@ class TestBeaconEndpoints(BiothingsWebAppTest):
class TestGenomicIntervalQuery(BiothingsWebAppTest):
TEST_DATA_DIR_NAME = 'mv_app_test'

pass
# TODO: Write tests along with the correct implementation
# the data is already enough to write the tests
def test_standalone_interval_query_pos_hg19(self):
self.query(data={'q': 'chr8:7194707'})

def test_standalone_interval_query_range_hg19(self):
self.query(data={'q': 'chr8:7194706-7194708'})

def test_standalone_interval_query_pos_hg38(self):
self.query(data={'q': 'chrX:30718532', 'assembly': 'hg38'})

def test_prequery(self):
self.query(data={'q': 'cadd.chrom:9 AND chr8:7194707'}, hits=False)
self.query(data={'q': 'cadd.chrom:8 AND chr8:7194707'})

def test_postquery(self):
self.query(data={'q': 'chr8:7194707 AND cadd.chrom:9'}, hits=False)
self.query(data={'q': 'chr8:7194707 cadd.chrom:9 OR cadd.chrom:8'})

def test_pre_and_post_query(self):
self.query(data={'q': 'dbnsfp.alt:A AND chr8:7194707 AND cadd.chrom:8'})
self.query(data={'q': 'NOT dbnsfp.alt:A AND chr8:7194707 AND cadd.chrom:8'},
hits=False)
self.query(data={'q': 'dbnsfp.alt:A AND chr8:7194707 AND NOT cadd.chrom:8'},
hits=False)

def test_pre_and_post_query_logic(self):
# we want something that messes up the old one when it
# does the concatenation without () and breaking the
# (intended) affinity
# ES itself is very weird anyways,
# see https://github.com/elastic/elasticsearch/issues/24847
#
# Explanation on the query used below
# if it gets evaluated to
# (cadd.chrom:8 OR cadd.chrom:9) AND (cadd.chrom:8 OR cadd.chrom:9)
# then there should be hits, but
# cadd.chrom:8 OR cadd.chrom:9 AND cadd.chrom:8 OR cadd.chrom:9
# does not yield results. Despite the strange query, usually it makes
# sense to add the parenthesis, and that's the better practices following
# ES documentation anyways
self.query(data={
'q': 'cadd.chrom:8 OR cadd.chrom:9'
'AND chr8:7194707 AND '
'cadd.chrom:9 OR cadd.chrom:8'
})
101 changes: 79 additions & 22 deletions src/web/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,98 @@
import re
from typing import Dict, Optional

from elasticsearch_dsl import Search
from biothings.web.query import ESQueryBuilder, AsyncESQueryBackend


INTERVAL_PATTERN = re.compile(
r'(?P<pre_query>.+(?P<pre_and>[Aa][Nn][Dd]))*(?P<interval>\s*chr(?P<chr>[1-9xXyYmM][0-9tT]?):(?P<gstart>[0-9,]+)-(?P<gend>[0-9,]+)\s*)(?P<post_query>(?P<post_and>[Aa][Nn][Dd]).+)*')
SNP_PATTERN = re.compile(
r'(?P<pre_query>.+(?P<pre_and>[Aa][Nn][Dd]))*(?P<interval>\s*chr(?P<chr>[1-9xXyYmM][0-9tT]?):(?P<gend>(?P<gstart>[0-9,]+))\s*)(?P<post_query>(?P<post_and>[Aa][Nn][Dd]).+)*')
PATTERNS = [INTERVAL_PATTERN, SNP_PATTERN]
r"""
chr # string literal chr
(?P<chr>[1-9]|1[0-9]|2[0-2]|X|Y|MT) # chromasomes 1-22, X, Y, and MT
: # literal colon sign
(
# captures an interval
(?P<gstart>[\d,]+)-(?P<gend>[\d,]+) # range, we only allow comma as sep.
| # or one position
(?P<gpos>[\d,]+)
)
(
\s+AND\s+ # take a hitch on the regex engine and prepare the post_query
(?P<post_query_string>\S.+) # match a non-whitespace followed by anything
)?
""",
flags=re.ASCII | re.IGNORECASE | re.VERBOSE
)


class MVQueryBuilder(ESQueryBuilder):

def _parse_interval_query(self, q):

for pattern in PATTERNS:
m = re.search(pattern, q)
if m:
r = m.groupdict()
if r['pre_query']:
r['query'] = r['pre_query'].rstrip(r['pre_and']).rstrip()
if r['post_query']:
r['query'] += ' ' + r['post_query']
elif r['post_query']:
r['query'] = r['post_query'].lstrip(r['post_and']).lstrip()
else:
r['query'] = None
return r
return False
@staticmethod
def _parse_interval_query(q: str) -> Optional[Dict[str, str]]:
"""
Parse query string and extract appropriate genome interval query
If the query string includes a valid genome interval/position query,
such information is extracted, along with other parts of the string
query. Using [] to denote optional parts and <> for required parts,
such queries looks like this:
[query string AND ] chr<Chromosome>:<start>-<end> [AND query string]
[query string AND ] chr<Chromosome>:<position> [AND query string]
If the query string is not of this format, None is returned. If a
valid interval query is found, a dictionary is returned with the keys
'chr', 'gstart', 'gend', and 'query'.
Args:
q: input query string
Returns:
None: if input query string is not a valid interval query
Dict[str, str]: with the following keys
'chr': Chromosome identifier: 1-22, X, Y, MT
'gstart': start position of gene
'gend': end position of gene
'query': other parts of the query string, concatenated with AND
"""
# don't even bother when we don't see chr
# even with improved regex, this is a few times faster
start_pos = q.find('chr') # find first occurrence of 'chr'
# might not be what we're looking for, but usually discards enough
# so the regex engine runs less
if start_pos < 0:
return None
m = re.search(INTERVAL_PATTERN, q[start_pos:])
if not m:
return None
start_pos += m.start() # add real offset
pre_query = q[:start_pos].strip()
query = []
if pre_query != '':
# pre_query non empty and does not end in AND\s+
if not q[start_pos - 1].isspace():
return None
if pre_query[-3:].upper() != 'AND':
return None
query.append(f'({pre_query[:-3]})') # strip the AND, add parenthesis
md = m.groupdict()
r = {}
# copy chr
r['chr'] = md['chr']
# copy start/end
if md['gpos']:
r['gstart'] = r['gend'] = md['gpos']
else:
r['gstart'] = md['gstart']
r['gend'] = md['gend']
if md['post_query_string']:
query.append(f"({md['post_query_string']})")
r['query'] = ' AND '.join(query)
return r

def default_string_query(self, q, options):

match = self._parse_interval_query(q)
if match: # interval query
search = Search()
if match['query']:
if match['query'] != '':
search = search.query("query_string", query=match['query'])
search = search.filter('match', chrom=match['chr'])
assembly = 'hg38' if options.assembly == 'hg38' else 'hg19'
Expand Down

0 comments on commit 5358992

Please sign in to comment.