Fix interval query (#132)

* update docs * added new parser that is measurably faster * add related tests
biothings · Aug 17, 2021 · 5358992 · 5358992
1 parent 3c54ace
commit 5358992
Show file tree

Hide file tree

Showing 3 changed files with 129 additions and 26 deletions.
diff --git a/docs/doc/variant_query_service.rst b/docs/doc/variant_query_service.rst
@@ -170,7 +170,12 @@ You can use **AND**/**OR**/**NOT** boolean operators and grouping to form compli
     q=dbnsfp.polyphen2.hdiv.score:>0.99 AND chrom:1                        AND operator
     q=_exists_:dbsnp AND NOT dbsnp.vartype:indel                           NOT operator
     q=_exists_:dbsnp AND (NOT dbsnp.vartype:indel)                         grouping with ()
-    
+
+Genomic interval queries can be mixed in as well, but only when surrounded by **AND** operators, and cannot be used inside parentheses.::
+
+    q=dbnsfp.genename:CDK* AND chr2:39406300-39406400
+    q=chr2:39406300-39406400 AND dbnsfp.genename:CDK*
+
     
 Escaping reserved characters
 """"""""""""""""""""""""""""

diff --git a/src/tests/app/test_local.py b/src/tests/app/test_local.py
@@ -136,6 +136,47 @@ class TestBeaconEndpoints(BiothingsWebAppTest):
 class TestGenomicIntervalQuery(BiothingsWebAppTest):
     TEST_DATA_DIR_NAME = 'mv_app_test'
 
-    pass
-    # TODO: Write tests along with the correct implementation
-    #  the data is already enough to write the tests
+    def test_standalone_interval_query_pos_hg19(self):
+        self.query(data={'q': 'chr8:7194707'})
+
+    def test_standalone_interval_query_range_hg19(self):
+        self.query(data={'q': 'chr8:7194706-7194708'})
+
+    def test_standalone_interval_query_pos_hg38(self):
+        self.query(data={'q': 'chrX:30718532', 'assembly': 'hg38'})
+
+    def test_prequery(self):
+        self.query(data={'q': 'cadd.chrom:9 AND chr8:7194707'}, hits=False)
+        self.query(data={'q': 'cadd.chrom:8 AND chr8:7194707'})
+
+    def test_postquery(self):
+        self.query(data={'q': 'chr8:7194707 AND cadd.chrom:9'}, hits=False)
+        self.query(data={'q': 'chr8:7194707 cadd.chrom:9 OR cadd.chrom:8'})
+
+    def test_pre_and_post_query(self):
+        self.query(data={'q': 'dbnsfp.alt:A AND chr8:7194707 AND cadd.chrom:8'})
+        self.query(data={'q': 'NOT dbnsfp.alt:A AND chr8:7194707 AND cadd.chrom:8'},
+                   hits=False)
+        self.query(data={'q': 'dbnsfp.alt:A AND chr8:7194707 AND NOT cadd.chrom:8'},
+                   hits=False)
+
+    def test_pre_and_post_query_logic(self):
+        # we want something that messes up the old one when it
+        # does the concatenation without () and breaking the
+        # (intended) affinity
+        # ES itself is very weird anyways,
+        # see https://github.com/elastic/elasticsearch/issues/24847
+        #
+        # Explanation on the query used below
+        # if it gets evaluated to
+        #   (cadd.chrom:8 OR cadd.chrom:9) AND (cadd.chrom:8 OR cadd.chrom:9)
+        # then there should be hits, but
+        #   cadd.chrom:8 OR cadd.chrom:9 AND cadd.chrom:8 OR cadd.chrom:9
+        # does not yield results. Despite the strange query, usually it makes
+        # sense to add the parenthesis, and that's the better practices following
+        # ES documentation anyways
+        self.query(data={
+            'q': 'cadd.chrom:8 OR cadd.chrom:9'
+            'AND chr8:7194707 AND '
+            'cadd.chrom:9 OR cadd.chrom:8'
+        })
diff --git a/src/web/pipeline.py b/src/web/pipeline.py
@@ -1,41 +1,98 @@
 import re
+from typing import Dict, Optional
 
 from elasticsearch_dsl import Search
 from biothings.web.query import ESQueryBuilder, AsyncESQueryBackend
 
 
 INTERVAL_PATTERN = re.compile(
-    r'(?P<pre_query>.+(?P<pre_and>[Aa][Nn][Dd]))*(?P<interval>\s*chr(?P<chr>[1-9xXyYmM][0-9tT]?):(?P<gstart>[0-9,]+)-(?P<gend>[0-9,]+)\s*)(?P<post_query>(?P<post_and>[Aa][Nn][Dd]).+)*')
-SNP_PATTERN = re.compile(
-    r'(?P<pre_query>.+(?P<pre_and>[Aa][Nn][Dd]))*(?P<interval>\s*chr(?P<chr>[1-9xXyYmM][0-9tT]?):(?P<gend>(?P<gstart>[0-9,]+))\s*)(?P<post_query>(?P<post_and>[Aa][Nn][Dd]).+)*')
-PATTERNS = [INTERVAL_PATTERN, SNP_PATTERN]
+    r"""
+    chr  # string literal chr   
+        (?P<chr>[1-9]|1[0-9]|2[0-2]|X|Y|MT)  # chromasomes 1-22, X, Y, and MT
+        :  # literal colon sign
+        (
+            # captures an interval
+            (?P<gstart>[\d,]+)-(?P<gend>[\d,]+)  # range, we only allow comma as sep.
+            |  # or one position
+            (?P<gpos>[\d,]+)
+        )
+    (
+        \s+AND\s+  # take a hitch on the regex engine and prepare the post_query
+        (?P<post_query_string>\S.+)  # match a non-whitespace followed by anything
+    )?
+    """,
+    flags=re.ASCII | re.IGNORECASE | re.VERBOSE
+)
 
 
 class MVQueryBuilder(ESQueryBuilder):
-
-    def _parse_interval_query(self, q):
-
-        for pattern in PATTERNS:
-            m = re.search(pattern, q)
-            if m:
-                r = m.groupdict()
-                if r['pre_query']:
-                    r['query'] = r['pre_query'].rstrip(r['pre_and']).rstrip()
-                    if r['post_query']:
-                        r['query'] += ' ' + r['post_query']
-                elif r['post_query']:
-                    r['query'] = r['post_query'].lstrip(r['post_and']).lstrip()
-                else:
-                    r['query'] = None
-                return r
-        return False
+    @staticmethod
+    def _parse_interval_query(q: str) -> Optional[Dict[str, str]]:
+        """
+        Parse query string and extract appropriate genome interval query
+
+        If the query string includes a valid genome interval/position query,
+        such information is extracted, along with other parts of the string
+        query. Using [] to denote optional parts and <> for required parts,
+        such queries looks like this:
+            [query string AND ] chr<Chromosome>:<start>-<end> [AND query string]
+            [query string AND ] chr<Chromosome>:<position> [AND query string]
+
+        If the query string is not of this format, None is returned. If a
+        valid interval query is found, a dictionary is returned with the keys
+        'chr', 'gstart', 'gend', and 'query'.
+
+        Args:
+            q: input query string
+        Returns:
+            None: if input query string is not a valid interval query
+            Dict[str, str]: with the following keys
+                'chr': Chromosome identifier: 1-22, X, Y, MT
+                'gstart': start position of gene
+                'gend': end position of gene
+                'query': other parts of the query string, concatenated with AND
+        """
+        # don't even bother when we don't see chr
+        # even with improved regex, this is a few times faster
+        start_pos = q.find('chr')  # find first occurrence of 'chr'
+        # might not be what we're looking for, but usually discards enough
+        # so the regex engine runs less
+        if start_pos < 0:
+            return None
+        m = re.search(INTERVAL_PATTERN, q[start_pos:])
+        if not m:
+            return None
+        start_pos += m.start()  # add real offset
+        pre_query = q[:start_pos].strip()
+        query = []
+        if pre_query != '':
+            # pre_query non empty and does not end in AND\s+
+            if not q[start_pos - 1].isspace():
+                return None
+            if pre_query[-3:].upper() != 'AND':
+                return None
+            query.append(f'({pre_query[:-3]})')  # strip the AND, add parenthesis
+        md = m.groupdict()
+        r = {}
+        # copy chr
+        r['chr'] = md['chr']
+        # copy start/end
+        if md['gpos']:
+            r['gstart'] = r['gend'] = md['gpos']
+        else:
+            r['gstart'] = md['gstart']
+            r['gend'] = md['gend']
+        if md['post_query_string']:
+            query.append(f"({md['post_query_string']})")
+        r['query'] = ' AND '.join(query)
+        return r
 
     def default_string_query(self, q, options):
 
         match = self._parse_interval_query(q)
         if match:  # interval query
             search = Search()
-            if match['query']:
+            if match['query'] != '':
                 search = search.query("query_string", query=match['query'])
             search = search.filter('match', chrom=match['chr'])
             assembly = 'hg38' if options.assembly == 'hg38' else 'hg19'