Fixes belbio/bel_api#96 - term completion issue

Searching for SP:H4_HUMAN or IL-6 was not possible - changed the es_mapping file to fix this by changing the tokenizers.
belbio · Aug 24, 2018 · 43add12 · 43add12
1 parent 880262e
commit 43add12
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 13 deletions.
diff --git a/bel/db/es_mapping_terms.yml b/bel/db/es_mapping_terms.yml
@@ -8,10 +8,23 @@ settings:
     analyzer:
       autocomplete:
         type: custom
-        tokenizer: standard
+        tokenizer: autocomplete_tokenizer
         filter:
           - lowercase
           - autocomplete_filter
+      autocomplete_search:
+        type: custom
+        tokenizer: autocomplete_tokenizer
+        filter:
+          - lowercase
+
+    # Added simple pattern split to allow SP:H4_HUMAN in alt_ids to be matched
+    # This will also allow IL-6 to be completed against.  The standard tokenizer/analyzer
+    # broke it into IL and 6
+    tokenizer:
+      autocomplete_tokenizer:
+        type: simple_pattern_split
+        pattern: " |:"
     normalizer:
       lowercase:
         type: custom
@@ -26,7 +39,7 @@ mappings:
         type: text
         store: true
         analyzer: autocomplete
-        search_analyzer: standard
+        search_analyzer: autocomplete_search
       src_id:
         type: keyword
         copy_to: autocomplete
@@ -41,6 +54,9 @@ mappings:
       alt_ids:
         type: keyword
         copy_to: autocomplete
+      alt_src_ids:
+        type: keyword
+        copy_to: autocomplete
       label:
         type: text
         copy_to: autocomplete

diff --git a/bel/resources/namespace.py b/bel/resources/namespace.py
@@ -141,13 +141,6 @@ def terms_iterator_for_elasticsearch(fo: IO, index_name: str):
             if species_list and species_id and species_id not in species_list:
                 continue
 
-            all_term_ids = set()
-            for term_id in [term['id']] + term.get('alt_ids', []):
-                all_term_ids.add(term_id)
-                all_term_ids.add(lowercase_term_id(term_id))
-
-            term['alt_ids'] = copy.copy(list(all_term_ids))
-
             yield {
                 '_op_type': 'index',
                 '_index': index_name,
@@ -166,7 +159,9 @@ def lowercase_term_id(term_id: str) -> str:
     Returns:
         str: lowercased, e.g. MESH:atherosclerosis
     """
-    (ns, val) = term_id.split(':', maxsplit=1)
-    term_id = f'{ns}:{val.lower()}'
-
-    return term_id
+    try:
+        (ns, val) = term_id.split(':', maxsplit=1)
+        term_id = f'{ns}:{val.lower()}'
+        return term_id
+    except Exception:
+        return term_id