Merge pull request #117 from biothings/0.10.x

Use 0.10.x BioThings SDK
biothings · Oct 26, 2021 · 907f87b · 907f87b
2 parents 8cad607 + e5b900d
commit 907f87b
Show file tree

Hide file tree

Showing 26 changed files with 94 additions and 627 deletions.
diff --git a/requirements_hub.txt b/requirements_hub.txt
@@ -1,2 +1,3 @@
+git+https://github.com/biothings/biothings.api@master#egg=biothings[hub]
 biopython # refseq
 pandas # umls
diff --git a/requirements_web.txt b/requirements_web.txt
@@ -1,6 +1,4 @@
 # biothings
-biothings[web_extra]==0.9.1
-elasticsearch==6.3.1
-elasticsearch-dsl==6.3.1
+git+https://github.com/biothings/biothings.api@master#egg=biothings[web_extra]
 #Optional
 raven
diff --git a/src/config_web.py b/src/config_web.py
@@ -11,11 +11,8 @@
 # *****************************************************************************
 # Elasticsearch Settings
 # *****************************************************************************
-# elasticsearch server transport url
 ES_HOST = 'localhost:9200'
-# elasticsearch index name
-ES_INDEX = 'genedoc_mygene_allspecies_current'
-# elasticsearch document type
+ES_INDEX = 'mygene_current'
 ES_DOC_TYPE = 'gene'
 
 # *****************************************************************************
@@ -48,13 +45,13 @@
 TAXONOMY = {
     "human": {"tax_id": "9606", "assembly": "hg38"},
     "mouse": {"tax_id": "10090", "assembly": "mm10"},
-    "rat": {"tax_id": "10116", "assembly": "rn4"},
-    "fruitfly": {"tax_id": "7227", "assembly": "dm3"},
-    "nematode": {"tax_id": "6239", "assembly": "ce10"},
-    "zebrafish": {"tax_id": "7955", "assembly": "zv9"},
-    "thale-cress": {"tax_id": "3702"},
-    "frog": {"tax_id": "8364", "assembly": "xenTro3"},
-    "pig": {"tax_id": "9823", "assembly": "susScr2"}
+    "rat": {"tax_id": "10116", "assembly": "rn6"},
+    "fruitfly": {"tax_id": "7227", "assembly": "dm6"},
+    "nematode": {"tax_id": "6239", "assembly": "ce11"},
+    "zebrafish": {"tax_id": "7955", "assembly": "danRer11"},
+    "thale-cress": {"tax_id": "3702", "assembly": "araTha1"},
+    "frog": {"tax_id": "8364", "assembly": "xenTro9"},
+    "pig": {"tax_id": "9823", "assembly": "susScr11"}
 }
 
 DATASOURCE_TRANSLATIONS = {
@@ -97,7 +94,6 @@
         'default': ['all'],
         'strict': False,
         'max': 1000,
-        'group': 'esqb',
         'translations': [
             (re.compile(pattern, re.I), translation['tax_id'])
             for (pattern, translation) in TAXONOMY.items()
@@ -108,18 +104,17 @@
         'default': None,
         'strict': False,
         'max': 1000,
-        'group': 'esqb',
         'translations': [
             (re.compile(pattern, re.I), translation['tax_id']) for
             (pattern, translation) in TAXONOMY.items()
         ]
     }
 }
 FIELD_FILTERS = {
-    'entrezonly': {'type': bool, 'default': False, 'group': 'esqb'},
-    'ensemblonly': {'type': bool, 'default': False, 'group': 'esqb'},
-    'exists': {'type': list, 'default': None, 'max': 1000, 'group': 'esqb', 'strict': False},
-    'missing': {'type': list, 'default': None, 'max': 1000, 'group': 'esqb', 'strict': False},
+    'entrezonly': {'type': bool, 'default': False},
+    'ensemblonly': {'type': bool, 'default': False},
+    'exists': {'type': list, 'default': None, 'max': 1000, 'strict': False},
+    'missing': {'type': list, 'default': None, 'max': 1000, 'strict': False},
 }
 
 DATASOURCE_TRANSLATION_TYPEDEF = [
@@ -142,7 +137,7 @@
 QUERY_KWARGS['*']['_source']['strict'] = False
 QUERY_KWARGS['GET']['q']['translations'] = DATASOURCE_TRANSLATION_TYPEDEF
 QUERY_KWARGS['POST']['scopes']['translations'] = TRIMMED_DATASOURCE_TRANSLATION_TYPEDEF
-QUERY_KWARGS['GET']['include_tax_tree'] = {'type': bool, 'default': False, 'group': 'esqb'}
+QUERY_KWARGS['GET']['include_tax_tree'] = {'type': bool, 'default': False}
 QUERY_KWARGS['POST']['scopes']['default'] = ["_id", "entrezgene", "ensembl.gene", "retired"]
 QUERY_KWARGS['POST']['q']['jsoninput'] = True
 
@@ -151,27 +146,16 @@
 # Elasticsearch Query Pipeline
 # *****************************************************************************
 ES_QUERY_BUILDER = "web.pipeline.MygeneQueryBuilder"
-ES_RESULT_TRANSFORM = "web.pipeline.MygeneTransform"
 AVAILABLE_FIELDS_EXCLUDED = ['all', 'accession_agg', 'refseq_agg']
 
-# *****************************************************************************
-# Analytics Settings
-# *****************************************************************************
-GA_ACTION_QUERY_GET = 'query_get'
-GA_ACTION_QUERY_POST = 'query_post'
-GA_ACTION_ANNOTATION_GET = 'gene_get'
-GA_ACTION_ANNOTATION_POST = 'gene_post'
-GA_TRACKER_URL = 'MyGene.info'
-
 # *****************************************************************************
 # Endpoints Specifics & Others
 # *****************************************************************************
 
 # kwargs for status check
 STATUS_CHECK = {
     'id': '1017',
-    'index': 'genedoc_mygene_allspecies_current',
-    'doc_type': 'gene'
+    'index': 'mygene_current'
 }
 
 # This essentially bypasses the es.get fallback as in myvariant...
@@ -180,10 +164,6 @@
 ANNOTATION_ID_REGEX_LIST = [(re.compile(r'^\d+$'), ['entrezgene', 'retired'])]
 ANNOTATION_DEFAULT_SCOPES = ["_id", "entrezgene", "ensembl.gene", "retired"]
 
-# for error messages
-ID_REQUIRED_MESSAGE = 'Gene ID Required'
-ID_NOT_FOUND_TEMPLATE = "Gene ID '{bid}' not found"
-
 # for docs
 INCLUDE_DOCS = False
 DOCS_STATIC_PATH = 'docs/_build/html'

diff --git a/src/hub/dataindex/indexer.py b/src/hub/dataindex/indexer.py
@@ -3,18 +3,18 @@
 
 class GeneIndexer(Indexer):
 
-    def __init__(self, build_doc, indexer_env, target_name, index_name):
-        super().__init__(build_doc, indexer_env, target_name, index_name)
-        self.index_settings["codec"] = "best_compression"  # mygene's specific
-        self.index_settings["number_of_replicas"] = 0
-        self.index_settings["number_of_shards"] = 3
-        self.index_settings["analysis"]["tokenizer"] = {
+    def __init__(self, build_doc, indexer_env, index_name):
+        super().__init__(build_doc, indexer_env, index_name)
+
+        # add a tokenizer
+        self.es_index_settings["analysis"]["tokenizer"] = {
             "refseq_tokenizer": {
                 "delimiter": ".",
                 "type": "path_hierarchy"
             }
         }
-        self.index_settings["analysis"]["analyzer"]["refseq_analyzer"] = {
+        # add an analyzer
+        self.es_index_settings["analysis"]["analyzer"]["refseq_analyzer"] = {
             "filter": "lowercase",
             "tokenizer": "refseq_tokenizer",
             "type": "custom"

diff --git a/src/hub/dataload/sources/exac/parser.py b/src/hub/dataload/sources/exac/parser.py
@@ -94,10 +94,10 @@ def load_broadinstitute_exac(data_folder):
     from ..ensembl.parser import EnsemblParser
     from biothings.utils.hub_db import get_src_dump
     ensembl_doc = get_src_dump().find_one({"_id":"ensembl"}) or {}
-    ensembl_dir = ensembl_doc.get("data_folder")
+    ensembl_dir = ensembl_doc.get('download', {}).get("data_folder")
     assert ensembl_dir, "Can't find Ensembl data directory (used for id conversion)"
-    ensembl_parser = EnsemblParser(ensembl_dir)
-    ensembl_parser._load_ensembl2entrez_li()
+    ensembl_parser = EnsemblParser('ensembl', ensembl_dir)
+    ensembl_parser._load_ensembl2entrez_li('ensembl')
     ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
     for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
         _,ensid,transid,_ = line

diff --git a/src/index.py b/src/index.py
@@ -10,10 +10,11 @@
 import os.path
 
 import config
-from biothings.web.index_base import main
+from biothings.web.launcher import main
 
 ADDON_HANDLERS = [
-    (r"/demo/?(.*)", "tornado.web.StaticFileHandler", {"path": "docs/demo", "default_filename": "index.html"}),
+    (r"/demo/?(.*)", "tornado.web.StaticFileHandler",
+     {"path": "docs/demo", "default_filename": "index.html"}),
 ]
 if config.INCLUDE_DOCS:
     if not os.path.exists(config.DOCS_STATIC_PATH):

diff --git a/src/plugins/orthology_agr/manifest.json b/src/plugins/orthology_agr/manifest.json
@@ -1,6 +1,6 @@
 {
     "version": "0.2",
-    "requires" : ["pandas"],
+    "requires" : ["numpy", "pandas"],
     "__metadata__":{
         "url": "https://www.alliancegenome.org/",
         "license_url": "https://creativecommons.org/licenses/by/4.0/",

diff --git a/src/tests/app_tests/config.py b/src/tests/app_tests/config.py
@@ -1,13 +1,8 @@
-
-"""
-    Mychem.info
-    https://mychem.info/
-    Chemical and Drug Annotation as a Service.
-"""
 import os as _os
 import sys as _sys
 import importlib.util as _imp_util
 
+
 CONFIG_FILE_NAME = "config_web.py"
 
 # find the path of the config file

diff --git a/src/tests/app_tests/test_app_tests.py b/src/tests/app_tests/test_app_tests.py
@@ -3,7 +3,6 @@
 from biothings.tests.web import BiothingsWebAppTest
 
 
-
 class TestAnnotationScopes(BiothingsWebAppTest):
     TEST_DATA_DIR_NAME = 'AnnotationTests'
 
@@ -27,7 +26,7 @@ def test_retired(self):
         res = res.json()
         assert len(res) == 1
         assert self.value_in_result(q, res, 'retired') or \
-               self.value_in_result(int(q), res, 'retired')
+            self.value_in_result(int(q), res, 'retired')
 
     # cannot test for entrez != _id (no such data)
     # not adding a mock data for this case because this seems unlikely
@@ -76,13 +75,13 @@ def test_003_entrez_int(self):
     def test_010_taxtree_true(self):
         self.query(
             hits=True,
-            q='lytic enzyme', species='1386', include_tax_tree=True
+            q='lytic enzyme', species='1385', include_tax_tree=True
         )
 
     def test_011_taxtree_false(self):
         self.query(
             hits=False,
-            q='lytic enzyme', species='1386',
+            q='lytic enzyme', species='1385',
         )
 
     def test_012_species_translation(self):
@@ -94,6 +93,17 @@ def test_012_species_translation(self):
             assert self.value_in_result(9606, hit, 'taxid') or \
                 self.value_in_result("9606", hit, 'taxid')
 
+    @pytest.mark.skip("this is also impossible to trigger. "
+                      "It gets casted before reaching the pipeline")
+    def test_013_species_type_error(self):
+        self.request('query', method='POST', expect=400,
+                     data={'q': '__all__', 'species': True})
+
+    def test_014_species_translation_fail(self):
+        self.request('query', params={
+            'q': '__all__', 'species': 'elf',  # I don't see this test case changing soon
+        }, expect=400)
+
     def test_020_interval_query_hg38(self):
         q = 'chr12:55,966,782-55,972,788'
         # should hit 1017
@@ -167,15 +177,15 @@ def test_040_species_facet_filter(self):
         taxid = 9606
         res1 = self.query(
             hits=True,
-            q=q, aggs='type_of_gene'
+            q=q, aggs='type_of_gene', size=10
         )
         res2 = self.query(
             hits=True,
-            q=q, aggs='type_of_gene', species=taxid
+            q=q, aggs='type_of_gene', species=taxid, size=10
         )
         res3 = self.query(
             hits=True,
-            q=q, aggs='type_of_gene', species_facet_filter=taxid
+            q=q, aggs='type_of_gene', species_facet_filter=taxid,size=10
         )
         assert res1['facets'] != res2['facets']
         assert res1['facets'] == res3['facets']
@@ -184,13 +194,12 @@ def test_040_species_facet_filter(self):
 
 
 class TestMetadata(BiothingsWebAppTest):
+    TEST_DATA_DIR_NAME = 'QueryTests'
+
     def test_metadata_extra(self):
         res = self.request('metadata')
         res = res.json()
-        assert 'available_fields' in res
-        assert 'app_revision' in res
         assert 'genome_assembly' in res
         assert isinstance(res['genome_assembly'], dict)
         assert 'taxonomy' in res
         assert isinstance(res['taxonomy'], dict)
-        assert 'source' in res
diff --git a/src/tests/README.md → src/tests/data_tests/README.md b/src/tests/README.md → src/tests/data_tests/README.md
diff --git a/src/tests/test_1_annotation.py → src/tests/data_tests/test_1_annotation.py b/src/tests/test_1_annotation.py → src/tests/data_tests/test_1_annotation.py
diff --git a/src/tests/test_2_query.py → src/tests/data_tests/test_2_query.py b/src/tests/test_2_query.py → src/tests/data_tests/test_2_query.py
@@ -1,3 +1,4 @@
+import pytest
 from biothings.tests.web import BiothingsDataTest
 
 
@@ -22,6 +23,7 @@ def test_203_query(self):
         # interval query
         self.query(q='chr1:151,073,054-151,383,976&species=human')
 
+    @pytest.mark.skip("Feature Removed in BioThings 0.7.0")
     def test_204_query(self):
         pass # feature removed in biothings 0.7.0
         # con = self.request('query?q=cdk2&callback=mycallback').content

diff --git a/src/tests/test_3_web.py → src/tests/data_tests/test_3_web.py b/src/tests/test_3_web.py → src/tests/data_tests/test_3_web.py
@@ -47,12 +47,12 @@ def test_331_taxonomy(self):
         assert "lineage" in res
 
     def test_332_taxonomy(self):
-        res = self.request("species/46170?include_children=true").json()
+        res = self.request("species/1280?include_children=true").json()
         assert len(res['children']) >= 305
 
     def test_333_taxonomy(self):
-        res = self.request("species/46170?include_children=true").json()
-        res2 = self.request("species/46170?include_children=true&has_gene=1").json()
+        res = self.request("species/1280?include_children=true").json()
+        res2 = self.request("species/1280?include_children=true&has_gene=1").json()
         assert len(res2['children']) >= 11
         assert len(res2['children']) <= len(res['children'])
 

diff --git a/src/tests/test_4_input.py → src/tests/data_tests/test_4_input.py b/src/tests/test_4_input.py → src/tests/data_tests/test_4_input.py
diff --git a/src/tests/test_5_control.py → src/tests/data_tests/test_5_control.py b/src/tests/test_5_control.py → src/tests/data_tests/test_5_control.py
diff --git a/src/tests/test_6_sources.py → src/tests/data_tests/test_6_sources.py b/src/tests/test_6_sources.py → src/tests/data_tests/test_6_sources.py
@@ -65,7 +65,6 @@ def test_611_refseq(self):
         assert hit["refseq"]["protein"] == "NP_001670.1"
         assert hit["refseq"]["rna"].startswith("NM_001679.")
 
-
     def test_615_accession(self):
         protein = filter_hits(self.request(
             "query?q=accession:AAH68303&fields=accession").json())
@@ -274,6 +273,7 @@ def test_670_pharos(self):
         res = self.request("gene/56141?fields=pharos").json()
         assert res["pharos"]["target_id"] == 4745
 
+
 def filter_hits(dic, field=None):
     ''' Filter hits by removing specified fields or by default meta fields '''
     res = dict(dic)

diff --git a/src/tools/update_mapping.py b/src/tools/update_mapping.py
diff --git a/src/utils/__init__.py b/src/utils/__init__.py