Skip to content

Commit

Permalink
Merge pull request #117 from biothings/0.10.x
Browse files Browse the repository at this point in the history
Use 0.10.x BioThings SDK
  • Loading branch information
zcqian committed Oct 26, 2021
2 parents 8cad607 + e5b900d commit 907f87b
Show file tree
Hide file tree
Showing 26 changed files with 94 additions and 627 deletions.
1 change: 1 addition & 0 deletions requirements_hub.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
git+https://github.com/biothings/biothings.api@master#egg=biothings[hub]
biopython # refseq
pandas # umls
4 changes: 1 addition & 3 deletions requirements_web.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# biothings
biothings[web_extra]==0.9.1
elasticsearch==6.3.1
elasticsearch-dsl==6.3.1
git+https://github.com/biothings/biothings.api@master#egg=biothings[web_extra]
#Optional
raven
48 changes: 14 additions & 34 deletions src/config_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@
# *****************************************************************************
# Elasticsearch Settings
# *****************************************************************************
# elasticsearch server transport url
ES_HOST = 'localhost:9200'
# elasticsearch index name
ES_INDEX = 'genedoc_mygene_allspecies_current'
# elasticsearch document type
ES_INDEX = 'mygene_current'
ES_DOC_TYPE = 'gene'

# *****************************************************************************
Expand Down Expand Up @@ -48,13 +45,13 @@
TAXONOMY = {
"human": {"tax_id": "9606", "assembly": "hg38"},
"mouse": {"tax_id": "10090", "assembly": "mm10"},
"rat": {"tax_id": "10116", "assembly": "rn4"},
"fruitfly": {"tax_id": "7227", "assembly": "dm3"},
"nematode": {"tax_id": "6239", "assembly": "ce10"},
"zebrafish": {"tax_id": "7955", "assembly": "zv9"},
"thale-cress": {"tax_id": "3702"},
"frog": {"tax_id": "8364", "assembly": "xenTro3"},
"pig": {"tax_id": "9823", "assembly": "susScr2"}
"rat": {"tax_id": "10116", "assembly": "rn6"},
"fruitfly": {"tax_id": "7227", "assembly": "dm6"},
"nematode": {"tax_id": "6239", "assembly": "ce11"},
"zebrafish": {"tax_id": "7955", "assembly": "danRer11"},
"thale-cress": {"tax_id": "3702", "assembly": "araTha1"},
"frog": {"tax_id": "8364", "assembly": "xenTro9"},
"pig": {"tax_id": "9823", "assembly": "susScr11"}
}

DATASOURCE_TRANSLATIONS = {
Expand Down Expand Up @@ -97,7 +94,6 @@
'default': ['all'],
'strict': False,
'max': 1000,
'group': 'esqb',
'translations': [
(re.compile(pattern, re.I), translation['tax_id'])
for (pattern, translation) in TAXONOMY.items()
Expand All @@ -108,18 +104,17 @@
'default': None,
'strict': False,
'max': 1000,
'group': 'esqb',
'translations': [
(re.compile(pattern, re.I), translation['tax_id']) for
(pattern, translation) in TAXONOMY.items()
]
}
}
FIELD_FILTERS = {
'entrezonly': {'type': bool, 'default': False, 'group': 'esqb'},
'ensemblonly': {'type': bool, 'default': False, 'group': 'esqb'},
'exists': {'type': list, 'default': None, 'max': 1000, 'group': 'esqb', 'strict': False},
'missing': {'type': list, 'default': None, 'max': 1000, 'group': 'esqb', 'strict': False},
'entrezonly': {'type': bool, 'default': False},
'ensemblonly': {'type': bool, 'default': False},
'exists': {'type': list, 'default': None, 'max': 1000, 'strict': False},
'missing': {'type': list, 'default': None, 'max': 1000, 'strict': False},
}

DATASOURCE_TRANSLATION_TYPEDEF = [
Expand All @@ -142,7 +137,7 @@
QUERY_KWARGS['*']['_source']['strict'] = False
QUERY_KWARGS['GET']['q']['translations'] = DATASOURCE_TRANSLATION_TYPEDEF
QUERY_KWARGS['POST']['scopes']['translations'] = TRIMMED_DATASOURCE_TRANSLATION_TYPEDEF
QUERY_KWARGS['GET']['include_tax_tree'] = {'type': bool, 'default': False, 'group': 'esqb'}
QUERY_KWARGS['GET']['include_tax_tree'] = {'type': bool, 'default': False}
QUERY_KWARGS['POST']['scopes']['default'] = ["_id", "entrezgene", "ensembl.gene", "retired"]
QUERY_KWARGS['POST']['q']['jsoninput'] = True

Expand All @@ -151,27 +146,16 @@
# Elasticsearch Query Pipeline
# *****************************************************************************
ES_QUERY_BUILDER = "web.pipeline.MygeneQueryBuilder"
ES_RESULT_TRANSFORM = "web.pipeline.MygeneTransform"
AVAILABLE_FIELDS_EXCLUDED = ['all', 'accession_agg', 'refseq_agg']

# *****************************************************************************
# Analytics Settings
# *****************************************************************************
GA_ACTION_QUERY_GET = 'query_get'
GA_ACTION_QUERY_POST = 'query_post'
GA_ACTION_ANNOTATION_GET = 'gene_get'
GA_ACTION_ANNOTATION_POST = 'gene_post'
GA_TRACKER_URL = 'MyGene.info'

# *****************************************************************************
# Endpoints Specifics & Others
# *****************************************************************************

# kwargs for status check
STATUS_CHECK = {
'id': '1017',
'index': 'genedoc_mygene_allspecies_current',
'doc_type': 'gene'
'index': 'mygene_current'
}

# This essentially bypasses the es.get fallback as in myvariant...
Expand All @@ -180,10 +164,6 @@
ANNOTATION_ID_REGEX_LIST = [(re.compile(r'^\d+$'), ['entrezgene', 'retired'])]
ANNOTATION_DEFAULT_SCOPES = ["_id", "entrezgene", "ensembl.gene", "retired"]

# for error messages
ID_REQUIRED_MESSAGE = 'Gene ID Required'
ID_NOT_FOUND_TEMPLATE = "Gene ID '{bid}' not found"

# for docs
INCLUDE_DOCS = False
DOCS_STATIC_PATH = 'docs/_build/html'
Expand Down
14 changes: 7 additions & 7 deletions src/hub/dataindex/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@

class GeneIndexer(Indexer):

def __init__(self, build_doc, indexer_env, target_name, index_name):
super().__init__(build_doc, indexer_env, target_name, index_name)
self.index_settings["codec"] = "best_compression" # mygene's specific
self.index_settings["number_of_replicas"] = 0
self.index_settings["number_of_shards"] = 3
self.index_settings["analysis"]["tokenizer"] = {
def __init__(self, build_doc, indexer_env, index_name):
super().__init__(build_doc, indexer_env, index_name)

# add a tokenizer
self.es_index_settings["analysis"]["tokenizer"] = {
"refseq_tokenizer": {
"delimiter": ".",
"type": "path_hierarchy"
}
}
self.index_settings["analysis"]["analyzer"]["refseq_analyzer"] = {
# add an analyzer
self.es_index_settings["analysis"]["analyzer"]["refseq_analyzer"] = {
"filter": "lowercase",
"tokenizer": "refseq_tokenizer",
"type": "custom"
Expand Down
6 changes: 3 additions & 3 deletions src/hub/dataload/sources/exac/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@ def load_broadinstitute_exac(data_folder):
from ..ensembl.parser import EnsemblParser
from biothings.utils.hub_db import get_src_dump
ensembl_doc = get_src_dump().find_one({"_id":"ensembl"}) or {}
ensembl_dir = ensembl_doc.get("data_folder")
ensembl_dir = ensembl_doc.get('download', {}).get("data_folder")
assert ensembl_dir, "Can't find Ensembl data directory (used for id conversion)"
ensembl_parser = EnsemblParser(ensembl_dir)
ensembl_parser._load_ensembl2entrez_li()
ensembl_parser = EnsemblParser('ensembl', ensembl_dir)
ensembl_parser._load_ensembl2entrez_li('ensembl')
ensembl2entrez = list2dict(ensembl_parser.ensembl2entrez_li, 0, alwayslist=True)
for line in tabfile_feeder(os.path.join(ensembl_dir,"gene_ensembl__translation__main.txt")):
_,ensid,transid,_ = line
Expand Down
5 changes: 3 additions & 2 deletions src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
import os.path

import config
from biothings.web.index_base import main
from biothings.web.launcher import main

ADDON_HANDLERS = [
(r"/demo/?(.*)", "tornado.web.StaticFileHandler", {"path": "docs/demo", "default_filename": "index.html"}),
(r"/demo/?(.*)", "tornado.web.StaticFileHandler",
{"path": "docs/demo", "default_filename": "index.html"}),
]
if config.INCLUDE_DOCS:
if not os.path.exists(config.DOCS_STATIC_PATH):
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/orthology_agr/manifest.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"version": "0.2",
"requires" : ["pandas"],
"requires" : ["numpy", "pandas"],
"__metadata__":{
"url": "https://www.alliancegenome.org/",
"license_url": "https://creativecommons.org/licenses/by/4.0/",
Expand Down
7 changes: 1 addition & 6 deletions src/tests/app_tests/config.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@

"""
Mychem.info
https://mychem.info/
Chemical and Drug Annotation as a Service.
"""
import os as _os
import sys as _sys
import importlib.util as _imp_util


CONFIG_FILE_NAME = "config_web.py"

# find the path of the config file
Expand Down
29 changes: 19 additions & 10 deletions src/tests/app_tests/test_app_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from biothings.tests.web import BiothingsWebAppTest



class TestAnnotationScopes(BiothingsWebAppTest):
TEST_DATA_DIR_NAME = 'AnnotationTests'

Expand All @@ -27,7 +26,7 @@ def test_retired(self):
res = res.json()
assert len(res) == 1
assert self.value_in_result(q, res, 'retired') or \
self.value_in_result(int(q), res, 'retired')
self.value_in_result(int(q), res, 'retired')

# cannot test for entrez != _id (no such data)
# not adding a mock data for this case because this seems unlikely
Expand Down Expand Up @@ -76,13 +75,13 @@ def test_003_entrez_int(self):
def test_010_taxtree_true(self):
self.query(
hits=True,
q='lytic enzyme', species='1386', include_tax_tree=True
q='lytic enzyme', species='1385', include_tax_tree=True
)

def test_011_taxtree_false(self):
self.query(
hits=False,
q='lytic enzyme', species='1386',
q='lytic enzyme', species='1385',
)

def test_012_species_translation(self):
Expand All @@ -94,6 +93,17 @@ def test_012_species_translation(self):
assert self.value_in_result(9606, hit, 'taxid') or \
self.value_in_result("9606", hit, 'taxid')

@pytest.mark.skip("this is also impossible to trigger. "
"It gets casted before reaching the pipeline")
def test_013_species_type_error(self):
self.request('query', method='POST', expect=400,
data={'q': '__all__', 'species': True})

def test_014_species_translation_fail(self):
self.request('query', params={
'q': '__all__', 'species': 'elf', # I don't see this test case changing soon
}, expect=400)

def test_020_interval_query_hg38(self):
q = 'chr12:55,966,782-55,972,788'
# should hit 1017
Expand Down Expand Up @@ -167,15 +177,15 @@ def test_040_species_facet_filter(self):
taxid = 9606
res1 = self.query(
hits=True,
q=q, aggs='type_of_gene'
q=q, aggs='type_of_gene', size=10
)
res2 = self.query(
hits=True,
q=q, aggs='type_of_gene', species=taxid
q=q, aggs='type_of_gene', species=taxid, size=10
)
res3 = self.query(
hits=True,
q=q, aggs='type_of_gene', species_facet_filter=taxid
q=q, aggs='type_of_gene', species_facet_filter=taxid,size=10
)
assert res1['facets'] != res2['facets']
assert res1['facets'] == res3['facets']
Expand All @@ -184,13 +194,12 @@ def test_040_species_facet_filter(self):


class TestMetadata(BiothingsWebAppTest):
TEST_DATA_DIR_NAME = 'QueryTests'

def test_metadata_extra(self):
res = self.request('metadata')
res = res.json()
assert 'available_fields' in res
assert 'app_revision' in res
assert 'genome_assembly' in res
assert isinstance(res['genome_assembly'], dict)
assert 'taxonomy' in res
assert isinstance(res['taxonomy'], dict)
assert 'source' in res
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
from biothings.tests.web import BiothingsDataTest


Expand All @@ -22,6 +23,7 @@ def test_203_query(self):
# interval query
self.query(q='chr1:151,073,054-151,383,976&species=human')

@pytest.mark.skip("Feature Removed in BioThings 0.7.0")
def test_204_query(self):
pass # feature removed in biothings 0.7.0
# con = self.request('query?q=cdk2&callback=mycallback').content
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ def test_331_taxonomy(self):
assert "lineage" in res

def test_332_taxonomy(self):
res = self.request("species/46170?include_children=true").json()
res = self.request("species/1280?include_children=true").json()
assert len(res['children']) >= 305

def test_333_taxonomy(self):
res = self.request("species/46170?include_children=true").json()
res2 = self.request("species/46170?include_children=true&has_gene=1").json()
res = self.request("species/1280?include_children=true").json()
res2 = self.request("species/1280?include_children=true&has_gene=1").json()
assert len(res2['children']) >= 11
assert len(res2['children']) <= len(res['children'])

Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def test_611_refseq(self):
assert hit["refseq"]["protein"] == "NP_001670.1"
assert hit["refseq"]["rna"].startswith("NM_001679.")


def test_615_accession(self):
protein = filter_hits(self.request(
"query?q=accession:AAH68303&fields=accession").json())
Expand Down Expand Up @@ -274,6 +273,7 @@ def test_670_pharos(self):
res = self.request("gene/56141?fields=pharos").json()
assert res["pharos"]["target_id"] == 4745


def filter_hits(dic, field=None):
''' Filter hits by removing specified fields or by default meta fields '''
res = dict(dic)
Expand Down
10 changes: 0 additions & 10 deletions src/tools/update_mapping.py

This file was deleted.

Empty file removed src/utils/__init__.py
Empty file.

0 comments on commit 907f87b

Please sign in to comment.