Skip to content

Commit

Permalink
Merge branch 'production'
Browse files Browse the repository at this point in the history
  • Loading branch information
sirloon committed Jan 2, 2020
2 parents 60fd8c1 + 14fa35c commit 6e50a31
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 26 deletions.
8 changes: 7 additions & 1 deletion requirements_web.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# biothings
biothings[web_extra]
git+https://github.com/biothings/biothings.api.git@475c41cfc7f0650eb315cefe853e7a0331098517#egg=biothings

# optional
msgpack-python==0.4.6

# for nosetests
nose>=1.3.7

# for sentry monitoring
raven
2 changes: 1 addition & 1 deletion src/hub/dataload/sources/civic/civic_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class CivicDumper(HTTPDumper):

SRC_NAME = "civic"
SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)
API_PAGE = 'https://civic.genome.wustl.edu/api/variants/'
API_PAGE = 'https://civicdb.org/api/variants/'
SCHEDULE = "0 22 1 * *"
IGNORE_HTTP_CODE = [404] # some variants are
MAX_PARALLEL_DUMP = 1
Expand Down
4 changes: 2 additions & 2 deletions src/hub/dataload/sources/clinvar/clinvar_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ class ClinvarHG19Uploader(ClinvarBaseUploader):
name = "clinvar_hg19"
main_source = "clinvar"
__metadata__ = {
"mapper" : 'observed',
"mapper" : 'observed_skipidtoolong',
"assembly" : "hg19",
"src_meta" : SRC_META,
}
Expand All @@ -244,7 +244,7 @@ class ClinvarHG38Uploader(ClinvarBaseUploader):
name = "clinvar_hg38"
main_source = "clinvar"
__metadata__ = {
"mapper" : 'observed',
"mapper" : 'observed_skipidtoolong',
"assembly" : "hg38",
"src_meta" : SRC_META,
}
Expand Down
4 changes: 2 additions & 2 deletions src/hub/dataload/sources/dbsnp/dbsnp_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ class DBSNPHg19Uploader(DBSNPBaseUploader):
main_source = "dbsnp"
name = "dbsnp_hg19"
__metadata__ = {
"mapper" : 'observed',
"mapper" : 'observed_skipidtoolong',
"assembly" : "hg19",
"src_meta" : SRC_META
}
Expand All @@ -271,7 +271,7 @@ class DBSNPHg38Uploader(DBSNPBaseUploader):
main_source = "dbsnp"
name = "dbsnp_hg38"
__metadata__ = {
"mapper" : 'observed',
"mapper" : 'observed_skipidtoolong',
"assembly" : "hg38",
"src_meta" : SRC_META
}
Expand Down
36 changes: 16 additions & 20 deletions src/hub/dataload/sources/emv/emv_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,32 +26,28 @@ def get_mapping(klass):
"emv": {
"properties": {
"gene": {
"type": "text",
"analyzer": "string_lowercase",
"copy_to" : ["all"]
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword"
},
"egl_variant": {
"type": "text",
"analyzer": "string_lowercase",
"copy_to" : ["all"]
"variant_id": {
"type": "integer"
},
"egl_protein": {
"type": "text",
"analyzer": "string_lowercase"
"exon": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword"
},
"egl_classification": {
"type": "text",
"analyzer": "string_lowercase"
"egl_variant": {
"normalizer": "keyword_lowercase_normalizer",
"type": "keyword"
},
"hgvs": {
"type": "text",
"analyzer": "string_lowercase",
"copy_to" : ["all"]
"type": "text"
},
"clinvar_rcv": {
"type": "text",
"analyzer": "string_lowercase",
"copy_to" : ["all"]
"egl_classification": {
"type": "text"
},
"egl_protein": {
"type": "text"
}
}
}
Expand Down
33 changes: 33 additions & 0 deletions src/tests/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from nose.core import runmodule
from nose.tools import eq_, ok_
from pprint import pformat

from biothings.tests import BiothingsTestCase

Expand Down Expand Up @@ -330,6 +331,38 @@ def test_seqhashed_long_hgvs_id(self):
h = res["hits"][0]["_id"].split("_seqhashed_")[-1]
assert h in res["hits"][0]["_seqhashed"]

def check_index_count(self, assembly):
meta = self.request("metadata?assembly=%s" % assembly).json()
results = {}
for src_name in meta["src"]:
if src_name == "snpeff":
continue # not a root src, counts always different
# TODO: that said, count in meta could be correct...
stats = meta["src"][src_name]["stats"]
for stat in stats:
subsrc = stat.replace("_%s" % assembly,"")
if subsrc in ("gnomad_genomes","gnomad_exomes"):
subsrc = subsrc.rstrip("s") # plural in meta, singular in docs
meta_cnt = meta["src"][src_name]["stats"][stat]
res = self.request("query?q=_exists_:%s&size=0&assembly=%s" % (subsrc,assembly)).json()
results[subsrc] = {"meta" : meta_cnt, "index" : res["total"]}
#assert res["total"] == meta_cnt, "Count in metadata (%s) doesn't match count from query (%s) for datasource '%s'" % (meta_cnt,res["total"],subsrc)
errs = {}
for src in results:
mc = results[src]["meta"]
ic = results[src]["index"]
if mc != ic:
errs[src] = results[src]
errs[src]["diff"] = mc - ic
assert len(errs) == 0, "Some counts don't match metadata:\n%s" % pformat(errs)


def test_index_count_hg19(self):
self.check_index_count("hg19")

def test_index_count_hg38(self):
self.check_index_count("hg38")


if __name__ == '__main__':
print()
Expand Down

0 comments on commit 6e50a31

Please sign in to comment.