Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinxin90 committed Oct 29, 2019
2 parents b291673 + dfadcc4 commit f7537cf
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 3 deletions.
19 changes: 16 additions & 3 deletions src/hub/dataload/sources/dbnsfp/dbnsfp_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class DBNSFPDumper(GoogleDriveDumper):
# also, sometimes there's a "v", sometimes not...
RELEASE_PAT = "dbNSFPv?(\d+\..*\d+a)\.zip"

#SCHEDULE = "0 9 * * *" # disabled until we have a new parser for rel. 4.0
SCHEDULE = "0 9 * * *" # disabled until we have a new parser for rel. 4.0

def get_newest_info(self):
ftp = FTP('dbnsfp.softgenetics.com')
Expand All @@ -42,9 +42,22 @@ def get_newest_info(self):
[drels.setdefault(rel,f) for (f,rel) in releases]
# sort items based on date
releases = sorted(drels.keys())
# check if there's a non-beta version. Tricky there, usually versions are like that:
# 4.0a, 4.0ab1, 4.0ab2
# if sorted, 4.0ab2 will be the "newest", but it's a beta (b2) and 4.0a is
# actually the newest there
newest = releases[-1]
nonbetapat = re.compile("(\d+\.\d+)\w\d(\w)")
m = nonbetapat.match(newest)
if m:
nonbeta = "".join(m.groups())
if nonbeta in releases:
self.logger.info("Found non-beta version '%s'" % nonbeta)
newest = nonbeta

# get the last item in the list, which is the latest version
self.newest_file = drels[releases[-1]]
self.release = releases[-1]
self.newest_file = drels[newest]
self.release = newest

def new_release_available(self):
current_release = self.src_doc.get("download",{}).get("release")
Expand Down
20 changes: 20 additions & 0 deletions src/hub/dataload/storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from biothings.hub.dataload.storage import BaseStorage, BasicStorage, IgnoreDuplicatedStorage
from utils.hgvs import encode_long_hgvs_id


class EncodeLongHGVSIDStorage(BaseStorage):
"""
BasicStorage including long HGVC ID encoding
"""

def check_doc_func(self, doc):
doc = encode_long_hgvs_id(doc)
if doc.get("_seqhashed"):
# required to query _exists_:_seqhashed
doc["_seqhashed"]["_flag"] = True

return doc

class MyVariantBasicStorage(EncodeLongHGVSIDStorage, BasicStorage): pass
class MyVariantIgnoreDuplicatedStorage(EncodeLongHGVSIDStorage, IgnoreDuplicatedStorage): pass

0 comments on commit f7537cf

Please sign in to comment.