Skip to content

Commit

Permalink
Merge pull request #163 from biothings/issue-142-dbNSFP
Browse files Browse the repository at this point in the history
Fix to Issue 142
  • Loading branch information
Yao Yao committed Feb 28, 2023
2 parents ad3bf65 + 47e2a1e commit 3577355
Show file tree
Hide file tree
Showing 4 changed files with 290 additions and 201 deletions.
30 changes: 30 additions & 0 deletions src/hub/dataload/sources/dbnsfp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
## Dumper Design

This dumper uses both FTP and HTTP, according to Sebastien:

- FTP: to get the latest version
- HTTP: to actually download the file (because the FTP server is slow)

The FTP server is [dbnsfp.softgenetics.com](dbnsfp.softgenetics.com) and the latest version detection works quite straightforward.

The tricky part is parsing the HTTP download link from the dbNSFP [main page]( https://sites.google.com/site/jpopgen/dbNSFP). As this README is written, the latest academic release is `dbNSFP4.3a` and the main page lists three URLs for download, in the following order:

1. [Amazon AWS](https://www.google.com/url?q=https%3A%2F%2Fdbnsfp.s3.amazonaws.com%2FdbNSFP4.3a.zip&sa=D&sntz=1&usg=AOvVaw2jxs6oSlLKGuD0pfWzazXd)
2. [Box](https://www.google.com/url?q=https%3A%2F%2Fusf.box.com%2Fshared%2Fstatic%2F9r6iamhldji4c3vodwebh3947vgrvsng&sa=D&sntz=1&usg=AOvVaw0IxtlQigv3YxfO4zEGR3U3)
3. [Google Drive](https://drive.google.com/file/d/1p8zlODMg5RIdG2J_vU1292ZqSocS-lii/view?usp=sharing)

We decide to use the Box URLs for downloading because:

1. The Amazon AWS URLs may return a `403: AccessDenied`
2. The Google Drive URLs, when accessed by python code (e.g. our `GoogleDriveDumper`), may report `Google Drive - Quota exceeded` error and we may have to wait 24 hours and try again.

Challenges of finding the correct Box URLs are:

1. Only Amazon AWS URLs contain the filenames (e.g. `dbNSFP4.3a.zip`)
2. A Box URL might be wrapped as in `https://www.google.com/url?q=<Box_URL>`

Solutions:

1. Given a filename, we have to pinpoint the Box URL with following assumption: the first Box URL right after the Amazon AWS URL containing the filename is the target URL.
2. We can use `urllib.parse.urlparse` to find the wrapped Box URL.

225 changes: 139 additions & 86 deletions src/hub/dataload/sources/dbnsfp/dbnsfp_dump.py
Original file line number Diff line number Diff line change
@@ -1,134 +1,187 @@
import os
import os.path
import sys
import time
import re
import requests
from urllib.parse import urlparse, parse_qs
from ftplib import FTP
from bs4 import BeautifulSoup
import zipfile

import biothings, config
biothings.config_for_app(config)

from config import DATA_ARCHIVE_ROOT
from biothings.hub.dataload.dumper import GoogleDriveDumper, DumperException
from biothings.hub.dataload.dumper import HTTPDumper, DumperException
from biothings.utils.common import unzipall


class DBNSFPDumper(GoogleDriveDumper):
'''
Mixed dumper (use FTP and HTTP/GoogleDrive) to dump dbNSFP:
class DBNSFPDumper(HTTPDumper):
"""
Mixed dumper (use FTP and HTTP) to dump dbNSFP:
- FTP: to get the latest version
- HTTP: to actually get the data (because their FTP server is sooo slow)
'''
"""

SRC_NAME = "dbnsfp"
SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)
# "a" is for academic, not "c"ommercial
# also, sometimes there's a "v", sometimes not...
RELEASE_PAT = "dbNSFPv?(\d+\..*\d+a)\.zip"

SCHEDULE = "0 9 1 * *" # 9AM every 1st day of month
SCHEDULE = "0 9 1 * *" # 9AM every 1st day of month

# Look for filenames ending with "a" (for Academic), not "c" (for Commercial).
# Also, sometimes there's a "v", sometimes not...
FILENAME_PATTERN = re.compile("dbNSFPv?(\d+\..*\d+a)\.zip")
# Check if a release is a beta release.
# Tricky there, usually releases are like 4.0a, 4.0b1a, 4.0b2a.
# If sorted, 4.0b2a will be the "newest", but it's a beta (b2) and 4.0a is actually the newest there
BETA_RELEASE_PATTERN = re.compile("(\d+\.\d+)\w\d(\w)")

def get_newest_info(self):
ftp = FTP('dbnsfp.softgenetics.com')
ftp.login('dbnsfp','dbnsfp')
releases = ftp.nlst()
# get rid of readme files
pat = re.compile(self.RELEASE_PAT)
releases = [(x,pat.match(x).groups()[0]) for x in releases if pat.match(x)]
drels = {}
[drels.setdefault(rel,f) for (f,rel) in releases]
# sort items based on date
releases = sorted(drels.keys())
# check if there's a non-beta version. Tricky there, usually versions are like that:
# 4.0a, 4.0ab1, 4.0ab2
# if sorted, 4.0ab2 will be the "newest", but it's a beta (b2) and 4.0a is
# actually the newest there
newest = releases[-1]
nonbetapat = re.compile("(\d+\.\d+)\w\d(\w)")
m = nonbetapat.match(newest)
if m:
nonbeta = "".join(m.groups())
if nonbeta in releases:
self.logger.info("Found non-beta version '%s'" % nonbeta)
newest = nonbeta
release_map = dict() # a dict of <release_num, file_name>

ftp = FTP("dbnsfp.softgenetics.com")
ftp.login("dbnsfp", "dbnsfp")
for filename in ftp.nlst():
filename_match = self.FILENAME_PATTERN.match(filename)
if filename_match:
release = filename_match.groups()[0]
release_map[release] = filename

# get the last item in the list, which is the latest version
self.newest_file = drels[newest]
self.release = newest
newest_release = sorted(release_map.keys())[-1]

beta_release_match = self.BETA_RELEASE_PATTERN.match(newest_release)
if beta_release_match:
# If the newest release is a beta release, infer its stable release
stable_release = "".join(beta_release_match.groups())

# If the inferred stable release is available, use it instead of the beta release
if stable_release in release_map:
self.logger.info(f"Stable release {stable_release} detected; beta release {newest_release} discarded.")
newest_release = stable_release
# Otherwise just use the beta release
# else:
# pass

self.release = newest_release
self.newest_file = release_map[newest_release]

def new_release_available(self):
current_release = self.src_doc.get("download",{}).get("release")
current_release = self.src_doc.get("download", {}).get("release")
if not current_release or self.release > current_release:
self.logger.info("New release '%s' found" % self.release)
self.logger.info(f"New release {self.release} available, over current release {current_release}.")
return True
else:
self.logger.debug("No new release found")
self.logger.debug(f"No new release available over current release {current_release}.")
return False

def get_drive_url(self,ftpname):
# ok, so let's get the main page data. in this page there are links for both
# FTP and Google Drive. We're assuming here that just after FTP link, there's
# the corresponding one for Drive (parse will ensure we downloaded the correct
# version, and also the correct licensed one - academic only)
res = requests.get("https://sites.google.com/site/jpopgen/dbNSFP")
html = BeautifulSoup(res.text,"html.parser")
ftplink = html.findAll(attrs={"href":re.compile(ftpname)})
if ftplink:
ftplink = ftplink.pop()
else:
raise DumperException("Can't find a FTP link for '%s'" % ftpname)
# let's cross fingers here...
drivelink = ftplink.findNextSibling()
href = drivelink.get("href")
if href:
return href
else:
raise DumperException("Can't find a href in drive link element: %s" % drivelink)
@classmethod
def get_box_url(cls, filename):
"""
Given a filename, find its Box download link from parsing the index page.
dbNSFP main page provides 3 types of downloads for each release, in the following order:
1. Amazon AWS (somehow cannot access)
2. Box (direct link, or wrapped in www.google.com/url?q=<Box_URL>)
3. Google Drive (a download page, not direct link)
However only the Amazon AWS download URL will contain the filename. E.g. "dbNSFP4.3a.zip" and
"https://www.google.com/url?q=https%3A%2F%2Fdbnsfp.s3.amazonaws.com%2FdbNSFP4.3a.zip&amp;sa=D&amp;sntz=1&amp;usg=AOvVaw2jxs6oSlLKGuD0pfWzazXd".
The algorithm here is:
1. Find the Amazon AWS download URL containing the filename.
2. Find the first Box download URL right after the above Amazon AWS URL.
Note: The above algorithm may fail once the HTML structure of the main page changed.
"""

amazon_anchor_text = "Amazon"
box_anchor_text = "Box"
# google_drive_anchor_text = "googledrive"
# to find anchor elements containing text "Amazon", or "Box"
anchor_text_pattern = re.compile(f"^{amazon_anchor_text}|{box_anchor_text}$")

html_response = requests.get("https://sites.google.com/site/jpopgen/dbNSFP")
html_text = html_response.text
soup = BeautifulSoup(html_text, "html.parser")
anchors = soup.find_all("a", href=True, text=anchor_text_pattern)

amazon_anchor_index = None
for index, anchor in enumerate(anchors):
if filename in anchor["href"] and anchor.text == amazon_anchor_text:
amazon_anchor_index = index

if amazon_anchor_index is None:
raise DumperException(f"Cannot find an {amazon_anchor_text} anchor element containing filename {filename}.")

box_anchor = None
for anchor in anchors[amazon_anchor_index:]:
if anchor.text == box_anchor_text:
box_anchor = anchor
break

if box_anchor is None:
raise DumperException(f"Cannot find a {box_anchor_text} anchor element after the {amazon_anchor_text} anchor of "
f"{anchors[amazon_anchor_index]['href']}.")

box_url = box_anchor["href"]

# The Box download URL might be a "www.google.com/url" URL wrapping the true Box URL. E.g.
# "https://www.google.com/url?q=https%3A%2F%2Fusf.box.com%2Fshared%2Fstatic%2Fq1kufbnww5dy3fs2t1yp5ay0w93eufq7"
box_url_parse_result = urlparse(box_url)
if box_url_parse_result.netloc == "www.google.com":
qs_result = parse_qs(box_url_parse_result.query)
q = qs_result.get("q", None)
if q is None:
raise DumperException(f"Cannot find q in the query string of {box_url} for {filename}.")
return q[0] # The wrapped Box URL should be the only element in "q"
elif box_url_parse_result.netloc.endswith("box.com"): # direct Box.com download link
return box_url
else:
raise DumperException(f"Cannot recognized the Box download URL {box_url} for {filename}.")

def create_todump_list(self, force=False):
def create_todump_list(self, force=False, **kwargs):
self.get_newest_info()
new_localfile = os.path.join(self.new_data_folder,os.path.basename(self.newest_file))

new_localfile = os.path.join(self.new_data_folder, os.path.basename(self.newest_file))
try:
current_localfile = os.path.join(self.current_data_folder,os.path.basename(self.newest_file))
current_localfile = os.path.join(self.current_data_folder, os.path.basename(self.newest_file))
except TypeError:
# current data folder doesn't even exist
current_localfile = new_localfile

if force or not os.path.exists(current_localfile) or self.new_release_available():
# register new release (will be stored in backend)
self.release = self.release
remote = self.get_drive_url(self.newest_file)
self.to_dump.append({"remote": remote,"local":new_localfile})
remote = self.get_box_url(self.newest_file)
self.to_dump.append({"remote": remote, "local": new_localfile})

def post_download(self, remote, local):
"""
Run some sanity checks after downloading
"""

def post_download(self,remote,local):
"""
Check #1: The filename of the downloaded archive must contain the release tag.
"""
filename = os.path.basename(local)
if not self.release in filename:
raise DumperException("Weird, filename is wrong ('%s')" % filename)
# make sure we downloaded to correct one, and that it's the academic version
if self.release not in filename:
raise DumperException(f"Weird, filename is wrong ({filename}); should contain release tag {self.release}.")

"""
Check #2: The downloaded archive must contain a README whole filename must contain the release tag.
"""
zf = zipfile.ZipFile(local)
readme = None
for f in zf.filelist:
if "readme" in f.filename:
readme = f
break
if not readme:
raise DumperException("Can't find a readme in the archive (I was checking version/license)")
if not self.release in readme.filename:
raise DumperException("Version in readme filename ('%s') doesn't match expected version %s" % (readme.filename, self.release))
assert self.release.endswith("a"), "Release '%s' isn't academic version (how possible ?)" % self.release
# good to go...
if readme is None:
raise DumperException(f"Can't find a README in the archive {local} (for the purpose of checking version/license).")
if self.release not in readme.filename:
raise DumperException(f"Version in readme filename ({readme.filename}) doesn't match release tag {self.release}.")

"""
Check #3: Must be a academic release.
"""
assert self.release.endswith("a"), f"Release {self.release} isn't academic version (how possible?)"

# More checks go here...

def post_dump(self, *args, **kwargs):
self.logger.info("Unzipping files in '%s'" % self.new_data_folder)
unzipall(self.new_data_folder)


def main():
dumper = DBNSFPDumper()
dumper.dump()

if __name__ == "__main__":
main()

0 comments on commit 3577355

Please sign in to comment.