Merge pull request #163 from biothings/issue-142-dbNSFP

Fix to Issue 142
biothings · Feb 28, 2023 · 3577355 · 3577355
2 parents ad3bf65 + 47e2a1e
commit 3577355
Show file tree

Hide file tree

Showing 4 changed files with 290 additions and 201 deletions.
diff --git a/src/hub/dataload/sources/dbnsfp/README.md b/src/hub/dataload/sources/dbnsfp/README.md
@@ -0,0 +1,30 @@
+## Dumper Design
+
+This dumper uses both FTP and HTTP, according to Sebastien:
+
+- FTP: to get the latest version
+- HTTP: to actually download the file (because the FTP server is slow)
+
+The FTP server is [dbnsfp.softgenetics.com](dbnsfp.softgenetics.com) and the latest version detection works quite straightforward.
+
+The tricky part is parsing the HTTP download link from the dbNSFP [main page]( https://sites.google.com/site/jpopgen/dbNSFP). As this README is written, the latest academic release is `dbNSFP4.3a` and the main page lists three URLs for download, in the following order:
+
+1. [Amazon AWS](https://www.google.com/url?q=https%3A%2F%2Fdbnsfp.s3.amazonaws.com%2FdbNSFP4.3a.zip&sa=D&sntz=1&usg=AOvVaw2jxs6oSlLKGuD0pfWzazXd)
+2. [Box](https://www.google.com/url?q=https%3A%2F%2Fusf.box.com%2Fshared%2Fstatic%2F9r6iamhldji4c3vodwebh3947vgrvsng&sa=D&sntz=1&usg=AOvVaw0IxtlQigv3YxfO4zEGR3U3)
+3. [Google Drive](https://drive.google.com/file/d/1p8zlODMg5RIdG2J_vU1292ZqSocS-lii/view?usp=sharing)
+
+We decide to use the Box URLs for downloading because:
+
+1. The Amazon AWS URLs may return a `403: AccessDenied`
+2. The Google Drive URLs, when accessed by python code (e.g. our `GoogleDriveDumper`), may report `Google Drive - Quota exceeded` error and we may have to wait 24 hours and try again.
+
+Challenges of finding the correct Box URLs are:
+
+1. Only Amazon AWS URLs contain the filenames (e.g. `dbNSFP4.3a.zip`)
+2. A Box URL might be wrapped as in `https://www.google.com/url?q=<Box_URL>`
+
+Solutions: 
+
+1. Given a filename, we have to pinpoint the Box URL with following assumption: the first Box URL right after the Amazon AWS URL containing the filename is the target URL.
+2. We can use `urllib.parse.urlparse` to find the wrapped Box URL.
+
diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_dump.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_dump.py
@@ -1,134 +1,187 @@
 import os
-import os.path
-import sys
-import time
 import re
 import requests
+from urllib.parse import urlparse, parse_qs
 from ftplib import FTP
 from bs4 import BeautifulSoup
 import zipfile
 
-import biothings, config
-biothings.config_for_app(config)
-
 from config import DATA_ARCHIVE_ROOT
-from biothings.hub.dataload.dumper import GoogleDriveDumper, DumperException
+from biothings.hub.dataload.dumper import HTTPDumper, DumperException
 from biothings.utils.common import unzipall
 
 
-class DBNSFPDumper(GoogleDriveDumper):
-    '''
-    Mixed dumper (use FTP and HTTP/GoogleDrive) to dump dbNSFP:
+class DBNSFPDumper(HTTPDumper):
+    """
+    Mixed dumper (use FTP and HTTP) to dump dbNSFP:
     - FTP: to get the latest version
     - HTTP: to actually get the data (because their FTP server is sooo slow)
-    '''
+    """
 
     SRC_NAME = "dbnsfp"
     SRC_ROOT_FOLDER = os.path.join(DATA_ARCHIVE_ROOT, SRC_NAME)
-    # "a" is for academic, not "c"ommercial
-    # also, sometimes there's a "v", sometimes not...
-    RELEASE_PAT = "dbNSFPv?(\d+\..*\d+a)\.zip"
 
-    SCHEDULE = "0 9 1 * *"  # 9AM every 1st day of month 
+    SCHEDULE = "0 9 1 * *"  # 9AM every 1st day of month
+
+    # Look for filenames ending with "a" (for Academic), not "c" (for Commercial).
+    #   Also, sometimes there's a "v", sometimes not...
+    FILENAME_PATTERN = re.compile("dbNSFPv?(\d+\..*\d+a)\.zip")
+    # Check if a release is a beta release.
+    # Tricky there, usually releases are like 4.0a, 4.0b1a, 4.0b2a.
+    #   If sorted, 4.0b2a will be the "newest", but it's a beta (b2) and 4.0a is actually the newest there
+    BETA_RELEASE_PATTERN = re.compile("(\d+\.\d+)\w\d(\w)")
 
     def get_newest_info(self):
-        ftp = FTP('dbnsfp.softgenetics.com')
-        ftp.login('dbnsfp','dbnsfp')
-        releases = ftp.nlst()
-        # get rid of readme files
-        pat = re.compile(self.RELEASE_PAT)
-        releases = [(x,pat.match(x).groups()[0]) for x in releases if pat.match(x)]
-        drels = {}
-        [drels.setdefault(rel,f) for (f,rel) in releases]
-        # sort items based on date
-        releases = sorted(drels.keys())
-        # check if there's a non-beta version. Tricky there, usually versions are like that:
-        # 4.0a, 4.0ab1, 4.0ab2
-        # if sorted, 4.0ab2 will be the "newest", but it's a beta (b2) and 4.0a is
-        # actually the newest there
-        newest = releases[-1]
-        nonbetapat = re.compile("(\d+\.\d+)\w\d(\w)")
-        m = nonbetapat.match(newest)
-        if m:
-            nonbeta = "".join(m.groups())
-            if nonbeta in releases:
-                self.logger.info("Found non-beta version '%s'" % nonbeta)
-                newest = nonbeta
+        release_map = dict()  # a dict of <release_num, file_name>
+
+        ftp = FTP("dbnsfp.softgenetics.com")
+        ftp.login("dbnsfp", "dbnsfp")
+        for filename in ftp.nlst():
+            filename_match = self.FILENAME_PATTERN.match(filename)
+            if filename_match:
+                release = filename_match.groups()[0]
+                release_map[release] = filename
 
         # get the last item in the list, which is the latest version
-        self.newest_file = drels[newest]
-        self.release = newest
+        newest_release = sorted(release_map.keys())[-1]
+
+        beta_release_match = self.BETA_RELEASE_PATTERN.match(newest_release)
+        if beta_release_match:
+            # If the newest release is a beta release, infer its stable release
+            stable_release = "".join(beta_release_match.groups())
+
+            # If the inferred stable release is available, use it instead of the beta release
+            if stable_release in release_map:
+                self.logger.info(f"Stable release {stable_release} detected; beta release {newest_release} discarded.")
+                newest_release = stable_release
+            # Otherwise just use the beta release
+            # else:
+            #     pass
+
+        self.release = newest_release
+        self.newest_file = release_map[newest_release]
 
     def new_release_available(self):
-        current_release = self.src_doc.get("download",{}).get("release")
+        current_release = self.src_doc.get("download", {}).get("release")
         if not current_release or self.release > current_release:
-            self.logger.info("New release '%s' found" % self.release)
+            self.logger.info(f"New release {self.release} available, over current release {current_release}.")
             return True
         else:
-            self.logger.debug("No new release found")
+            self.logger.debug(f"No new release available over current release {current_release}.")
             return False
 
-    def get_drive_url(self,ftpname):
-        # ok, so let's get the main page data. in this page there are links for both
-        # FTP and Google Drive. We're assuming here that just after FTP link, there's
-        # the corresponding one for Drive (parse will ensure we downloaded the correct
-        # version, and also the correct licensed one - academic only)
-        res = requests.get("https://sites.google.com/site/jpopgen/dbNSFP")
-        html = BeautifulSoup(res.text,"html.parser")
-        ftplink = html.findAll(attrs={"href":re.compile(ftpname)})
-        if ftplink:
-            ftplink = ftplink.pop()
-        else:
-            raise DumperException("Can't find a FTP link for '%s'" % ftpname)
-        # let's cross fingers here...
-        drivelink = ftplink.findNextSibling()
-        href = drivelink.get("href")
-        if href:
-            return href
-        else:
-            raise DumperException("Can't find a href in drive link element: %s" % drivelink)
+    @classmethod
+    def get_box_url(cls, filename):
+        """
+        Given a filename, find its Box download link from parsing the index page.
+
+        dbNSFP main page provides 3 types of downloads for each release, in the following order:
+        1. Amazon AWS (somehow cannot access)
+        2. Box (direct link, or wrapped in www.google.com/url?q=<Box_URL>)
+        3. Google Drive (a download page, not direct link)
+
+        However only the Amazon AWS download URL will contain the filename. E.g. "dbNSFP4.3a.zip" and
+        "https://www.google.com/url?q=https%3A%2F%2Fdbnsfp.s3.amazonaws.com%2FdbNSFP4.3a.zip&amp;sa=D&amp;sntz=1&amp;usg=AOvVaw2jxs6oSlLKGuD0pfWzazXd".
+
+        The algorithm here is:
+        1. Find the Amazon AWS download URL containing the filename.
+        2. Find the first Box download URL right after the above Amazon AWS URL.
+
+        Note: The above algorithm may fail once the HTML structure of the main page changed.
+        """
+
+        amazon_anchor_text = "Amazon"
+        box_anchor_text = "Box"
+        # google_drive_anchor_text = "googledrive"
+        # to find anchor elements containing text "Amazon", or "Box"
+        anchor_text_pattern = re.compile(f"^{amazon_anchor_text}|{box_anchor_text}$")
+
+        html_response = requests.get("https://sites.google.com/site/jpopgen/dbNSFP")
+        html_text = html_response.text
+        soup = BeautifulSoup(html_text, "html.parser")
+        anchors = soup.find_all("a", href=True, text=anchor_text_pattern)
+
+        amazon_anchor_index = None
+        for index, anchor in enumerate(anchors):
+            if filename in anchor["href"] and anchor.text == amazon_anchor_text:
+                amazon_anchor_index = index
+
+        if amazon_anchor_index is None:
+            raise DumperException(f"Cannot find an {amazon_anchor_text} anchor element containing filename {filename}.")
+
+        box_anchor = None
+        for anchor in anchors[amazon_anchor_index:]:
+            if anchor.text == box_anchor_text:
+                box_anchor = anchor
+                break
 
+        if box_anchor is None:
+            raise DumperException(f"Cannot find a {box_anchor_text} anchor element after the {amazon_anchor_text} anchor of "
+                                  f"{anchors[amazon_anchor_index]['href']}.")
+
+        box_url = box_anchor["href"]
+
+        # The Box download URL might be a "www.google.com/url" URL wrapping the true Box URL. E.g.
+        # "https://www.google.com/url?q=https%3A%2F%2Fusf.box.com%2Fshared%2Fstatic%2Fq1kufbnww5dy3fs2t1yp5ay0w93eufq7"
+        box_url_parse_result = urlparse(box_url)
+        if box_url_parse_result.netloc == "www.google.com":
+            qs_result = parse_qs(box_url_parse_result.query)
+            q = qs_result.get("q", None)
+            if q is None:
+                raise DumperException(f"Cannot find q in the query string of {box_url} for {filename}.")
+            return q[0]  # The wrapped Box URL should be the only element in "q"
+        elif box_url_parse_result.netloc.endswith("box.com"):  # direct Box.com download link
+            return box_url
+        else:
+            raise DumperException(f"Cannot recognized the Box download URL {box_url} for {filename}.")
 
-    def create_todump_list(self, force=False):
+    def create_todump_list(self, force=False, **kwargs):
         self.get_newest_info()
-        new_localfile = os.path.join(self.new_data_folder,os.path.basename(self.newest_file))
+
+        new_localfile = os.path.join(self.new_data_folder, os.path.basename(self.newest_file))
         try:
-            current_localfile = os.path.join(self.current_data_folder,os.path.basename(self.newest_file))
+            current_localfile = os.path.join(self.current_data_folder, os.path.basename(self.newest_file))
         except TypeError:
             # current data folder doesn't even exist
             current_localfile = new_localfile
+
         if force or not os.path.exists(current_localfile) or self.new_release_available():
-            # register new release (will be stored in backend)
-            self.release = self.release
-            remote = self.get_drive_url(self.newest_file)
-            self.to_dump.append({"remote": remote,"local":new_localfile})
+            remote = self.get_box_url(self.newest_file)
+            self.to_dump.append({"remote": remote, "local": new_localfile})
+
+    def post_download(self, remote, local):
+        """
+        Run some sanity checks after downloading
+        """
 
-    def post_download(self,remote,local):
+        """
+        Check #1: The filename of the downloaded archive must contain the release tag.
+        """
         filename = os.path.basename(local)
-        if not self.release in filename:
-            raise DumperException("Weird, filename is wrong ('%s')" % filename)
-        # make sure we downloaded to correct one, and that it's the academic version
+        if self.release not in filename:
+            raise DumperException(f"Weird, filename is wrong ({filename}); should contain release tag {self.release}.")
+
+        """
+        Check #2: The downloaded archive must contain a README whole filename must contain the release tag.
+        """
         zf = zipfile.ZipFile(local)
         readme = None
         for f in zf.filelist:
             if "readme" in f.filename:
                 readme = f
                 break
-        if not readme:
-            raise DumperException("Can't find a readme in the archive (I was checking version/license)")
-        if not self.release in readme.filename:
-            raise DumperException("Version in readme filename ('%s') doesn't match expected version %s" % (readme.filename, self.release))
-        assert self.release.endswith("a"), "Release '%s' isn't academic version (how possible ?)" % self.release
-        # good to go...
+        if readme is None:
+            raise DumperException(f"Can't find a README in the archive {local} (for the purpose of checking version/license).")
+        if self.release not in readme.filename:
+            raise DumperException(f"Version in readme filename ({readme.filename}) doesn't match release tag {self.release}.")
+
+        """
+        Check #3: Must be a academic release. 
+        """
+        assert self.release.endswith("a"), f"Release {self.release} isn't academic version (how possible?)"
+
+        # More checks go here...
 
     def post_dump(self, *args, **kwargs):
         self.logger.info("Unzipping files in '%s'" % self.new_data_folder)
         unzipall(self.new_data_folder)
-
-
-def main():
-    dumper = DBNSFPDumper()
-    dumper.dump()
-
-if __name__ == "__main__":
-    main()