From 79f704b4424036d06014798a2f143bcfab4617e8 Mon Sep 17 00:00:00 2001 From: Chunlei Wu Date: Thu, 29 Jul 2021 17:29:34 -0700 Subject: [PATCH] added md5 validation check for pubchem dumper --- .../dataload/sources/pubchem/pubchem_dump.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/hub/dataload/sources/pubchem/pubchem_dump.py b/src/hub/dataload/sources/pubchem/pubchem_dump.py index da0a5b9..419f277 100644 --- a/src/hub/dataload/sources/pubchem/pubchem_dump.py +++ b/src/hub/dataload/sources/pubchem/pubchem_dump.py @@ -1,5 +1,9 @@ +import os import os.path import ftplib +import glob +import shutil +import subprocess import biothings import config @@ -54,3 +58,26 @@ def create_todump_list(self, force=False): self.logger.debug("Recycling FTP client because: '%s'" % e) self.release_client() self.prepare_client() + + def post_dump(self, *args, **kwargs): + '''Validate downloaded files''' + self.logger.debug("Start validating downloaded files...") + cmd = shutil.which('md5sum') + if not cmd: + raise OSError('"md5sum" is not found in the PATH!') + if cmd: + old = os.path.abspath(os.curdir) + os.chdir(self.new_data_folder) + try: + md5_files = glob.glob("*.md5") + if md5_files: + for md5_file in md5_files: + cmd = ["md5sum", "-c", md5_file] + self.logger.debug("\tValidating md5 checksum for: ", md5_file) + try: + subprocess.check_call(cmd) + except subprocess.SubprocessError: + raise DumperException("Failed to validate: ", md5_file) + self.logger.debug("All %s files are validated.", len(md5_files)) + finally: + os.chdir(old)