Skip to content

Commit

Permalink
Add delete file functionality to BaseDumper (#186)
Browse files Browse the repository at this point in the history
* WIP: Add delete file in post_dump
Added `BaseDumper.to_delete` property and `BaseDumper.post_dump_delete_files` method.

* updated `post_dump_delete_files`

* empty to_delete after deleting
  • Loading branch information
zcqian committed Sep 9, 2021
1 parent 8fd18d1 commit 7e16007
Showing 1 changed file with 58 additions and 0 deletions.
58 changes: 58 additions & 0 deletions biothings/hub/dataload/dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import email.utils
import inspect
import os
import os.path
import stat
import pprint
import re
import subprocess
Expand Down Expand Up @@ -59,6 +61,8 @@ def __init__(self,
self.log_folder = log_folder or btconfig.LOG_FOLDER
self.archive = archive or self.ARCHIVE
self.to_dump = []
self.to_delete: List[Union[str, bytes,os.PathLike]] = []
"""Populate with list of relative path of files to delete"""
self.release = None
self.t0 = time.time()
self.logfile = None
Expand Down Expand Up @@ -164,6 +168,59 @@ def post_download(self, remotefile, localfile):
This is a good place to check file's integrity. Optional"""
pass

def post_dump_delete_files(self):
"""
Delete files after dump
Invoke this method in post_dump to synchronously delete
the list of paths stored in `self.to_delete`, in order.
Non-recursive. If directories need to be removed, build the list such that
files residing in the directory are removed first and then the directory.
(Hint: see `os.walk(dir, topdown=False)`)
"""
base_dir: str = os.path.realpath(self.new_data_folder)
self.logger.debug("Only delete files under %s", base_dir)
# assume this path is good
for rel_file_name in self.to_delete:
delete_path = os.path.realpath(
os.path.join(base_dir, rel_file_name)
) # figure out the full path
self.logger.debug("%s is %s", rel_file_name, delete_path)
common_path = os.path.commonpath((base_dir, delete_path))
self.logger.debug("Calculated common prefix path: %s", common_path)
if common_path != base_dir or delete_path == base_dir:
raise RuntimeError("Attempting to delete something outside the download "
"directory")
try:
s = os.stat(delete_path)
self.logger.debug("stat(%s): %s", delete_path, s)
except FileNotFoundError:
self.logger.warning("Cannot delete %s (%s), does not exist",
rel_file_name, delete_path)
continue
# there is a race condition but the effects are limited
if stat.S_ISREG(s.st_mode):
self.logger.info("Deleting regular file %s (%s)",
rel_file_name, delete_path)
try:
os.unlink(delete_path)
except Exception as e:
self.logger.exception("Failed to delete regular file")
raise e
elif stat.S_ISDIR(s.st_mode):
self.logger.info("Deleting directory %s (%s)",
rel_file_name, delete_path)
try:
os.rmdir(delete_path)
except Exception as e:
self.logger.exception("Failed to delete directory")
raise e
else:
raise RuntimeError(f"{rel_file_name} ({delete_path}) is not "
"a regular file or directory, cannot delete")
self.to_delete = [] # reset the list in case

def post_dump(self, *args, **kwargs):
"""
Placeholder to add a custom process once the whole resource
Expand Down Expand Up @@ -488,6 +545,7 @@ class FTPDumper(BaseDumper):
FTP_TIMEOUT = 10 * 60.0 # we want dumper to timout if necessary
BLOCK_SIZE: Optional[int] = None # default is still kept at 8KB


# TODO: should we add a __del__ to make sure to close ftp connection ?
# ftplib has a context __enter__, but we don't use it that way ("with ...")

Expand Down

0 comments on commit 7e16007

Please sign in to comment.