From f5cf4dd25e869abc4aa30f1f15b97082471bbbe9 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Tue, 23 Jan 2024 14:52:26 +0100 Subject: [PATCH 1/2] Respect the timeout and retry settings for converting docs Should fix #573 --- ingestors/settings.py | 4 ++-- ingestors/support/convert.py | 17 ++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/ingestors/settings.py b/ingestors/settings.py index e396b2eb9..41ddb2a8d 100644 --- a/ingestors/settings.py +++ b/ingestors/settings.py @@ -4,8 +4,8 @@ TESTING = False -CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200) # 2 hrs -CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 256) +CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 300) # seconds +CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 3) # Enable (expensive!) Google Cloud API OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False) diff --git a/ingestors/support/convert.py b/ingestors/support/convert.py index 786978310..79b77dbd5 100644 --- a/ingestors/support/convert.py +++ b/ingestors/support/convert.py @@ -9,11 +9,10 @@ from ingestors.support.cache import CacheSupport from ingestors.support.temp import TempFileSupport from ingestors.exc import ProcessingException +from ingestors import settings log = logging.getLogger(__name__) -TIMEOUT = 3600 # seconds -CONVERT_RETRIES = 5 PDF_CACHE_ACCESSED = Counter( "ingestfile_pdf_cache_accessed", @@ -45,7 +44,9 @@ def document_to_pdf(self, unique_tmpdir, file_path, entity): self.tags.set(key, content_hash) return pdf_file - def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT): + def _document_to_pdf( + self, unique_tmpdir, file_path, entity, timeout=settings.CONVERT_TIMEOUT + ): """Converts an office document to PDF.""" file_name = entity_filename(entity) log.info("Converting [%s] to PDF", entity) @@ -72,17 +73,15 @@ def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT): file_path, ] try: - for attempt in range(1, CONVERT_RETRIES): + for attempt in range(1, settings.CONVERT_RETRIES): log.info( - f"Starting LibreOffice: %s with timeout %s attempt #{attempt}/{CONVERT_RETRIES}", - cmd, - timeout, + f"Starting LibreOffice: {cmd} with timeout {timeout} attempt #{attempt}/{settings.CONVERT_RETRIES}", ) try: subprocess.run(cmd, timeout=timeout, check=True) except Exception as e: log.info( - f"Could not be converted to PDF (attempt {attempt}/{CONVERT_RETRIES}): {e}" + f"Could not be converted to PDF (attempt {attempt}/{settings.CONVERT_RETRIES}): {e}" ) continue @@ -95,7 +94,7 @@ def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT): log.info(f"Successfully converted {out_file}") return out_file raise ProcessingException( - f"Could not be converted to PDF (attempt #{attempt}/{CONVERT_RETRIES})" + f"Could not be converted to PDF (attempt #{attempt}/{settings.CONVERT_RETRIES})" ) except Exception as e: raise ProcessingException("Could not be converted to PDF") from e From 7d068f89efe1c22c248ac7f3d63103007099354d Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Wed, 31 Jan 2024 16:23:21 +0100 Subject: [PATCH 2/2] Remove the CONVERT_RETRIES setting The custom logic for retrying document conversions is no longer necessary. We should be able to rely just on worker retries. --- ingestors/settings.py | 1 - ingestors/support/convert.py | 36 ++++++++++++++---------------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/ingestors/settings.py b/ingestors/settings.py index 41ddb2a8d..4768e91e9 100644 --- a/ingestors/settings.py +++ b/ingestors/settings.py @@ -5,7 +5,6 @@ TESTING = False CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 300) # seconds -CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 3) # Enable (expensive!) Google Cloud API OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False) diff --git a/ingestors/support/convert.py b/ingestors/support/convert.py index 79b77dbd5..2e418b741 100644 --- a/ingestors/support/convert.py +++ b/ingestors/support/convert.py @@ -73,28 +73,20 @@ def _document_to_pdf( file_path, ] try: - for attempt in range(1, settings.CONVERT_RETRIES): - log.info( - f"Starting LibreOffice: {cmd} with timeout {timeout} attempt #{attempt}/{settings.CONVERT_RETRIES}", - ) - try: - subprocess.run(cmd, timeout=timeout, check=True) - except Exception as e: - log.info( - f"Could not be converted to PDF (attempt {attempt}/{settings.CONVERT_RETRIES}): {e}" - ) - continue + log.info(f"Starting LibreOffice: {cmd} with timeout {timeout}") + try: + subprocess.run(cmd, timeout=timeout, check=True) + except Exception as e: + raise ProcessingException("Could not be converted to PDF") from e - for file_name in os.listdir(pdf_output_dir): - if not file_name.endswith(".pdf"): - continue - out_file = os.path.join(pdf_output_dir, file_name) - if os.stat(out_file).st_size == 0: - continue - log.info(f"Successfully converted {out_file}") - return out_file - raise ProcessingException( - f"Could not be converted to PDF (attempt #{attempt}/{settings.CONVERT_RETRIES})" - ) + for file_name in os.listdir(pdf_output_dir): + if not file_name.endswith(".pdf"): + continue + out_file = os.path.join(pdf_output_dir, file_name) + if os.stat(out_file).st_size == 0: + continue + log.info(f"Successfully converted {out_file}") + return out_file + raise ProcessingException("Could not be converted to PDF") except Exception as e: raise ProcessingException("Could not be converted to PDF") from e