diff --git a/ingestors/settings.py b/ingestors/settings.py index e396b2eb9..4768e91e9 100644 --- a/ingestors/settings.py +++ b/ingestors/settings.py @@ -4,8 +4,7 @@ TESTING = False -CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200) # 2 hrs -CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 256) +CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 300) # seconds # Enable (expensive!) Google Cloud API OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False) diff --git a/ingestors/support/convert.py b/ingestors/support/convert.py index 786978310..2e418b741 100644 --- a/ingestors/support/convert.py +++ b/ingestors/support/convert.py @@ -9,11 +9,10 @@ from ingestors.support.cache import CacheSupport from ingestors.support.temp import TempFileSupport from ingestors.exc import ProcessingException +from ingestors import settings log = logging.getLogger(__name__) -TIMEOUT = 3600 # seconds -CONVERT_RETRIES = 5 PDF_CACHE_ACCESSED = Counter( "ingestfile_pdf_cache_accessed", @@ -45,7 +44,9 @@ def document_to_pdf(self, unique_tmpdir, file_path, entity): self.tags.set(key, content_hash) return pdf_file - def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT): + def _document_to_pdf( + self, unique_tmpdir, file_path, entity, timeout=settings.CONVERT_TIMEOUT + ): """Converts an office document to PDF.""" file_name = entity_filename(entity) log.info("Converting [%s] to PDF", entity) @@ -72,30 +73,20 @@ def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT): file_path, ] try: - for attempt in range(1, CONVERT_RETRIES): - log.info( - f"Starting LibreOffice: %s with timeout %s attempt #{attempt}/{CONVERT_RETRIES}", - cmd, - timeout, - ) - try: - subprocess.run(cmd, timeout=timeout, check=True) - except Exception as e: - log.info( - f"Could not be converted to PDF (attempt {attempt}/{CONVERT_RETRIES}): {e}" - ) - continue + log.info(f"Starting LibreOffice: {cmd} with timeout {timeout}") + try: + subprocess.run(cmd, timeout=timeout, check=True) + except Exception as e: + raise ProcessingException("Could not be converted to PDF") from e - for file_name in os.listdir(pdf_output_dir): - if not file_name.endswith(".pdf"): - continue - out_file = os.path.join(pdf_output_dir, file_name) - if os.stat(out_file).st_size == 0: - continue - log.info(f"Successfully converted {out_file}") - return out_file - raise ProcessingException( - f"Could not be converted to PDF (attempt #{attempt}/{CONVERT_RETRIES})" - ) + for file_name in os.listdir(pdf_output_dir): + if not file_name.endswith(".pdf"): + continue + out_file = os.path.join(pdf_output_dir, file_name) + if os.stat(out_file).st_size == 0: + continue + log.info(f"Successfully converted {out_file}") + return out_file + raise ProcessingException("Could not be converted to PDF") except Exception as e: raise ProcessingException("Could not be converted to PDF") from e