Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions ingestors/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

TESTING = False

CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200) # 2 hrs
CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 256)
CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 300) # seconds

# Enable (expensive!) Google Cloud API
OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False)
Expand Down
45 changes: 18 additions & 27 deletions ingestors/support/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@
from ingestors.support.cache import CacheSupport
from ingestors.support.temp import TempFileSupport
from ingestors.exc import ProcessingException
from ingestors import settings

log = logging.getLogger(__name__)

TIMEOUT = 3600 # seconds
CONVERT_RETRIES = 5

PDF_CACHE_ACCESSED = Counter(
"ingestfile_pdf_cache_accessed",
Expand Down Expand Up @@ -45,7 +44,9 @@ def document_to_pdf(self, unique_tmpdir, file_path, entity):
self.tags.set(key, content_hash)
return pdf_file

def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT):
def _document_to_pdf(
self, unique_tmpdir, file_path, entity, timeout=settings.CONVERT_TIMEOUT
):
"""Converts an office document to PDF."""
file_name = entity_filename(entity)
log.info("Converting [%s] to PDF", entity)
Expand All @@ -72,30 +73,20 @@ def _document_to_pdf(self, unique_tmpdir, file_path, entity, timeout=TIMEOUT):
file_path,
]
try:
for attempt in range(1, CONVERT_RETRIES):
log.info(
f"Starting LibreOffice: %s with timeout %s attempt #{attempt}/{CONVERT_RETRIES}",
cmd,
timeout,
)
try:
subprocess.run(cmd, timeout=timeout, check=True)
except Exception as e:
log.info(
f"Could not be converted to PDF (attempt {attempt}/{CONVERT_RETRIES}): {e}"
)
continue
log.info(f"Starting LibreOffice: {cmd} with timeout {timeout}")
try:
subprocess.run(cmd, timeout=timeout, check=True)
except Exception as e:
raise ProcessingException("Could not be converted to PDF") from e

for file_name in os.listdir(pdf_output_dir):
if not file_name.endswith(".pdf"):
continue
out_file = os.path.join(pdf_output_dir, file_name)
if os.stat(out_file).st_size == 0:
continue
log.info(f"Successfully converted {out_file}")
return out_file
raise ProcessingException(
f"Could not be converted to PDF (attempt #{attempt}/{CONVERT_RETRIES})"
)
for file_name in os.listdir(pdf_output_dir):
if not file_name.endswith(".pdf"):
continue
out_file = os.path.join(pdf_output_dir, file_name)
if os.stat(out_file).st_size == 0:
continue
log.info(f"Successfully converted {out_file}")
return out_file
raise ProcessingException("Could not be converted to PDF")
except Exception as e:
raise ProcessingException("Could not be converted to PDF") from e