diff --git a/Dockerfile b/Dockerfile index 3dec0dcee..86ec052ac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -147,6 +147,7 @@ RUN python3 -m spacy download el_core_news_sm \ COPY . /ingestors WORKDIR /ingestors RUN pip3 install --no-cache-dir -e /ingestors +RUN argospm update && argospm install translate-en_es RUN chown -R app:app /ingestors ENV ARCHIVE_TYPE=file \ diff --git a/ingestors/support/pdf.py b/ingestors/support/pdf.py index 5bf7bd41c..3295e3811 100644 --- a/ingestors/support/pdf.py +++ b/ingestors/support/pdf.py @@ -1,3 +1,4 @@ +import argostranslate.translate from dataclasses import dataclass import logging import os @@ -67,7 +68,8 @@ def extract_pages(self, pdf_model: PdfModel, entity, manager): page_entity.make_id(entity.id, page_model.number) page_entity.set("document", entity) page_entity.set("index", page_model.number) - page_entity.add("bodyText", page_model.text) + text_es = argostranslate.translate.translate(page_model.text, "en", "es") + page_entity.add("bodyText", text_es) manager.apply_context(page_entity, entity) manager.emit_entity(page_entity) manager.emit_text_fragment(entity, page_model.text, page_entity.id) diff --git a/requirements.txt b/requirements.txt index d35195495..3da736211 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,3 +39,4 @@ requests[security]==2.28.2 pymupdf==1.21.1 sentry_sdk==1.26.0 +argostranslate