From 4276cc0230f6828d50ef2f849fa3c53b9601eb32 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Tue, 26 Aug 2025 08:33:08 +0200 Subject: [PATCH 1/3] bugfix: Handle password protected 7z archives --- ingestors/packages/__init__.py | 2 ++ tests/fixtures/7z_password.7z | Bin 0 -> 178 bytes tests/test_packages.py | 9 +++++++++ 3 files changed, 11 insertions(+) create mode 100644 tests/fixtures/7z_password.7z diff --git a/ingestors/packages/__init__.py b/ingestors/packages/__init__.py index f1da6b99c..df8413292 100644 --- a/ingestors/packages/__init__.py +++ b/ingestors/packages/__init__.py @@ -32,6 +32,8 @@ def unpack(self, file_path, entity, temp_dir): z.extractall(path=temp_dir) except ArchiveError as e: raise ProcessingException(f"Error: {e}") + except py7zr.PasswordRequired: + raise ProcessingException("Password protected 7z archive") class SingleFilePackageIngestor(PackageSupport, Ingestor): diff --git a/tests/fixtures/7z_password.7z b/tests/fixtures/7z_password.7z new file mode 100644 index 0000000000000000000000000000000000000000..3fc90ae205e2bb3e0eef2c1819d880312d946eb8 GIT binary patch literal 178 zcmXr7+Ou9=hJj_b==Vtq3=mKRr8C5SS2$ggzw^wl=bGi&#)F0DzAU~`m;Hdpw(DoM z;n&|DyYjdgS=bmDITaY#xfvOlRMF5LW;(1A`k-1u%wZJ0RZ^RFq8lQ literal 0 HcmV?d00001 diff --git a/tests/test_packages.py b/tests/test_packages.py index 05243aa82..bb8d34cb0 100644 --- a/tests/test_packages.py +++ b/tests/test_packages.py @@ -89,3 +89,12 @@ def test_7zip_symlink_escape(self): assert len(self.manager.entities) == 1 assert self.manager.entities[0].first("fileName") == "bad7zip.7z" assert self.manager.entities[0].first("processingStatus") == "failure" + + def test_7zip_password(self): + fixture_path, entity = self.fixture("7z_password.7z") + + self.manager.ingest(fixture_path, entity) + + assert len(self.manager.entities) == 1 + assert self.manager.entities[0].first("fileName") == "7z_password.7z" + assert self.manager.entities[0].first("processingStatus") == "failure" From 4fe6eee434df11722f457f12b6a90d4c73cb7ed2 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Tue, 26 Aug 2025 08:50:38 +0200 Subject: [PATCH 2/3] bugfix: use the correct debian flavor, now that bookworm is oldstable --- Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index d76e7af64..45a0c58ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,7 @@ LABEL org.opencontainers.image.licenses MIT LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file # Enable non-free archive for `unrar`. -RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \ +RUN echo "deb http://http.us.debian.org/debian bookworm non-free" >/etc/apt/sources.list.d/nonfree.list \ && apt-get -qq -y update \ && apt-get -qq -y install build-essential locales \ # python deps (mostly to install their dependencies) @@ -148,10 +148,10 @@ RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep5 RUN chown -R app:app /ingestors ENV ARCHIVE_TYPE=file \ - ARCHIVE_PATH=/data \ - FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ - REDIS_URL=redis://redis:6379/0 \ - TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata + ARCHIVE_PATH=/data \ + FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ + REDIS_URL=redis://redis:6379/0 \ + TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata # USER app CMD ingestors process From 72fbf9bfbbbc312d3a68931b9b66fc07b221c43e Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Tue, 26 Aug 2025 10:57:35 +0200 Subject: [PATCH 3/3] bugfix: keep fakeredis version under 2.31.0 because of a breaking API change --- Dockerfile | 8 ++++---- requirements.txt | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 45a0c58ad..29500bb0a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -148,10 +148,10 @@ RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep5 RUN chown -R app:app /ingestors ENV ARCHIVE_TYPE=file \ - ARCHIVE_PATH=/data \ - FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ - REDIS_URL=redis://redis:6379/0 \ - TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata + ARCHIVE_PATH=/data \ + FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ + REDIS_URL=redis://redis:6379/0 \ + TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata # USER app CMD ingestors process diff --git a/requirements.txt b/requirements.txt index 738f3aba3..18998e542 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ py7zr==1.0.0 pytest==8.3.5 pytest-cov==6.1.0 click==8.1.8 +fakeredis==2.30.3 # File format support dbf==0.99.10