diff --git a/pyproject.toml b/pyproject.toml index 4285721b..55b372ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ keywords = [ dependencies = [ "apify-client>=2.0.0,<3.0.0", "apify-shared>=2.0.0,<3.0.0", - "crawlee>=1.0.0,<2.0.0", + "crawlee>=1.0.2,<2.0.0", "cachetools>=5.5.0", "cryptography>=42.0.0", "impit>=0.6.1", diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 28158b55..fe6e89c2 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -3,6 +3,7 @@ from datetime import datetime, timedelta from decimal import Decimal from logging import getLogger +from pathlib import Path from typing import Annotated, Any from pydantic import AliasChoices, BeforeValidator, Field, model_validator @@ -421,6 +422,14 @@ def disable_browser_sandbox_on_platform(self) -> Self: logger.warning('Actor is running on the Apify platform, `disable_browser_sandbox` was changed to True.') return self + @property + def canonical_input_key(self) -> str: + return str(Path(self.input_key).with_suffix('.json')) + + @property + def input_key_candidates(self) -> set[str]: + return {self.input_key, self.canonical_input_key, Path(self.canonical_input_key).stem} + @classmethod def get_global_configuration(cls) -> Configuration: """Retrieve the global instance of the configuration. diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index 5a339982..e320d2c9 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -1,14 +1,19 @@ import asyncio import json -from pathlib import Path +import logging -from typing_extensions import override +from more_itertools import flatten +from typing_extensions import Self, override from crawlee._consts import METADATA_FILENAME +from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient +from crawlee.storage_clients.models import KeyValueStoreRecord from apify._configuration import Configuration +logger = logging.getLogger(__name__) + class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient): """Apify-specific implementation of the `FileSystemKeyValueStoreClient`. @@ -17,6 +22,22 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient): directory, except for the metadata file and the `INPUT.json` file. """ + @override + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + alias: str | None, + configuration: CrawleeConfiguration, + ) -> Self: + client = await super().open(id=id, name=name, alias=alias, configuration=configuration) + + await client._sanitize_input_json_files() # noqa: SLF001 - it's okay, this is a factory method + + return client + @override async def purge(self) -> None: """Purges the key-value store by deleting all its contents. @@ -24,16 +45,16 @@ async def purge(self) -> None: It deletes all files in the key-value store directory, except for the metadata file and the `INPUT.json` file. It also updates the metadata to reflect that the store has been purged. """ - kvs_input_key = Configuration.get_global_configuration().input_key - - # First try to find the alternative format of the input file and process it if it exists. - for file_path in self.path_to_kvs.glob('*'): - if file_path.name == f'{kvs_input_key}.json': - await self._process_input_json(file_path) + configuration = Configuration.get_global_configuration() async with self._lock: + files_to_keep = set( + flatten([key, f'{key}.{METADATA_FILENAME}'] for key in configuration.input_key_candidates) + ) + files_to_keep.add(METADATA_FILENAME) + for file_path in self.path_to_kvs.glob('*'): - if file_path.name in {METADATA_FILENAME, kvs_input_key, f'{kvs_input_key}.{METADATA_FILENAME}'}: + if file_path.name in files_to_keep: continue if file_path.is_file(): await asyncio.to_thread(file_path.unlink, missing_ok=True) @@ -43,15 +64,40 @@ async def purge(self) -> None: update_modified_at=True, ) - async def _process_input_json(self, path: Path) -> None: - """Process simple input json file to format expected by the FileSystemKeyValueStoreClient. + async def _sanitize_input_json_files(self) -> None: + """Handle missing metadata for input files.""" + configuration = Configuration.get_global_configuration() + alternative_keys = configuration.input_key_candidates - {configuration.canonical_input_key} - For example: INPUT.json -> INPUT, INPUT.json.metadata - """ - try: - f = await asyncio.to_thread(path.open) - input_data = json.load(f) - finally: - f.close() - await asyncio.to_thread(path.unlink, missing_ok=True) - await self.set_value(key=path.stem, value=input_data) + if (self.path_to_kvs / configuration.canonical_input_key).exists(): + # Refresh metadata to prevent inconsistencies + input_data = await asyncio.to_thread( + lambda: json.loads((self.path_to_kvs / configuration.canonical_input_key).read_text()) + ) + await self.set_value(key=configuration.canonical_input_key, value=input_data) + + for alternative_key in alternative_keys: + if (alternative_input_file := self.path_to_kvs / alternative_key).exists(): + logger.warning(f'Redundant input file found: {alternative_input_file}') + else: + for alternative_key in alternative_keys: + alternative_input_file = self.path_to_kvs / alternative_key + + # Only process files that actually exist + if alternative_input_file.exists(): + # Refresh metadata to prevent inconsistencies + with alternative_input_file.open() as f: + input_data = await asyncio.to_thread(lambda: json.load(f)) + await self.set_value(key=alternative_key, value=input_data) + + @override + async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: + configuration = Configuration.get_global_configuration() + + if key in configuration.input_key_candidates: + for candidate in configuration.input_key_candidates: + value = await super().get_value(key=candidate) + if value is not None: + return value + + return await super().get_value(key=key) diff --git a/tests/unit/storage_clients/test_file_system.py b/tests/unit/storage_clients/test_file_system.py index 7b938416..280188b7 100644 --- a/tests/unit/storage_clients/test_file_system.py +++ b/tests/unit/storage_clients/test_file_system.py @@ -2,16 +2,16 @@ import asyncio import json -from typing import TYPE_CHECKING +from pathlib import Path +import pytest + +from crawlee import service_locator from crawlee._consts import METADATA_FILENAME from apify import Actor, Configuration from apify.storage_clients._file_system import ApifyFileSystemKeyValueStoreClient -if TYPE_CHECKING: - from pathlib import Path - async def test_purge_preserves_input_file_and_metadata() -> None: """Test that purge() preserves INPUT.json and metadata files but removes other files.""" @@ -61,19 +61,32 @@ async def test_purge_preserves_input_file_and_metadata() -> None: # Verify INPUT.json content is unchanged input_content = await asyncio.to_thread(input_file.read_text) - assert input_content == '{"test": "input"}' + assert json.loads(input_content) == json.loads('{"test": "input"}') + +@pytest.mark.parametrize('input_file_name', ['INPUT', 'INPUT.json']) +async def test_pre_existing_input_used_by_actor(input_file_name: str) -> None: + configuration = Configuration() + service_locator.set_configuration(configuration) + + # Create key-value store directory and make sure that it is empty + path_to_input = Path(configuration.storage_dir) / 'key_value_stores' / 'default' + path_to_input.mkdir(parents=True) + assert list(path_to_input.glob('*')) == [] -async def test_pre_existing_input_used_by_actor(tmp_path: Path) -> None: pre_existing_input = { 'foo': 'bar', } - configuration = Configuration.get_global_configuration() # Create pre-existing INPUT.json file - path_to_input = tmp_path / 'key_value_stores' / 'default' - path_to_input.mkdir(parents=True) - (path_to_input / f'{configuration.input_key}.json').write_text(json.dumps(pre_existing_input)) + (path_to_input / input_file_name).write_text(json.dumps(pre_existing_input)) async with Actor(): assert pre_existing_input == await Actor.get_input() + + # Make sure that the input file doesn't get renamed in the process and metadata are added + assert set(path_to_input.glob('*')) == { + path_to_input / '__metadata__.json', + path_to_input / input_file_name, + path_to_input / f'{input_file_name}.__metadata__.json', + } diff --git a/uv.lock b/uv.lock index f45887f7..7fba3239 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" [[package]] @@ -76,7 +76,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=2.0.0,<3.0.0" }, { name = "apify-shared", specifier = ">=2.0.0,<3.0.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", specifier = ">=1.0.0,<2.0.0" }, + { name = "crawlee", specifier = ">=1.0.2,<2.0.0" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "impit", specifier = ">=0.6.1" }, { name = "lazy-object-proxy", specifier = ">=1.11.0" }, @@ -516,7 +516,7 @@ toml = [ [[package]] name = "crawlee" -version = "1.0.1" +version = "1.0.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cachetools" }, @@ -532,9 +532,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4d/0a/713d5d45c9833a74beaf9f298081823eeef8dcbd5c25b8f87feebe7a4574/crawlee-1.0.1.tar.gz", hash = "sha256:01f7c9bfeace31bb3a175bfb954bc12214dd2fc3d4e9da30e566659f25015a67", size = 24897569, upload-time = "2025-10-06T08:28:36.367Z" } +sdist = { url = "https://files.pythonhosted.org/packages/93/68/45641208866a60176be4c5f2ab620c2122df18db956dc86a03471181e7c3/crawlee-1.0.2.tar.gz", hash = "sha256:522b52c1362d116b95ba85820f87001713f290a3ec690568adb862a4b29d7ca4", size = 24900937, upload-time = "2025-10-08T07:59:09.983Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/46/5ab3edf0497dad3c6c02979609934f36e1bb934076e4401e5b5c7b060fbd/crawlee-1.0.1-py3-none-any.whl", hash = "sha256:a910f0c2e8fec57b3c800af4f60006e1906923f80e9301f0073a85aca7d16c62", size = 303586, upload-time = "2025-10-06T08:28:32.466Z" }, + { url = "https://files.pythonhosted.org/packages/be/ac/860de31ca534adb1d6321f66c7d082ba735eff49090f67a316f8d60f1ee2/crawlee-1.0.2-py3-none-any.whl", hash = "sha256:57a63d0b22493297490a5836e6b1d47dee667004d95bbc01387dcfb00f6a8a7a", size = 304369, upload-time = "2025-10-08T07:59:07.475Z" }, ] [package.optional-dependencies]