From dd94567ed26587c19934e3888977ad762a7dddc0 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 6 Oct 2025 22:43:36 +0200 Subject: [PATCH 01/15] fix: Also load input from a file with a .json extension in file system storage --- src/apify/_configuration.py | 9 ++++++ .../_file_system/_key_value_store_client.py | 28 +++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 28158b55..b5ebf442 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -3,6 +3,7 @@ from datetime import datetime, timedelta from decimal import Decimal from logging import getLogger +from pathlib import Path from typing import Annotated, Any from pydantic import AliasChoices, BeforeValidator, Field, model_validator @@ -421,6 +422,14 @@ def disable_browser_sandbox_on_platform(self) -> Self: logger.warning('Actor is running on the Apify platform, `disable_browser_sandbox` was changed to True.') return self + @property + def canonical_input_key(self) -> str: + return Path(self.input_key).stem + + @property + def input_key_candidates(self) -> set[str]: + return {self.input_key, self.canonical_input_key, Path(self.canonical_input_key).with_suffix('.json').name} + @classmethod def get_global_configuration(cls) -> Configuration: """Retrieve the global instance of the configuration. diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index 5a339982..83c4573c 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -2,10 +2,12 @@ import json from pathlib import Path +from more_itertools import flatten from typing_extensions import override from crawlee._consts import METADATA_FILENAME from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient +from crawlee.storage_clients.models import KeyValueStoreRecord from apify._configuration import Configuration @@ -24,16 +26,24 @@ async def purge(self) -> None: It deletes all files in the key-value store directory, except for the metadata file and the `INPUT.json` file. It also updates the metadata to reflect that the store has been purged. """ - kvs_input_key = Configuration.get_global_configuration().input_key + configuration = Configuration.get_global_configuration() # First try to find the alternative format of the input file and process it if it exists. for file_path in self.path_to_kvs.glob('*'): - if file_path.name == f'{kvs_input_key}.json': + if ( + file_path.name in configuration.input_key_candidates + and file_path.name != configuration.canonical_input_key + ): await self._process_input_json(file_path) async with self._lock: + files_to_keep = set( + flatten([key, f'{key}.{METADATA_FILENAME}'] for key in configuration.input_key_candidates) + ) + files_to_keep.add(METADATA_FILENAME) + for file_path in self.path_to_kvs.glob('*'): - if file_path.name in {METADATA_FILENAME, kvs_input_key, f'{kvs_input_key}.{METADATA_FILENAME}'}: + if file_path.name in files_to_keep: continue if file_path.is_file(): await asyncio.to_thread(file_path.unlink, missing_ok=True) @@ -55,3 +65,15 @@ async def _process_input_json(self, path: Path) -> None: f.close() await asyncio.to_thread(path.unlink, missing_ok=True) await self.set_value(key=path.stem, value=input_data) + + @override + async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: + configuration = Configuration.get_global_configuration() + + if key in configuration.input_key_candidates: + for candidate in configuration.input_key_candidates: + value = await super().get_value(key=candidate) + if value is not None: + return value + + return await super().get_value(key=key) From e0d647b9ad0958fdf872d0e94fd7d19fdf05adba Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 6 Oct 2025 23:00:29 +0200 Subject: [PATCH 02/15] extend test, fix bug --- .../_file_system/_key_value_store_client.py | 5 +---- tests/unit/storage_clients/test_file_system.py | 8 +++++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index 83c4573c..f47fc368 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -30,10 +30,7 @@ async def purge(self) -> None: # First try to find the alternative format of the input file and process it if it exists. for file_path in self.path_to_kvs.glob('*'): - if ( - file_path.name in configuration.input_key_candidates - and file_path.name != configuration.canonical_input_key - ): + if file_path.name in configuration.input_key_candidates: await self._process_input_json(file_path) async with self._lock: diff --git a/tests/unit/storage_clients/test_file_system.py b/tests/unit/storage_clients/test_file_system.py index 7b938416..10ea6e1b 100644 --- a/tests/unit/storage_clients/test_file_system.py +++ b/tests/unit/storage_clients/test_file_system.py @@ -4,6 +4,8 @@ import json from typing import TYPE_CHECKING +import pytest + from crawlee._consts import METADATA_FILENAME from apify import Actor, Configuration @@ -64,16 +66,16 @@ async def test_purge_preserves_input_file_and_metadata() -> None: assert input_content == '{"test": "input"}' -async def test_pre_existing_input_used_by_actor(tmp_path: Path) -> None: +@pytest.mark.parametrize('input_file_name', ['INPUT', 'INPUT.json']) +async def test_pre_existing_input_used_by_actor(tmp_path: Path, input_file_name: str) -> None: pre_existing_input = { 'foo': 'bar', } - configuration = Configuration.get_global_configuration() # Create pre-existing INPUT.json file path_to_input = tmp_path / 'key_value_stores' / 'default' path_to_input.mkdir(parents=True) - (path_to_input / f'{configuration.input_key}.json').write_text(json.dumps(pre_existing_input)) + (path_to_input / input_file_name).write_text(json.dumps(pre_existing_input)) async with Actor(): assert pre_existing_input == await Actor.get_input() From 649227d8c1cf7315dd6406668a718bd8aeb53a87 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 7 Oct 2025 09:33:20 +0200 Subject: [PATCH 03/15] relax test --- tests/unit/storage_clients/test_file_system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/storage_clients/test_file_system.py b/tests/unit/storage_clients/test_file_system.py index 10ea6e1b..e2a77f70 100644 --- a/tests/unit/storage_clients/test_file_system.py +++ b/tests/unit/storage_clients/test_file_system.py @@ -63,7 +63,7 @@ async def test_purge_preserves_input_file_and_metadata() -> None: # Verify INPUT.json content is unchanged input_content = await asyncio.to_thread(input_file.read_text) - assert input_content == '{"test": "input"}' + assert json.loads(input_content) == json.loads('{"test": "input"}') @pytest.mark.parametrize('input_file_name', ['INPUT', 'INPUT.json']) From 5b416ee518ddca0ca0cdf7321e607631970ce6a7 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 7 Oct 2025 09:49:39 +0200 Subject: [PATCH 04/15] Better naming + safety measures --- .../_file_system/_key_value_store_client.py | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index f47fc368..24013ed3 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -1,5 +1,6 @@ import asyncio import json +import logging from pathlib import Path from more_itertools import flatten @@ -11,6 +12,8 @@ from apify._configuration import Configuration +logger = logging.getLogger(__name__) + class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient): """Apify-specific implementation of the `FileSystemKeyValueStoreClient`. @@ -31,7 +34,7 @@ async def purge(self) -> None: # First try to find the alternative format of the input file and process it if it exists. for file_path in self.path_to_kvs.glob('*'): if file_path.name in configuration.input_key_candidates: - await self._process_input_json(file_path) + await self._sanitize_input_json(file_path) async with self._lock: files_to_keep = set( @@ -50,18 +53,29 @@ async def purge(self) -> None: update_modified_at=True, ) - async def _process_input_json(self, path: Path) -> None: - """Process simple input json file to format expected by the FileSystemKeyValueStoreClient. + async def _sanitize_input_json(self, path: Path) -> None: + """Transform an input json file to match the naming convention expected by the FileSystemKeyValueStoreClient. For example: INPUT.json -> INPUT, INPUT.json.metadata """ + configuration = Configuration.get_global_configuration() + + f = None try: f = await asyncio.to_thread(path.open) input_data = json.load(f) finally: - f.close() + if f is not None: + f.close() + + if await self.record_exists(key=configuration.canonical_input_key): + logger.warning(f'Redundant input file found: {path}') + return + + logger.info(f'Renaming input file: {path.name} -> {configuration.canonical_input_key}') + await asyncio.to_thread(path.unlink, missing_ok=True) - await self.set_value(key=path.stem, value=input_data) + await self.set_value(key=configuration.canonical_input_key, value=input_data) @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: From 81d74fa86b9088da102927130ab08fcb7f63bd97 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 7 Oct 2025 10:42:44 +0200 Subject: [PATCH 05/15] fix comment --- .../storage_clients/_file_system/_key_value_store_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index 24013ed3..8b27e27b 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -56,7 +56,7 @@ async def purge(self) -> None: async def _sanitize_input_json(self, path: Path) -> None: """Transform an input json file to match the naming convention expected by the FileSystemKeyValueStoreClient. - For example: INPUT.json -> INPUT, INPUT.json.metadata + For example: INPUT.json -> INPUT, INPUT.__metadata__.json """ configuration = Configuration.get_global_configuration() From 1195b2c4ae11d18ba1a649f62120bdf16031345e Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 7 Oct 2025 11:20:07 +0200 Subject: [PATCH 06/15] update expected behavior --- tests/unit/storage_clients/test_file_system.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/storage_clients/test_file_system.py b/tests/unit/storage_clients/test_file_system.py index e2a77f70..f0cc2bbd 100644 --- a/tests/unit/storage_clients/test_file_system.py +++ b/tests/unit/storage_clients/test_file_system.py @@ -79,3 +79,6 @@ async def test_pre_existing_input_used_by_actor(tmp_path: Path, input_file_name: async with Actor(): assert pre_existing_input == await Actor.get_input() + + # Make sure that the input file doesn't get renamed in the process + assert (path_to_input / input_file_name).exists() From 71a9e0c8cb1fa2224c5cc2b74ce2c0a389ab43d1 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 7 Oct 2025 11:42:53 +0200 Subject: [PATCH 07/15] Do not actually rename files --- .../_file_system/_key_value_store_client.py | 52 +++++++++---------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index 8b27e27b..7d8d1efa 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -1,7 +1,6 @@ import asyncio import json import logging -from pathlib import Path from more_itertools import flatten from typing_extensions import override @@ -31,10 +30,7 @@ async def purge(self) -> None: """ configuration = Configuration.get_global_configuration() - # First try to find the alternative format of the input file and process it if it exists. - for file_path in self.path_to_kvs.glob('*'): - if file_path.name in configuration.input_key_candidates: - await self._sanitize_input_json(file_path) + await self._sanitize_input_json_files() async with self._lock: files_to_keep = set( @@ -53,29 +49,31 @@ async def purge(self) -> None: update_modified_at=True, ) - async def _sanitize_input_json(self, path: Path) -> None: - """Transform an input json file to match the naming convention expected by the FileSystemKeyValueStoreClient. - - For example: INPUT.json -> INPUT, INPUT.__metadata__.json - """ + async def _sanitize_input_json_files(self) -> None: + """Handle missing metadata for input files.""" configuration = Configuration.get_global_configuration() - - f = None - try: - f = await asyncio.to_thread(path.open) - input_data = json.load(f) - finally: - if f is not None: - f.close() - - if await self.record_exists(key=configuration.canonical_input_key): - logger.warning(f'Redundant input file found: {path}') - return - - logger.info(f'Renaming input file: {path.name} -> {configuration.canonical_input_key}') - - await asyncio.to_thread(path.unlink, missing_ok=True) - await self.set_value(key=configuration.canonical_input_key, value=input_data) + alternative_keys = configuration.input_key_candidates - {configuration.canonical_input_key} + + if (self.path_to_kvs / configuration.canonical_input_key).exists(): + # Handle missing metadata + if not await self.record_exists(key=configuration.canonical_input_key): + input_data = await asyncio.to_thread( + lambda: json.loads((self.path_to_kvs / configuration.canonical_input_key).read_text()) + ) + await self.set_value(key=configuration.canonical_input_key, value=input_data) + + for alternative_key in alternative_keys: + if (alternative_input_file := self.path_to_kvs / alternative_key).exists(): + logger.warning(f'Redundant input file found: {alternative_input_file}') + else: + for alternative_key in alternative_keys: + alternative_input_file = self.path_to_kvs / alternative_key + + # Handle missing metadata + if alternative_input_file.exists() and not await self.record_exists(key=alternative_key): + with alternative_input_file.open() as f: + input_data = await asyncio.to_thread(lambda: json.load(f)) + await self.set_value(key=configuration.canonical_input_key, value=input_data) @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: From 8926a58cd9ab86d50bfc434616b4db3cf92528c0 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 7 Oct 2025 14:22:34 +0200 Subject: [PATCH 08/15] INPUT.json is the default --- src/apify/_configuration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index b5ebf442..fe6e89c2 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -424,11 +424,11 @@ def disable_browser_sandbox_on_platform(self) -> Self: @property def canonical_input_key(self) -> str: - return Path(self.input_key).stem + return str(Path(self.input_key).with_suffix('.json')) @property def input_key_candidates(self) -> set[str]: - return {self.input_key, self.canonical_input_key, Path(self.canonical_input_key).with_suffix('.json').name} + return {self.input_key, self.canonical_input_key, Path(self.canonical_input_key).stem} @classmethod def get_global_configuration(cls) -> Configuration: From 9843ab8e3bcdfb52fc854631f9aebaa2cd124059 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 7 Oct 2025 16:46:48 +0200 Subject: [PATCH 09/15] Always refresh metadata --- .../_file_system/_key_value_store_client.py | 20 +++++++++---------- .../unit/storage_clients/test_file_system.py | 11 +++++----- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index 7d8d1efa..5c4a4a62 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -55,12 +55,11 @@ async def _sanitize_input_json_files(self) -> None: alternative_keys = configuration.input_key_candidates - {configuration.canonical_input_key} if (self.path_to_kvs / configuration.canonical_input_key).exists(): - # Handle missing metadata - if not await self.record_exists(key=configuration.canonical_input_key): - input_data = await asyncio.to_thread( - lambda: json.loads((self.path_to_kvs / configuration.canonical_input_key).read_text()) - ) - await self.set_value(key=configuration.canonical_input_key, value=input_data) + # Refresh metadata to prevent inconsistencies + input_data = await asyncio.to_thread( + lambda: json.loads((self.path_to_kvs / configuration.canonical_input_key).read_text()) + ) + await self.set_value(key=configuration.canonical_input_key, value=input_data) for alternative_key in alternative_keys: if (alternative_input_file := self.path_to_kvs / alternative_key).exists(): @@ -69,11 +68,10 @@ async def _sanitize_input_json_files(self) -> None: for alternative_key in alternative_keys: alternative_input_file = self.path_to_kvs / alternative_key - # Handle missing metadata - if alternative_input_file.exists() and not await self.record_exists(key=alternative_key): - with alternative_input_file.open() as f: - input_data = await asyncio.to_thread(lambda: json.load(f)) - await self.set_value(key=configuration.canonical_input_key, value=input_data) + # Refresh metadata to prevent inconsistencies + with alternative_input_file.open() as f: + input_data = await asyncio.to_thread(lambda: json.load(f)) + await self.set_value(key=configuration.canonical_input_key, value=input_data) @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: diff --git a/tests/unit/storage_clients/test_file_system.py b/tests/unit/storage_clients/test_file_system.py index f0cc2bbd..cb41ce19 100644 --- a/tests/unit/storage_clients/test_file_system.py +++ b/tests/unit/storage_clients/test_file_system.py @@ -2,7 +2,7 @@ import asyncio import json -from typing import TYPE_CHECKING +from pathlib import Path import pytest @@ -11,9 +11,6 @@ from apify import Actor, Configuration from apify.storage_clients._file_system import ApifyFileSystemKeyValueStoreClient -if TYPE_CHECKING: - from pathlib import Path - async def test_purge_preserves_input_file_and_metadata() -> None: """Test that purge() preserves INPUT.json and metadata files but removes other files.""" @@ -67,13 +64,15 @@ async def test_purge_preserves_input_file_and_metadata() -> None: @pytest.mark.parametrize('input_file_name', ['INPUT', 'INPUT.json']) -async def test_pre_existing_input_used_by_actor(tmp_path: Path, input_file_name: str) -> None: +async def test_pre_existing_input_used_by_actor(input_file_name: str) -> None: + configuration = Configuration.get_global_configuration() + pre_existing_input = { 'foo': 'bar', } # Create pre-existing INPUT.json file - path_to_input = tmp_path / 'key_value_stores' / 'default' + path_to_input = Path(configuration.storage_dir) / 'key_value_stores' / 'default' path_to_input.mkdir(parents=True) (path_to_input / input_file_name).write_text(json.dumps(pre_existing_input)) From b48135209134082a771c8e73002121a30757c17f Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 7 Oct 2025 17:11:53 +0200 Subject: [PATCH 10/15] Sanitize input files on open(), not purge() --- .../_file_system/_key_value_store_client.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index 5c4a4a62..81c001a6 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -3,9 +3,10 @@ import logging from more_itertools import flatten -from typing_extensions import override +from typing_extensions import Self, override from crawlee._consts import METADATA_FILENAME +from crawlee.configuration import Configuration as CrawleeConfiguration from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreRecord @@ -21,6 +22,22 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient): directory, except for the metadata file and the `INPUT.json` file. """ + @override + @classmethod + async def open( + cls, + *, + id: str | None, + name: str | None, + alias: str | None, + configuration: CrawleeConfiguration, + ) -> Self: + client = await super().open(id=id, name=name, alias=alias, configuration=configuration) + + await client._sanitize_input_json_files() # noqa: SLF001 - it's okay, this is a factory method + + return client + @override async def purge(self) -> None: """Purges the key-value store by deleting all its contents. @@ -30,8 +47,6 @@ async def purge(self) -> None: """ configuration = Configuration.get_global_configuration() - await self._sanitize_input_json_files() - async with self._lock: files_to_keep = set( flatten([key, f'{key}.{METADATA_FILENAME}'] for key in configuration.input_key_candidates) From ce6fab07e05b524cd0ab14bf68174ebac8f14d73 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 7 Oct 2025 21:04:35 +0200 Subject: [PATCH 11/15] tmp pin crawlee to last beta --- pyproject.toml | 3 ++- uv.lock | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4285721b..140d4bac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,8 @@ keywords = [ dependencies = [ "apify-client>=2.0.0,<3.0.0", "apify-shared>=2.0.0,<3.0.0", - "crawlee>=1.0.0,<2.0.0", + # "crawlee>=1.0.0,<2.0.0", + "crawlee==1.0.2b4", # Temporary pin "cachetools>=5.5.0", "cryptography>=42.0.0", "impit>=0.6.1", diff --git a/uv.lock b/uv.lock index f45887f7..8f535fb9 100644 --- a/uv.lock +++ b/uv.lock @@ -76,7 +76,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=2.0.0,<3.0.0" }, { name = "apify-shared", specifier = ">=2.0.0,<3.0.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", specifier = ">=1.0.0,<2.0.0" }, + { name = "crawlee", specifier = "==1.0.2b4" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "impit", specifier = ">=0.6.1" }, { name = "lazy-object-proxy", specifier = ">=1.11.0" }, @@ -516,7 +516,7 @@ toml = [ [[package]] name = "crawlee" -version = "1.0.1" +version = "1.0.2b4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cachetools" }, @@ -532,9 +532,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4d/0a/713d5d45c9833a74beaf9f298081823eeef8dcbd5c25b8f87feebe7a4574/crawlee-1.0.1.tar.gz", hash = "sha256:01f7c9bfeace31bb3a175bfb954bc12214dd2fc3d4e9da30e566659f25015a67", size = 24897569, upload-time = "2025-10-06T08:28:36.367Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b5/5d/a10d3b51241178ea6c4cb8ccceb4722dfb57b3d250957f1bde2a154a9f06/crawlee-1.0.2b4.tar.gz", hash = "sha256:602ee3f4a25869ff477e4a64c132c7875315a61ff1a644135eaeaf886c25437d", size = 24900976, upload-time = "2025-10-07T18:48:23.109Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/46/5ab3edf0497dad3c6c02979609934f36e1bb934076e4401e5b5c7b060fbd/crawlee-1.0.1-py3-none-any.whl", hash = "sha256:a910f0c2e8fec57b3c800af4f60006e1906923f80e9301f0073a85aca7d16c62", size = 303586, upload-time = "2025-10-06T08:28:32.466Z" }, + { url = "https://files.pythonhosted.org/packages/c1/ef/91f8cec6fb74b65bf69d2798c8338c2f21121e991c675ffa30bcf93e525e/crawlee-1.0.2b4-py3-none-any.whl", hash = "sha256:eb365053d729d9153b358c846eb354a7de082ae5075df9672285585ac5455b3d", size = 304390, upload-time = "2025-10-07T18:48:20.451Z" }, ] [package.optional-dependencies] From 680a9c975ec250e811e99fbec0d692313ba862d3 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 8 Oct 2025 11:20:24 +0200 Subject: [PATCH 12/15] add if input file exists then ... --- .../_file_system/_key_value_store_client.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index 81c001a6..7b520e3a 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -83,10 +83,12 @@ async def _sanitize_input_json_files(self) -> None: for alternative_key in alternative_keys: alternative_input_file = self.path_to_kvs / alternative_key - # Refresh metadata to prevent inconsistencies - with alternative_input_file.open() as f: - input_data = await asyncio.to_thread(lambda: json.load(f)) - await self.set_value(key=configuration.canonical_input_key, value=input_data) + # Only process files that actually exist + if alternative_input_file.exists(): + # Refresh metadata to prevent inconsistencies + with alternative_input_file.open() as f: + input_data = await asyncio.to_thread(lambda: json.load(f)) + await self.set_value(key=configuration.canonical_input_key, value=input_data) @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: From 55d7d9e5a65159de05d990f71d3918eb311e0a54 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 8 Oct 2025 11:23:25 +0200 Subject: [PATCH 13/15] Update crawlee --- pyproject.toml | 3 +-- uv.lock | 10 +++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 140d4bac..55b372ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,8 +36,7 @@ keywords = [ dependencies = [ "apify-client>=2.0.0,<3.0.0", "apify-shared>=2.0.0,<3.0.0", - # "crawlee>=1.0.0,<2.0.0", - "crawlee==1.0.2b4", # Temporary pin + "crawlee>=1.0.2,<2.0.0", "cachetools>=5.5.0", "cryptography>=42.0.0", "impit>=0.6.1", diff --git a/uv.lock b/uv.lock index 8f535fb9..7fba3239 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" [[package]] @@ -76,7 +76,7 @@ requires-dist = [ { name = "apify-client", specifier = ">=2.0.0,<3.0.0" }, { name = "apify-shared", specifier = ">=2.0.0,<3.0.0" }, { name = "cachetools", specifier = ">=5.5.0" }, - { name = "crawlee", specifier = "==1.0.2b4" }, + { name = "crawlee", specifier = ">=1.0.2,<2.0.0" }, { name = "cryptography", specifier = ">=42.0.0" }, { name = "impit", specifier = ">=0.6.1" }, { name = "lazy-object-proxy", specifier = ">=1.11.0" }, @@ -516,7 +516,7 @@ toml = [ [[package]] name = "crawlee" -version = "1.0.2b4" +version = "1.0.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cachetools" }, @@ -532,9 +532,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b5/5d/a10d3b51241178ea6c4cb8ccceb4722dfb57b3d250957f1bde2a154a9f06/crawlee-1.0.2b4.tar.gz", hash = "sha256:602ee3f4a25869ff477e4a64c132c7875315a61ff1a644135eaeaf886c25437d", size = 24900976, upload-time = "2025-10-07T18:48:23.109Z" } +sdist = { url = "https://files.pythonhosted.org/packages/93/68/45641208866a60176be4c5f2ab620c2122df18db956dc86a03471181e7c3/crawlee-1.0.2.tar.gz", hash = "sha256:522b52c1362d116b95ba85820f87001713f290a3ec690568adb862a4b29d7ca4", size = 24900937, upload-time = "2025-10-08T07:59:09.983Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/ef/91f8cec6fb74b65bf69d2798c8338c2f21121e991c675ffa30bcf93e525e/crawlee-1.0.2b4-py3-none-any.whl", hash = "sha256:eb365053d729d9153b358c846eb354a7de082ae5075df9672285585ac5455b3d", size = 304390, upload-time = "2025-10-07T18:48:20.451Z" }, + { url = "https://files.pythonhosted.org/packages/be/ac/860de31ca534adb1d6321f66c7d082ba735eff49090f67a316f8d60f1ee2/crawlee-1.0.2-py3-none-any.whl", hash = "sha256:57a63d0b22493297490a5836e6b1d47dee667004d95bbc01387dcfb00f6a8a7a", size = 304369, upload-time = "2025-10-08T07:59:07.475Z" }, ] [package.optional-dependencies] From 72602198ede9a612dd115e9caec6756ef310e53b Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 8 Oct 2025 11:57:25 +0200 Subject: [PATCH 14/15] test more --- .../unit/storage_clients/test_file_system.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/unit/storage_clients/test_file_system.py b/tests/unit/storage_clients/test_file_system.py index cb41ce19..280188b7 100644 --- a/tests/unit/storage_clients/test_file_system.py +++ b/tests/unit/storage_clients/test_file_system.py @@ -6,6 +6,7 @@ import pytest +from crawlee import service_locator from crawlee._consts import METADATA_FILENAME from apify import Actor, Configuration @@ -65,19 +66,27 @@ async def test_purge_preserves_input_file_and_metadata() -> None: @pytest.mark.parametrize('input_file_name', ['INPUT', 'INPUT.json']) async def test_pre_existing_input_used_by_actor(input_file_name: str) -> None: - configuration = Configuration.get_global_configuration() + configuration = Configuration() + service_locator.set_configuration(configuration) + + # Create key-value store directory and make sure that it is empty + path_to_input = Path(configuration.storage_dir) / 'key_value_stores' / 'default' + path_to_input.mkdir(parents=True) + assert list(path_to_input.glob('*')) == [] pre_existing_input = { 'foo': 'bar', } # Create pre-existing INPUT.json file - path_to_input = Path(configuration.storage_dir) / 'key_value_stores' / 'default' - path_to_input.mkdir(parents=True) (path_to_input / input_file_name).write_text(json.dumps(pre_existing_input)) async with Actor(): assert pre_existing_input == await Actor.get_input() - # Make sure that the input file doesn't get renamed in the process - assert (path_to_input / input_file_name).exists() + # Make sure that the input file doesn't get renamed in the process and metadata are added + assert set(path_to_input.glob('*')) == { + path_to_input / '__metadata__.json', + path_to_input / input_file_name, + path_to_input / f'{input_file_name}.__metadata__.json', + } From a57e2334533726e3a2227125bfb3402516439d75 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 8 Oct 2025 12:06:05 +0200 Subject: [PATCH 15/15] fix --- .../storage_clients/_file_system/_key_value_store_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index 7b520e3a..e320d2c9 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -88,7 +88,7 @@ async def _sanitize_input_json_files(self) -> None: # Refresh metadata to prevent inconsistencies with alternative_input_file.open() as f: input_data = await asyncio.to_thread(lambda: json.load(f)) - await self.set_value(key=configuration.canonical_input_key, value=input_data) + await self.set_value(key=alternative_key, value=input_data) @override async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: