From f3a5a45b0a3035206291a298b2b089c920126618 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 7 Nov 2024 13:19:12 +0100 Subject: [PATCH 1/2] Fix unhandled ValueError in request handler result processing --- src/crawlee/_types.py | 4 ++++ src/crawlee/storages/_dataset.py | 11 ++++++----- tests/unit/basic_crawler/test_basic_crawler.py | 11 +++++++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index d69b297634..b57fbc894c 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -399,6 +399,10 @@ async def push_data( **kwargs: Unpack[PushDataKwargs], ) -> None: """Track a call to the `push_data` context helper.""" + from crawlee.storages._dataset import Dataset + + await Dataset.check_and_serialize(data) + self.push_data_calls.append( PushDataFunctionCall( data=data, diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index 1b244b61f5..ca19a772e1 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -257,11 +257,11 @@ async def push_data(self, data: JsonSerializable, **kwargs: Unpack[PushDataKwarg """ # Handle singular items if not isinstance(data, list): - items = await self._check_and_serialize(data) + items = await self.check_and_serialize(data) return await self._resource_client.push_items(items, **kwargs) # Handle lists - payloads_generator = (await self._check_and_serialize(item, index) for index, item in enumerate(data)) + payloads_generator = (await self.check_and_serialize(item, index) for index, item in enumerate(data)) # Invoke client in series to preserve the order of data async for items in self._chunk_by_size(payloads_generator): @@ -417,7 +417,8 @@ async def iterate_items( ): yield item - async def _check_and_serialize(self, item: JsonSerializable, index: int | None = None) -> str: + @classmethod + async def check_and_serialize(cls, item: JsonSerializable, index: int | None = None) -> str: """Serializes a given item to JSON, checks its serializability and size against a limit. Args: @@ -438,8 +439,8 @@ async def _check_and_serialize(self, item: JsonSerializable, index: int | None = raise ValueError(f'Data item{s}is not serializable to JSON.') from exc payload_size = ByteSize(len(payload.encode('utf-8'))) - if payload_size > self._EFFECTIVE_LIMIT_SIZE: - raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {self._EFFECTIVE_LIMIT_SIZE})') + if payload_size > cls._EFFECTIVE_LIMIT_SIZE: + raise ValueError(f'Data item{s}is too large (size: {payload_size}, limit: {cls._EFFECTIVE_LIMIT_SIZE})') return payload diff --git a/tests/unit/basic_crawler/test_basic_crawler.py b/tests/unit/basic_crawler/test_basic_crawler.py index 5078102145..4de75c1518 100644 --- a/tests/unit/basic_crawler/test_basic_crawler.py +++ b/tests/unit/basic_crawler/test_basic_crawler.py @@ -645,6 +645,17 @@ async def handler(context: BasicCrawlingContext) -> None: assert exported_json_str == expected_json_str +async def test_crawler_push_data_over_limit() -> None: + crawler = BasicCrawler() + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + await context.push_data({'hello': 'world' * 3 * 1024 * 1024}) + + stats = await crawler.run(['http://example.tld/1']) + assert stats.requests_failed == 1 + + async def test_context_update_kv_store() -> None: crawler = BasicCrawler() From 45121ed40d90321cc71f3e4e1dbb32119cc9fc5c Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Mon, 11 Nov 2024 12:23:28 +0100 Subject: [PATCH 2/2] Add comment --- tests/unit/basic_crawler/test_basic_crawler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/basic_crawler/test_basic_crawler.py b/tests/unit/basic_crawler/test_basic_crawler.py index 4de75c1518..709637351e 100644 --- a/tests/unit/basic_crawler/test_basic_crawler.py +++ b/tests/unit/basic_crawler/test_basic_crawler.py @@ -650,6 +650,7 @@ async def test_crawler_push_data_over_limit() -> None: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: + # Push a roughly 15MB payload - this should be enough to break the 9MB limit await context.push_data({'hello': 'world' * 3 * 1024 * 1024}) stats = await crawler.run(['http://example.tld/1'])