From 5d182fba2074c10277b601779c07c7c057a2cc7a Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 21 Jun 2024 10:26:38 +0200 Subject: [PATCH 1/2] fix: type error in statistics persist state close: #194 --- src/crawlee/statistics/statistics.py | 8 +++++--- src/crawlee/storages/request_queue.py | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/crawlee/statistics/statistics.py b/src/crawlee/statistics/statistics.py index 1d472139c0..a4254d98ee 100644 --- a/src/crawlee/statistics/statistics.py +++ b/src/crawlee/statistics/statistics.py @@ -10,7 +10,7 @@ from crawlee._utils.recurring_task import RecurringTask from crawlee.events import LocalEventManager -from crawlee.events.types import Event +from crawlee.events.types import Event, EventPersistStateData from crawlee.statistics import FinalStatistics, StatisticsPersistedState, StatisticsState from crawlee.statistics.error_tracker import ErrorTracker from crawlee.storages import KeyValueStore @@ -126,7 +126,7 @@ async def __aexit__( self.state.crawler_finished_at = datetime.now(timezone.utc) self._events.off(event=Event.PERSIST_STATE, listener=self._persist_state) await self._periodic_logger.stop() - await self._persist_state() + await self._persist_state(event_data=EventPersistStateData(is_migrating=False)) def register_status_code(self, code: int) -> None: """Increment the number of times a status code has been received.""" @@ -233,7 +233,9 @@ async def _maybe_load_statistics(self) -> None: elif saved_state.crawler_last_started_at: self._instance_start = saved_state.crawler_last_started_at - async def _persist_state(self) -> None: + async def _persist_state(self, event_data: EventPersistStateData) -> None: + logger.debug(f'Persisting state of the Statistics (event_data={event_data}).') + if not self._persistence_enabled: return diff --git a/src/crawlee/storages/request_queue.py b/src/crawlee/storages/request_queue.py index 017b1a8916..663268e59b 100644 --- a/src/crawlee/storages/request_queue.py +++ b/src/crawlee/storages/request_queue.py @@ -52,7 +52,9 @@ class RequestQueue(BaseStorage, RequestProvider): rq = await RequestQueue.open(id='my_rq_id') """ - _API_PROCESSED_REQUESTS_DELAY = timedelta(seconds=10) + # TODO: set this back to 10 seconds once the following issue is resolved: + # https://github.com/apify/crawlee-python/issues/203 + _API_PROCESSED_REQUESTS_DELAY = timedelta(seconds=1) """Delay threshold to assume consistency of queue head operations after queue modifications.""" _MAX_CACHED_REQUESTS = 1_000_000 From 210227d9c3cc387449e3ce7f414eb6487c1f3cb2 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 21 Jun 2024 10:28:24 +0200 Subject: [PATCH 2/2] changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8deffbee12..2e0b63a6a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ - Add storage-related helpers `get_data`, `push_data` and `export_to` to `BasicCrawler` and `BasicContext` - Add `PlaywrightCrawler`'s enqueue links helper +### Fixes + +- Fix type error in persist state of statistics + ## [0.0.4](../../releases/tag/v0.0.4) - 2024-05-30 - Another internal release, adding statistics capturing, proxy configuration and