diff --git a/CHANGELOG.md b/CHANGELOG.md index 8deffbee12..2e0b63a6a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ - Add storage-related helpers `get_data`, `push_data` and `export_to` to `BasicCrawler` and `BasicContext` - Add `PlaywrightCrawler`'s enqueue links helper +### Fixes + +- Fix type error in persist state of statistics + ## [0.0.4](../../releases/tag/v0.0.4) - 2024-05-30 - Another internal release, adding statistics capturing, proxy configuration and diff --git a/src/crawlee/statistics/statistics.py b/src/crawlee/statistics/statistics.py index 1d472139c0..a4254d98ee 100644 --- a/src/crawlee/statistics/statistics.py +++ b/src/crawlee/statistics/statistics.py @@ -10,7 +10,7 @@ from crawlee._utils.recurring_task import RecurringTask from crawlee.events import LocalEventManager -from crawlee.events.types import Event +from crawlee.events.types import Event, EventPersistStateData from crawlee.statistics import FinalStatistics, StatisticsPersistedState, StatisticsState from crawlee.statistics.error_tracker import ErrorTracker from crawlee.storages import KeyValueStore @@ -126,7 +126,7 @@ async def __aexit__( self.state.crawler_finished_at = datetime.now(timezone.utc) self._events.off(event=Event.PERSIST_STATE, listener=self._persist_state) await self._periodic_logger.stop() - await self._persist_state() + await self._persist_state(event_data=EventPersistStateData(is_migrating=False)) def register_status_code(self, code: int) -> None: """Increment the number of times a status code has been received.""" @@ -233,7 +233,9 @@ async def _maybe_load_statistics(self) -> None: elif saved_state.crawler_last_started_at: self._instance_start = saved_state.crawler_last_started_at - async def _persist_state(self) -> None: + async def _persist_state(self, event_data: EventPersistStateData) -> None: + logger.debug(f'Persisting state of the Statistics (event_data={event_data}).') + if not self._persistence_enabled: return diff --git a/src/crawlee/storages/request_queue.py b/src/crawlee/storages/request_queue.py index 017b1a8916..663268e59b 100644 --- a/src/crawlee/storages/request_queue.py +++ b/src/crawlee/storages/request_queue.py @@ -52,7 +52,9 @@ class RequestQueue(BaseStorage, RequestProvider): rq = await RequestQueue.open(id='my_rq_id') """ - _API_PROCESSED_REQUESTS_DELAY = timedelta(seconds=10) + # TODO: set this back to 10 seconds once the following issue is resolved: + # https://github.com/apify/crawlee-python/issues/203 + _API_PROCESSED_REQUESTS_DELAY = timedelta(seconds=1) """Delay threshold to assume consistency of queue head operations after queue modifications.""" _MAX_CACHED_REQUESTS = 1_000_000