Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
- Add storage-related helpers `get_data`, `push_data` and `export_to` to `BasicCrawler` and `BasicContext`
- Add `PlaywrightCrawler`'s enqueue links helper

### Fixes

- Fix type error in persist state of statistics

## [0.0.4](../../releases/tag/v0.0.4) - 2024-05-30

- Another internal release, adding statistics capturing, proxy configuration and
Expand Down
8 changes: 5 additions & 3 deletions src/crawlee/statistics/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from crawlee._utils.recurring_task import RecurringTask
from crawlee.events import LocalEventManager
from crawlee.events.types import Event
from crawlee.events.types import Event, EventPersistStateData
from crawlee.statistics import FinalStatistics, StatisticsPersistedState, StatisticsState
from crawlee.statistics.error_tracker import ErrorTracker
from crawlee.storages import KeyValueStore
Expand Down Expand Up @@ -126,7 +126,7 @@ async def __aexit__(
self.state.crawler_finished_at = datetime.now(timezone.utc)
self._events.off(event=Event.PERSIST_STATE, listener=self._persist_state)
await self._periodic_logger.stop()
await self._persist_state()
await self._persist_state(event_data=EventPersistStateData(is_migrating=False))

def register_status_code(self, code: int) -> None:
"""Increment the number of times a status code has been received."""
Expand Down Expand Up @@ -233,7 +233,9 @@ async def _maybe_load_statistics(self) -> None:
elif saved_state.crawler_last_started_at:
self._instance_start = saved_state.crawler_last_started_at

async def _persist_state(self) -> None:
async def _persist_state(self, event_data: EventPersistStateData) -> None:
logger.debug(f'Persisting state of the Statistics (event_data={event_data}).')

if not self._persistence_enabled:
return

Expand Down
4 changes: 3 additions & 1 deletion src/crawlee/storages/request_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ class RequestQueue(BaseStorage, RequestProvider):
rq = await RequestQueue.open(id='my_rq_id')
"""

_API_PROCESSED_REQUESTS_DELAY = timedelta(seconds=10)
# TODO: set this back to 10 seconds once the following issue is resolved:
# https://github.com/apify/crawlee-python/issues/203
_API_PROCESSED_REQUESTS_DELAY = timedelta(seconds=1)
"""Delay threshold to assume consistency of queue head operations after queue modifications."""

_MAX_CACHED_REQUESTS = 1_000_000
Expand Down