Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ dependencies = [
"more-itertools>=10.2.0",
"protego>=0.5.0",
"psutil>=6.0.0",
"pydantic-settings>=2.2.0,!=2.7.0,!=2.7.1,!=2.8.0",
"pydantic-settings>=2.12.0",
"pydantic>=2.11.0",
"pyee>=9.0.0",
"tldextract>=5.1.0",
Expand Down
4 changes: 3 additions & 1 deletion src/crawlee/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ class Configuration(BaseSettings):
Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
"""

model_config = SettingsConfigDict(validate_by_name=True, validate_by_alias=True)
# TODO: https://github.com/pydantic/pydantic-settings/issues/706
# Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
model_config = SettingsConfigDict(populate_by_name=True)

internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
"""Timeout for the internal asynchronous operations."""
Expand Down
3 changes: 0 additions & 3 deletions tests/unit/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# TODO: Update crawlee_storage_dir args once the Pydantic bug is fixed
# https://github.com/apify/crawlee-python/issues/146

from __future__ import annotations

import logging
Expand Down
10 changes: 5 additions & 5 deletions tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,7 +1102,7 @@ async def test_services_crawlers_can_use_different_services() -> None:

async def test_crawler_uses_default_storages(tmp_path: Path) -> None:
configuration = Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
storage_dir=str(tmp_path),
purge_on_start=True,
)
service_locator.set_configuration(configuration)
Expand All @@ -1120,7 +1120,7 @@ async def test_crawler_uses_default_storages(tmp_path: Path) -> None:

async def test_crawler_can_use_other_storages(tmp_path: Path) -> None:
configuration = Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
storage_dir=str(tmp_path),
purge_on_start=True,
)
service_locator.set_configuration(configuration)
Expand Down Expand Up @@ -1148,11 +1148,11 @@ async def test_crawler_can_use_other_storages_of_same_type(tmp_path: Path) -> No
}

configuration_a = Configuration(
crawlee_storage_dir=str(a_path), # type: ignore[call-arg]
storage_dir=str(a_path),
purge_on_start=True,
)
configuration_b = Configuration(
crawlee_storage_dir=str(b_path), # type: ignore[call-arg]
storage_dir=str(b_path),
purge_on_start=True,
)

Expand Down Expand Up @@ -1652,7 +1652,7 @@ async def _run_crawler(requests: list[str], storage_dir: str) -> StatisticsState
Must be defined like this to be pickable for ProcessPoolExecutor."""
service_locator.set_configuration(
Configuration(
crawlee_storage_dir=storage_dir, # type: ignore[call-arg]
storage_dir=storage_dir,
purge_on_start=False,
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
@pytest.fixture
def configuration(tmp_path: Path) -> Configuration:
return Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
storage_dir=str(tmp_path),
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
@pytest.fixture
def configuration(tmp_path: Path) -> Configuration:
return Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
storage_dir=str(tmp_path),
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
@pytest.fixture
def configuration(tmp_path: Path) -> Configuration:
return Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
storage_dir=str(tmp_path),
)


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/storage_clients/_sql/test_sql_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
def configuration(tmp_path: Path) -> Configuration:
"""Temporary configuration for tests."""
return Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
storage_dir=str(tmp_path),
)


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/storage_clients/_sql/test_sql_kvs_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
def configuration(tmp_path: Path) -> Configuration:
"""Temporary configuration for tests."""
return Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
storage_dir=str(tmp_path),
)


Expand Down
2 changes: 1 addition & 1 deletion tests/unit/storage_clients/_sql/test_sql_rq_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
def configuration(tmp_path: Path) -> Configuration:
"""Temporary configuration for tests."""
return Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
storage_dir=str(tmp_path),
)


Expand Down
5 changes: 1 addition & 4 deletions tests/unit/storages/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# TODO: Update crawlee_storage_dir args once the Pydantic bug is fixed
# https://github.com/apify/crawlee-python/issues/146

from __future__ import annotations

import json
Expand Down Expand Up @@ -492,7 +489,7 @@ async def test_export_with_multiple_kwargs(dataset: Dataset, tmp_path: Path) ->
custom_dir_name = 'some_dir'
custom_dir = tmp_path / custom_dir_name
custom_dir.mkdir()
target_configuration = Configuration(crawlee_storage_dir=str(custom_dir)) # type: ignore[call-arg]
target_configuration = Configuration(storage_dir=str(custom_dir))

# Set expected values
expected_exported_data = f'{json.dumps([{"some key": "some data"}])}'
Expand Down
5 changes: 1 addition & 4 deletions tests/unit/storages/test_key_value_store.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# TODO: Update crawlee_storage_dir args once the Pydantic bug is fixed
# https://github.com/apify/crawlee-python/issues/146

from __future__ import annotations

import json
Expand Down Expand Up @@ -1119,7 +1116,7 @@ async def test_get_auto_saved_value_various_global_clients(
"""Ensure that persistence is working for all clients regardless of what is set in service locator."""
service_locator.set_configuration(
Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
storage_dir=str(tmp_path),
purge_on_start=True,
)
)
Expand Down
3 changes: 0 additions & 3 deletions tests/unit/storages/test_request_queue.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# TODO: Update crawlee_storage_dir args once the Pydantic bug is fixed
# https://github.com/apify/crawlee-python/issues/146

from __future__ import annotations

import asyncio
Expand Down
27 changes: 7 additions & 20 deletions tests/unit/storages/test_storage_instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,32 +22,23 @@ def storage_type(request: pytest.FixtureRequest) -> type[Storage]:


async def test_unique_storage_by_storage_client(tmp_path: Path, storage_type: type[Storage]) -> None:
config = Configuration(
purge_on_start=True,
)
config.storage_dir = str(tmp_path)
config = Configuration(purge_on_start=True, storage_dir=str(tmp_path))

storage_1 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config)
storage_2 = await storage_type.open(storage_client=FileSystemStorageClient(), configuration=config)
assert storage_1 is not storage_2


async def test_same_storage_when_different_client(tmp_path: Path, storage_type: type[Storage]) -> None:
config = Configuration(
purge_on_start=True,
)
config.storage_dir = str(tmp_path)
config = Configuration(purge_on_start=True, storage_dir=str(tmp_path))

storage_1 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config)
storage_2 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config)
assert storage_1 is storage_2


async def test_unique_storage_by_storage_type(tmp_path: Path) -> None:
config = Configuration(
purge_on_start=True,
)
config.storage_dir = str(tmp_path)
config = Configuration(purge_on_start=True, storage_dir=str(tmp_path))
storage_client = MemoryStorageClient()

kvs = await KeyValueStore.open(storage_client=storage_client, configuration=config)
Expand All @@ -71,11 +62,9 @@ async def test_unique_storage_by_unique_cache_key_different_path(tmp_path: Path,
path_1.mkdir()
path_2.mkdir()

config_1 = Configuration()
config_1.storage_dir = str(path_1)
config_1 = Configuration(storage_dir=str(path_1))

config_2 = Configuration()
config_2.storage_dir = str(path_2)
config_2 = Configuration(storage_dir=str(path_2))

storage_client = FileSystemStorageClient()

Expand All @@ -87,11 +76,9 @@ async def test_unique_storage_by_unique_cache_key_different_path(tmp_path: Path,
async def test_unique_storage_by_unique_cache_key_same_path(tmp_path: Path, storage_type: type[Storage]) -> None:
"""Test that StorageInstanceManager support unique cache key. Different configs with same storage_dir create same
storage."""
config_1 = Configuration()
config_1.storage_dir = str(tmp_path)
config_1 = Configuration(storage_dir=str(tmp_path))

config_2 = Configuration()
config_2.storage_dir = str(tmp_path)
config_2 = Configuration(storage_dir=str(tmp_path))

storage_client = FileSystemStorageClient()

Expand Down
15 changes: 3 additions & 12 deletions tests/unit/test_configuration.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# TODO: Update crawlee_storage_dir args once the Pydantic bug is fixed
# https://github.com/apify/crawlee-python/issues/146

from __future__ import annotations

from typing import TYPE_CHECKING
Expand Down Expand Up @@ -38,11 +35,7 @@ def test_global_configuration_works_reversed() -> None:

async def test_storage_not_persisted_when_non_persistable_storage_used(tmp_path: Path, server_url: URL) -> None:
"""Make the Crawler use MemoryStorageClient which can't persist state."""
service_locator.set_configuration(
Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
)
)
service_locator.set_configuration(Configuration(storage_dir=str(tmp_path)))
crawler = HttpCrawler(storage_client=MemoryStorageClient())

@crawler.router.default_handler
Expand All @@ -62,9 +55,7 @@ async def test_storage_persisted_with_explicit_statistics_with_persistable_stora
"""Make the Crawler use MemoryStorageClient which can't persist state,
but pass explicit statistics to it which will use global FileSystemStorageClient() that can persist state."""

configuration = Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
)
configuration = Configuration(storage_dir=str(tmp_path))
service_locator.set_configuration(configuration)
service_locator.set_storage_client(FileSystemStorageClient())

Expand All @@ -85,7 +76,7 @@ async def default_handler(context: HttpCrawlingContext) -> None:

async def test_storage_persisted_when_enabled(tmp_path: Path, server_url: URL) -> None:
configuration = Configuration(
crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg]
storage_dir=str(tmp_path),
)

storage_client = FileSystemStorageClient()
Expand Down
8 changes: 4 additions & 4 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading