Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 27 additions & 6 deletions src/crawlee/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,23 @@


class Configuration(BaseSettings):
"""Configuration of the Crawler.
"""Configuration settings for the Crawlee project.

Args:
internal_timeout: Timeout for internal operations such as marking a request as processed.
verbose_log: Allows verbose logging.
default_storage_id: The default storage ID.
purge_on_start: Whether to purge the storage on start.
This class stores common configurable parameters for Crawlee. Default values are provided for all settings,
so typically, no adjustments are necessary. However, you may modify settings for specific use cases,
such as changing the default storage directory, the default storage IDs, the timeout for internal
operations, and more.

Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
"""

model_config = SettingsConfigDict(populate_by_name=True)

internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
"""Timeout for the internal asynchronous operations."""

verbose_log: Annotated[bool, Field(alias='crawlee_verbose_log')] = False
"""Whether to enable verbose logging."""

default_browser_path: Annotated[
str | None,
Expand All @@ -39,6 +42,7 @@ class Configuration(BaseSettings):
)
),
] = None
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""

disable_browser_sandbox: Annotated[
bool,
Expand All @@ -49,6 +53,7 @@ class Configuration(BaseSettings):
)
),
] = False
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""

log_level: Annotated[
Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
Expand All @@ -60,6 +65,7 @@ class Configuration(BaseSettings):
),
BeforeValidator(lambda value: str(value).upper()),
] = 'INFO'
"""The logging level."""

default_dataset_id: Annotated[
str,
Expand All @@ -71,6 +77,7 @@ class Configuration(BaseSettings):
)
),
] = 'default'
"""The default dataset ID."""

default_key_value_store_id: Annotated[
str,
Expand All @@ -82,6 +89,7 @@ class Configuration(BaseSettings):
)
),
] = 'default'
"""The default key-value store ID."""

default_request_queue_id: Annotated[
str,
Expand All @@ -93,6 +101,7 @@ class Configuration(BaseSettings):
)
),
] = 'default'
"""The default request queue ID."""

purge_on_start: Annotated[
bool,
Expand All @@ -103,8 +112,10 @@ class Configuration(BaseSettings):
)
),
] = True
"""Whether to purge the storage on the start."""

write_metadata: Annotated[bool, Field(alias='crawlee_write_metadata')] = True
"""Whether to write the storage metadata."""

persist_storage: Annotated[
bool,
Expand All @@ -115,6 +126,7 @@ class Configuration(BaseSettings):
)
),
] = True
"""Whether to persist the storage."""

persist_state_interval: Annotated[
timedelta_ms,
Expand All @@ -125,6 +137,7 @@ class Configuration(BaseSettings):
)
),
] = timedelta(minutes=1)
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""

system_info_interval: Annotated[
timedelta_ms,
Expand All @@ -135,6 +148,7 @@ class Configuration(BaseSettings):
)
),
] = timedelta(seconds=1)
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""

max_used_cpu_ratio: Annotated[
float,
Expand All @@ -145,6 +159,7 @@ class Configuration(BaseSettings):
)
),
] = 0.95
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""

memory_mbytes: Annotated[
int | None,
Expand All @@ -156,6 +171,7 @@ class Configuration(BaseSettings):
)
),
] = None
"""The maximum memory in megabytes. The `Snapshotter.max_memory_size` is set to this value."""

available_memory_ratio: Annotated[
float,
Expand All @@ -166,6 +182,7 @@ class Configuration(BaseSettings):
)
),
] = 0.25
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""

storage_dir: Annotated[
str,
Expand All @@ -176,6 +193,7 @@ class Configuration(BaseSettings):
),
),
] = './storage'
"""The path to the storage directory."""

chrome_executable_path: Annotated[
str | None,
Expand All @@ -186,6 +204,7 @@ class Configuration(BaseSettings):
)
),
] = None
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""

headless: Annotated[
bool,
Expand All @@ -196,6 +215,7 @@ class Configuration(BaseSettings):
)
),
] = True
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""

xvfb: Annotated[
bool,
Expand All @@ -206,6 +226,7 @@ class Configuration(BaseSettings):
)
),
] = False
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""

@classmethod
def get_global_configuration(cls) -> Self:
Expand Down