Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ curl-cffi = { version = ">=0.7.0", optional = true }
docutils = ">=0.21.0"
eval-type-backport = ">=0.2.0"
html5lib = { version = ">=1.0", optional = true }
httpx = { version = ">=0.27.0", extras = ["brotli"] }
httpx = { version = ">=0.27.0", extras = ["brotli", "http2"] }
inquirer = ">=3.3.0"
lxml = { version = ">=5.2.0", optional = true }
more_itertools = ">=10.2.0"
Expand Down
11 changes: 9 additions & 2 deletions src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@
class HttpResponse(Protocol):
"""This protocol defines the interface that any HTTP response object must implement."""

def read(self) -> bytes:
"""Read the content of the response body."""
@property
def http_version(self) -> str:
"""The HTTP version used in the response."""

@property
def status_code(self) -> int:
Expand All @@ -31,11 +32,17 @@ def status_code(self) -> int:
def headers(self) -> dict[str, str]:
"""The HTTP headers received in the response."""

def read(self) -> bytes:
"""Read the content of the response body."""


@dataclass(frozen=True)
class HttpCrawlingResult:
"""Result of a HTTP-only crawl.

Mainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,
`ParselCrawlingContext`, ...).

Args:
http_response: The HTTP response received from the server.
"""
Expand Down
39 changes: 32 additions & 7 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING, Any, Optional, cast

import httpx
Expand All @@ -18,15 +19,18 @@
from crawlee.proxy_configuration import ProxyInfo
from crawlee.statistics import Statistics

logger = getLogger(__name__)


class _HttpxResponse:
"""Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol."""

def __init__(self, response: httpx.Response) -> None:
self._response = response

def read(self) -> bytes:
return self._response.read()
@property
def http_version(self) -> str:
return self._response.http_version

@property
def status_code(self) -> int:
Expand All @@ -36,6 +40,9 @@ def status_code(self) -> int:
def headers(self) -> dict[str, str]:
return dict(self._response.headers.items())

def read(self) -> bytes:
return self._response.read()


class _HttpxTransport(httpx.AsyncHTTPTransport):
"""HTTP transport adapter that stores response cookies in a `Session`.
Expand Down Expand Up @@ -76,6 +83,8 @@ def __init__(
persist_cookies_per_session: bool = True,
additional_http_error_status_codes: Iterable[int] = (),
ignore_http_error_status_codes: Iterable[int] = (),
http1: bool = True,
http2: bool = True,
**async_client_kwargs: Any,
) -> None:
"""Create a new instance.
Expand All @@ -84,13 +93,17 @@ def __init__(
persist_cookies_per_session: Whether to persist cookies per HTTP session.
additional_http_error_status_codes: Additional HTTP status codes to treat as errors.
ignore_http_error_status_codes: HTTP status codes to ignore as errors.
http1: Whether to enable HTTP/1.1 support.
http2: Whether to enable HTTP/2 support.
async_client_kwargs: Additional keyword arguments for `httpx.AsyncClient`.
"""
super().__init__(
persist_cookies_per_session=persist_cookies_per_session,
additional_http_error_status_codes=additional_http_error_status_codes,
ignore_http_error_status_codes=ignore_http_error_status_codes,
)
self._http1 = http1
self._http2 = http2
self._async_client_kwargs = async_client_kwargs

self._client_by_proxy_url = dict[Optional[str], httpx.AsyncClient]()
Expand Down Expand Up @@ -182,11 +195,23 @@ def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
If the client for the given proxy URL doesn't exist, it will be created and stored.
"""
if proxy_url not in self._client_by_proxy_url:
self._client_by_proxy_url[proxy_url] = httpx.AsyncClient(
transport=_HttpxTransport(),
proxy=proxy_url,
**self._async_client_kwargs,
)
# Prepare a default kwargs for the new client.
kwargs: dict[str, Any] = {
'transport': _HttpxTransport(
proxy=proxy_url,
http1=self._http1,
http2=self._http2,
),
'proxy': proxy_url,
'http1': self._http1,
'http2': self._http2,
}

# Update the default kwargs with any additional user-provided kwargs.
kwargs.update(self._async_client_kwargs)

client = httpx.AsyncClient(**kwargs)
self._client_by_proxy_url[proxy_url] = client

return self._client_by_proxy_url[proxy_url]

Expand Down
24 changes: 22 additions & 2 deletions src/crawlee/http_clients/curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"For example, if you use pip, run `pip install 'crawlee[curl-impersonate]'`.",
) from exc

from curl_cffi.const import CurlHttpVersion
from typing_extensions import override

from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
Expand All @@ -36,8 +37,24 @@ class _CurlImpersonateResponse:
def __init__(self, response: Response) -> None:
self._response = response

def read(self) -> bytes:
return self._response.content
@property
def http_version(self) -> str:
if self._response.http_version == CurlHttpVersion.NONE:
return 'NONE'
if self._response.http_version == CurlHttpVersion.V1_0:
return 'HTTP/1.0'
if self._response.http_version == CurlHttpVersion.V1_1:
return 'HTTP/1.1'
if self._response.http_version in {
CurlHttpVersion.V2_0,
CurlHttpVersion.V2TLS,
CurlHttpVersion.V2_PRIOR_KNOWLEDGE,
}:
return 'HTTP/2'
if self._response.http_version == CurlHttpVersion.V3:
return 'HTTP/3'

raise ValueError(f'Unknown HTTP version: {self._response.http_version}')

@property
def status_code(self) -> int:
Expand All @@ -47,6 +64,9 @@ def status_code(self) -> int:
def headers(self) -> dict[str, str]:
return dict(self._response.headers.items())

def read(self) -> bytes:
return self._response.content


class CurlImpersonateHttpClient(BaseHttpClient):
"""HTTP client based on the `curl-cffi` library.
Expand Down
12 changes: 12 additions & 0 deletions tests/unit/http_clients/test_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,18 @@ def http_client() -> HttpxHttpClient:
return HttpxHttpClient()


async def test_http_1(httpbin: str) -> None:
http_client = HttpxHttpClient(http1=True, http2=False)
response = await http_client.send_request(httpbin)
assert response.http_version == 'HTTP/1.1'


async def test_http_2(httpbin: str) -> None:
http_client = HttpxHttpClient(http2=True)
response = await http_client.send_request(httpbin)
assert response.http_version == 'HTTP/2'


@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
async def test_proxy(
http_client: HttpxHttpClient,
Expand Down