apify · vdusek · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -52,7 +52,7 @@ curl-cffi = { version = ">=0.7.0", optional = true }
 docutils = ">=0.21.0"
 eval-type-backport = ">=0.2.0"
 html5lib = { version = ">=1.0", optional = true }
-httpx = { version = ">=0.27.0", extras = ["brotli"] }
+httpx = { version = ">=0.27.0", extras = ["brotli", "http2"] }
 inquirer = ">=3.3.0"
 lxml = { version = ">=5.2.0", optional = true }
 more_itertools = ">=10.2.0"

diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py
@@ -20,8 +20,9 @@
 class HttpResponse(Protocol):
     """This protocol defines the interface that any HTTP response object must implement."""
 
-    def read(self) -> bytes:
-        """Read the content of the response body."""
+    @property
+    def http_version(self) -> str:
+        """The HTTP version used in the response."""
 
     @property
     def status_code(self) -> int:
@@ -31,11 +32,17 @@ def status_code(self) -> int:
     def headers(self) -> dict[str, str]:
         """The HTTP headers received in the response."""
 
+    def read(self) -> bytes:
+        """Read the content of the response body."""
+
 
 @dataclass(frozen=True)
 class HttpCrawlingResult:
     """Result of a HTTP-only crawl.
 
+    Mainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,
+    `ParselCrawlingContext`, ...).
+
     Args:
         http_response: The HTTP response received from the server.
     """

diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from logging import getLogger
 from typing import TYPE_CHECKING, Any, Optional, cast
 
 import httpx
@@ -18,15 +19,18 @@
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.statistics import Statistics
 
+logger = getLogger(__name__)
+
 
 class _HttpxResponse:
     """Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol."""
 
     def __init__(self, response: httpx.Response) -> None:
         self._response = response
 
-    def read(self) -> bytes:
-        return self._response.read()
+    @property
+    def http_version(self) -> str:
+        return self._response.http_version
 
     @property
     def status_code(self) -> int:
@@ -36,6 +40,9 @@ def status_code(self) -> int:
     def headers(self) -> dict[str, str]:
         return dict(self._response.headers.items())
 
+    def read(self) -> bytes:
+        return self._response.read()
+
 
 class _HttpxTransport(httpx.AsyncHTTPTransport):
     """HTTP transport adapter that stores response cookies in a `Session`.
@@ -76,6 +83,8 @@ def __init__(
         persist_cookies_per_session: bool = True,
         additional_http_error_status_codes: Iterable[int] = (),
         ignore_http_error_status_codes: Iterable[int] = (),
+        http1: bool = True,
+        http2: bool = True,
         **async_client_kwargs: Any,
     ) -> None:
         """Create a new instance.
@@ -84,13 +93,17 @@ def __init__(
             persist_cookies_per_session: Whether to persist cookies per HTTP session.
             additional_http_error_status_codes: Additional HTTP status codes to treat as errors.
             ignore_http_error_status_codes: HTTP status codes to ignore as errors.
+            http1: Whether to enable HTTP/1.1 support.
+            http2: Whether to enable HTTP/2 support.
             async_client_kwargs: Additional keyword arguments for `httpx.AsyncClient`.
         """
         super().__init__(
             persist_cookies_per_session=persist_cookies_per_session,
             additional_http_error_status_codes=additional_http_error_status_codes,
             ignore_http_error_status_codes=ignore_http_error_status_codes,
         )
+        self._http1 = http1
+        self._http2 = http2
         self._async_client_kwargs = async_client_kwargs
 
         self._client_by_proxy_url = dict[Optional[str], httpx.AsyncClient]()
@@ -182,11 +195,23 @@ def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
         If the client for the given proxy URL doesn't exist, it will be created and stored.
         """
         if proxy_url not in self._client_by_proxy_url:
-            self._client_by_proxy_url[proxy_url] = httpx.AsyncClient(
-                transport=_HttpxTransport(),
-                proxy=proxy_url,
-                **self._async_client_kwargs,
-            )
+            # Prepare a default kwargs for the new client.
+            kwargs: dict[str, Any] = {
+                'transport': _HttpxTransport(
+                    proxy=proxy_url,
+                    http1=self._http1,
+                    http2=self._http2,
+                ),
+                'proxy': proxy_url,
+                'http1': self._http1,
+                'http2': self._http2,
+            }
+
+            # Update the default kwargs with any additional user-provided kwargs.
+            kwargs.update(self._async_client_kwargs)
+
+            client = httpx.AsyncClient(**kwargs)
+            self._client_by_proxy_url[proxy_url] = client
 
         return self._client_by_proxy_url[proxy_url]
 

diff --git a/src/crawlee/http_clients/curl_impersonate.py b/src/crawlee/http_clients/curl_impersonate.py
@@ -12,6 +12,7 @@
         "For example, if you use pip, run `pip install 'crawlee[curl-impersonate]'`.",
     ) from exc
 
+from curl_cffi.const import CurlHttpVersion
 from typing_extensions import override
 
 from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
@@ -36,8 +37,24 @@ class _CurlImpersonateResponse:
     def __init__(self, response: Response) -> None:
         self._response = response
 
-    def read(self) -> bytes:
-        return self._response.content
+    @property
+    def http_version(self) -> str:
+        if self._response.http_version == CurlHttpVersion.NONE:
+            return 'NONE'
+        if self._response.http_version == CurlHttpVersion.V1_0:
+            return 'HTTP/1.0'
+        if self._response.http_version == CurlHttpVersion.V1_1:
+            return 'HTTP/1.1'
+        if self._response.http_version in {
+            CurlHttpVersion.V2_0,
+            CurlHttpVersion.V2TLS,
+            CurlHttpVersion.V2_PRIOR_KNOWLEDGE,
+        }:
+            return 'HTTP/2'
+        if self._response.http_version == CurlHttpVersion.V3:
+            return 'HTTP/3'
+
+        raise ValueError(f'Unknown HTTP version: {self._response.http_version}')
 
     @property
     def status_code(self) -> int:
@@ -47,6 +64,9 @@ def status_code(self) -> int:
     def headers(self) -> dict[str, str]:
         return dict(self._response.headers.items())
 
+    def read(self) -> bytes:
+        return self._response.content
+
 
 class CurlImpersonateHttpClient(BaseHttpClient):
     """HTTP client based on the `curl-cffi` library.

diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py
@@ -19,6 +19,18 @@ def http_client() -> HttpxHttpClient:
     return HttpxHttpClient()
 
 
+async def test_http_1(httpbin: str) -> None:
+    http_client = HttpxHttpClient(http1=True, http2=False)
+    response = await http_client.send_request(httpbin)
+    assert response.http_version == 'HTTP/1.1'
+
+
+async def test_http_2(httpbin: str) -> None:
+    http_client = HttpxHttpClient(http2=True)
+    response = await http_client.send_request(httpbin)
+    assert response.http_version == 'HTTP/2'
+
+
 @pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
 async def test_proxy(
     http_client: HttpxHttpClient,