diff --git a/src/apify/scrapy/__init__.py b/src/apify/scrapy/__init__.py index 44b52138..2ee44e2c 100644 --- a/src/apify/scrapy/__init__.py +++ b/src/apify/scrapy/__init__.py @@ -1,4 +1,3 @@ -from .middlewares import ApifyRetryMiddleware from .pipelines import ActorDatasetPushPipeline from .scheduler import ApifyScheduler -from .utils import get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request +from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request diff --git a/src/apify/scrapy/middlewares/__init__.py b/src/apify/scrapy/middlewares/__init__.py new file mode 100644 index 00000000..d022da54 --- /dev/null +++ b/src/apify/scrapy/middlewares/__init__.py @@ -0,0 +1,2 @@ +from .apify_proxy import ApifyHttpProxyMiddleware +from .apify_retry import ApifyRetryMiddleware diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py new file mode 100644 index 00000000..bb5da987 --- /dev/null +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from urllib.parse import ParseResult, urlparse + +from scrapy.core.downloader.handlers.http11 import TunnelError +from scrapy.exceptions import NotConfigured + +from ...actor import Actor +from ...proxy_configuration import ProxyConfiguration +from ..utils import get_basic_auth_header + +if TYPE_CHECKING: + from scrapy import Request, Spider + from scrapy.crawler import Crawler + + +class ApifyHttpProxyMiddleware: + """Apify HTTP proxy middleware for Scrapy. + + This middleware enhances request processing by adding a 'proxy' field to the request's meta and an authentication + header. It draws inspiration from the `HttpProxyMiddleware` included by default in Scrapy projects. The proxy URL + is sourced from the settings under the `APIFY_PROXY_SETTINGS` key. The value of this key, a dictionary, should be + provided by the Actor input. An example of the proxy settings: + + proxy_settings = {'useApifyProxy': true, 'apifyProxyGroups': []} + """ + + def __init__(self: ApifyHttpProxyMiddleware, proxy_settings: dict) -> None: + """Create a new instance. + + Args: + proxy_settings: Dictionary containing proxy settings, provided by the Actor input. + auth_encoding: Encoding for basic authentication (default is 'latin-1'). + """ + self._proxy_settings = proxy_settings + self._proxy_cfg_internal: ProxyConfiguration | None = None + + @classmethod + def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> ApifyHttpProxyMiddleware: + """Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler. + + Args: + cls: Class type. + crawler: Scrapy Crawler object. + + Returns: + ApifyHttpProxyMiddleware: Instance of the class. + """ + proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS') + + if proxy_settings is None: + Actor.log.warning('ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing in the Actor input.') + raise NotConfigured + + use_apify_proxy = proxy_settings.get('useApifyProxy', False) + + if use_apify_proxy is not True: + Actor.log.warning( + 'ApifyHttpProxyMiddleware is not going to be used. Actor input field "proxyConfiguration.useApifyProxy" is probably set to False.' + ) + raise NotConfigured + + return cls(proxy_settings) + + async def process_request(self: ApifyHttpProxyMiddleware, request: Request, spider: Spider) -> None: + """Process a Scrapy request by assigning a new proxy. + + Args: + request: Scrapy Request object. + spider: Scrapy Spider object. + + Raises: + ValueError: If username and password are not provided in the proxy URL. + + Returns: + None: The request is processed and middleware pipeline can continue. + """ + Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}') + url = await self._get_new_proxy_url() + + if not (url.username and url.password): + raise ValueError('Username and password must be provided in the proxy URL.') + + request.meta['proxy'] = url.geturl() + basic_auth_header = get_basic_auth_header(url.username, url.password) + request.headers[b'Proxy-Authorization'] = basic_auth_header + + Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: updated request.meta={request.meta}') + + def process_exception( + self: ApifyHttpProxyMiddleware, + request: Request, + exception: Exception, + spider: Spider, + ) -> None | Request: + """Process an exception that occurs during request processing. + + Args: + request: Scrapy Request object. + exception: Exception object. + spider: Scrapy Spider object. + + Returns: + If a TunnelError occurs, return the request object to halt its processing in the middleware pipeline. + Return None otherwise to allow the continuation of request processing. + """ + Actor.log.debug( + f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}', + ) + + if isinstance(exception, TunnelError): + Actor.log.warning(f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", reason="{exception}", skipping...') + return request + + return None + + async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult: + """Get a new proxy URL. + + Raises: + NotConfigured: If creation of the proxy configuration fails. + + Returns: + ParseResult: New proxy URL. + """ + # Get proxy configuration, creating it if necessary + proxy_cfg = ( + self._proxy_cfg_internal + if isinstance(self._proxy_cfg_internal, ProxyConfiguration) + else await Actor.create_proxy_configuration(actor_proxy_input=self._proxy_settings) + ) + + # If the proxy configuration is still not available, raise an error. However, this should not happen due + # to the checks in the `from_crawler` method. + if proxy_cfg is None: + Actor.log.error('Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.') + raise NotConfigured + + # Store the proxy configuration for future use + self._proxy_cfg_internal = proxy_cfg + + # Get a new proxy URL and return it + new_url = await proxy_cfg.new_url() + return urlparse(new_url) diff --git a/src/apify/scrapy/middlewares.py b/src/apify/scrapy/middlewares/apify_retry.py similarity index 96% rename from src/apify/scrapy/middlewares.py rename to src/apify/scrapy/middlewares/apify_retry.py index 32a1900b..96415dcb 100644 --- a/src/apify/scrapy/middlewares.py +++ b/src/apify/scrapy/middlewares/apify_retry.py @@ -11,14 +11,14 @@ 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', ) from exc -from ..actor import Actor -from .utils import nested_event_loop, open_queue_with_custom_client, to_apify_request +from ...actor import Actor +from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request if TYPE_CHECKING: from scrapy import Spider from scrapy.http import Request, Response - from ..storages import RequestQueue + from ...storages import RequestQueue class ApifyRetryMiddleware(RetryMiddleware): diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index 6ccd32bf..6a6e1e41 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -3,6 +3,10 @@ import asyncio import codecs import pickle +from base64 import b64encode +from urllib.parse import unquote + +from scrapy.utils.python import to_bytes try: from scrapy import Request, Spider @@ -19,6 +23,13 @@ nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop() +def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes: + """Generate a basic authentication header for the given username and password.""" + string = f'{unquote(username)}:{unquote(password)}' + user_pass = to_bytes(string, encoding=auth_encoding) + return b'Basic ' + b64encode(user_pass) + + def get_running_event_loop_id() -> int: """Get the ID of the currently running event loop. diff --git a/tests/unit/scrapy/__init__.py b/tests/unit/scrapy/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/scrapy/middlewares/__init__.py b/tests/unit/scrapy/middlewares/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/scrapy/middlewares/test_apify_proxy.py b/tests/unit/scrapy/middlewares/test_apify_proxy.py new file mode 100644 index 00000000..379e1b45 --- /dev/null +++ b/tests/unit/scrapy/middlewares/test_apify_proxy.py @@ -0,0 +1,152 @@ +from __future__ import annotations + +from urllib.parse import ParseResult, urlparse + +import pytest +from scrapy import Request, Spider +from scrapy.core.downloader.handlers.http11 import TunnelError +from scrapy.crawler import Crawler +from scrapy.exceptions import NotConfigured + +from apify import ProxyConfiguration +from apify.scrapy.middlewares import ApifyHttpProxyMiddleware + + +class DummySpider(Spider): + name = 'dummy_spider' + + +@pytest.fixture() +def middleware() -> ApifyHttpProxyMiddleware: + """Fixture to create an Apify HTTP proxy middleware.""" + proxy_settings = {'useApifyProxy': True} + return ApifyHttpProxyMiddleware(proxy_settings) + + +@pytest.fixture() +def crawler(monkeypatch: pytest.MonkeyPatch) -> Crawler: + """Fixture to create a Scrapy crawler.""" + crawler = Crawler(DummySpider) + monkeypatch.setattr(crawler, 'settings', {}) + return crawler + + +@pytest.fixture() +def spider() -> DummySpider: + """Fixture to create a "dummy" Scrapy spider.""" + return DummySpider() + + +@pytest.fixture() +def dummy_request() -> Request: + """Fixture to create a "dummy" Scrapy spider.""" + return Request('https://example.com') + + +@pytest.fixture() +def proxy_configuration() -> ProxyConfiguration: + """Fixture to create an Apify ProxyConfiguration object.""" + return ProxyConfiguration() + + +@pytest.mark.parametrize( + ('settings', 'expected_exception'), + [ + ({'APIFY_PROXY_SETTINGS': {'useApifyProxy': True}}, None), + ({'APIFY_PROXY_SETTINGS': {'useApifyProxy': True, 'apifyProxyGroups': []}}, None), + ({}, NotConfigured), + ({'a': 1}, NotConfigured), + ({'APIFY_PROXY_SETTINGS': {}}, NotConfigured), + ({'APIFY_PROXY_SETTINGS': {'useApifyProxy': None}}, NotConfigured), + ({'APIFY_PROXY_SETTINGS': {'useApifyProxy': False}}, NotConfigured), + ], +) +def test__from_crawler( + crawler: Crawler, + monkeypatch: pytest.MonkeyPatch, + settings: dict, + expected_exception: type[Exception] | None, +) -> None: + monkeypatch.setattr(crawler, 'settings', settings) + + if expected_exception is None: + middleware = ApifyHttpProxyMiddleware.from_crawler(crawler) + assert middleware._proxy_settings == settings['APIFY_PROXY_SETTINGS'] + + else: + with pytest.raises(expected_exception): + ApifyHttpProxyMiddleware.from_crawler(crawler) + + +@pytest.mark.parametrize( + 'expected_proxy_url', + ['http://username:password@proxy.example.com:8080', 'http://hsdfgds:52354325@proxy.apify.com:5748'], +) +async def test__get_new_proxy_url( + monkeypatch: pytest.MonkeyPatch, + middleware: ApifyHttpProxyMiddleware, + proxy_configuration: ProxyConfiguration, + expected_proxy_url: str, +) -> None: + async def mock_new_url() -> str: + return expected_proxy_url + + monkeypatch.setattr(proxy_configuration, 'new_url', mock_new_url) + middleware._proxy_cfg_internal = proxy_configuration + proxy_url = await middleware._get_new_proxy_url() + assert proxy_url == urlparse(expected_proxy_url) + + +@pytest.mark.parametrize( + ('proxy_url', 'expected_exception', 'expected_request_header'), + [ + ('http://username:password@proxy.example.com:8080', None, b'Basic dXNlcm5hbWU6cGFzc3dvcmQ='), + ('http://user123:pass456@proxy.apify.com:5748', None, b'Basic dXNlcjEyMzpwYXNzNDU2'), + ('http://@proxy.example.com:2943', ValueError, b''), + ], +) +async def test__process_request( + monkeypatch: pytest.MonkeyPatch, + middleware: ApifyHttpProxyMiddleware, + spider: DummySpider, + dummy_request: Request, + proxy_url: str, + expected_exception: type[Exception] | None, + expected_request_header: bytes, +) -> None: + async def mock_get_new_proxy_url() -> ParseResult: + return urlparse(proxy_url) + + monkeypatch.setattr(middleware, '_get_new_proxy_url', mock_get_new_proxy_url) + + if expected_exception is None: + await middleware.process_request(dummy_request, spider) + assert dummy_request.meta['proxy'] == proxy_url + assert dummy_request.headers[b'Proxy-Authorization'] == expected_request_header + else: + with pytest.raises(expected_exception): + await middleware.process_request(dummy_request, spider) + + +@pytest.mark.parametrize( + ('exception', 'none_returned_values_is_expected'), + [ + (TunnelError(), False), + (ValueError(), True), + ], +) +def test__process_exception( + middleware: ApifyHttpProxyMiddleware, + spider: DummySpider, + dummy_request: Request, + exception: Exception, + *, + none_returned_values_is_expected: bool, +) -> None: + returned_value = middleware.process_exception(dummy_request, exception, spider) + + if none_returned_values_is_expected: + assert returned_value is None + + else: + assert returned_value == dummy_request diff --git a/tests/unit/scrapy/test_utils.py b/tests/unit/scrapy/test_utils.py new file mode 100644 index 00000000..070d7f78 --- /dev/null +++ b/tests/unit/scrapy/test_utils.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +import pytest + +from apify.scrapy import get_basic_auth_header + + +@pytest.mark.parametrize( + ('username', 'password', 'expected_auth_header'), + [ + ('username', 'password', b'Basic dXNlcm5hbWU6cGFzc3dvcmQ='), + ('john_smith', 'secret_password_123', b'Basic am9obl9zbWl0aDpzZWNyZXRfcGFzc3dvcmRfMTIz'), + ], +) +def test__get_basic_auth_header( + username: str, + password: str, + expected_auth_header: bytes, +) -> None: + auth_header = get_basic_auth_header(username, password) + assert auth_header == expected_auth_header