-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Scrapy
ApifyHttpProxyMiddleware
for managing proxies (#158)
- Loading branch information
Showing
9 changed files
with
335 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,3 @@ | ||
from .middlewares import ApifyRetryMiddleware | ||
from .pipelines import ActorDatasetPushPipeline | ||
from .scheduler import ApifyScheduler | ||
from .utils import get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request | ||
from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .apify_proxy import ApifyHttpProxyMiddleware | ||
from .apify_retry import ApifyRetryMiddleware |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
from urllib.parse import ParseResult, urlparse | ||
|
||
from scrapy.core.downloader.handlers.http11 import TunnelError | ||
from scrapy.exceptions import NotConfigured | ||
|
||
from ...actor import Actor | ||
from ...proxy_configuration import ProxyConfiguration | ||
from ..utils import get_basic_auth_header | ||
|
||
if TYPE_CHECKING: | ||
from scrapy import Request, Spider | ||
from scrapy.crawler import Crawler | ||
|
||
|
||
class ApifyHttpProxyMiddleware: | ||
"""Apify HTTP proxy middleware for Scrapy. | ||
This middleware enhances request processing by adding a 'proxy' field to the request's meta and an authentication | ||
header. It draws inspiration from the `HttpProxyMiddleware` included by default in Scrapy projects. The proxy URL | ||
is sourced from the settings under the `APIFY_PROXY_SETTINGS` key. The value of this key, a dictionary, should be | ||
provided by the Actor input. An example of the proxy settings: | ||
proxy_settings = {'useApifyProxy': true, 'apifyProxyGroups': []} | ||
""" | ||
|
||
def __init__(self: ApifyHttpProxyMiddleware, proxy_settings: dict) -> None: | ||
"""Create a new instance. | ||
Args: | ||
proxy_settings: Dictionary containing proxy settings, provided by the Actor input. | ||
auth_encoding: Encoding for basic authentication (default is 'latin-1'). | ||
""" | ||
self._proxy_settings = proxy_settings | ||
self._proxy_cfg_internal: ProxyConfiguration | None = None | ||
|
||
@classmethod | ||
def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> ApifyHttpProxyMiddleware: | ||
"""Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler. | ||
Args: | ||
cls: Class type. | ||
crawler: Scrapy Crawler object. | ||
Returns: | ||
ApifyHttpProxyMiddleware: Instance of the class. | ||
""" | ||
proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS') | ||
|
||
if proxy_settings is None: | ||
Actor.log.warning('ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing in the Actor input.') | ||
raise NotConfigured | ||
|
||
use_apify_proxy = proxy_settings.get('useApifyProxy', False) | ||
|
||
if use_apify_proxy is not True: | ||
Actor.log.warning( | ||
'ApifyHttpProxyMiddleware is not going to be used. Actor input field "proxyConfiguration.useApifyProxy" is probably set to False.' | ||
) | ||
raise NotConfigured | ||
|
||
return cls(proxy_settings) | ||
|
||
async def process_request(self: ApifyHttpProxyMiddleware, request: Request, spider: Spider) -> None: | ||
"""Process a Scrapy request by assigning a new proxy. | ||
Args: | ||
request: Scrapy Request object. | ||
spider: Scrapy Spider object. | ||
Raises: | ||
ValueError: If username and password are not provided in the proxy URL. | ||
Returns: | ||
None: The request is processed and middleware pipeline can continue. | ||
""" | ||
Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}') | ||
url = await self._get_new_proxy_url() | ||
|
||
if not (url.username and url.password): | ||
raise ValueError('Username and password must be provided in the proxy URL.') | ||
|
||
request.meta['proxy'] = url.geturl() | ||
basic_auth_header = get_basic_auth_header(url.username, url.password) | ||
request.headers[b'Proxy-Authorization'] = basic_auth_header | ||
|
||
Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: updated request.meta={request.meta}') | ||
|
||
def process_exception( | ||
self: ApifyHttpProxyMiddleware, | ||
request: Request, | ||
exception: Exception, | ||
spider: Spider, | ||
) -> None | Request: | ||
"""Process an exception that occurs during request processing. | ||
Args: | ||
request: Scrapy Request object. | ||
exception: Exception object. | ||
spider: Scrapy Spider object. | ||
Returns: | ||
If a TunnelError occurs, return the request object to halt its processing in the middleware pipeline. | ||
Return None otherwise to allow the continuation of request processing. | ||
""" | ||
Actor.log.debug( | ||
f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}', | ||
) | ||
|
||
if isinstance(exception, TunnelError): | ||
Actor.log.warning(f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", reason="{exception}", skipping...') | ||
return request | ||
|
||
return None | ||
|
||
async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult: | ||
"""Get a new proxy URL. | ||
Raises: | ||
NotConfigured: If creation of the proxy configuration fails. | ||
Returns: | ||
ParseResult: New proxy URL. | ||
""" | ||
# Get proxy configuration, creating it if necessary | ||
proxy_cfg = ( | ||
self._proxy_cfg_internal | ||
if isinstance(self._proxy_cfg_internal, ProxyConfiguration) | ||
else await Actor.create_proxy_configuration(actor_proxy_input=self._proxy_settings) | ||
) | ||
|
||
# If the proxy configuration is still not available, raise an error. However, this should not happen due | ||
# to the checks in the `from_crawler` method. | ||
if proxy_cfg is None: | ||
Actor.log.error('Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.') | ||
raise NotConfigured | ||
|
||
# Store the proxy configuration for future use | ||
self._proxy_cfg_internal = proxy_cfg | ||
|
||
# Get a new proxy URL and return it | ||
new_url = await proxy_cfg.new_url() | ||
return urlparse(new_url) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
from __future__ import annotations | ||
|
||
from urllib.parse import ParseResult, urlparse | ||
|
||
import pytest | ||
from scrapy import Request, Spider | ||
from scrapy.core.downloader.handlers.http11 import TunnelError | ||
from scrapy.crawler import Crawler | ||
from scrapy.exceptions import NotConfigured | ||
|
||
from apify import ProxyConfiguration | ||
from apify.scrapy.middlewares import ApifyHttpProxyMiddleware | ||
|
||
|
||
class DummySpider(Spider): | ||
name = 'dummy_spider' | ||
|
||
|
||
@pytest.fixture() | ||
def middleware() -> ApifyHttpProxyMiddleware: | ||
"""Fixture to create an Apify HTTP proxy middleware.""" | ||
proxy_settings = {'useApifyProxy': True} | ||
return ApifyHttpProxyMiddleware(proxy_settings) | ||
|
||
|
||
@pytest.fixture() | ||
def crawler(monkeypatch: pytest.MonkeyPatch) -> Crawler: | ||
"""Fixture to create a Scrapy crawler.""" | ||
crawler = Crawler(DummySpider) | ||
monkeypatch.setattr(crawler, 'settings', {}) | ||
return crawler | ||
|
||
|
||
@pytest.fixture() | ||
def spider() -> DummySpider: | ||
"""Fixture to create a "dummy" Scrapy spider.""" | ||
return DummySpider() | ||
|
||
|
||
@pytest.fixture() | ||
def dummy_request() -> Request: | ||
"""Fixture to create a "dummy" Scrapy spider.""" | ||
return Request('https://example.com') | ||
|
||
|
||
@pytest.fixture() | ||
def proxy_configuration() -> ProxyConfiguration: | ||
"""Fixture to create an Apify ProxyConfiguration object.""" | ||
return ProxyConfiguration() | ||
|
||
|
||
@pytest.mark.parametrize( | ||
('settings', 'expected_exception'), | ||
[ | ||
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': True}}, None), | ||
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': True, 'apifyProxyGroups': []}}, None), | ||
({}, NotConfigured), | ||
({'a': 1}, NotConfigured), | ||
({'APIFY_PROXY_SETTINGS': {}}, NotConfigured), | ||
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': None}}, NotConfigured), | ||
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': False}}, NotConfigured), | ||
], | ||
) | ||
def test__from_crawler( | ||
crawler: Crawler, | ||
monkeypatch: pytest.MonkeyPatch, | ||
settings: dict, | ||
expected_exception: type[Exception] | None, | ||
) -> None: | ||
monkeypatch.setattr(crawler, 'settings', settings) | ||
|
||
if expected_exception is None: | ||
middleware = ApifyHttpProxyMiddleware.from_crawler(crawler) | ||
assert middleware._proxy_settings == settings['APIFY_PROXY_SETTINGS'] | ||
|
||
else: | ||
with pytest.raises(expected_exception): | ||
ApifyHttpProxyMiddleware.from_crawler(crawler) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
'expected_proxy_url', | ||
['http://username:password@proxy.example.com:8080', 'http://hsdfgds:52354325@proxy.apify.com:5748'], | ||
) | ||
async def test__get_new_proxy_url( | ||
monkeypatch: pytest.MonkeyPatch, | ||
middleware: ApifyHttpProxyMiddleware, | ||
proxy_configuration: ProxyConfiguration, | ||
expected_proxy_url: str, | ||
) -> None: | ||
async def mock_new_url() -> str: | ||
return expected_proxy_url | ||
|
||
monkeypatch.setattr(proxy_configuration, 'new_url', mock_new_url) | ||
middleware._proxy_cfg_internal = proxy_configuration | ||
proxy_url = await middleware._get_new_proxy_url() | ||
assert proxy_url == urlparse(expected_proxy_url) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
('proxy_url', 'expected_exception', 'expected_request_header'), | ||
[ | ||
('http://username:password@proxy.example.com:8080', None, b'Basic dXNlcm5hbWU6cGFzc3dvcmQ='), | ||
('http://user123:pass456@proxy.apify.com:5748', None, b'Basic dXNlcjEyMzpwYXNzNDU2'), | ||
('http://@proxy.example.com:2943', ValueError, b''), | ||
], | ||
) | ||
async def test__process_request( | ||
monkeypatch: pytest.MonkeyPatch, | ||
middleware: ApifyHttpProxyMiddleware, | ||
spider: DummySpider, | ||
dummy_request: Request, | ||
proxy_url: str, | ||
expected_exception: type[Exception] | None, | ||
expected_request_header: bytes, | ||
) -> None: | ||
async def mock_get_new_proxy_url() -> ParseResult: | ||
return urlparse(proxy_url) | ||
|
||
monkeypatch.setattr(middleware, '_get_new_proxy_url', mock_get_new_proxy_url) | ||
|
||
if expected_exception is None: | ||
await middleware.process_request(dummy_request, spider) | ||
assert dummy_request.meta['proxy'] == proxy_url | ||
assert dummy_request.headers[b'Proxy-Authorization'] == expected_request_header | ||
else: | ||
with pytest.raises(expected_exception): | ||
await middleware.process_request(dummy_request, spider) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
('exception', 'none_returned_values_is_expected'), | ||
[ | ||
(TunnelError(), False), | ||
(ValueError(), True), | ||
], | ||
) | ||
def test__process_exception( | ||
middleware: ApifyHttpProxyMiddleware, | ||
spider: DummySpider, | ||
dummy_request: Request, | ||
exception: Exception, | ||
*, | ||
none_returned_values_is_expected: bool, | ||
) -> None: | ||
returned_value = middleware.process_exception(dummy_request, exception, spider) | ||
|
||
if none_returned_values_is_expected: | ||
assert returned_value is None | ||
|
||
else: | ||
assert returned_value == dummy_request |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from __future__ import annotations | ||
|
||
import pytest | ||
|
||
from apify.scrapy import get_basic_auth_header | ||
|
||
|
||
@pytest.mark.parametrize( | ||
('username', 'password', 'expected_auth_header'), | ||
[ | ||
('username', 'password', b'Basic dXNlcm5hbWU6cGFzc3dvcmQ='), | ||
('john_smith', 'secret_password_123', b'Basic am9obl9zbWl0aDpzZWNyZXRfcGFzc3dvcmRfMTIz'), | ||
], | ||
) | ||
def test__get_basic_auth_header( | ||
username: str, | ||
password: str, | ||
expected_auth_header: bytes, | ||
) -> None: | ||
auth_header = get_basic_auth_header(username, password) | ||
assert auth_header == expected_auth_header |