Skip to content

Commit

Permalink
Add Scrapy ApifyHttpProxyMiddleware for managing proxies (#158)
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Jan 3, 2024
1 parent 9580522 commit 2ec1037
Show file tree
Hide file tree
Showing 9 changed files with 335 additions and 5 deletions.
3 changes: 1 addition & 2 deletions src/apify/scrapy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .middlewares import ApifyRetryMiddleware
from .pipelines import ActorDatasetPushPipeline
from .scheduler import ApifyScheduler
from .utils import get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
from .utils import get_basic_auth_header, get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
2 changes: 2 additions & 0 deletions src/apify/scrapy/middlewares/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .apify_proxy import ApifyHttpProxyMiddleware
from .apify_retry import ApifyRetryMiddleware
145 changes: 145 additions & 0 deletions src/apify/scrapy/middlewares/apify_proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from urllib.parse import ParseResult, urlparse

from scrapy.core.downloader.handlers.http11 import TunnelError
from scrapy.exceptions import NotConfigured

from ...actor import Actor
from ...proxy_configuration import ProxyConfiguration
from ..utils import get_basic_auth_header

if TYPE_CHECKING:
from scrapy import Request, Spider
from scrapy.crawler import Crawler


class ApifyHttpProxyMiddleware:
"""Apify HTTP proxy middleware for Scrapy.
This middleware enhances request processing by adding a 'proxy' field to the request's meta and an authentication
header. It draws inspiration from the `HttpProxyMiddleware` included by default in Scrapy projects. The proxy URL
is sourced from the settings under the `APIFY_PROXY_SETTINGS` key. The value of this key, a dictionary, should be
provided by the Actor input. An example of the proxy settings:
proxy_settings = {'useApifyProxy': true, 'apifyProxyGroups': []}
"""

def __init__(self: ApifyHttpProxyMiddleware, proxy_settings: dict) -> None:
"""Create a new instance.
Args:
proxy_settings: Dictionary containing proxy settings, provided by the Actor input.
auth_encoding: Encoding for basic authentication (default is 'latin-1').
"""
self._proxy_settings = proxy_settings
self._proxy_cfg_internal: ProxyConfiguration | None = None

@classmethod
def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> ApifyHttpProxyMiddleware:
"""Create an instance of ApifyHttpProxyMiddleware from a Scrapy Crawler.
Args:
cls: Class type.
crawler: Scrapy Crawler object.
Returns:
ApifyHttpProxyMiddleware: Instance of the class.
"""
proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS')

if proxy_settings is None:
Actor.log.warning('ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing in the Actor input.')
raise NotConfigured

use_apify_proxy = proxy_settings.get('useApifyProxy', False)

if use_apify_proxy is not True:
Actor.log.warning(
'ApifyHttpProxyMiddleware is not going to be used. Actor input field "proxyConfiguration.useApifyProxy" is probably set to False.'
)
raise NotConfigured

return cls(proxy_settings)

async def process_request(self: ApifyHttpProxyMiddleware, request: Request, spider: Spider) -> None:
"""Process a Scrapy request by assigning a new proxy.
Args:
request: Scrapy Request object.
spider: Scrapy Spider object.
Raises:
ValueError: If username and password are not provided in the proxy URL.
Returns:
None: The request is processed and middleware pipeline can continue.
"""
Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}')
url = await self._get_new_proxy_url()

if not (url.username and url.password):
raise ValueError('Username and password must be provided in the proxy URL.')

request.meta['proxy'] = url.geturl()
basic_auth_header = get_basic_auth_header(url.username, url.password)
request.headers[b'Proxy-Authorization'] = basic_auth_header

Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: updated request.meta={request.meta}')

def process_exception(
self: ApifyHttpProxyMiddleware,
request: Request,
exception: Exception,
spider: Spider,
) -> None | Request:
"""Process an exception that occurs during request processing.
Args:
request: Scrapy Request object.
exception: Exception object.
spider: Scrapy Spider object.
Returns:
If a TunnelError occurs, return the request object to halt its processing in the middleware pipeline.
Return None otherwise to allow the continuation of request processing.
"""
Actor.log.debug(
f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}, spider={spider}',
)

if isinstance(exception, TunnelError):
Actor.log.warning(f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", reason="{exception}", skipping...')
return request

return None

async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult:
"""Get a new proxy URL.
Raises:
NotConfigured: If creation of the proxy configuration fails.
Returns:
ParseResult: New proxy URL.
"""
# Get proxy configuration, creating it if necessary
proxy_cfg = (
self._proxy_cfg_internal
if isinstance(self._proxy_cfg_internal, ProxyConfiguration)
else await Actor.create_proxy_configuration(actor_proxy_input=self._proxy_settings)
)

# If the proxy configuration is still not available, raise an error. However, this should not happen due
# to the checks in the `from_crawler` method.
if proxy_cfg is None:
Actor.log.error('Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.')
raise NotConfigured

# Store the proxy configuration for future use
self._proxy_cfg_internal = proxy_cfg

# Get a new proxy URL and return it
new_url = await proxy_cfg.new_url()
return urlparse(new_url)
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
) from exc

from ..actor import Actor
from .utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
from ...actor import Actor
from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request

if TYPE_CHECKING:
from scrapy import Spider
from scrapy.http import Request, Response

from ..storages import RequestQueue
from ...storages import RequestQueue


class ApifyRetryMiddleware(RetryMiddleware):
Expand Down
11 changes: 11 additions & 0 deletions src/apify/scrapy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import asyncio
import codecs
import pickle
from base64 import b64encode
from urllib.parse import unquote

from scrapy.utils.python import to_bytes

try:
from scrapy import Request, Spider
Expand All @@ -19,6 +23,13 @@
nested_event_loop: asyncio.AbstractEventLoop = asyncio.new_event_loop()


def get_basic_auth_header(username: str, password: str, auth_encoding: str = 'latin-1') -> bytes:
"""Generate a basic authentication header for the given username and password."""
string = f'{unquote(username)}:{unquote(password)}'
user_pass = to_bytes(string, encoding=auth_encoding)
return b'Basic ' + b64encode(user_pass)


def get_running_event_loop_id() -> int:
"""Get the ID of the currently running event loop.
Expand Down
Empty file added tests/unit/scrapy/__init__.py
Empty file.
Empty file.
152 changes: 152 additions & 0 deletions tests/unit/scrapy/middlewares/test_apify_proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
from __future__ import annotations

from urllib.parse import ParseResult, urlparse

import pytest
from scrapy import Request, Spider
from scrapy.core.downloader.handlers.http11 import TunnelError
from scrapy.crawler import Crawler
from scrapy.exceptions import NotConfigured

from apify import ProxyConfiguration
from apify.scrapy.middlewares import ApifyHttpProxyMiddleware


class DummySpider(Spider):
name = 'dummy_spider'


@pytest.fixture()
def middleware() -> ApifyHttpProxyMiddleware:
"""Fixture to create an Apify HTTP proxy middleware."""
proxy_settings = {'useApifyProxy': True}
return ApifyHttpProxyMiddleware(proxy_settings)


@pytest.fixture()
def crawler(monkeypatch: pytest.MonkeyPatch) -> Crawler:
"""Fixture to create a Scrapy crawler."""
crawler = Crawler(DummySpider)
monkeypatch.setattr(crawler, 'settings', {})
return crawler


@pytest.fixture()
def spider() -> DummySpider:
"""Fixture to create a "dummy" Scrapy spider."""
return DummySpider()


@pytest.fixture()
def dummy_request() -> Request:
"""Fixture to create a "dummy" Scrapy spider."""
return Request('https://example.com')


@pytest.fixture()
def proxy_configuration() -> ProxyConfiguration:
"""Fixture to create an Apify ProxyConfiguration object."""
return ProxyConfiguration()


@pytest.mark.parametrize(
('settings', 'expected_exception'),
[
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': True}}, None),
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': True, 'apifyProxyGroups': []}}, None),
({}, NotConfigured),
({'a': 1}, NotConfigured),
({'APIFY_PROXY_SETTINGS': {}}, NotConfigured),
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': None}}, NotConfigured),
({'APIFY_PROXY_SETTINGS': {'useApifyProxy': False}}, NotConfigured),
],
)
def test__from_crawler(
crawler: Crawler,
monkeypatch: pytest.MonkeyPatch,
settings: dict,
expected_exception: type[Exception] | None,
) -> None:
monkeypatch.setattr(crawler, 'settings', settings)

if expected_exception is None:
middleware = ApifyHttpProxyMiddleware.from_crawler(crawler)
assert middleware._proxy_settings == settings['APIFY_PROXY_SETTINGS']

else:
with pytest.raises(expected_exception):
ApifyHttpProxyMiddleware.from_crawler(crawler)


@pytest.mark.parametrize(
'expected_proxy_url',
['http://username:password@proxy.example.com:8080', 'http://hsdfgds:52354325@proxy.apify.com:5748'],
)
async def test__get_new_proxy_url(
monkeypatch: pytest.MonkeyPatch,
middleware: ApifyHttpProxyMiddleware,
proxy_configuration: ProxyConfiguration,
expected_proxy_url: str,
) -> None:
async def mock_new_url() -> str:
return expected_proxy_url

monkeypatch.setattr(proxy_configuration, 'new_url', mock_new_url)
middleware._proxy_cfg_internal = proxy_configuration
proxy_url = await middleware._get_new_proxy_url()
assert proxy_url == urlparse(expected_proxy_url)


@pytest.mark.parametrize(
('proxy_url', 'expected_exception', 'expected_request_header'),
[
('http://username:password@proxy.example.com:8080', None, b'Basic dXNlcm5hbWU6cGFzc3dvcmQ='),
('http://user123:pass456@proxy.apify.com:5748', None, b'Basic dXNlcjEyMzpwYXNzNDU2'),
('http://@proxy.example.com:2943', ValueError, b''),
],
)
async def test__process_request(
monkeypatch: pytest.MonkeyPatch,
middleware: ApifyHttpProxyMiddleware,
spider: DummySpider,
dummy_request: Request,
proxy_url: str,
expected_exception: type[Exception] | None,
expected_request_header: bytes,
) -> None:
async def mock_get_new_proxy_url() -> ParseResult:
return urlparse(proxy_url)

monkeypatch.setattr(middleware, '_get_new_proxy_url', mock_get_new_proxy_url)

if expected_exception is None:
await middleware.process_request(dummy_request, spider)
assert dummy_request.meta['proxy'] == proxy_url
assert dummy_request.headers[b'Proxy-Authorization'] == expected_request_header
else:
with pytest.raises(expected_exception):
await middleware.process_request(dummy_request, spider)


@pytest.mark.parametrize(
('exception', 'none_returned_values_is_expected'),
[
(TunnelError(), False),
(ValueError(), True),
],
)
def test__process_exception(
middleware: ApifyHttpProxyMiddleware,
spider: DummySpider,
dummy_request: Request,
exception: Exception,
*,
none_returned_values_is_expected: bool,
) -> None:
returned_value = middleware.process_exception(dummy_request, exception, spider)

if none_returned_values_is_expected:
assert returned_value is None

else:
assert returned_value == dummy_request
21 changes: 21 additions & 0 deletions tests/unit/scrapy/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from __future__ import annotations

import pytest

from apify.scrapy import get_basic_auth_header


@pytest.mark.parametrize(
('username', 'password', 'expected_auth_header'),
[
('username', 'password', b'Basic dXNlcm5hbWU6cGFzc3dvcmQ='),
('john_smith', 'secret_password_123', b'Basic am9obl9zbWl0aDpzZWNyZXRfcGFzc3dvcmRfMTIz'),
],
)
def test__get_basic_auth_header(
username: str,
password: str,
expected_auth_header: bytes,
) -> None:
auth_header = get_basic_auth_header(username, password)
assert auth_header == expected_auth_header

0 comments on commit 2ec1037

Please sign in to comment.