Skip to content

Commit

Permalink
Add ApifyHttpProxyMiddleware for managing proxies
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Dec 20, 2023
1 parent 32f5d0d commit 19e2b02
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 4 deletions.
1 change: 0 additions & 1 deletion src/apify/scrapy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .middlewares import ApifyRetryMiddleware
from .pipelines import ActorDatasetPushPipeline
from .scheduler import ApifyScheduler
from .utils import get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request
2 changes: 2 additions & 0 deletions src/apify/scrapy/middlewares/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .apify_proxy import ApifyHttpProxyMiddleware
from .apify_retry import ApifyRetryMiddleware
44 changes: 44 additions & 0 deletions src/apify/scrapy/middlewares/apify_proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import annotations

import base64
from typing import TYPE_CHECKING
from urllib.parse import unquote

from scrapy.exceptions import NotConfigured
from scrapy.utils.python import to_bytes

from apify import Actor

if TYPE_CHECKING:
from scrapy import Request, Spider
from scrapy.crawler import Crawler


class ApifyHttpProxyMiddleware:
def __init__(self: ApifyHttpProxyMiddleware, proxy_settings: dict, auth_encoding: str = 'latin-1') -> None:
self.auth_encoding = auth_encoding
self.proxy_settings = proxy_settings

@classmethod
def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> ApifyHttpProxyMiddleware:
proxy_settings = crawler.settings.get('PROXY_SETTINGS')

if not proxy_settings:
Actor.log.warning('Field "PROXY_SETTINGS" is missing in the settings. ApifyHttpProxyMiddleware is not NotConfigured.')
raise NotConfigured

return cls(proxy_settings)

def process_request(self: ApifyHttpProxyMiddleware, request: Request, spider: Spider) -> None:
if self.proxy_settings:
request.meta['proxy'] = self.proxy_settings['url']
creds = self._basic_auth_header()
request.headers[b'Proxy-Authorization'] = b'Basic ' + creds

Actor.log.info(f'ApifyHttpProxyMiddleware: request.meta={request.meta}')

def _basic_auth_header(self: ApifyHttpProxyMiddleware) -> bytes:
username = self.proxy_settings['username']
password = self.proxy_settings['password']
user_pass = to_bytes(f'{unquote(username)}:{unquote(password)}', encoding=self.auth_encoding)
return base64.b64encode(user_pass)
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
) from exc

from ..actor import Actor
from .utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
from ...actor import Actor
from ..utils import nested_event_loop, open_queue_with_custom_client, to_apify_request

if TYPE_CHECKING:
from scrapy import Spider
from scrapy.http import Request, Response

from ..storages import RequestQueue
from ...storages import RequestQueue


class ApifyRetryMiddleware(RetryMiddleware):
Expand Down

0 comments on commit 19e2b02

Please sign in to comment.