Skip to content

Commit

Permalink
Add Scrapy ApifyHttpProxyMiddleware for managing proxies
Browse files Browse the repository at this point in the history
Closes: #255
  • Loading branch information
vdusek committed Dec 21, 2023
1 parent 50eff8b commit 1146914
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 17 deletions.
11 changes: 10 additions & 1 deletion templates/python-scrapy/.actor/input_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,16 @@
{ "url": "https://apify.com" }
],
"editor": "requestListSources"
},
"proxyConfiguration": {
"sectionCaption": "Proxy and HTTP configuration",
"title": "Proxy configuration",
"type": "object",
"description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.",
"editor": "proxy",
"prefill": { "useApifyProxy": true },
"default": { "useApifyProxy": true }
}
},
"required": ["start_urls"]
"required": ["start_urls", "proxyConfiguration"]
}
2 changes: 1 addition & 1 deletion templates/python-scrapy/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Feel free to add your Python dependencies below. For formatting guidelines, see:
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify[scrapy] ~= 1.4.0
apify[scrapy] ~= 1.4.1
nest-asyncio ~= 1.5.8
scrapy ~= 2.11.0
3 changes: 3 additions & 0 deletions templates/python-scrapy/src/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
# messages, especially when running on the platform.
configure_logger('httpx', 'WARNING')

# # tmp
# configure_logger('apify', 'DEBUG')


scrapy_logging.configure_logging = new_configure_logging

Expand Down
35 changes: 24 additions & 11 deletions templates/python-scrapy/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
Issue: https://github.com/apify/actor-templates/issues/202
"""

from __future__ import annotations

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
Expand All @@ -47,7 +49,7 @@
LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]


def _get_scrapy_settings() -> Settings:
def _get_scrapy_settings(proxy_cfg: dict | None = None) -> Settings:
"""
Get Scrapy project settings with custom configurations.
Expand All @@ -58,16 +60,26 @@ def _get_scrapy_settings() -> Settings:
"""
settings = get_project_settings()

# Use ApifyScheduler as the scheduler
settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler'

# Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000),
# ensuring it is executed as the final step in the pipeline sequence
settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000

# Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None

# Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
settings['DOWNLOADER_MIDDLEWARES']['src.middlewares.ApifyHttpProxyMiddleware'] = 950

# Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000)
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000

# Use ApifyScheduler as the scheduler
settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler'
# Store the proxy configuration
settings['APIFY_PROXY_SETTINGS'] = proxy_cfg

return settings

Expand All @@ -79,18 +91,19 @@ async def main() -> None:
async with Actor:
Actor.log.info('Actor is being executed...')

# Process Actor input - you can customize logic for handling Actor input here
# The `start_urls` option from Actor input is combined with Scrapy's `start_urls` from your spiders
# Process Actor input
actor_input = await Actor.get_input() or {}
start_urls = [start_url.get('url') for start_url in actor_input.get('start_urls', LOCAL_DEFAULT_START_URLS)]

# Get Scrapy project settings with custom configurations
settings = _get_scrapy_settings()
start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
proxy_configuration = actor_input.get('proxyConfiguration')

# Add start URLs to the request queue
rq = await Actor.open_request_queue()
for url in start_urls:
await rq.add_request({'url': url, 'method': 'GET'})
for start_url in start_urls:
url = start_url.get('url')
await rq.add_request(request={'url': url, 'method': 'GET'})

# Get Scrapy project settings with custom configurations
settings = _get_scrapy_settings(proxy_configuration)

# Execute the spider using Scrapy CrawlerProcess
process = CrawlerProcess(settings, install_root_handler=False)
Expand Down
9 changes: 7 additions & 2 deletions templates/python-scrapy/src/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from scrapy.crawler import Crawler
from scrapy.http import Response

from apify import Actor

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter

Expand Down Expand Up @@ -76,7 +78,8 @@ def process_start_requests(
yield r

def spider_opened(self, spider: Spider) -> None:
spider.logger.info('TitleSpiderMiddleware: Spider opened: %s', spider.name)
# spider.logger.info('TitleSpiderMiddleware: Spider opened: %s', spider.name)
pass


class TitleDownloaderMiddleware:
Expand All @@ -101,6 +104,7 @@ def process_request(self, request: Request, spider: Spider) -> Request | Respons
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
# Actor.log.info(f'TitleDownloaderMiddleware.process_request was called (scrapy_request={request}, scrapy_request.meta={request.meta})...')
return None

def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response:
Expand All @@ -123,4 +127,5 @@ def process_exception(self, request: Request, exception: BaseException, spider:
pass

def spider_opened(self, spider: Spider) -> None:
spider.logger.info('TitleDownloaderMiddleware: Spider opened: %s', spider.name)
# spider.logger.info('TitleDownloaderMiddleware: Spider opened: %s', spider.name)
pass
4 changes: 2 additions & 2 deletions templates/python-scrapy/src/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
'src.pipelines.TitleItemPipeline': 123,
}
SPIDER_MIDDLEWARES = {
'src.middlewares.TitleSpiderMiddleware': 543,
'src.middlewares.TitleSpiderMiddleware': 999,
}
DOWNLOADER_MIDDLEWARES = {
'src.middlewares.TitleDownloaderMiddleware': 543,
'src.middlewares.TitleDownloaderMiddleware': 999,
}

0 comments on commit 1146914

Please sign in to comment.