Add Scrapy ApifyHttpProxyMiddleware for managing proxies

Closes: #255
apify · Dec 21, 2023 · 1146914 · 1146914
1 parent 50eff8b
commit 1146914
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 17 deletions.
diff --git a/templates/python-scrapy/.actor/input_schema.json b/templates/python-scrapy/.actor/input_schema.json
@@ -11,7 +11,16 @@
                 { "url": "https://apify.com" }
             ],
             "editor": "requestListSources"
+        },
+        "proxyConfiguration": {
+            "sectionCaption": "Proxy and HTTP configuration",
+            "title": "Proxy configuration",
+            "type": "object",
+            "description": "Specifies proxy servers that will be used by the scraper in order to hide its origin.",
+            "editor": "proxy",
+            "prefill": { "useApifyProxy": true },
+            "default": { "useApifyProxy": true }
         }
     },
-    "required": ["start_urls"]
+    "required": ["start_urls", "proxyConfiguration"]
 }
diff --git a/templates/python-scrapy/requirements.txt b/templates/python-scrapy/requirements.txt
@@ -1,6 +1,6 @@
 # Feel free to add your Python dependencies below. For formatting guidelines, see:
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
-apify[scrapy] ~= 1.4.0
+apify[scrapy] ~= 1.4.1
 nest-asyncio ~= 1.5.8
 scrapy ~= 2.11.0
diff --git a/templates/python-scrapy/src/__main__.py b/templates/python-scrapy/src/__main__.py
@@ -87,6 +87,9 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
     # messages, especially when running on the platform.
     configure_logger('httpx', 'WARNING')
 
+    # # tmp
+    # configure_logger('apify', 'DEBUG')
+
 
 scrapy_logging.configure_logging = new_configure_logging
 

diff --git a/templates/python-scrapy/src/main.py b/templates/python-scrapy/src/main.py
@@ -34,6 +34,8 @@
    Issue: https://github.com/apify/actor-templates/issues/202
 """
 
+from __future__ import annotations
+
 from scrapy.crawler import CrawlerProcess
 from scrapy.settings import Settings
 from scrapy.utils.project import get_project_settings
@@ -47,7 +49,7 @@
 LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]
 
 
-def _get_scrapy_settings() -> Settings:
+def _get_scrapy_settings(proxy_cfg: dict | None = None) -> Settings:
     """
     Get Scrapy project settings with custom configurations.
 
@@ -58,16 +60,26 @@ def _get_scrapy_settings() -> Settings:
     """
     settings = get_project_settings()
 
+    # Use ApifyScheduler as the scheduler
+    settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler'
+
     # Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000),
     # ensuring it is executed as the final step in the pipeline sequence
     settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000
 
+    # Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
+    settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None
+
+    # Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
+    settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
+    settings['DOWNLOADER_MIDDLEWARES']['src.middlewares.ApifyHttpProxyMiddleware'] = 950
+
     # Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000)
     settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None
     settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000
 
-    # Use ApifyScheduler as the scheduler
-    settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler'
+    # Store the proxy configuration
+    settings['APIFY_PROXY_SETTINGS'] = proxy_cfg
 
     return settings
 
@@ -79,18 +91,19 @@ async def main() -> None:
     async with Actor:
         Actor.log.info('Actor is being executed...')
 
-        # Process Actor input - you can customize logic for handling Actor input here
-        # The `start_urls` option from Actor input is combined with Scrapy's `start_urls` from your spiders
+        # Process Actor input
         actor_input = await Actor.get_input() or {}
-        start_urls = [start_url.get('url') for start_url in actor_input.get('start_urls', LOCAL_DEFAULT_START_URLS)]
-
-        # Get Scrapy project settings with custom configurations
-        settings = _get_scrapy_settings()
+        start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
+        proxy_configuration = actor_input.get('proxyConfiguration')
 
         # Add start URLs to the request queue
         rq = await Actor.open_request_queue()
-        for url in start_urls:
-            await rq.add_request({'url': url, 'method': 'GET'})
+        for start_url in start_urls:
+            url = start_url.get('url')
+            await rq.add_request(request={'url': url, 'method': 'GET'})
+
+        # Get Scrapy project settings with custom configurations
+        settings = _get_scrapy_settings(proxy_configuration)
 
         # Execute the spider using Scrapy CrawlerProcess
         process = CrawlerProcess(settings, install_root_handler=False)

diff --git a/templates/python-scrapy/src/middlewares.py b/templates/python-scrapy/src/middlewares.py
@@ -17,6 +17,8 @@
 from scrapy.crawler import Crawler
 from scrapy.http import Response
 
+from apify import Actor
+
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 
@@ -76,7 +78,8 @@ def process_start_requests(
             yield r
 
     def spider_opened(self, spider: Spider) -> None:
-        spider.logger.info('TitleSpiderMiddleware: Spider opened: %s', spider.name)
+        # spider.logger.info('TitleSpiderMiddleware: Spider opened: %s', spider.name)
+        pass
 
 
 class TitleDownloaderMiddleware:
@@ -101,6 +104,7 @@ def process_request(self, request: Request, spider: Spider) -> Request | Respons
         # - or return a Request object
         # - or raise IgnoreRequest: process_exception() methods of
         #   installed downloader middleware will be called
+        # Actor.log.info(f'TitleDownloaderMiddleware.process_request was called (scrapy_request={request}, scrapy_request.meta={request.meta})...')
         return None
 
     def process_response(self, request: Request, response: Response, spider: Spider) -> Request | Response:
@@ -123,4 +127,5 @@ def process_exception(self, request: Request, exception: BaseException, spider:
         pass
 
     def spider_opened(self, spider: Spider) -> None:
-        spider.logger.info('TitleDownloaderMiddleware: Spider opened: %s', spider.name)
+        # spider.logger.info('TitleDownloaderMiddleware: Spider opened: %s', spider.name)
+        pass
diff --git a/templates/python-scrapy/src/settings.py b/templates/python-scrapy/src/settings.py
@@ -19,8 +19,8 @@
     'src.pipelines.TitleItemPipeline': 123,
 }
 SPIDER_MIDDLEWARES = {
-    'src.middlewares.TitleSpiderMiddleware': 543,
+    'src.middlewares.TitleSpiderMiddleware': 999,
 }
 DOWNLOADER_MIDDLEWARES = {
-    'src.middlewares.TitleDownloaderMiddleware': 543,
+    'src.middlewares.TitleDownloaderMiddleware': 999,
 }