From 1ca82f8ba8ee7793b32f697058e64e95cfab379f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 23 Feb 2024 11:25:59 +0100 Subject: [PATCH] Update of Scrapy integration, fixing dw middlewares (#186) --- CHANGELOG.md | 10 +- pyproject.toml | 2 +- src/apify/scrapy/middlewares/__init__.py | 1 - src/apify/scrapy/middlewares/apify_retry.py | 117 ------------------ src/apify/scrapy/requests.py | 23 +++- src/apify/scrapy/scheduler.py | 13 +- src/apify/scrapy/utils.py | 13 +- .../scrapy/utils/test_apply_apify_settings.py | 10 +- 8 files changed, 47 insertions(+), 142 deletions(-) delete mode 100644 src/apify/scrapy/middlewares/apify_retry.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 548a2394..c01e00bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,14 @@ # Changelog -## [1.5.6](../../releases/tag/v1.5.6) - Unreleased +## [1.6.0](../../releases/tag/v1.6.0) - Unreleased -... +### Fixed + +- Update of Scrapy integration, fixes in `ApifyScheduler`, `to_apify_request` and `apply_apify_settings`. + +### Removed + +- Removed `ApifyRetryMiddleware` and stay with the Scrapy's default one ## [1.5.5](../../releases/tag/v1.5.5) - 2024-02-01 diff --git a/pyproject.toml b/pyproject.toml index 43e00000..541bbf16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "apify" -version = "1.5.6" +version = "1.6.0" description = "Apify SDK for Python" readme = "README.md" license = { text = "Apache Software License" } diff --git a/src/apify/scrapy/middlewares/__init__.py b/src/apify/scrapy/middlewares/__init__.py index d022da54..257252d5 100644 --- a/src/apify/scrapy/middlewares/__init__.py +++ b/src/apify/scrapy/middlewares/__init__.py @@ -1,2 +1 @@ from .apify_proxy import ApifyHttpProxyMiddleware -from .apify_retry import ApifyRetryMiddleware diff --git a/src/apify/scrapy/middlewares/apify_retry.py b/src/apify/scrapy/middlewares/apify_retry.py deleted file mode 100644 index a5eb5a73..00000000 --- a/src/apify/scrapy/middlewares/apify_retry.py +++ /dev/null @@ -1,117 +0,0 @@ -from __future__ import annotations - -import traceback -from typing import TYPE_CHECKING, Any - -try: - from scrapy import Spider # noqa: TCH002 - from scrapy.downloadermiddlewares.retry import RetryMiddleware - from scrapy.http import Request, Response # noqa: TCH002 - from scrapy.utils.response import response_status_message -except ImportError as exc: - raise ImportError( - 'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', - ) from exc - -from apify.actor import Actor -from apify.scrapy.requests import to_apify_request -from apify.scrapy.utils import nested_event_loop, open_queue_with_custom_client - -if TYPE_CHECKING: - from apify.storages import RequestQueue - - -class ApifyRetryMiddleware(RetryMiddleware): - """The default Scrapy retry middleware enriched with Apify's Request Queue interaction.""" - - def __init__(self: ApifyRetryMiddleware, *args: Any, **kwargs: Any) -> None: - """Create a new instance.""" - super().__init__(*args, **kwargs) - try: - self._rq: RequestQueue = nested_event_loop.run_until_complete(open_queue_with_custom_client()) - except BaseException: - traceback.print_exc() - raise - - def process_response( - self: ApifyRetryMiddleware, - request: Request, - response: Response, - spider: Spider, - ) -> Request | Response | None: - """Process the response and decide whether the request should be retried. - - Args: - request: The request that was sent. - response: The response that was received. - spider: The Spider that sent the request. - - Returns: - The response, or a new request if the request should be retried. - """ - if not isinstance(request.url, str): - raise TypeError(f'Expected request.url to be a string, got {type(request.url)} instead.') - - # Robots requests are bypassed directly, they don't go through a Scrapy Scheduler, and also through our - # Request Queue. Check the scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware for details. - if request.url.endswith('robots.txt'): - return response - - try: - return nested_event_loop.run_until_complete(self._handle_retry_logic(request, response, spider)) - except BaseException: - traceback.print_exc() - raise - - def process_exception( - self: ApifyRetryMiddleware, - request: Request, - exception: BaseException, - spider: Spider, - ) -> Request | Response | None: - """Handle the exception and decide whether the request should be retried. - - Args: - request: The request that encountered an exception. - exception: The exception that occurred. - spider: The Spider that sent the request. - - Returns: - None: The request will not be retried. - """ - Actor.log.debug(f'ApifyRetryMiddleware.process_exception was called (request={request}, exception={exception})...') - apify_request = to_apify_request(request, spider=spider) - - # Unlike the default Scrapy RetryMiddleware, we do not attempt to retry requests on exception. - # It was causing issues with the Apify request queue, because the request was not marked as handled and was - # stucked in the request queue forever - Scrapy crawling never finished. The solution would be to completely - # rewrite the retry logic from default RetryMiddleware. - try: - nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request)) - except BaseException: - traceback.print_exc() - raise - - return None - - async def _handle_retry_logic( - self: ApifyRetryMiddleware, - request: Request, - response: Response, - spider: Spider, - ) -> Request | Response | None: - """Handle the retry logic of the request.""" - Actor.log.debug(f'ApifyRetryMiddleware.handle_retry_logic was called (scrapy_request={request})...') - apify_request = to_apify_request(request, spider=spider) - - if request.meta.get('dont_retry', False): - await self._rq.mark_request_as_handled(apify_request) - return response - - if response.status in self.retry_http_codes: - await self._rq.reclaim_request(apify_request) - reason = response_status_message(response.status) - return self._retry(request, reason, spider) or response - - await self._rq.mark_request_as_handled(apify_request) - return response diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 3ccea9fd..6fde057c 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -16,6 +16,14 @@ from apify.actor import Actor +def _is_request_produced_by_middleware(scrapy_request: Request) -> bool: + """Returns True if the Scrapy request was produced by a downloader middleware, otherwise False. + + Works for RetryMiddleware and RedirectMiddleware. + """ + return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times')) + + def to_apify_request(scrapy_request: Request, spider: Spider) -> dict: """Convert a Scrapy request to an Apify request. @@ -48,13 +56,16 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict: f'scrapy_request.headers is not an instance of the scrapy.http.headers.Headers class, scrapy_request.headers = {scrapy_request.headers}', ) - # Add 'id' to the apify_request - if scrapy_request.meta.get('apify_request_id'): - apify_request['id'] = scrapy_request.meta['apify_request_id'] + if _is_request_produced_by_middleware(scrapy_request): + apify_request['uniqueKey'] = scrapy_request.url + else: + # Add 'id' to the apify_request + if scrapy_request.meta.get('apify_request_id'): + apify_request['id'] = scrapy_request.meta['apify_request_id'] - # Add 'uniqueKey' to the apify_request - if scrapy_request.meta.get('apify_request_unique_key'): - apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key'] + # Add 'uniqueKey' to the apify_request + if scrapy_request.meta.get('apify_request_unique_key'): + apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key'] # Serialize the Scrapy Request and store it in the apify_request. # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64, diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 1e55fc89..dcf3fd66 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -70,6 +70,8 @@ def has_pending_requests(self: ApifyScheduler) -> bool: def enqueue_request(self: ApifyScheduler, request: Request) -> bool: """Add a request to the scheduler. + This could be called from either from a spider or a downloader middleware (e.g. redirect, retry, ...). + Args: request: The request to add to the scheduler. @@ -94,7 +96,7 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool: traceback.print_exc() raise - Actor.log.debug(f'[{call_id}]: apify_request was added to the RQ (apify_request={apify_request})') + Actor.log.debug(f'[{call_id}]: rq.add_request.result={result}...') return bool(result['wasAlreadyPresent']) def next_request(self: ApifyScheduler) -> Request | None: @@ -109,6 +111,7 @@ def next_request(self: ApifyScheduler) -> Request | None: if not isinstance(self._rq, RequestQueue): raise TypeError('self._rq must be an instance of the RequestQueue class') + # Fetch the next request from the Request Queue try: apify_request = nested_event_loop.run_until_complete(self._rq.fetch_next_request()) except BaseException: @@ -123,6 +126,14 @@ def next_request(self: ApifyScheduler) -> Request | None: if not isinstance(self.spider, Spider): raise TypeError('self.spider must be an instance of the Spider class') + # Let the Request Queue know that the request is being handled. Every request should be marked as handled, + # retrying is handled by the Scrapy's RetryMiddleware. + try: + nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request)) + except BaseException: + traceback.print_exc() + raise + scrapy_request = to_scrapy_request(apify_request, spider=self.spider) Actor.log.debug( f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned (scrapy_request={scrapy_request})', diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index 66e58e9b..405e59a3 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -59,16 +59,13 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict # ensuring it is executed as the final step in the pipeline sequence settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000 - # Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt - settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None + # Disable the default AjaxCrawlMiddleware since it can be problematic with Apify. It can return a new request + # during process_response, but currently we have no way of detecting it and handling it properly. + settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware'] = None - # Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware + # Replace the default HttpProxyMiddleware with ApifyHttpProxyMiddleware settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None - settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950 - - # Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000) - settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None - settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000 + settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750 # Store the proxy configuration settings['APIFY_PROXY_SETTINGS'] = proxy_config diff --git a/tests/unit/scrapy/utils/test_apply_apify_settings.py b/tests/unit/scrapy/utils/test_apply_apify_settings.py index 9de69379..42de1e9b 100644 --- a/tests/unit/scrapy/utils/test_apply_apify_settings.py +++ b/tests/unit/scrapy/utils/test_apply_apify_settings.py @@ -34,7 +34,6 @@ def test__apply_apify_settings__update_downloader_middlewares() -> None: 'DOWNLOADER_MIDDLEWARES': { 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 234, - 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 345, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543, }, } @@ -42,12 +41,11 @@ def test__apply_apify_settings__update_downloader_middlewares() -> None: new_settings = apply_apify_settings(settings=settings) assert new_settings.get('DOWNLOADER_MIDDLEWARES') == { - 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': None, - 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, - 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, - 'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 950, - 'apify.scrapy.middlewares.ApifyRetryMiddleware': 1000, + 'apify.scrapy.middlewares.ApifyHttpProxyMiddleware': 750, + 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': None, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 543, + 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, + 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 123, }