-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move Scrapy-related code from Actor template to SDK (#134)
- Loading branch information
Showing
22 changed files
with
465 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from .middlewares import ApifyRetryMiddleware | ||
from .pipelines import ActorDatasetPushPipeline | ||
from .scheduler import ApifyScheduler | ||
from .utils import get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import traceback | ||
from typing import Union | ||
|
||
try: | ||
from scrapy import Spider | ||
from scrapy.downloadermiddlewares.retry import RetryMiddleware | ||
from scrapy.exceptions import IgnoreRequest | ||
from scrapy.http import Request, Response | ||
from scrapy.utils.response import response_status_message | ||
except ImportError as exc: | ||
raise ImportError( | ||
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', | ||
) from exc | ||
|
||
from ..actor import Actor | ||
from ..storages import RequestQueue | ||
from .utils import nested_event_loop, open_queue_with_custom_client, to_apify_request | ||
|
||
|
||
class ApifyRetryMiddleware(RetryMiddleware): | ||
"""The default Scrapy retry middleware enriched with Apify's Request Queue interaction.""" | ||
|
||
def __init__(self, *args: list, **kwargs: dict) -> None: | ||
"""Create a new instance.""" | ||
super().__init__(*args, **kwargs) | ||
try: | ||
self._rq: RequestQueue = nested_event_loop.run_until_complete(open_queue_with_custom_client()) | ||
except BaseException: | ||
traceback.print_exc() | ||
|
||
def __del__(self) -> None: | ||
"""Before deleting the instance, close the nested event loop.""" | ||
nested_event_loop.stop() | ||
nested_event_loop.close() | ||
|
||
def process_response(self, request: Request, response: Response, spider: Spider) -> Union[Request, Response]: | ||
"""Process the response and decide whether the request should be retried. | ||
Args: | ||
request: The request that was sent. | ||
response: The response that was received. | ||
spider: The Spider that sent the request. | ||
Returns: | ||
The response, or a new request if the request should be retried. | ||
""" | ||
# Robots requests are bypassed directly, they don't go through a Scrapy Scheduler, and also through our | ||
# Request Queue. Check the scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware for details. | ||
assert isinstance(request.url, str) | ||
if request.url.endswith('robots.txt'): | ||
return response | ||
|
||
try: | ||
returned = nested_event_loop.run_until_complete(self._handle_retry_logic(request, response, spider)) | ||
except BaseException: | ||
traceback.print_exc() | ||
|
||
return returned | ||
|
||
def process_exception( | ||
self, | ||
request: Request, | ||
exception: BaseException, | ||
spider: Spider, | ||
) -> Union[Request, Response, None]: | ||
"""Handle the exception and decide whether the request should be retried.""" | ||
Actor.log.debug(f'ApifyRetryMiddleware.process_exception was called (scrapy_request={request})...') | ||
apify_request = to_apify_request(request, spider=spider) | ||
|
||
if isinstance(exception, IgnoreRequest): | ||
try: | ||
nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request)) | ||
except BaseException: | ||
traceback.print_exc() | ||
else: | ||
nested_event_loop.run_until_complete(self._rq.reclaim_request(apify_request)) | ||
|
||
return super().process_exception(request, exception, spider) | ||
|
||
async def _handle_retry_logic( | ||
self, | ||
request: Request, | ||
response: Response, | ||
spider: Spider, | ||
) -> Union[Request, Response]: | ||
"""Handle the retry logic of the request.""" | ||
Actor.log.debug(f'ApifyRetryMiddleware.handle_retry_logic was called (scrapy_request={request})...') | ||
apify_request = to_apify_request(request, spider=spider) | ||
|
||
if request.meta.get('dont_retry', False): | ||
await self._rq.mark_request_as_handled(apify_request) | ||
return response | ||
|
||
if response.status in self.retry_http_codes: | ||
await self._rq.reclaim_request(apify_request) | ||
reason = response_status_message(response.status) | ||
return self._retry(request, reason, spider) or response | ||
|
||
await self._rq.mark_request_as_handled(apify_request) | ||
return response |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from itemadapter import ItemAdapter | ||
|
||
try: | ||
from scrapy import Item, Spider | ||
except ImportError as exc: | ||
raise ImportError( | ||
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".', | ||
) from exc | ||
|
||
from ..actor import Actor | ||
|
||
|
||
class ActorDatasetPushPipeline: | ||
"""A Scrapy pipeline for pushing items to an Actor's default dataset. | ||
This pipeline is designed to be enabled only when the Scrapy project is run as an Actor. | ||
""" | ||
|
||
async def process_item(self, item: Item, spider: Spider) -> Item: | ||
"""Pushes the provided Scrapy item to the Actor's default dataset.""" | ||
item_dict = ItemAdapter(item).asdict() | ||
Actor.log.debug(f'Pushing item={item_dict} produced by spider={spider} to the dataset.') | ||
await Actor.push_data(item_dict) | ||
return item |
Oops, something went wrong.