From 4fb959e51d3c8aefbb787cccb0ed89f35d813b8f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 11 Feb 2025 14:40:18 +0100 Subject: [PATCH 1/2] Set line length to docs related code to 90 Update existing examples to be compliant. --- docs/01_overview/code/01_introduction.py | 5 +- .../code/02_crawlee_beautifulsoup.py | 3 +- docs/02_guides/code/02_crawlee_playwright.py | 18 ++++- docs/02_guides/code/scrapy_src/__main__.py | 74 ++++++++++--------- docs/02_guides/code/scrapy_src/items.py | 3 +- docs/02_guides/code/scrapy_src/main.py | 21 +++--- docs/02_guides/code/scrapy_src/settings.py | 3 +- .../code/scrapy_src/spiders/title.py | 3 +- docs/03_concepts/code/03_rq.py | 11 ++- docs/03_concepts/code/05_proxy_actor_input.py | 4 +- docs/03_concepts/code/05_proxy_rotation.py | 16 +++- docs/03_concepts/code/09_webserver.py | 4 +- docs/pyproject.toml | 9 +++ 13 files changed, 113 insertions(+), 61 deletions(-) create mode 100644 docs/pyproject.toml diff --git a/docs/01_overview/code/01_introduction.py b/docs/01_overview/code/01_introduction.py index c5441d61..a3eaba25 100644 --- a/docs/01_overview/code/01_introduction.py +++ b/docs/01_overview/code/01_introduction.py @@ -10,5 +10,8 @@ async def main() -> None: async with httpx.AsyncClient() as client: response = await client.get(actor_input['url']) soup = BeautifulSoup(response.content, 'html.parser') - data = {'url': actor_input['url'], 'title': soup.title.string if soup.title else None} + data = { + 'url': actor_input['url'], + 'title': soup.title.string if soup.title else None, + } await Actor.push_data(data) diff --git a/docs/02_guides/code/02_crawlee_beautifulsoup.py b/docs/02_guides/code/02_crawlee_beautifulsoup.py index 489d83ae..e2dba8a1 100644 --- a/docs/02_guides/code/02_crawlee_beautifulsoup.py +++ b/docs/02_guides/code/02_crawlee_beautifulsoup.py @@ -25,7 +25,8 @@ async def main() -> None: # Create a crawler. crawler = BeautifulSoupCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. + # Limit the crawl to max requests. + # Remove or increase it for crawling all links. max_requests_per_crawl=50, ) diff --git a/docs/02_guides/code/02_crawlee_playwright.py b/docs/02_guides/code/02_crawlee_playwright.py index 674c1e94..2f0f110f 100644 --- a/docs/02_guides/code/02_crawlee_playwright.py +++ b/docs/02_guides/code/02_crawlee_playwright.py @@ -25,7 +25,8 @@ async def main() -> None: # Create a crawler. crawler = PlaywrightCrawler( - # Limit the crawl to max requests. Remove or increase it for crawling all links. + # Limit the crawl to max requests. + # Remove or increase it for crawling all links. max_requests_per_crawl=50, headless=True, browser_launch_options={ @@ -43,9 +44,18 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: data = { 'url': context.request.url, 'title': await context.page.title(), - 'h1s': [await h1.text_content() for h1 in await context.page.locator('h1').all()], - 'h2s': [await h2.text_content() for h2 in await context.page.locator('h2').all()], - 'h3s': [await h3.text_content() for h3 in await context.page.locator('h3').all()], + 'h1s': [ + await h1.text_content() + for h1 in await context.page.locator('h1').all() + ], + 'h2s': [ + await h2.text_content() + for h2 in await context.page.locator('h2').all() + ], + 'h3s': [ + await h3.text_content() + for h3 in await context.page.locator('h3').all() + ], } # Store the extracted data to the default dataset. diff --git a/docs/02_guides/code/scrapy_src/__main__.py b/docs/02_guides/code/scrapy_src/__main__.py index 56d477dd..c31adfb2 100644 --- a/docs/02_guides/code/scrapy_src/__main__.py +++ b/docs/02_guides/code/scrapy_src/__main__.py @@ -1,19 +1,20 @@ """Apify Actor integration for Scrapy projects. -This module transforms a Scrapy project into an Apify Actor, handling the configuration of logging, patching Scrapy's -logging system, and establishing the required environment to run the Scrapy spider within the Apify platform. +This module transforms a Scrapy project into an Apify Actor, handling the configuration +of logging, patching Scrapy's logging system, and establishing the required environment +to run the Scrapy spider within the Apify platform. -This file is specifically designed to be executed when the project is run as an Apify Actor using `apify run` locally -or being run on the Apify platform. It is not being executed when running the project as a Scrapy project using -`scrapy crawl title_spider`. +This file is specifically designed to be executed when the project is run as an Apify +Actor using `apify run` locally or being run on the Apify platform. It is not being +executed when running the project as a Scrapy project using `scrapy crawl title_spider`. We recommend you do not modify this file unless you really know what you are doing. """ # ruff: noqa: E402 -# We need to configure the logging first before we import anything else, so that nothing else imports -# `scrapy.utils.log` before we patch it. +# We need to configure the logging first before we import anything else, so that nothing +# else imports `scrapy.utils.log` before we patch it. from __future__ import annotations from logging import StreamHandler, getLogger @@ -29,9 +30,10 @@ OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted'] ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES -# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file, -# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for -# a specific logger, do it in this file. +# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the +# field is not present in the file, Scrapy will default to `DEBUG`. This setting applies +# to all loggers. If you wish to change the logging level for a specific logger, +# do it in this file. settings = get_project_settings() LOGGING_LEVEL = settings['LOG_LEVEL'] @@ -40,7 +42,9 @@ apify_handler.setFormatter(ActorLogFormatter(include_logger_name=True)) -def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None: +def configure_logger( + logger_name: str | None, log_level: str, *handlers: StreamHandler +) -> None: """Configure a logger with the specified settings. Args: @@ -56,41 +60,46 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH logger.addHandler(handler) -# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from +# Apify loggers have to be set up here and in the `new_configure_logging` as well to be +# able to use them both from # the `main.py` and Scrapy components. for logger_name in MAIN_LOGGER_NAMES: configure_logger(logger_name, LOGGING_LEVEL, apify_handler) -# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging` -# call here: https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though -# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's `configure_logging` method -# like this, so that our handler is attached right after Scrapy calls the `configure_logging` method, because -# otherwise we would lose some log messages. +# We can't attach our log handler to the loggers normally, because Scrapy would remove +# them in the `configure_logging` call here: +# https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L113 (even though +# `disable_existing_loggers` is set to False :facepalm:). We need to monkeypatch Scrapy's +# `configure_logging` method like this, so that our handler is attached right after +# Scrapy calls the `configure_logging` method, because otherwise we would lose some log +# messages. old_configure_logging = scrapy_logging.configure_logging def new_configure_logging(*args: Any, **kwargs: Any) -> None: - """Configure logging for Scrapy and root loggers to ensure consistent logging behavior. + """Configure logging for Scrapy and root loggers to ensure consistent log behavior. - We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root - logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary - loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here + We need to manually configure both the root logger and all Scrapy-associated loggers. + Configuring only the root logger is not sufficient, as Scrapy will override it with + its own settings. Scrapy uses these four primary loggers: + https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, these four loggers and the root logger. """ old_configure_logging(*args, **kwargs) - # We modify the root (None) logger to ensure proper display of logs from spiders when using the `self.logger` - # property within spiders. See details in the Spider logger property: + # We modify the root (None) logger to ensure proper display of logs from spiders when + # using the `self.logger` property within spiders. See details in the Spider logger + # property: # https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/spiders/__init__.py#L43:L46. configure_logger(None, LOGGING_LEVEL, apify_handler) - # We modify other loggers only by setting up their log level. A custom log handler is added - # only to the root logger to avoid duplicate log messages. + # We modify other loggers only by setting up their log level. A custom log handler + # is added only to the root logger to avoid duplicate log messages. for logger_name in ALL_LOGGER_NAMES: configure_logger(logger_name, LOGGING_LEVEL) - # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose and spams the logs with useless - # messages, especially when running on the platform. + # Set the HTTPX logger explicitly to the WARNING level, because it is too verbose + # and spams the logs with useless messages, especially when running on the platform. configure_logger('httpx', 'WARNING') @@ -105,12 +114,11 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None: from .main import main -# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is -# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries -# to work together. -# -# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly -# on Windows. +# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) +# asynchronous libraries, it is necessary to set the Twisted reactor to +# `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries +# to work together. Note: The reactor must be installed before applying +# `nest_asyncio.apply()`, otherwise, it will not work correctly on Windows. install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') nest_asyncio.apply() diff --git a/docs/02_guides/code/scrapy_src/items.py b/docs/02_guides/code/scrapy_src/items.py index eae7ff23..f6d8b28d 100644 --- a/docs/02_guides/code/scrapy_src/items.py +++ b/docs/02_guides/code/scrapy_src/items.py @@ -3,7 +3,8 @@ This module defines Scrapy item models for scraped data. Items represent structured data extracted by spiders. -For detailed information on creating and utilizing items, refer to the official documentation: +For detailed information on creating and utilizing items, +refer to the official documentation: https://docs.scrapy.org/en/latest/topics/items.html """ diff --git a/docs/02_guides/code/scrapy_src/main.py b/docs/02_guides/code/scrapy_src/main.py index 1a878c5b..4bf9441b 100644 --- a/docs/02_guides/code/scrapy_src/main.py +++ b/docs/02_guides/code/scrapy_src/main.py @@ -1,22 +1,25 @@ """This module defines the main entry point for the Apify Actor. -This module defines the main coroutine for the Apify Scrapy Actor, executed from the __main__.py file. The coroutine -processes the Actor's input and executes the Scrapy spider. Additionally, it updates Scrapy project settings by -applying Apify-related settings. Which includes adding a custom scheduler, retry middleware, and an item pipeline -for pushing data to the Apify dataset. +This module defines the main coroutine for the Apify Scrapy Actor, executed from +the __main__.py file. The coroutine processes the Actor's input and executes the Scrapy +spider. Additionally, it updates Scrapy project settings by applying Apify-related +settings. Which includes adding a custom scheduler, retry middleware, and an item +pipeline for pushing data to the Apify dataset. Customization: -------------- -Feel free to customize this file to add specific functionality to the Actor, such as incorporating your own Scrapy -components like spiders and handling Actor input. However, make sure you have a clear understanding of your -modifications. For instance, removing `apply_apify_settings` break the integration between Scrapy and Apify. +Feel free to customize this file to add specific functionality to the Actor, such +as incorporating your own Scrapy components like spiders and handling Actor input. +However, make sure you have a clear understanding of your modifications. For instance, +removing `apply_apify_settings` break the integration between Scrapy and Apify. Documentation: -------------- -For an in-depth description of the Apify-Scrapy integration process, our Scrapy components, known limitations and -other stuff, please refer to the following documentation page: https://docs.apify.com/cli/docs/integrating-scrapy. +For an in-depth description of the Apify-Scrapy integration process, our Scrapy +components, known limitations and other stuff, please refer to the following +documentation page: https://docs.apify.com/cli/docs/integrating-scrapy. """ from __future__ import annotations diff --git a/docs/02_guides/code/scrapy_src/settings.py b/docs/02_guides/code/scrapy_src/settings.py index 8a0fd3e6..c9d11107 100644 --- a/docs/02_guides/code/scrapy_src/settings.py +++ b/docs/02_guides/code/scrapy_src/settings.py @@ -1,6 +1,7 @@ """Scrapy settings module. -This module contains Scrapy settings for the project, defining various configurations and options. +This module contains Scrapy settings for the project, defining various configurations +and options. For more comprehensive details on Scrapy settings, refer to the official documentation: http://doc.scrapy.org/en/latest/topics/settings.html diff --git a/docs/02_guides/code/scrapy_src/spiders/title.py b/docs/02_guides/code/scrapy_src/spiders/title.py index 7be37b68..1c299abe 100644 --- a/docs/02_guides/code/scrapy_src/spiders/title.py +++ b/docs/02_guides/code/scrapy_src/spiders/title.py @@ -20,7 +20,8 @@ class TitleSpider(Spider): name = 'title_spider' - # The `start_urls` specified in this class will be merged with the `start_urls` value from your Actor input + # The `start_urls` specified in this class will be merged with the + # `start_urls` value from your Actor input # when the project is executed using Apify. start_urls = ['https://apify.com/'] diff --git a/docs/03_concepts/code/03_rq.py b/docs/03_concepts/code/03_rq.py index ba6a9570..fe1ea605 100644 --- a/docs/03_concepts/code/03_rq.py +++ b/docs/03_concepts/code/03_rq.py @@ -19,7 +19,9 @@ async def main() -> None: await queue.add_request(Request.from_url('http://example.com/0'), forefront=True) # If you try to add an existing request again, it will not do anything - add_request_info = await queue.add_request(Request.from_url('http://different-example.com/5')) + add_request_info = await queue.add_request( + Request.from_url('http://different-example.com/5') + ) Actor.log.info(f'Add request info: {add_request_info}') processed_request = await queue.get_request(add_request_info.id) @@ -29,8 +31,8 @@ async def main() -> None: while not await queue.is_finished(): # Fetch the next unhandled request in the queue request = await queue.fetch_next_request() - # This can happen due to the eventual consistency of the underlying request queue storage, - # best solution is just to sleep a bit + # This can happen due to the eventual consistency of the underlying request + # queue storage, best solution is just to sleep a bit. if request is None: await asyncio.sleep(1) continue @@ -45,6 +47,7 @@ async def main() -> None: Actor.log.info('Request successful.') await queue.mark_request_as_handled(request) else: - # If processing the request was unsuccessful, reclaim it so it can be processed again + # If processing the request was unsuccessful, reclaim it so it can be + # processed again. Actor.log.warning('Request failed, will retry!') await queue.reclaim_request(request) diff --git a/docs/03_concepts/code/05_proxy_actor_input.py b/docs/03_concepts/code/05_proxy_actor_input.py index 3a69ea0a..3ca0344d 100644 --- a/docs/03_concepts/code/05_proxy_actor_input.py +++ b/docs/03_concepts/code/05_proxy_actor_input.py @@ -5,7 +5,9 @@ async def main() -> None: async with Actor: actor_input = await Actor.get_input() or {} proxy_settings = actor_input.get('proxySettings') - proxy_configuration = await Actor.create_proxy_configuration(actor_proxy_input=proxy_settings) + proxy_configuration = await Actor.create_proxy_configuration( + actor_proxy_input=proxy_settings + ) if not proxy_configuration: raise RuntimeError('No proxy configuration available.') diff --git a/docs/03_concepts/code/05_proxy_rotation.py b/docs/03_concepts/code/05_proxy_rotation.py index c816dabf..8e6a5de0 100644 --- a/docs/03_concepts/code/05_proxy_rotation.py +++ b/docs/03_concepts/code/05_proxy_rotation.py @@ -17,7 +17,15 @@ async def main() -> None: proxy_url = await proxy_configuration.new_url() # http://proxy-2.com proxy_url = await proxy_configuration.new_url() # http://proxy-1.com proxy_url = await proxy_configuration.new_url() # http://proxy-2.com - proxy_url = await proxy_configuration.new_url(session_id='a') # http://proxy-1.com - proxy_url = await proxy_configuration.new_url(session_id='b') # http://proxy-2.com - proxy_url = await proxy_configuration.new_url(session_id='b') # http://proxy-2.com - proxy_url = await proxy_configuration.new_url(session_id='a') # http://proxy-1.com + proxy_url = await proxy_configuration.new_url( + session_id='a' + ) # http://proxy-1.com + proxy_url = await proxy_configuration.new_url( + session_id='b' + ) # http://proxy-2.com + proxy_url = await proxy_configuration.new_url( + session_id='b' + ) # http://proxy-2.com + proxy_url = await proxy_configuration.new_url( + session_id='a' + ) # http://proxy-1.com diff --git a/docs/03_concepts/code/09_webserver.py b/docs/03_concepts/code/09_webserver.py index de6d953d..48a5c10d 100644 --- a/docs/03_concepts/code/09_webserver.py +++ b/docs/03_concepts/code/09_webserver.py @@ -21,7 +21,9 @@ def run_server() -> None: # Start the HTTP server on the provided port, # and save a reference to the server. global http_server - with ThreadingHTTPServer(('', Actor.config.web_server_port), RequestHandler) as server: + with ThreadingHTTPServer( + ('', Actor.config.web_server_port), RequestHandler + ) as server: Actor.log.info(f'Server running on {Actor.config.web_server_port}') http_server = server server.serve_forever() diff --git a/docs/pyproject.toml b/docs/pyproject.toml new file mode 100644 index 00000000..73a75678 --- /dev/null +++ b/docs/pyproject.toml @@ -0,0 +1,9 @@ +# Line lenght different from the rest of the code to make sure that the example codes visualised on the generated +# documentation webpages are shown without vertical slider to make them more readable. + +[tool.ruff] +# Inherit all from project top configuration file. +extend = "../pyproject.toml" + +# Override just line length +line-length = 90 # Maximum possible fit to the doc webpage. Longer lines need slider. From ce543e3a326be3a0808b1e205a0a86e6856c9274 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 18 Feb 2025 08:49:59 +0100 Subject: [PATCH 2/2] Remove scrapy_src leftovers after merge from master --- docs/02_guides/code/scrapy_src/__main__.py | 0 docs/02_guides/code/scrapy_src/items.py | 0 docs/02_guides/code/scrapy_src/main.py | 0 docs/02_guides/code/scrapy_src/settings.py | 0 4 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 docs/02_guides/code/scrapy_src/__main__.py delete mode 100644 docs/02_guides/code/scrapy_src/items.py delete mode 100644 docs/02_guides/code/scrapy_src/main.py delete mode 100644 docs/02_guides/code/scrapy_src/settings.py diff --git a/docs/02_guides/code/scrapy_src/__main__.py b/docs/02_guides/code/scrapy_src/__main__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/02_guides/code/scrapy_src/items.py b/docs/02_guides/code/scrapy_src/items.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/02_guides/code/scrapy_src/main.py b/docs/02_guides/code/scrapy_src/main.py deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/02_guides/code/scrapy_src/settings.py b/docs/02_guides/code/scrapy_src/settings.py deleted file mode 100644 index e69de29b..00000000