diff --git a/docs/experiments/request_locking.mdx b/docs/experiments/request_locking.mdx index f4f00116fff6..224b59ec7006 100644 --- a/docs/experiments/request_locking.mdx +++ b/docs/experiments/request_locking.mdx @@ -6,6 +6,21 @@ description: Parallelize crawlers with ease using request locking import ApiLink from '@site/src/components/ApiLink'; +:::tip Release announcement + +As of **May 2024** (`crawlee` version `3.10.0`), this experiment is now enabled by default! With that said, if you encounter issues you can: + +- set `requestLocking` to `false` in the `experiments` object of your crawler options +- update all imports of `RequestQueue` to `RequestQueueV1` +- open an issue on our [GitHub repository](https://github.com/apify/crawlee) + +The content below is kept for documentation purposes. +If you're interested in the changes, you can read the [blog post about the new Request Queue storage system on the Apify blog](https://blog.apify.com/new-apify-request-queue/). + +::: + +--- + :::caution This is an experimental feature. While we welcome testers, keep in mind that it is currently not recommended to use this in production. diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 2a00b012af47..1b3ff9ddafea 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -39,8 +39,8 @@ import { mergeCookies, NonRetryableError, purgeDefaultStorages, + RequestQueueV1, RequestQueue, - RequestQueueV2, RequestState, RetryRequestError, Router, @@ -356,8 +356,10 @@ export interface BasicCrawlerOptions(options?: RequestOptions): Promise | null>; /** diff --git a/packages/core/src/storages/request_queue.ts b/packages/core/src/storages/request_queue.ts index ae8ed86a81f9..bc9d16a6fb11 100644 --- a/packages/core/src/storages/request_queue.ts +++ b/packages/core/src/storages/request_queue.ts @@ -72,8 +72,10 @@ const RECENTLY_HANDLED_CACHE_SIZE = 1000; * await queue.addRequest({ url: 'http://example.com/foo/bar' }, { forefront: true }); * ``` * @category Sources + * + * @deprecated RequestQueue v1 is deprecated and will be removed in the future. Please use {@apilink RequestQueue} instead. */ -export class RequestQueue extends RequestProvider { +class RequestQueue extends RequestProvider { private queryQueueHeadPromise?: Promise<{ wasLimitReached: boolean; prevLimit: number; @@ -327,6 +329,12 @@ export class RequestQueue extends RequestProvider { return super.markRequestHandled(...args); } + /** + * Reclaims a failed request back to the queue, so that it can be returned for processing later again + * by another call to {@apilink RequestQueue.fetchNextRequest}. + * The request record in the queue is updated using the provided `request` parameter. + * For example, this lets you store the number of retries or error messages for the request. + */ override async reclaimRequest(...args: Parameters) { checkStorageAccess(); @@ -359,7 +367,25 @@ export class RequestQueue extends RequestProvider { this.lastActivity = new Date(); } + /** + * Opens a request queue and returns a promise resolving to an instance + * of the {@apilink RequestQueue} class. + * + * {@apilink RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud. + * The queue is used for deep crawling of websites, where you start with several URLs and then + * recursively follow links to other pages. The data structure supports both breadth-first + * and depth-first crawling orders. + * + * For more details and code examples, see the {@apilink RequestQueue} class. + * + * @param [queueIdOrName] + * ID or name of the request queue to be opened. If `null` or `undefined`, + * the function returns the default request queue associated with the crawler run. + * @param [options] Open Request Queue options. + */ static override async open(...args: Parameters): Promise { return super.open(...args) as Promise; } } + +export { RequestQueue as RequestQueueV1 }; diff --git a/packages/core/src/storages/request_queue_v2.ts b/packages/core/src/storages/request_queue_v2.ts index 58af926b265f..9f407ac2b5ef 100644 --- a/packages/core/src/storages/request_queue_v2.ts +++ b/packages/core/src/storages/request_queue_v2.ts @@ -21,7 +21,41 @@ const MAX_CACHED_REQUESTS = 2_000_000; */ const RECENTLY_HANDLED_CACHE_SIZE = 1000; -class RequestQueue extends RequestProvider { +/** + * Represents a queue of URLs to crawl, which is used for deep crawling of websites + * where you start with several URLs and then recursively + * follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders. + * + * Each URL is represented using an instance of the {@apilink Request} class. + * The queue can only contain unique URLs. More precisely, it can only contain {@apilink Request} instances + * with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden. + * To add a single URL multiple times to the queue, + * corresponding {@apilink Request} objects will need to have different `uniqueKey` properties. + * + * Do not instantiate this class directly, use the {@apilink RequestQueue.open} function instead. + * + * `RequestQueue` is used by {@apilink BasicCrawler}, {@apilink CheerioCrawler}, {@apilink PuppeteerCrawler} + * and {@apilink PlaywrightCrawler} as a source of URLs to crawl. + * Unlike {@apilink RequestList}, `RequestQueue` supports dynamic adding and removing of requests. + * On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch. + * + * **Example usage:** + * + * ```javascript + * // Open the default request queue associated with the crawler run + * const queue = await RequestQueue.open(); + * + * // Open a named request queue + * const queueWithName = await RequestQueue.open('some-name'); + * + * // Enqueue few requests + * await queue.addRequest({ url: 'http://example.com/aaa' }); + * await queue.addRequest({ url: 'http://example.com/bbb' }); + * await queue.addRequest({ url: 'http://example.com/foo/bar' }, { forefront: true }); + * ``` + * @category Sources + */ +export class RequestQueue extends RequestProvider { private _listHeadAndLockPromise: Promise | null = null; constructor(options: RequestProviderOptions, config = Configuration.getGlobalConfig()) { @@ -63,21 +97,7 @@ class RequestQueue extends RequestProvider { } /** - * Returns a next request in the queue to be processed, or `null` if there are no more pending requests. - * - * Once you successfully finish processing of the request, you need to call - * {@apilink RequestQueue.markRequestHandled} - * to mark the request as handled in the queue. If there was some error in processing the request, - * call {@apilink RequestQueue.reclaimRequest} instead, - * so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function. - * - * Note that the `null` return value doesn't mean the queue processing finished, - * it means there are currently no pending requests. - * To check whether all requests in queue were finished, - * use {@apilink RequestQueue.isFinished} instead. - * - * @returns - * Returns the request object or `null` if there are no more pending requests. + * @inheritDoc */ override async fetchNextRequest(): Promise | null> { checkStorageAccess(); @@ -143,6 +163,9 @@ class RequestQueue extends RequestProvider { return request; } + /** + * @inheritDoc + */ override async reclaimRequest(...args: Parameters): ReturnType { checkStorageAccess(); @@ -350,9 +373,10 @@ class RequestQueue extends RequestProvider { } } + /** + * @inheritDoc + */ static override async open(...args: Parameters): Promise { return super.open(...args) as Promise; } } - -export { RequestQueue as RequestQueueV2 }; diff --git a/test/core/storages/request_queue.test.ts b/test/core/storages/request_queue.test.ts index 8ad854d9e114..8eb3b3527e81 100644 --- a/test/core/storages/request_queue.test.ts +++ b/test/core/storages/request_queue.test.ts @@ -4,7 +4,7 @@ import { QUERY_HEAD_MIN_LENGTH, API_PROCESSED_REQUESTS_DELAY_MILLIS, STORAGE_CONSISTENCY_DELAY_MILLIS, - RequestQueue, + RequestQueueV1 as RequestQueue, Request, Configuration, ProxyConfiguration,