From e79688324790e5d07fc11192769cf051617e96e4 Mon Sep 17 00:00:00 2001 From: Dinesh Hardasani <45535600+Dineshhardasani@users.noreply.github.com> Date: Mon, 31 Jul 2023 11:50:29 +0530 Subject: [PATCH] feat: add support for `sameDomainDelay` (#2003) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implemented a new feature to introduce a delay while crawling same domain requests. closes #1993 --------- Co-authored-by: Martin Adámek Co-authored-by: Jindřich Bär --- packages/basic-crawler/package.json | 1 + .../src/internals/basic-crawler.ts | 47 ++++++++++++++++++- yarn.lock | 1 + 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/packages/basic-crawler/package.json b/packages/basic-crawler/package.json index 5a431c366ca..21038135452 100644 --- a/packages/basic-crawler/package.json +++ b/packages/basic-crawler/package.json @@ -53,6 +53,7 @@ "@crawlee/utils": "^3.4.2", "got-scraping": "^3.2.9", "ow": "^0.28.1", + "tldts": "^6.0.0", "tslib": "^2.4.0", "type-fest": "^4.0.0" } diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 1d26d017499..af05c13dbef 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -46,6 +46,7 @@ import { ROTATE_PROXY_ERRORS } from '@crawlee/utils'; import type { Method, OptionsInit } from 'got-scraping'; import { gotScraping } from 'got-scraping'; import ow, { ArgumentError } from 'ow'; +import { getDomain } from 'tldts'; import type { SetRequired } from 'type-fest'; export interface BasicCrawlingContext< @@ -219,6 +220,12 @@ export interface BasicCrawlerOptions; protected maxSessionRotations: number; protected handledRequestsCount: number; protected statusMessageLoggingInterval: number; @@ -463,6 +472,7 @@ export class BasicCrawler= this.sameDomainDelayMillis) { + this.domainAccessedTime.set(domain, now); + return false; + } + + const delay = lastAccessTime + this.sameDomainDelayMillis - now; + this.log.debug(`Request ${request.url} (${request.id}) will be reclaimed after ${delay} milliseconds due to same domain delay`); + setTimeout(async () => { + this.log.debug(`Adding request ${request.url} (${request.id}) back to the queue`); + await source?.reclaimRequest(request); + }, delay); + + return true; + } + /** * Wrapper around requestHandler that fetches requests from RequestList/RequestQueue * then retries them in a case of an error, etc. @@ -996,7 +1039,9 @@ export class BasicCrawler