Skip to content

Commit

Permalink
feat: add support for sameDomainDelay (#2003)
Browse files Browse the repository at this point in the history
Implemented a new feature to introduce a delay while crawling same
domain requests.

closes #1993

---------

Co-authored-by: Martin Adámek <banan23@gmail.com>
Co-authored-by: Jindřich Bär <jindrichbar@gmail.com>
  • Loading branch information
3 people committed Jul 31, 2023
1 parent c4fb5e2 commit e796883
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 1 deletion.
1 change: 1 addition & 0 deletions packages/basic-crawler/package.json
Expand Up @@ -53,6 +53,7 @@
"@crawlee/utils": "^3.4.2",
"got-scraping": "^3.2.9",
"ow": "^0.28.1",
"tldts": "^6.0.0",
"tslib": "^2.4.0",
"type-fest": "^4.0.0"
}
Expand Down
47 changes: 46 additions & 1 deletion packages/basic-crawler/src/internals/basic-crawler.ts
Expand Up @@ -46,6 +46,7 @@ import { ROTATE_PROXY_ERRORS } from '@crawlee/utils';
import type { Method, OptionsInit } from 'got-scraping';
import { gotScraping } from 'got-scraping';
import ow, { ArgumentError } from 'ow';
import { getDomain } from 'tldts';
import type { SetRequired } from 'type-fest';

export interface BasicCrawlingContext<
Expand Down Expand Up @@ -219,6 +220,12 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
*/
maxRequestRetries?: number;

/**
* Indicates how much time (in seconds) to wait before crawling another same domain request.
* @default 0
*/
sameDomainDelaySecs?: number;

/**
* Maximum number of session rotations per request.
* The crawler will automatically rotate the session in case of a proxy error or if it gets blocked by the website.
Expand Down Expand Up @@ -434,6 +441,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected requestHandlerTimeoutMillis!: number;
protected internalTimeoutMillis: number;
protected maxRequestRetries: number;
protected sameDomainDelayMillis: number;
protected domainAccessedTime: Map<string, number>;
protected maxSessionRotations: number;
protected handledRequestsCount: number;
protected statusMessageLoggingInterval: number;
Expand Down Expand Up @@ -463,6 +472,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
// TODO: remove in a future release
handleFailedRequestFunction: ow.optional.function,
maxRequestRetries: ow.optional.number,
sameDomainDelaySecs: ow.optional.number,
maxSessionRotations: ow.optional.number,
maxRequestsPerCrawl: ow.optional.number,
autoscaledPoolOptions: ow.optional.object,
Expand Down Expand Up @@ -494,6 +504,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
requestList,
requestQueue,
maxRequestRetries = 3,
sameDomainDelaySecs = 0,
maxSessionRotations = 10,
maxRequestsPerCrawl,
autoscaledPoolOptions = {},
Expand Down Expand Up @@ -533,6 +544,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
this.statusMessageCallback = statusMessageCallback as StatusMessageCallback;
this.events = config.getEventManager();
this.domainAccessedTime = new Map();

this._handlePropertyNameChange({
newName: 'requestHandler',
Expand Down Expand Up @@ -589,6 +601,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}

this.maxRequestRetries = maxRequestRetries;
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
this.maxSessionRotations = maxSessionRotations;
this.handledRequestsCount = 0;
this.stats = new Statistics({ logMessage: `${log.getOptions().prefix} request statistics:`, config });
Expand Down Expand Up @@ -964,6 +977,36 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
_crawlingContext: Context,
) {}

/**
* Delays processing of the request based on the `sameDomainDelaySecs` option,
* adding it back to the queue after the timeout passes. Returns `true` if the request
* should be ignored and will be reclaimed to the queue once ready.
*/
protected delayRequest(request:Request, source: RequestQueue | RequestList) {
const domain = getDomain(request.url);

if (!domain || !request) {
return false;
}

const now = Date.now();
const lastAccessTime = this.domainAccessedTime.get(domain);

if (!lastAccessTime || (now - lastAccessTime) >= this.sameDomainDelayMillis) {
this.domainAccessedTime.set(domain, now);
return false;
}

const delay = lastAccessTime + this.sameDomainDelayMillis - now;
this.log.debug(`Request ${request.url} (${request.id}) will be reclaimed after ${delay} milliseconds due to same domain delay`);
setTimeout(async () => {
this.log.debug(`Adding request ${request.url} (${request.id}) back to the queue`);
await source?.reclaimRequest(request);
}, delay);

return true;
}

/**
* Wrapper around requestHandler that fetches requests from RequestList/RequestQueue
* then retries them in a case of an error, etc.
Expand Down Expand Up @@ -996,7 +1039,9 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

tryCancel();

if (!request) return;
if (!request || this.delayRequest(request, source)) {
return;
}

// Reset loadedUrl so an old one is not carried over to retries.
request.loadedUrl = undefined;
Expand Down
1 change: 1 addition & 0 deletions yarn.lock
Expand Up @@ -750,6 +750,7 @@ __metadata:
"@crawlee/utils": ^3.4.2
got-scraping: ^3.2.9
ow: ^0.28.1
tldts: ^6.0.0
tslib: ^2.4.0
type-fest: ^4.0.0
languageName: unknown
Expand Down

0 comments on commit e796883

Please sign in to comment.