Skip to content

Commit

Permalink
fix: clean up inProgress cache when delaying requests via `sameDoma…
Browse files Browse the repository at this point in the history
…inDelaySecs` (#2045)
  • Loading branch information
B4nan committed Aug 23, 2023
1 parent f20b420 commit f63ccc0
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion packages/basic-crawler/src/internals/basic-crawler.ts
Expand Up @@ -982,7 +982,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
* adding it back to the queue after the timeout passes. Returns `true` if the request
* should be ignored and will be reclaimed to the queue once ready.
*/
protected delayRequest(request:Request, source: RequestQueue | RequestList) {
protected delayRequest(request: Request, source: RequestQueue | RequestList) {
const domain = getDomain(request.url);

if (!domain || !request) {
Expand All @@ -997,10 +997,12 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return false;
}

source.inProgress.delete(request.id!);
const delay = lastAccessTime + this.sameDomainDelayMillis - now;
this.log.debug(`Request ${request.url} (${request.id}) will be reclaimed after ${delay} milliseconds due to same domain delay`);
setTimeout(async () => {
this.log.debug(`Adding request ${request.url} (${request.id}) back to the queue`);
source?.inProgress.add(request.id!);
await source?.reclaimRequest(request);
}, delay);

Expand Down

0 comments on commit f63ccc0

Please sign in to comment.