diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 2677338ca3d..db0bf38ae39 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -751,7 +751,12 @@ export class BasicCrawler (this._getMessageFromError(error) as any)?.includes(x)); } - protected isRequestBlocked(_crawlingContext: Context) { + /** + * Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics. + * Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason. + * @param _crawlingContext The crawling context to check. + */ + protected async isRequestBlocked(_crawlingContext: Context): Promise { throw new Error('the "isRequestBlocked" method is not implemented in this crawler.'); } diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index f78ceaf7d17..b4af68f4463 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -441,13 +441,18 @@ export abstract class BrowserCrawler< } } - private async containsSelectors(page: CommonPage, selectors: string[]): Promise { - return (await Promise.all( + private async containsSelectors(page: CommonPage, selectors: string[]): Promise { + const foundSelectors = (await Promise.all( selectors.map((selector) => (page as any).$(selector))) - ).some((el: any) => el !== null); + ) + .map((x, i) => [x, selectors[i]] as [any, string]) + .filter(([x]) => x !== null) + .map(([, selector]) => selector); + + return foundSelectors.length > 0 ? foundSelectors : null; } - protected override async isRequestBlocked(crawlingContext: Context): Promise { + protected override async isRequestBlocked(crawlingContext: Context): Promise { const { page, response } = crawlingContext; // eslint-disable-next-line dot-notation @@ -461,11 +466,19 @@ export abstract class BrowserCrawler< await sleep(5000); // here we cannot test for response code, because we only have the original response, not the possible Cloudflare redirect on passed challenge. - return this.containsSelectors(page, RETRY_CSS_SELECTORS); + const foundSelectors = await this.containsSelectors(page, RETRY_CSS_SELECTORS); + + if (!foundSelectors) return false; + return `Cloudflare challenge failed, found selectors: ${foundSelectors.join(', ')}`; } - return await this.containsSelectors(page, RETRY_CSS_SELECTORS) - || blockedStatusCodes.includes(response?.status() ?? 0); + const foundSelectors = await this.containsSelectors(page, RETRY_CSS_SELECTORS); + const blockedStatusCode = blockedStatusCodes.find((x) => x === (response?.status() ?? 0)); + + if (foundSelectors) return `Found selectors: ${foundSelectors.join(', ')}`; + if (blockedStatusCode) return `Received blocked status code: ${blockedStatusCode}`; + + return false; } /** @@ -534,9 +547,8 @@ export abstract class BrowserCrawler< } if (this.retryOnBlocked) { - if (await this.isRequestBlocked(crawlingContext)) { - throw new SessionError(); - } + const error = await this.isRequestBlocked(crawlingContext); + if (error) throw new SessionError(error); } request.state = RequestState.REQUEST_HANDLER; diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index 15076503b50..47a68ee1327 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -500,8 +500,9 @@ export class HttpCrawler { if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) { const $ = await crawlingContext.parseWithCheerio(); - return RETRY_CSS_SELECTORS.some((selector) => $(selector).length > 0); + const foundSelectors = RETRY_CSS_SELECTORS.filter((selector) => $(selector).length > 0); + + if (foundSelectors.length > 0) { + return `Found selectors: ${foundSelectors.join(', ')}`; + } } return false; }