Skip to content

Commit

Permalink
feat: log cause with retryOnBlocked (#2252)
Browse files Browse the repository at this point in the history
closes #2249
  • Loading branch information
barjin committed Dec 21, 2023
1 parent 3c5f9d6 commit e19a773
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 15 deletions.
7 changes: 6 additions & 1 deletion packages/basic-crawler/src/internals/basic-crawler.ts
Expand Up @@ -751,7 +751,12 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return ROTATE_PROXY_ERRORS.some((x: string) => (this._getMessageFromError(error) as any)?.includes(x));
}

protected isRequestBlocked(_crawlingContext: Context) {
/**
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
* Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
* @param _crawlingContext The crawling context to check.
*/
protected async isRequestBlocked(_crawlingContext: Context): Promise<string | false> {
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
}

Expand Down
32 changes: 22 additions & 10 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Expand Up @@ -441,13 +441,18 @@ export abstract class BrowserCrawler<
}
}

private async containsSelectors(page: CommonPage, selectors: string[]): Promise<boolean> {
return (await Promise.all(
private async containsSelectors(page: CommonPage, selectors: string[]): Promise<string[] | null> {
const foundSelectors = (await Promise.all(
selectors.map((selector) => (page as any).$(selector)))
).some((el: any) => el !== null);
)
.map((x, i) => [x, selectors[i]] as [any, string])
.filter(([x]) => x !== null)
.map(([, selector]) => selector);

return foundSelectors.length > 0 ? foundSelectors : null;
}

protected override async isRequestBlocked(crawlingContext: Context): Promise<boolean> {
protected override async isRequestBlocked(crawlingContext: Context): Promise<string | false> {
const { page, response } = crawlingContext;

// eslint-disable-next-line dot-notation
Expand All @@ -461,11 +466,19 @@ export abstract class BrowserCrawler<
await sleep(5000);

// here we cannot test for response code, because we only have the original response, not the possible Cloudflare redirect on passed challenge.
return this.containsSelectors(page, RETRY_CSS_SELECTORS);
const foundSelectors = await this.containsSelectors(page, RETRY_CSS_SELECTORS);

if (!foundSelectors) return false;
return `Cloudflare challenge failed, found selectors: ${foundSelectors.join(', ')}`;
}

return await this.containsSelectors(page, RETRY_CSS_SELECTORS)
|| blockedStatusCodes.includes(response?.status() ?? 0);
const foundSelectors = await this.containsSelectors(page, RETRY_CSS_SELECTORS);
const blockedStatusCode = blockedStatusCodes.find((x) => x === (response?.status() ?? 0));

if (foundSelectors) return `Found selectors: ${foundSelectors.join(', ')}`;
if (blockedStatusCode) return `Received blocked status code: ${blockedStatusCode}`;

return false;
}

/**
Expand Down Expand Up @@ -534,9 +547,8 @@ export abstract class BrowserCrawler<
}

if (this.retryOnBlocked) {
if (await this.isRequestBlocked(crawlingContext)) {
throw new SessionError();
}
const error = await this.isRequestBlocked(crawlingContext);
if (error) throw new SessionError(error);
}

request.state = RequestState.REQUEST_HANDLER;
Expand Down
13 changes: 9 additions & 4 deletions packages/http-crawler/src/internals/http-crawler.ts
Expand Up @@ -500,8 +500,9 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
});
}

if (this.retryOnBlocked && await this.isRequestBlocked(crawlingContext)) {
throw new SessionError();
if (this.retryOnBlocked) {
const error = await this.isRequestBlocked(crawlingContext);
if (error) throw new SessionError(error);
}

request.state = RequestState.REQUEST_HANDLER;
Expand All @@ -518,11 +519,15 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
}
}

protected override async isRequestBlocked(crawlingContext: Context) {
protected override async isRequestBlocked(crawlingContext: Context): Promise<string | false> {
if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
const $ = await crawlingContext.parseWithCheerio();

return RETRY_CSS_SELECTORS.some((selector) => $(selector).length > 0);
const foundSelectors = RETRY_CSS_SELECTORS.filter((selector) => $(selector).length > 0);

if (foundSelectors.length > 0) {
return `Found selectors: ${foundSelectors.join(', ')}`;
}
}
return false;
}
Expand Down

0 comments on commit e19a773

Please sign in to comment.