Skip to content

Commit

Permalink
feat: retryOnBlocked detects blocked webpage (#1956)
Browse files Browse the repository at this point in the history
  • Loading branch information
barjin committed Jul 19, 2023
1 parent 8d68d0b commit 766fa9b
Show file tree
Hide file tree
Showing 7 changed files with 245 additions and 177 deletions.
27 changes: 27 additions & 0 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Expand Up @@ -261,6 +261,15 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
*/
statusMessageLoggingInterval?: number;

/**
* If set to `true`, the crawler will automatically try to bypass any detected bot protection.
*
* Currently supports:
* - [**Cloudflare** Bot Management](https://www.cloudflare.com/products/bot-management/)
* - [**Google Search** Rate Limiting](https://www.google.com/sorry/)
*/
retryOnBlocked?: boolean;

/** @internal */
log?: Log;
}
Expand Down Expand Up @@ -387,6 +396,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected crawlingContexts = new Map<string, Context>();
protected autoscaledPoolOptions: AutoscaledPoolOptions;
protected events: EventManager;
protected retryOnBlocked: boolean;
private _closeEvents?: boolean;

protected static optionsShape = {
Expand All @@ -412,6 +422,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
useSessionPool: ow.optional.boolean,
statusMessageLoggingInterval: ow.optional.number,

retryOnBlocked: ow.optional.boolean,

// AutoscaledPool shorthands
minConcurrency: ow.optional.number,
maxConcurrency: ow.optional.number,
Expand Down Expand Up @@ -443,6 +455,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
maxConcurrency,
maxRequestsPerMinute,

retryOnBlocked = false,

// internal
log = defaultLog.child({ prefix: this.constructor.name }),

Expand Down Expand Up @@ -503,6 +517,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
newRequestHandlerTimeout = requestHandlerTimeoutSecs * 1000;
}

this.retryOnBlocked = retryOnBlocked;

this._handlePropertyNameChange({
newName: 'requestHandlerTimeoutSecs',
oldName: 'handleRequestTimeoutSecs',
Expand All @@ -526,6 +542,13 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
...sessionPoolOptions,
log,
};
if (this.retryOnBlocked) {
this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set.
Please note that the 'retryOnBlocked' feature might not work as expected.`);
}
}
this.useSessionPool = useSessionPool;
this.crawlingContexts = new Map();

Expand Down Expand Up @@ -593,6 +616,10 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
}

protected isRequestBlocked(_crawlingContext: Context) {
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
}

private setStatusMessage(message: string, options: SetStatusMessageOptions = {}) {
this.log.debug(`${options.isStatusMessageTerminal ? 'Terminal status message' : 'Status message'}: ${message}`);

Expand Down
21 changes: 21 additions & 0 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Expand Up @@ -37,6 +37,7 @@ import type {
import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
import ow from 'ow';
import type { Cookie as CookieObject } from '@crawlee/types';
import { RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
import type { BrowserLaunchContext } from './browser-launcher';

export interface BrowserCrawlingContext<
Expand Down Expand Up @@ -437,6 +438,19 @@ export abstract class BrowserCrawler<
}
}

protected override async isRequestBlocked(crawlingContext: Context): Promise<boolean> {
const { page, response } = crawlingContext;

// Cloudflare specific heuristic - wait 5 seconds if we get a 403 for the JS challenge to load / resolve.
if (response?.status() === 403) {
await sleep(5000);
};

return (
await Promise.all(RETRY_CSS_SELECTORS.map((selector) => (page as any).$(selector)))
).some((el) => el !== null);
}

/**
* Wrapper around requestHandler that opens and closes pages etc.
*/
Expand Down Expand Up @@ -493,6 +507,13 @@ export abstract class BrowserCrawler<
}
}

if (this.retryOnBlocked) {
if (await this.isRequestBlocked(crawlingContext)) {
session?.retire();
throw new Error('Antibot protection detected, the session has been retired.');
}
}

request.state = RequestState.REQUEST_HANDLER;
try {
await addTimeoutToPromise(
Expand Down
1 change: 1 addition & 0 deletions packages/http-crawler/package.json
Expand Up @@ -57,6 +57,7 @@
"@apify/utilities": "^2.0.0",
"@crawlee/basic": "^3.4.1",
"@crawlee/types": "^3.4.1",
"@crawlee/utils": "^3.4.1",
"@types/content-type": "^1.1.5",
"cheerio": "^1.0.0-rc.12",
"content-type": "^1.0.4",
Expand Down
15 changes: 15 additions & 0 deletions packages/http-crawler/src/internals/http-crawler.ts
Expand Up @@ -22,6 +22,7 @@ import {
Configuration,
RequestState,
} from '@crawlee/basic';
import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
import type { Awaitable, Dictionary } from '@crawlee/types';
import type { RequestLike, ResponseLike } from 'content-type';
import * as cheerio from 'cheerio';
Expand Down Expand Up @@ -464,6 +465,11 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
});
}

if (this.retryOnBlocked && await this.isRequestBlocked(crawlingContext)) {
crawlingContext.session?.retire();
throw new Error('Antibot protection detected, the session has been retired.');
}

request.state = RequestState.REQUEST_HANDLER;
try {
await addTimeoutToPromise(
Expand All @@ -478,6 +484,15 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
}
}

protected override async isRequestBlocked(crawlingContext: Context) {
if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
const $ = await crawlingContext.parseWithCheerio();

return RETRY_CSS_SELECTORS.some((selector) => $(selector).length > 0);
}
return false;
}

protected async _handleNavigation(crawlingContext: Context) {
const gotOptions = {} as OptionsInit;
const { request, session } = crawlingContext;
Expand Down
1 change: 1 addition & 0 deletions packages/utils/src/index.ts
@@ -1,3 +1,4 @@
export * from './internals/blocked';
export * from './internals/cheerio';
export * from './internals/chunk';
export * from './internals/extract-urls';
Expand Down
7 changes: 7 additions & 0 deletions packages/utils/src/internals/blocked.ts
@@ -0,0 +1,7 @@
/**
* CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked.
*/
export const RETRY_CSS_SELECTORS = [
'iframe[src^="https://challenges.cloudflare.com"]',
'div#infoDiv0 a[href*="//www.google.com/policies/terms/"]',
];

0 comments on commit 766fa9b

Please sign in to comment.