Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: retryOnBlocked detects blocked webpage #1956

Merged
merged 6 commits into from Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
27 changes: 27 additions & 0 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Expand Up @@ -261,6 +261,15 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
*/
statusMessageLoggingInterval?: number;

/**
* If set to `true`, the crawler will automatically try to bypass any detected bot protection.
barjin marked this conversation as resolved.
Show resolved Hide resolved
*
* Currently supports:
* - [**Cloudflare** Bot Management](https://www.cloudflare.com/products/bot-management/)
* - [**Google Search** Rate Limiting](https://www.google.com/sorry/)
*/
retryOnBlocked?: boolean;

/** @internal */
log?: Log;
}
Expand Down Expand Up @@ -387,6 +396,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected crawlingContexts = new Map<string, Context>();
protected autoscaledPoolOptions: AutoscaledPoolOptions;
protected events: EventManager;
protected retryOnBlocked: boolean;
private _closeEvents?: boolean;

protected static optionsShape = {
Expand All @@ -412,6 +422,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
useSessionPool: ow.optional.boolean,
statusMessageLoggingInterval: ow.optional.number,

retryOnBlocked: ow.optional.boolean,

// AutoscaledPool shorthands
minConcurrency: ow.optional.number,
maxConcurrency: ow.optional.number,
Expand Down Expand Up @@ -443,6 +455,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
maxConcurrency,
maxRequestsPerMinute,

retryOnBlocked = false,

// internal
log = defaultLog.child({ prefix: this.constructor.name }),

Expand Down Expand Up @@ -503,6 +517,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
newRequestHandlerTimeout = requestHandlerTimeoutSecs * 1000;
}

this.retryOnBlocked = retryOnBlocked;

this._handlePropertyNameChange({
newName: 'requestHandlerTimeoutSecs',
oldName: 'handleRequestTimeoutSecs',
Expand All @@ -526,6 +542,13 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
...sessionPoolOptions,
log,
};
if (this.retryOnBlocked) {
this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set.
Please note that the 'retryOnBlocked' feature might not work as expected.`);
}
}
this.useSessionPool = useSessionPool;
this.crawlingContexts = new Map();

Expand Down Expand Up @@ -593,6 +616,10 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
}

protected isRequestBlocked(_crawlingContext: Context) {
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
}

private setStatusMessage(message: string, options: SetStatusMessageOptions = {}) {
this.log.debug(`${options.isStatusMessageTerminal ? 'Terminal status message' : 'Status message'}: ${message}`);

Expand Down
21 changes: 21 additions & 0 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Expand Up @@ -37,6 +37,7 @@ import type {
import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
import ow from 'ow';
import type { Cookie as CookieObject } from '@crawlee/types';
import { RETRY_CSS_SELECTORS, sleep } from '@crawlee/utils';
import type { BrowserLaunchContext } from './browser-launcher';

export interface BrowserCrawlingContext<
Expand Down Expand Up @@ -437,6 +438,19 @@ export abstract class BrowserCrawler<
}
}

protected override async isRequestBlocked(crawlingContext: Context): Promise<boolean> {
const { page, response } = crawlingContext;

// Cloudflare specific heuristic - wait 5 seconds if we get a 403 for the JS challenge to load / resolve.
if (response?.status() === 403) {
await sleep(5000);
};

return (
await Promise.all(RETRY_CSS_SELECTORS.map((selector) => (page as any).$(selector)))
).some((el) => el !== null);
}

/**
* Wrapper around requestHandler that opens and closes pages etc.
*/
Expand Down Expand Up @@ -493,6 +507,13 @@ export abstract class BrowserCrawler<
}
}

if (this.retryOnBlocked) {
if (await this.isRequestBlocked(crawlingContext)) {
session?.retire();
throw new Error('Antibot protection detected, the session has been retired.');
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we should throw RetryRequestError? but that could end up with infinite retries. maybe better to dynamically increase the request.maxRetries instead and have some max, e.g. 10

not sure how easy it is to get around those blocking errors just by picking new session/proxy? it sounds safer to not count this into the retry limit

}
}

request.state = RequestState.REQUEST_HANDLER;
try {
await addTimeoutToPromise(
Expand Down
1 change: 1 addition & 0 deletions packages/http-crawler/package.json
Expand Up @@ -57,6 +57,7 @@
"@apify/utilities": "^2.0.0",
"@crawlee/basic": "^3.4.1",
"@crawlee/types": "^3.4.1",
"@crawlee/utils": "^3.4.1",
"@types/content-type": "^1.1.5",
"cheerio": "^1.0.0-rc.12",
"content-type": "^1.0.4",
Expand Down
15 changes: 15 additions & 0 deletions packages/http-crawler/src/internals/http-crawler.ts
Expand Up @@ -22,6 +22,7 @@ import {
Configuration,
RequestState,
} from '@crawlee/basic';
import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
import type { Awaitable, Dictionary } from '@crawlee/types';
import type { RequestLike, ResponseLike } from 'content-type';
import * as cheerio from 'cheerio';
Expand Down Expand Up @@ -464,6 +465,11 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
});
}

if (this.retryOnBlocked && await this.isRequestBlocked(crawlingContext)) {
crawlingContext.session?.retire();
throw new Error('Antibot protection detected, the session has been retired.');
}

request.state = RequestState.REQUEST_HANDLER;
try {
await addTimeoutToPromise(
Expand All @@ -478,6 +484,15 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
}
}

protected override async isRequestBlocked(crawlingContext: Context) {
if (HTML_AND_XML_MIME_TYPES.includes(crawlingContext.contentType.type)) {
const $ = await crawlingContext.parseWithCheerio();

return RETRY_CSS_SELECTORS.some((selector) => $(selector).length > 0);
}
return false;
}

protected async _handleNavigation(crawlingContext: Context) {
const gotOptions = {} as OptionsInit;
const { request, session } = crawlingContext;
Expand Down
1 change: 1 addition & 0 deletions packages/utils/src/index.ts
@@ -1,3 +1,4 @@
export * from './internals/blocked';
export * from './internals/cheerio';
export * from './internals/chunk';
export * from './internals/extract-urls';
Expand Down
7 changes: 7 additions & 0 deletions packages/utils/src/internals/blocked.ts
@@ -0,0 +1,7 @@
/**
* CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked.
*/
export const RETRY_CSS_SELECTORS = [
'iframe[src^="https://challenges.cloudflare.com"]',
'div#infoDiv0 a[href*="//www.google.com/policies/terms/"]',
];