Skip to content

Commit

Permalink
feat: initial retryOnBlocked implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
barjin committed Jun 26, 2023
1 parent 1844749 commit f571217
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 0 deletions.
11 changes: 11 additions & 0 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Expand Up @@ -261,6 +261,11 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
*/
statusMessageLoggingInterval?: number;

/**
* If set to `true`, the crawler will automatically try to bypass any detected bot protection.
*/
retryOnBlocked?: boolean;

/** @internal */
log?: Log;
}
Expand Down Expand Up @@ -412,6 +417,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
useSessionPool: ow.optional.boolean,
statusMessageLoggingInterval: ow.optional.number,

retryOnBlocked: ow.optional.boolean,

// AutoscaledPool shorthands
minConcurrency: ow.optional.number,
maxConcurrency: ow.optional.number,
Expand Down Expand Up @@ -591,6 +598,10 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
}

protected isGettingBlocked(crawlingContext: Context) {
throw new Error(`isGettingBlocked - method not implemented in this Crawler.\n ${crawlingContext}`);
}

private setStatusMessage(message: string, options: SetStatusMessageOptions = {}) {
this.log.debug(`${options.isStatusMessageTerminal ? 'Terminal status message' : 'Status message'}: ${message}`);

Expand Down
23 changes: 23 additions & 0 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Expand Up @@ -37,6 +37,7 @@ import type {
import { BROWSER_CONTROLLER_EVENTS, BrowserPool } from '@crawlee/browser-pool';
import ow from 'ow';
import type { Cookie as CookieObject } from '@crawlee/types';
import { blockedSelectors, sleep } from '@crawlee/utils';
import type { BrowserLaunchContext } from './browser-launcher';

export interface BrowserCrawlingContext<
Expand Down Expand Up @@ -312,6 +313,7 @@ export abstract class BrowserCrawler<
protected requestHandlerTimeoutInnerMillis: number;
protected preNavigationHooks: BrowserHook<Context>[];
protected postNavigationHooks: BrowserHook<Context>[];
protected retryOnBlocked: boolean;
protected persistCookiesPerSession: boolean;

protected static override optionsShape = {
Expand Down Expand Up @@ -345,6 +347,7 @@ export abstract class BrowserCrawler<
browserPoolOptions,
preNavigationHooks = [],
postNavigationHooks = [],
retryOnBlocked = false,
// Ignored
handleRequestFunction,

Expand Down Expand Up @@ -396,6 +399,7 @@ export abstract class BrowserCrawler<
this.proxyConfiguration = proxyConfiguration;
this.preNavigationHooks = preNavigationHooks;
this.postNavigationHooks = postNavigationHooks;
this.retryOnBlocked = retryOnBlocked;

if (headless != null) {
this.launchContext.launchOptions ??= {} as LaunchOptions;
Expand Down Expand Up @@ -437,6 +441,18 @@ export abstract class BrowserCrawler<
}
}

protected override async isGettingBlocked(crawlingContext: Context): Promise<boolean> {
const { page, response } = crawlingContext;

if (response?.status() === 403) {
await sleep(5000);
};

return (
await Promise.all(blockedSelectors.map((selector) => (page as any).$(selector)))
).some((el) => el !== null);
}

/**
* Wrapper around requestHandler that opens and closes pages etc.
*/
Expand Down Expand Up @@ -493,6 +509,13 @@ export abstract class BrowserCrawler<
}
}

if (this.retryOnBlocked) {
if (await this.isGettingBlocked(crawlingContext)) {
session?.retire();
throw new Error('Antibot protection detected, the session has been retired.');
}
}

request.state = RequestState.REQUEST_HANDLER;
try {
await addTimeoutToPromise(
Expand Down
12 changes: 12 additions & 0 deletions packages/http-crawler/src/internals/http-crawler.ts
Expand Up @@ -22,6 +22,7 @@ import {
Configuration,
RequestState,
} from '@crawlee/basic';
import { blockedSelectors } from '@crawlee/utils';
import type { Awaitable, Dictionary } from '@crawlee/types';
import type { RequestLike, ResponseLike } from 'content-type';
import * as cheerio from 'cheerio';
Expand Down Expand Up @@ -464,6 +465,11 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
});
}

if (await this.isGettingBlocked(crawlingContext)) {
crawlingContext.session?.retire();
throw new Error('Antibot protection detected, the session has been retired.');
}

request.state = RequestState.REQUEST_HANDLER;
try {
await addTimeoutToPromise(
Expand All @@ -478,6 +484,12 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
}
}

protected override async isGettingBlocked(crawlingContext: Context) {
const $ = await crawlingContext.parseWithCheerio();

return blockedSelectors.some((selector) => $(selector).length > 0);
}

protected async _handleNavigation(crawlingContext: Context) {
const gotOptions = {} as OptionsInit;
const { request, session } = crawlingContext;
Expand Down
1 change: 1 addition & 0 deletions packages/utils/src/index.ts
@@ -1,3 +1,4 @@
export * from './internals/blocked';
export * from './internals/cheerio';
export * from './internals/chunk';
export * from './internals/extract-urls';
Expand Down
4 changes: 4 additions & 0 deletions packages/utils/src/internals/blocked.ts
@@ -0,0 +1,4 @@
export const blockedSelectors = [
'iframe[src^="https://challenges.cloudflare.com"]',
'div#infoDiv0 a[href*="//www.google.com/policies/terms/"]',
];

0 comments on commit f571217

Please sign in to comment.