Skip to content

Commit

Permalink
feat: retire session on proxy error (#2002)
Browse files Browse the repository at this point in the history
When Crawlee sees a proxy-related error in navigation, it retires the
current session and retries the request using another session (proxy).

closes #1912
  • Loading branch information
barjin committed Jul 26, 2023
1 parent 6c79dd0 commit 8c0928b
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 54 deletions.
43 changes: 42 additions & 1 deletion packages/basic-crawler/src/internals/basic-crawler.ts
Expand Up @@ -39,8 +39,10 @@ import {
purgeDefaultStorages,
validators,
RetryRequestError,
SessionError,
} from '@crawlee/core';
import type { Dictionary, Awaitable, BatchAddRequestsResult, SetStatusMessageOptions } from '@crawlee/types';
import { ROTATE_PROXY_ERRORS } from '@crawlee/utils';
import type { Method, OptionsInit } from 'got-scraping';
import { gotScraping } from 'got-scraping';
import ow, { ArgumentError } from 'ow';
Expand Down Expand Up @@ -217,6 +219,15 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
*/
maxRequestRetries?: number;

/**
* Maximum number of session rotations per request.
* The crawler will automatically rotate the session in case of a proxy error or if it gets blocked by the website.
*
* The session rotations are not counted towards the {@apilink BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} limit.
* @default 10
*/
maxSessionRotations?: number;

/**
* Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached.
* This value should always be set in order to prevent infinite loops in misconfigured crawlers.
Expand Down Expand Up @@ -423,6 +434,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected requestHandlerTimeoutMillis!: number;
protected internalTimeoutMillis: number;
protected maxRequestRetries: number;
protected maxSessionRotations: number;
protected handledRequestsCount: number;
protected statusMessageLoggingInterval: number;
protected statusMessageCallback?: StatusMessageCallback;
Expand Down Expand Up @@ -451,6 +463,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
// TODO: remove in a future release
handleFailedRequestFunction: ow.optional.function,
maxRequestRetries: ow.optional.number,
maxSessionRotations: ow.optional.number,
maxRequestsPerCrawl: ow.optional.number,
autoscaledPoolOptions: ow.optional.object,
sessionPoolOptions: ow.optional.object,
Expand Down Expand Up @@ -481,6 +494,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
requestList,
requestQueue,
maxRequestRetries = 3,
maxSessionRotations = 10,
maxRequestsPerCrawl,
autoscaledPoolOptions = {},
keepAlive,
Expand Down Expand Up @@ -575,6 +589,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}

this.maxRequestRetries = maxRequestRetries;
this.maxSessionRotations = maxSessionRotations;
this.handledRequestsCount = 0;
this.stats = new Statistics({ logMessage: `${log.getOptions().prefix} request statistics:`, config });
this.sessionPoolOptions = {
Expand Down Expand Up @@ -655,6 +670,16 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
}

/**
* Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
* Used for retrying requests that failed due to proxy errors.
*
* @param error The error to check.
*/
protected isProxyError(error: Error): boolean {
return ROTATE_PROXY_ERRORS.some((x: string) => (this._getMessageFromError(error) as any)?.includes(x));
}

protected isRequestBlocked(_crawlingContext: Context) {
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
}
Expand Down Expand Up @@ -1115,6 +1140,18 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return isRequestListFinished && isRequestQueueFinished;
}

private async _rotateSession(crawlingContext: Context) {
const { request } = crawlingContext;

if ((request.sessionRotationCount ?? 0) >= this.maxSessionRotations) {
throw new Error(`Request failed because of proxy-related errors ${request.sessionRotationCount} times. `
+ 'This might be caused by a misconfigured proxy or an invalid session pool configuration.');
}
request.sessionRotationCount ??= 0;
request.sessionRotationCount++;
crawlingContext.session?.retire();
}

/**
* Handles errors thrown by user provided requestHandler()
*/
Expand All @@ -1135,7 +1172,11 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
if (shouldRetryRequest) {
this.stats.errorTrackerRetry.add(error);

await this._tagUserHandlerError(() => this.errorHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error));
if (error instanceof SessionError) {
await this._rotateSession(crawlingContext);
} else {
await this._tagUserHandlerError(() => this.errorHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error));
}

if (!request.noRetry) {
request.retryCount++;
Expand Down
15 changes: 13 additions & 2 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Expand Up @@ -24,6 +24,7 @@ import {
RequestState,
resolveBaseUrlForEnqueueLinksFiltering,
validators,
SessionError,
} from '@crawlee/basic';
import type {
BrowserController,
Expand Down Expand Up @@ -510,8 +511,7 @@ export abstract class BrowserCrawler<

if (this.retryOnBlocked) {
if (await this.isRequestBlocked(crawlingContext)) {
session?.retire();
throw new Error('Antibot protection detected, the session has been retired.');
throw new SessionError();
}
}

Expand Down Expand Up @@ -581,6 +581,8 @@ export abstract class BrowserCrawler<
await this._handleNavigationTimeout(crawlingContext, error as Error);

crawlingContext.request.state = RequestState.ERROR;

this._throwIfProxyError(error as Error);
throw error;
}
tryCancel();
Expand Down Expand Up @@ -619,6 +621,15 @@ export abstract class BrowserCrawler<
await crawlingContext.page.close();
}

/**
* Transforms proxy-related errors to `SessionError`.
*/
protected _throwIfProxyError(error: Error) {
if (this.isProxyError(error)) {
throw new SessionError(error.message);
}
}

protected abstract _navigationHandler(crawlingContext: Context, gotoOptions: GoToOptions): Promise<Context['response'] | null | undefined>;

/**
Expand Down
11 changes: 11 additions & 0 deletions packages/core/src/errors.ts
Expand Up @@ -23,3 +23,14 @@ export class RetryRequestError extends Error {
super(message ?? "Request is being retried at the user's request");
}
}

/**
* Errors of `SessionError` type will trigger a session rotation.
*
* This error doesn't respect the `maxRequestRetries` option and has a separate limit of `maxSessionRotations`.
*/
export class SessionError extends RetryRequestError {
constructor(message?: string) {
super(message ?? 'Detected a session error, rotating session...');
}
}
15 changes: 14 additions & 1 deletion packages/core/src/request.ts
Expand Up @@ -22,6 +22,7 @@ const requestOptionalPredicates = {
payload: ow.optional.any(ow.string, ow.buffer),
noRetry: ow.optional.boolean,
retryCount: ow.optional.number,
sessionRotationCount: ow.optional.number,
maxRetries: ow.optional.number,
errorMessages: ow.optional.array.ofType(ow.string),
headers: ow.optional.object,
Expand Down Expand Up @@ -107,6 +108,9 @@ export class Request<UserData extends Dictionary = Dictionary> {
/** The `true` value indicates that the request will not be automatically retried on error. */
noRetry: boolean;

/** Indicates the number of times the crawling of the request has rotated the session due to a session or a proxy error. */
sessionRotationCount?: number;

/** Indicates the number of times the crawling of the request has been retried on error. */
retryCount: number;

Expand Down Expand Up @@ -159,6 +163,7 @@ export class Request<UserData extends Dictionary = Dictionary> {
payload,
noRetry = false,
retryCount = 0,
sessionRotationCount = 0,
maxRetries,
errorMessages = [],
headers = {},
Expand All @@ -168,7 +173,14 @@ export class Request<UserData extends Dictionary = Dictionary> {
keepUrlFragment = false,
useExtendedUniqueKey = false,
skipNavigation,
} = options as RequestOptions & { loadedUrl?: string; retryCount?: number; maxRetries?: number; errorMessages?: string[]; handledAt?: string | Date };
} = options as RequestOptions & {
loadedUrl?: string;
retryCount?: number;
sessionRotationCount?: number;
maxRetries?: number;
errorMessages?: string[];
handledAt?: string | Date;
};

let {
method = 'GET',
Expand All @@ -186,6 +198,7 @@ export class Request<UserData extends Dictionary = Dictionary> {
this.payload = payload;
this.noRetry = noRetry;
this.retryCount = retryCount;
this.sessionRotationCount = sessionRotationCount;
this.errorMessages = [...errorMessages];
this.headers = { ...headers };
this.handledAt = handledAt as unknown instanceof Date ? (handledAt as Date).toISOString() : handledAt!;
Expand Down
14 changes: 9 additions & 5 deletions packages/http-crawler/src/internals/http-crawler.ts
Expand Up @@ -25,6 +25,7 @@ import {
validators,
Configuration,
RequestState,
SessionError,
} from '@crawlee/basic';
import type { Awaitable, Dictionary } from '@crawlee/types';
import { RETRY_CSS_SELECTORS } from '@crawlee/utils';
Expand Down Expand Up @@ -446,11 +447,11 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
crawlingContext.parseWithCheerio ??= async () => cheerio.load(parsed.body!.toString());

if (this.useSessionPool) {
this._throwOnBlockedRequest(session!, response.statusCode!);
this._throwOnBlockedRequest(crawlingContext.session!, response.statusCode!);
}

if (this.persistCookiesPerSession) {
session!.setCookiesFromResponse(response);
crawlingContext.session!.setCookiesFromResponse(response);
}

request.loadedUrl = response.url;
Expand All @@ -467,8 +468,7 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
}

if (this.retryOnBlocked && await this.isRequestBlocked(crawlingContext)) {
crawlingContext.session?.retire();
throw new Error('Antibot protection detected, the session has been retired.');
throw new SessionError();
}

request.state = RequestState.REQUEST_HANDLER;
Expand Down Expand Up @@ -596,7 +596,11 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
return undefined as unknown as PlainResponse;
}

throw e;
if (this.isProxyError(e as Error)) {
throw new SessionError();
} else {
throw e;
}
}
}

Expand Down
11 changes: 11 additions & 0 deletions packages/utils/src/internals/blocked.ts
Expand Up @@ -5,3 +5,14 @@ export const RETRY_CSS_SELECTORS = [
'iframe[src^="https://challenges.cloudflare.com"]',
'div#infoDiv0 a[href*="//www.google.com/policies/terms/"]',
];

/**
* Content of proxy errors that should trigger a retry, as the proxy is likely getting blocked / is malfunctioning.
*/
export const ROTATE_PROXY_ERRORS = [
'ECONNRESET',
'ECONNREFUSED',
'ERR_PROXY_CONNECTION_FAILED',
'ERR_TUNNEL_CONNECTION_FAILED',
'Proxy responded with',
];
90 changes: 66 additions & 24 deletions test/core/crawlers/browser_crawler.test.ts
Expand Up @@ -786,7 +786,73 @@ describe('BrowserCrawler', () => {

delete process.env[ENV_VARS.PROXY_PASSWORD];
});

test('proxy rotation on error works as expected', async () => {
const goodProxyUrl = 'http://good.proxy';
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] });

const browserCrawler = new class extends BrowserCrawlerTest {
protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise<HTTPResponse | null | undefined> {
const { session } = ctx;
const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id);

if (proxyInfo.url !== goodProxyUrl) {
throw new Error('ERR_PROXY_CONNECTION_FAILED');
}

return null;
}
}({
browserPoolOptions: {
browserPlugins: [puppeteerPlugin],
},
requestList,
maxRequestRetries: 0,
maxConcurrency: 1,
useSessionPool: true,
proxyConfiguration,
requestHandler: async () => {},
});

await expect(browserCrawler.run()).resolves.not.toThrow();
});

test('proxy rotation on error stops after maxSessionRotations limit', async () => {
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] });

/**
* The first increment is the base case when the proxy is retrieved for the first time.
*/
let numberOfRotations = -1;
const browserCrawler = new class extends BrowserCrawlerTest {
protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise<HTTPResponse | null | undefined> {
const { session } = ctx;
const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id);

numberOfRotations++;

if (proxyInfo.url.includes('localhost')) {
throw new Error('ERR_PROXY_CONNECTION_FAILED');
}

return null;
}
}({
browserPoolOptions: {
browserPlugins: [puppeteerPlugin],
},
requestList,
maxSessionRotations: 5,
maxConcurrency: 1,
proxyConfiguration,
requestHandler: async () => {},
});

await expect(browserCrawler.run()).rejects.toThrow();
expect(numberOfRotations).toBe(5);
});
});

describe('Crawling context', () => {
const sources = ['http://example.com/'];
let requestList: RequestList;
Expand Down Expand Up @@ -855,29 +921,5 @@ describe('BrowserCrawler', () => {

await browserCrawler.run();
});

test('failedRequestHandler contains proxyInfo', async () => {
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost'] });

const browserCrawler = new BrowserCrawlerTest({
browserPoolOptions: {
browserPlugins: [puppeteerPlugin],
},
requestList,
maxRequestRetries: 0,
maxConcurrency: 1,
useSessionPool: true,
proxyConfiguration,
requestHandler: async () => {
throw new Error('some error');
},
failedRequestHandler: async (crawlingContext) => {
expect(typeof crawlingContext.proxyInfo).toEqual('object');
expect(crawlingContext.proxyInfo.hasOwnProperty('url')).toEqual(true);
},
});

await browserCrawler.run();
});
});
});

0 comments on commit 8c0928b

Please sign in to comment.