diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index de2b582b192..1d26d017499 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -39,8 +39,10 @@ import { purgeDefaultStorages, validators, RetryRequestError, + SessionError, } from '@crawlee/core'; import type { Dictionary, Awaitable, BatchAddRequestsResult, SetStatusMessageOptions } from '@crawlee/types'; +import { ROTATE_PROXY_ERRORS } from '@crawlee/utils'; import type { Method, OptionsInit } from 'got-scraping'; import { gotScraping } from 'got-scraping'; import ow, { ArgumentError } from 'ow'; @@ -217,6 +219,15 @@ export interface BasicCrawlerOptions (this._getMessageFromError(error) as any)?.includes(x)); + } + protected isRequestBlocked(_crawlingContext: Context) { throw new Error('the "isRequestBlocked" method is not implemented in this crawler.'); } @@ -1115,6 +1140,18 @@ export class BasicCrawler= this.maxSessionRotations) { + throw new Error(`Request failed because of proxy-related errors ${request.sessionRotationCount} times. ` + + 'This might be caused by a misconfigured proxy or an invalid session pool configuration.'); + } + request.sessionRotationCount ??= 0; + request.sessionRotationCount++; + crawlingContext.session?.retire(); + } + /** * Handles errors thrown by user provided requestHandler() */ @@ -1135,7 +1172,11 @@ export class BasicCrawler this.errorHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error)); + if (error instanceof SessionError) { + await this._rotateSession(crawlingContext); + } else { + await this._tagUserHandlerError(() => this.errorHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error)); + } if (!request.noRetry) { request.retryCount++; diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 07603e07e22..c0d5be868aa 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -24,6 +24,7 @@ import { RequestState, resolveBaseUrlForEnqueueLinksFiltering, validators, + SessionError, } from '@crawlee/basic'; import type { BrowserController, @@ -510,8 +511,7 @@ export abstract class BrowserCrawler< if (this.retryOnBlocked) { if (await this.isRequestBlocked(crawlingContext)) { - session?.retire(); - throw new Error('Antibot protection detected, the session has been retired.'); + throw new SessionError(); } } @@ -581,6 +581,8 @@ export abstract class BrowserCrawler< await this._handleNavigationTimeout(crawlingContext, error as Error); crawlingContext.request.state = RequestState.ERROR; + + this._throwIfProxyError(error as Error); throw error; } tryCancel(); @@ -619,6 +621,15 @@ export abstract class BrowserCrawler< await crawlingContext.page.close(); } + /** + * Transforms proxy-related errors to `SessionError`. + */ + protected _throwIfProxyError(error: Error) { + if (this.isProxyError(error)) { + throw new SessionError(error.message); + } + } + protected abstract _navigationHandler(crawlingContext: Context, gotoOptions: GoToOptions): Promise; /** diff --git a/packages/core/src/errors.ts b/packages/core/src/errors.ts index b4312f187e6..26631745e8e 100644 --- a/packages/core/src/errors.ts +++ b/packages/core/src/errors.ts @@ -23,3 +23,14 @@ export class RetryRequestError extends Error { super(message ?? "Request is being retried at the user's request"); } } + +/** + * Errors of `SessionError` type will trigger a session rotation. + * + * This error doesn't respect the `maxRequestRetries` option and has a separate limit of `maxSessionRotations`. + */ +export class SessionError extends RetryRequestError { + constructor(message?: string) { + super(message ?? 'Detected a session error, rotating session...'); + } +} diff --git a/packages/core/src/request.ts b/packages/core/src/request.ts index 3e2a4f9d4e3..164729ae80d 100644 --- a/packages/core/src/request.ts +++ b/packages/core/src/request.ts @@ -22,6 +22,7 @@ const requestOptionalPredicates = { payload: ow.optional.any(ow.string, ow.buffer), noRetry: ow.optional.boolean, retryCount: ow.optional.number, + sessionRotationCount: ow.optional.number, maxRetries: ow.optional.number, errorMessages: ow.optional.array.ofType(ow.string), headers: ow.optional.object, @@ -107,6 +108,9 @@ export class Request { /** The `true` value indicates that the request will not be automatically retried on error. */ noRetry: boolean; + /** Indicates the number of times the crawling of the request has rotated the session due to a session or a proxy error. */ + sessionRotationCount?: number; + /** Indicates the number of times the crawling of the request has been retried on error. */ retryCount: number; @@ -159,6 +163,7 @@ export class Request { payload, noRetry = false, retryCount = 0, + sessionRotationCount = 0, maxRetries, errorMessages = [], headers = {}, @@ -168,7 +173,14 @@ export class Request { keepUrlFragment = false, useExtendedUniqueKey = false, skipNavigation, - } = options as RequestOptions & { loadedUrl?: string; retryCount?: number; maxRetries?: number; errorMessages?: string[]; handledAt?: string | Date }; + } = options as RequestOptions & { + loadedUrl?: string; + retryCount?: number; + sessionRotationCount?: number; + maxRetries?: number; + errorMessages?: string[]; + handledAt?: string | Date; + }; let { method = 'GET', @@ -186,6 +198,7 @@ export class Request { this.payload = payload; this.noRetry = noRetry; this.retryCount = retryCount; + this.sessionRotationCount = sessionRotationCount; this.errorMessages = [...errorMessages]; this.headers = { ...headers }; this.handledAt = handledAt as unknown instanceof Date ? (handledAt as Date).toISOString() : handledAt!; diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index ab36f4e6943..64c74c4075b 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -25,6 +25,7 @@ import { validators, Configuration, RequestState, + SessionError, } from '@crawlee/basic'; import type { Awaitable, Dictionary } from '@crawlee/types'; import { RETRY_CSS_SELECTORS } from '@crawlee/utils'; @@ -446,11 +447,11 @@ export class HttpCrawler cheerio.load(parsed.body!.toString()); if (this.useSessionPool) { - this._throwOnBlockedRequest(session!, response.statusCode!); + this._throwOnBlockedRequest(crawlingContext.session!, response.statusCode!); } if (this.persistCookiesPerSession) { - session!.setCookiesFromResponse(response); + crawlingContext.session!.setCookiesFromResponse(response); } request.loadedUrl = response.url; @@ -467,8 +468,7 @@ export class HttpCrawler { delete process.env[ENV_VARS.PROXY_PASSWORD]; }); + + test('proxy rotation on error works as expected', async () => { + const goodProxyUrl = 'http://good.proxy'; + const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] }); + + const browserCrawler = new class extends BrowserCrawlerTest { + protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise { + const { session } = ctx; + const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id); + + if (proxyInfo.url !== goodProxyUrl) { + throw new Error('ERR_PROXY_CONNECTION_FAILED'); + } + + return null; + } + }({ + browserPoolOptions: { + browserPlugins: [puppeteerPlugin], + }, + requestList, + maxRequestRetries: 0, + maxConcurrency: 1, + useSessionPool: true, + proxyConfiguration, + requestHandler: async () => {}, + }); + + await expect(browserCrawler.run()).resolves.not.toThrow(); + }); + + test('proxy rotation on error stops after maxSessionRotations limit', async () => { + const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] }); + + /** + * The first increment is the base case when the proxy is retrieved for the first time. + */ + let numberOfRotations = -1; + const browserCrawler = new class extends BrowserCrawlerTest { + protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise { + const { session } = ctx; + const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id); + + numberOfRotations++; + + if (proxyInfo.url.includes('localhost')) { + throw new Error('ERR_PROXY_CONNECTION_FAILED'); + } + + return null; + } + }({ + browserPoolOptions: { + browserPlugins: [puppeteerPlugin], + }, + requestList, + maxSessionRotations: 5, + maxConcurrency: 1, + proxyConfiguration, + requestHandler: async () => {}, + }); + + await expect(browserCrawler.run()).rejects.toThrow(); + expect(numberOfRotations).toBe(5); + }); }); + describe('Crawling context', () => { const sources = ['http://example.com/']; let requestList: RequestList; @@ -855,29 +921,5 @@ describe('BrowserCrawler', () => { await browserCrawler.run(); }); - - test('failedRequestHandler contains proxyInfo', async () => { - const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost'] }); - - const browserCrawler = new BrowserCrawlerTest({ - browserPoolOptions: { - browserPlugins: [puppeteerPlugin], - }, - requestList, - maxRequestRetries: 0, - maxConcurrency: 1, - useSessionPool: true, - proxyConfiguration, - requestHandler: async () => { - throw new Error('some error'); - }, - failedRequestHandler: async (crawlingContext) => { - expect(typeof crawlingContext.proxyInfo).toEqual('object'); - expect(crawlingContext.proxyInfo.hasOwnProperty('url')).toEqual(true); - }, - }); - - await browserCrawler.run(); - }); }); }); diff --git a/test/core/crawlers/cheerio_crawler.test.ts b/test/core/crawlers/cheerio_crawler.test.ts index 28bdd06e2e1..93e52365fbf 100644 --- a/test/core/crawlers/cheerio_crawler.test.ts +++ b/test/core/crawlers/cheerio_crawler.test.ts @@ -694,6 +694,55 @@ describe('CheerioCrawler', () => { expect(proxyInfo).toEqual(await proxyConfiguration.newProxyInfo(session.id)); } }); + + test('proxy rotation on error works as expected', async () => { + const goodProxyUrl = 'http://good.proxy'; + const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] }); + + const crawler = new class extends CheerioCrawler { + protected override async _requestFunction({ proxyUrl }: any): Promise { + if (proxyUrl !== goodProxyUrl) { + throw new Error('Proxy responded with 400 - Bad request'); + } + + return null; + } + }({ + maxRequestRetries: 0, + maxConcurrency: 1, + useSessionPool: true, + proxyConfiguration, + requestHandler: async () => {}, + }); + + await expect(crawler.run([serverAddress])).resolves.not.toThrow(); + }); + + test('proxy rotation on error stops after maxSessionRotations limit', async () => { + const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] }); + + /** + * The first increment is the base case when the proxy is retrieved for the first time. + */ + let numberOfRotations = -1; + const crawler = new CheerioCrawler({ + proxyConfiguration, + maxSessionRotations: 5, + requestHandler: async () => {}, + }); + + jest.spyOn(crawler, '_requestAsBrowser' as any).mockImplementation(async ({ proxyUrl }: any) => { + if (proxyUrl.includes('localhost')) { + numberOfRotations++; + throw new Error('Proxy responded with 400 - Bad request'); + } + + return null; + }); + + await expect(crawler.run([serverAddress])).rejects.toThrow(); + expect(numberOfRotations).toBe(5); + }); }); describe('SessionPool', () => { @@ -998,6 +1047,7 @@ describe('CheerioCrawler', () => { const cheerioCrawler = new CheerioCrawler({ requestList: requestListNew, maxRequestRetries: 0, + maxSessionRotations: 0, requestHandler: () => {}, failedRequestHandler: () => {}, useSessionPool: true, @@ -1012,7 +1062,11 @@ describe('CheerioCrawler', () => { return oldHandleRequestF.call(cheerioCrawler, opts); }; - await cheerioCrawler.run(); + try { + await cheerioCrawler.run(); + } catch (e) { + // localhost proxy causes proxy errors, session rotations and finally throws, but we don't care + } expect(newUrlSpy).toBeCalledWith(usedSession.id); }); @@ -1078,26 +1132,6 @@ describe('CheerioCrawler', () => { }); await cheerioCrawler.run(); }); - - test('failedRequestHandler contains proxyInfo', async () => { - const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost:8080'] }); - - const cheerioCrawler = new CheerioCrawler({ - requestList, - maxRequestRetries: 0, - maxConcurrency: 1, - proxyConfiguration, - requestHandler: () => { - throw new Error('some error'); - }, - failedRequestHandler: (crawlingContext) => { - expect(typeof crawlingContext.proxyInfo).toEqual('object'); - expect(crawlingContext.proxyInfo.hasOwnProperty('url')).toEqual(true); - }, - useSessionPool: true, - }); - await cheerioCrawler.run(); - }); }); describe('use', () => {