Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: exceeding maxSessionRotations calls failedRequestHandler #2029

Merged
merged 2 commits into from Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
17 changes: 8 additions & 9 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Expand Up @@ -1188,10 +1188,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
private async _rotateSession(crawlingContext: Context) {
const { request } = crawlingContext;

if ((request.sessionRotationCount ?? 0) >= this.maxSessionRotations) {
throw new Error(`Request failed because of proxy-related errors ${request.sessionRotationCount} times. `
+ 'This might be caused by a misconfigured proxy or an invalid session pool configuration.');
}
request.sessionRotationCount ??= 0;
request.sessionRotationCount++;
crawlingContext.session?.retire();
Expand Down Expand Up @@ -1302,16 +1298,19 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}

protected _canRequestBeRetried(request: Request, error: Error) {
// Request should never be retried, or the error encountered makes it not able to be retried, or the session rotation limit has been reached
if (request.noRetry
|| (error instanceof NonRetryableError)
|| (error instanceof SessionError && (this.maxSessionRotations <= (request.sessionRotationCount ?? 0)))
) {
return false;
}

// User requested retry (we ignore retry count here as its explicitly told by the user to retry)
if (error instanceof RetryRequestError) {
return true;
}

// Request should never be retried, or the error encountered makes it not able to be retried
if (request.noRetry || (error instanceof NonRetryableError)) {
return false;
}

// Ensure there are more retries available for the request
const maxRequestRetries = request.maxRetries ?? this.maxRequestRetries;
return request.retryCount < maxRequestRetries;
Expand Down
17 changes: 11 additions & 6 deletions test/core/crawlers/browser_crawler.test.ts
Expand Up @@ -790,6 +790,7 @@ describe('BrowserCrawler', () => {
test('proxy rotation on error works as expected', async () => {
const goodProxyUrl = 'http://good.proxy';
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] });
const requestHandler = jest.fn();

const browserCrawler = new class extends BrowserCrawlerTest {
protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise<HTTPResponse | null | undefined> {
Expand All @@ -811,19 +812,21 @@ describe('BrowserCrawler', () => {
maxConcurrency: 1,
useSessionPool: true,
proxyConfiguration,
requestHandler: async () => {},
requestHandler,
});

await expect(browserCrawler.run()).resolves.not.toThrow();
expect(requestHandler).toHaveBeenCalledTimes(requestList.length());
});

test('proxy rotation on error stops after maxSessionRotations limit', async () => {
test('proxy rotation on error respects maxSessionRotations, calls failedRequestHandler', async () => {
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] });
const failedRequestHandler = jest.fn();

/**
* The first increment is the base case when the proxy is retrieved for the first time.
*/
let numberOfRotations = -1;
let numberOfRotations = -requestList.length();
const browserCrawler = new class extends BrowserCrawlerTest {
protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise<HTTPResponse | null | undefined> {
const { session } = ctx;
Expand All @@ -846,10 +849,12 @@ describe('BrowserCrawler', () => {
maxConcurrency: 1,
proxyConfiguration,
requestHandler: async () => {},
failedRequestHandler,
});

await expect(browserCrawler.run()).rejects.toThrow();
expect(numberOfRotations).toBe(5);
await browserCrawler.run();
expect(failedRequestHandler).toBeCalledTimes(requestList.length());
expect(numberOfRotations).toBe(requestList.length() * 5);
});

test('proxy rotation logs the original proxy error', async () => {
Expand Down Expand Up @@ -881,7 +886,7 @@ describe('BrowserCrawler', () => {

const spy = jest.spyOn((crawler as any).log, 'warning' as any).mockImplementation(() => {});

await expect(crawler.run([serverAddress])).rejects.toThrow();
await crawler.run([serverAddress]);

expect(spy).toBeCalled();
expect(spy.mock.calls[0][0]).toEqual(expect.stringContaining(proxyError));
Expand Down
25 changes: 16 additions & 9 deletions test/core/crawlers/cheerio_crawler.test.ts
Expand Up @@ -698,37 +698,43 @@ describe('CheerioCrawler', () => {
test('proxy rotation on error works as expected', async () => {
const goodProxyUrl = 'http://good.proxy';
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] });
const check = jest.fn();

const crawler = new class extends CheerioCrawler {
protected override async _requestFunction({ proxyUrl }: any): Promise<any> {
if (proxyUrl !== goodProxyUrl) {
throw new Error('Proxy responded with 400 - Bad request');
protected override async _requestFunction(...args: any[]): Promise<any> {
check(...args);

if (args[0].proxyUrl === goodProxyUrl) {
return null;
}

return null;
throw new Error('Proxy responded with 400 - Bad request');
}
}({
maxRequestRetries: 0,
maxSessionRotations: 2,
maxConcurrency: 1,
useSessionPool: true,
proxyConfiguration,
requestHandler: async () => {},
requestHandler: () => {},
});

await expect(crawler.run([serverAddress])).resolves.not.toThrow();
expect(check).toBeCalledWith(expect.objectContaining({ proxyUrl: goodProxyUrl }));
});

test('proxy rotation on error stops after maxSessionRotations limit', async () => {
test('proxy rotation on error respects maxSessionRotations, calls failedRequestHandler', async () => {
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] });

/**
* The first increment is the base case when the proxy is retrieved for the first time.
*/
let numberOfRotations = -1;
const failedRequestHandler = jest.fn();
const crawler = new CheerioCrawler({
proxyConfiguration,
maxSessionRotations: 5,
requestHandler: async () => {},
failedRequestHandler,
});

jest.spyOn(crawler, '_requestAsBrowser' as any).mockImplementation(async ({ proxyUrl }: any) => {
Expand All @@ -740,7 +746,8 @@ describe('CheerioCrawler', () => {
return null;
});

await expect(crawler.run([serverAddress])).rejects.toThrow();
await crawler.run([serverAddress]);
expect(failedRequestHandler).toBeCalledTimes(1);
expect(numberOfRotations).toBe(5);
});

Expand All @@ -765,7 +772,7 @@ describe('CheerioCrawler', () => {

const spy = jest.spyOn((crawler as any).log, 'warning' as any).mockImplementation(() => {});

await expect(crawler.run([serverAddress])).rejects.toThrow();
await crawler.run([serverAddress]);

expect(spy).toBeCalled();
expect(spy.mock.calls[0][0]).toEqual(expect.stringContaining(proxyError));
Expand Down