Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add RetryRequestError + add error to the context for BC #1443

Merged
merged 10 commits into from
Aug 11, 2022
66 changes: 57 additions & 9 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import {
Statistics,
purgeDefaultStorages,
validators,
RetryRequestError,
} from '@crawlee/core';
import type { GotOptionsInit, OptionsOfTextResponseBody, Response as GotResponse } from 'got-scraping';
import { gotScraping } from 'got-scraping';
Expand Down Expand Up @@ -952,16 +953,18 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
throw error;
}

const shouldRetryRequest = !request.noRetry && request.retryCount < this.maxRequestRetries && !(error instanceof NonRetryableError);
const shouldRetryRequest = this._canRequestBeRetried(request, error);

if (shouldRetryRequest) {
request.retryCount++;

await this._tagUserHandlerError(() => this.errorHandler?.(crawlingContext, error));
await this._tagUserHandlerError(() => this.errorHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error));

const { url, retryCount, id } = request;

// We don't want to see the stack trace in the logs by default, when we are going to retry the request.
// Thus, we print the full stack trace only when CRAWLEE_VERBOSE_LOG environment variable is set to true.
const message = process.env.CRAWLEE_VERBOSE_LOG ? error.stack : error;
const message = this._getMessageFromError(error);
this.log.warning(
`Reclaiming failed request back to the list or queue. ${message}`,
{ id, url, retryCount },
Expand Down Expand Up @@ -990,17 +993,62 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}

protected async _handleFailedRequestHandler(crawlingContext: Context, error: Error): Promise<void> {
if (this.failedRequestHandler) {
await this._tagUserHandlerError(() => this.failedRequestHandler?.(crawlingContext, error));
return;
}

// Always log the last error regardless if the user provided a failedRequestHandler
const { id, url, method, uniqueKey } = crawlingContext.request;
const message = error instanceof TimeoutError && !process.env.CRAWLEE_VERBOSE_LOG ? error.message : error.stack;
const message = this._getMessageFromError(error, true);

this.log.error(
`Request failed and reached maximum retries. ${message}`,
{ id, url, method, uniqueKey },
);

if (this.failedRequestHandler) {
await this._tagUserHandlerError(() => this.failedRequestHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error));
}
}

/**
* Resolves the most verbose error message from a thrown error
* @param error The error received
* @returns The message to be logged
*/
protected _getMessageFromError(error: Error, forceStack = false) {
// For timeout errors we want to show the stack just in case the env variable is set
if (error instanceof TimeoutError) {
return process.env.CRAWLEE_VERBOSE_LOG ? error.stack : error.message || error;
}

return (process.env.CRAWLEE_VERBOSE_LOG || forceStack)
? error.stack ?? (error.message || error)
: error.message || error;
}

protected _canRequestBeRetried(request: Request, error: Error) {
// User requested retry (we ignore retry count here as its explicitly told by the user to retry)
if (error instanceof RetryRequestError) {
return true;
}

// Request should never be retried, or the error encountered makes it not able to be retried
if (request.noRetry || (error instanceof NonRetryableError)) {
return false;
}

// Ensure there are more retries available for the request
return request.retryCount < this.maxRequestRetries;
}

protected _augmentContextWithDeprecatedError(context: Context, error: Error) {
Object.defineProperty(context, 'error', {
get: () => {
// eslint-disable-next-line max-len
this.log.deprecated("The 'error' property of the crawling context is deprecated, and it is now passed as the second parameter in 'errorHandler' and 'failedRequestHandler'. Please update your code, as this property will be removed in a future version.");

return error;
},
});

return context;
}

/**
Expand Down
9 changes: 9 additions & 0 deletions packages/core/src/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,12 @@ export class CriticalError extends NonRetryableError {}
* @ignore
*/
export class MissingRouteError extends CriticalError {}

/**
* Indicates that the request should be retried (while still respecting the maximum number of retries).
*/
export class RetryRequestError extends Error {
constructor(message?: string) {
super(message ?? "Request is being retried at the user's request");
}
}
2 changes: 1 addition & 1 deletion test/core/crawlers/browser_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,7 @@ describe('BrowserCrawler', () => {
expect((crawlingContext.crawler as BrowserCrawler).browserPool).toBeInstanceOf(BrowserPool);
expect(crawlingContext.hasOwnProperty('response')).toBe(true);

expect(crawlingContext.error).toBeUndefined();
expect(crawlingContext.error).toBeInstanceOf(Error);
expect(error).toBeInstanceOf(Error);
expect(error.message).toEqual('some error');
};
Expand Down
2 changes: 1 addition & 1 deletion test/core/crawlers/cheerio_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1203,7 +1203,7 @@ describe('CheerioCrawler', () => {
expect(typeof crawlingContext.response).toBe('object');
expect(typeof crawlingContext.contentType).toBe('object');

expect(crawlingContext.error).toBeUndefined();
expect(crawlingContext.error).toBeInstanceOf(Error);
expect(error).toBeInstanceOf(Error);
expect(error.message).toEqual('some error');
};
Expand Down
12 changes: 6 additions & 6 deletions test/core/crawlers/puppeteer_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -246,15 +246,15 @@ describe('PuppeteerCrawler', () => {
const warnings = logWarningSpy.mock.calls.map((call) => [call[0], call[1].retryCount]);
expect(warnings).toEqual([
[
'Reclaiming failed request back to the list or queue. Error: Navigation timed out after 0.005 seconds.',
'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.',
1,
],
[
'Reclaiming failed request back to the list or queue. Error: Navigation timed out after 0.005 seconds.',
'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.',
2,
],
[
'Reclaiming failed request back to the list or queue. Error: Navigation timed out after 0.005 seconds.',
'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.',
3,
],
]);
Expand Down Expand Up @@ -305,15 +305,15 @@ describe('PuppeteerCrawler', () => {
const warnings = logWarningSpy.mock.calls.map((call) => [call[0], call[1].retryCount]);
expect(warnings).toEqual([
[
'Reclaiming failed request back to the list or queue. Error: Navigation timed out after 0.005 seconds.',
'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.',
1,
],
[
'Reclaiming failed request back to the list or queue. Error: Navigation timed out after 0.005 seconds.',
'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.',
2,
],
[
'Reclaiming failed request back to the list or queue. Error: Navigation timed out after 0.005 seconds.',
'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.',
3,
],
]);
Expand Down