Skip to content

Commit

Permalink
feat: Add options for custom HTTP error status codes (#2035)
Browse files Browse the repository at this point in the history
This commit introduces two new optional properties to `CheerioCrawler`
and `HttpCrawler`, allowing for finer control over how HTTP error status
codes are handled:

1. `ignoreHttpErrorStatusCodes`: An array of HTTP response status codes
that should be excluded from being considered as errors. By default,
error consideration is triggered for status codes >= 500.

2. `additionalHttpErrorStatusCodes`: An array of extra HTTP response
status codes that should be treated as errors. By default, error
consideration is triggered for status codes >= 500.

These options provide flexibility in specifying which HTTP response
codes should be treated as errors and ignored during the crawling
process.

Closes #1711
  • Loading branch information
HamzaAlwan committed Aug 18, 2023
1 parent 38a6547 commit b50ef1a
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 1 deletion.
30 changes: 29 additions & 1 deletion packages/http-crawler/src/internals/http-crawler.ts
Expand Up @@ -154,6 +154,18 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
* It passes the "Cookie" header to the request with the session cookies.
*/
persistCookiesPerSession?: boolean;

/**
* An array of HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be excluded from error consideration.
* By default, status codes >= 500 trigger errors.
*/
ignoreHttpErrorStatusCodes?: number[];

/**
* An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors.
* By default, status codes >= 500 trigger errors.
*/
additionalHttpErrorStatusCodes?: number[];
}

/**
Expand Down Expand Up @@ -289,6 +301,8 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
protected ignoreSslErrors: boolean;
protected suggestResponseEncoding?: string;
protected forceResponseEncoding?: string;
protected additionalHttpErrorStatusCodes: Set<number>;
protected ignoreHttpErrorStatusCodes: Set<number>;
protected readonly supportedMimeTypes: Set<string>;

protected static override optionsShape = {
Expand All @@ -303,6 +317,9 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
persistCookiesPerSession: ow.optional.boolean,

additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),

preNavigationHooks: ow.optional.array,
postNavigationHooks: ow.optional.array,
};
Expand All @@ -327,6 +344,8 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
persistCookiesPerSession,
preNavigationHooks = [],
postNavigationHooks = [],
additionalHttpErrorStatusCodes = [],
ignoreHttpErrorStatusCodes = [],

// Ignored
handleRequestFunction,
Expand Down Expand Up @@ -375,6 +394,8 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
this.ignoreSslErrors = ignoreSslErrors;
this.suggestResponseEncoding = suggestResponseEncoding;
this.forceResponseEncoding = forceResponseEncoding;
this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
this.proxyConfiguration = proxyConfiguration;
this.preNavigationHooks = preNavigationHooks;
this.postNavigationHooks = [
Expand Down Expand Up @@ -617,7 +638,10 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
this.stats.registerStatusCode(statusCode!);
}

if (statusCode! >= 500) {
const excludeError = this.ignoreHttpErrorStatusCodes.has(statusCode!);
const includeError = this.additionalHttpErrorStatusCodes.has(statusCode!);

if ((statusCode! >= 500 && !excludeError) || includeError) {
const body = await readStreamToString(response, encoding);

// Errors are often sent as JSON, so attempt to parse them,
Expand All @@ -629,6 +653,10 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
throw new Error(`${statusCode} - ${message}`);
}

if (includeError) {
throw new Error(`${statusCode} - Error status code was set by user.`);
}

// It's not a JSON, so it's probably some text. Get the first 100 chars of it.
throw new Error(`${statusCode} - Internal Server Error: ${body.slice(0, 100)}`);
} else if (HTML_AND_XML_MIME_TYPES.includes(type)) {
Expand Down
48 changes: 48 additions & 0 deletions test/core/crawlers/cheerio_crawler.test.ts
Expand Up @@ -507,6 +507,54 @@ describe('CheerioCrawler', () => {
});
});

test('should ignore non http error status codes set by user', async () => {
const requestList = await getRequestListForMock({
headers: {
'content-type': 'text/plain',
},
statusCode: 500,
});

const failed: Request[] = [];

const cheerioCrawler = new CheerioCrawler({
requestList,
minConcurrency: 2,
maxConcurrency: 2,
ignoreHttpErrorStatusCodes: [500],
requestHandler: () => {},
failedRequestHandler: ({ request }) => {
failed.push(request);
},
});

await cheerioCrawler.run();

expect(cheerioCrawler.autoscaledPool.minConcurrency).toBe(2);
expect(failed).toHaveLength(0);
});

test('should throw and error on http error status codes set by user', async () => {
const requestList = await getRequestListForMirror();
const failed: Request[] = [];

const cheerioCrawler = new CheerioCrawler({
requestList,
minConcurrency: 2,
maxConcurrency: 2,
additionalHttpErrorStatusCodes: [200],
requestHandler: () => {},
failedRequestHandler: ({ request }) => {
failed.push(request);
},
});

await cheerioCrawler.run();

expect(cheerioCrawler.autoscaledPool.minConcurrency).toBe(2);
expect(failed).toHaveLength(4);
});

test('should work with all defaults content types', async () => {
let handledRequests = 0;
const contentTypes = ['text/html', 'application/xhtml+xml', 'text/xml', 'application/xml', 'application/json'];
Expand Down
43 changes: 43 additions & 0 deletions test/core/crawlers/http_crawler.test.ts
Expand Up @@ -47,6 +47,11 @@ router.set('/echo', (req, res) => {
req.pipe(res);
});

router.set('/500Error', (req, res) => {
res.statusCode = 500;
res.end();
});

let server: http.Server;
let url: string;

Expand Down Expand Up @@ -273,3 +278,41 @@ test('POST with undefined (empty) payload', async () => {

expect(results).toStrictEqual(['']);
});

test('should ignore non http error status codes set by user', async () => {
const failed: any[] = [];

const crawler = new HttpCrawler({
minConcurrency: 2,
maxConcurrency: 2,
ignoreHttpErrorStatusCodes: [500],
requestHandler: () => {},
failedRequestHandler: ({ request }) => {
failed.push(request);
},
});

await crawler.run([`${url}/500Error`]);

expect(crawler.autoscaledPool.minConcurrency).toBe(2);
expect(failed).toHaveLength(0);
});

test('should throw and error on http error status codes set by user', async () => {
const failed: any[] = [];

const crawler = new HttpCrawler({
minConcurrency: 2,
maxConcurrency: 2,
additionalHttpErrorStatusCodes: [200],
requestHandler: () => {},
failedRequestHandler: ({ request }) => {
failed.push(request);
},
});

await crawler.run([`${url}/hello.html`]);

expect(crawler.autoscaledPool.minConcurrency).toBe(2);
expect(failed).toHaveLength(1);
});

0 comments on commit b50ef1a

Please sign in to comment.