From b50ef1ad51d6d7c7a71e7f40efdb2b1ef0f09291 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Fri, 18 Aug 2023 14:13:16 +0300 Subject: [PATCH] feat: Add options for custom HTTP error status codes (#2035) This commit introduces two new optional properties to `CheerioCrawler` and `HttpCrawler`, allowing for finer control over how HTTP error status codes are handled: 1. `ignoreHttpErrorStatusCodes`: An array of HTTP response status codes that should be excluded from being considered as errors. By default, error consideration is triggered for status codes >= 500. 2. `additionalHttpErrorStatusCodes`: An array of extra HTTP response status codes that should be treated as errors. By default, error consideration is triggered for status codes >= 500. These options provide flexibility in specifying which HTTP response codes should be treated as errors and ignored during the crawling process. Closes #1711 --- .../src/internals/http-crawler.ts | 30 +++++++++++- test/core/crawlers/cheerio_crawler.test.ts | 48 +++++++++++++++++++ test/core/crawlers/http_crawler.test.ts | 43 +++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index 1de29fca87a..9f37a035d30 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -154,6 +154,18 @@ export interface HttpCrawlerOptions= 500 trigger errors. + */ + ignoreHttpErrorStatusCodes?: number[]; + + /** + * An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors. + * By default, status codes >= 500 trigger errors. + */ + additionalHttpErrorStatusCodes?: number[]; } /** @@ -289,6 +301,8 @@ export class HttpCrawler; + protected ignoreHttpErrorStatusCodes: Set; protected readonly supportedMimeTypes: Set; protected static override optionsShape = { @@ -303,6 +317,9 @@ export class HttpCrawler= 500) { + const excludeError = this.ignoreHttpErrorStatusCodes.has(statusCode!); + const includeError = this.additionalHttpErrorStatusCodes.has(statusCode!); + + if ((statusCode! >= 500 && !excludeError) || includeError) { const body = await readStreamToString(response, encoding); // Errors are often sent as JSON, so attempt to parse them, @@ -629,6 +653,10 @@ export class HttpCrawler { }); }); + test('should ignore non http error status codes set by user', async () => { + const requestList = await getRequestListForMock({ + headers: { + 'content-type': 'text/plain', + }, + statusCode: 500, + }); + + const failed: Request[] = []; + + const cheerioCrawler = new CheerioCrawler({ + requestList, + minConcurrency: 2, + maxConcurrency: 2, + ignoreHttpErrorStatusCodes: [500], + requestHandler: () => {}, + failedRequestHandler: ({ request }) => { + failed.push(request); + }, + }); + + await cheerioCrawler.run(); + + expect(cheerioCrawler.autoscaledPool.minConcurrency).toBe(2); + expect(failed).toHaveLength(0); + }); + + test('should throw and error on http error status codes set by user', async () => { + const requestList = await getRequestListForMirror(); + const failed: Request[] = []; + + const cheerioCrawler = new CheerioCrawler({ + requestList, + minConcurrency: 2, + maxConcurrency: 2, + additionalHttpErrorStatusCodes: [200], + requestHandler: () => {}, + failedRequestHandler: ({ request }) => { + failed.push(request); + }, + }); + + await cheerioCrawler.run(); + + expect(cheerioCrawler.autoscaledPool.minConcurrency).toBe(2); + expect(failed).toHaveLength(4); + }); + test('should work with all defaults content types', async () => { let handledRequests = 0; const contentTypes = ['text/html', 'application/xhtml+xml', 'text/xml', 'application/xml', 'application/json']; diff --git a/test/core/crawlers/http_crawler.test.ts b/test/core/crawlers/http_crawler.test.ts index 586b69d384e..b6e2671d1c9 100644 --- a/test/core/crawlers/http_crawler.test.ts +++ b/test/core/crawlers/http_crawler.test.ts @@ -47,6 +47,11 @@ router.set('/echo', (req, res) => { req.pipe(res); }); +router.set('/500Error', (req, res) => { + res.statusCode = 500; + res.end(); +}); + let server: http.Server; let url: string; @@ -273,3 +278,41 @@ test('POST with undefined (empty) payload', async () => { expect(results).toStrictEqual(['']); }); + +test('should ignore non http error status codes set by user', async () => { + const failed: any[] = []; + + const crawler = new HttpCrawler({ + minConcurrency: 2, + maxConcurrency: 2, + ignoreHttpErrorStatusCodes: [500], + requestHandler: () => {}, + failedRequestHandler: ({ request }) => { + failed.push(request); + }, + }); + + await crawler.run([`${url}/500Error`]); + + expect(crawler.autoscaledPool.minConcurrency).toBe(2); + expect(failed).toHaveLength(0); +}); + +test('should throw and error on http error status codes set by user', async () => { + const failed: any[] = []; + + const crawler = new HttpCrawler({ + minConcurrency: 2, + maxConcurrency: 2, + additionalHttpErrorStatusCodes: [200], + requestHandler: () => {}, + failedRequestHandler: ({ request }) => { + failed.push(request); + }, + }); + + await crawler.run([`${url}/hello.html`]); + + expect(crawler.autoscaledPool.minConcurrency).toBe(2); + expect(failed).toHaveLength(1); +});