From 7ea5c4185b169ec933dcd8df2e85824a7e452913 Mon Sep 17 00:00:00 2001 From: Hamza Alwan Date: Mon, 21 Aug 2023 13:20:36 +0300 Subject: [PATCH] fix: support `DELETE` requests in `HttpCrawler` (#2039) This commit addresses an issue in the @crawlee/http (HttpCrawler) and @crawlee/cheerio (CheerioCrawler) packages related to setting the DELETE method for requests. The problem caused requests to fail with a timeout, rendering the functionality unusable. Closes #1658 --- .../src/internals/http-crawler.ts | 5 +++ test/core/crawlers/cheerio_crawler.test.ts | 32 +++++++++++++++++-- test/core/crawlers/http_crawler.test.ts | 26 +++++++++++++-- 3 files changed, 59 insertions(+), 4 deletions(-) diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index 9f37a035d30..2451506f0ea 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -814,6 +814,11 @@ export class HttpCrawler { resolve(addResponsePropertiesToStream(stream)); diff --git a/test/core/crawlers/cheerio_crawler.test.ts b/test/core/crawlers/cheerio_crawler.test.ts index 0a83bc553de..fde2845166b 100644 --- a/test/core/crawlers/cheerio_crawler.test.ts +++ b/test/core/crawlers/cheerio_crawler.test.ts @@ -507,7 +507,7 @@ describe('CheerioCrawler', () => { }); }); - test('should ignore non http error status codes set by user', async () => { + test('should ignore http error status codes set by user', async () => { const requestList = await getRequestListForMock({ headers: { 'content-type': 'text/plain', @@ -534,7 +534,7 @@ describe('CheerioCrawler', () => { expect(failed).toHaveLength(0); }); - test('should throw and error on http error status codes set by user', async () => { + test('should throw an error on http error status codes set by user', async () => { const requestList = await getRequestListForMirror(); const failed: Request[] = []; @@ -1290,6 +1290,34 @@ describe('CheerioCrawler', () => { expect(cheerioCrawler.requestHandler).toBeUndefined(); }); }); + + test('should work with delete requests', async () => { + const sources: Source[] = [1, 2, 3, 4].map((num) => { + return { + url: `${serverAddress}/special/mock?a=${num}`, + method: 'DELETE', + }; + }); + const requestList = await RequestList.open(null, sources); + + const failed: Request[] = []; + + const cheerioCrawler = new CheerioCrawler({ + requestList, + maxConcurrency: 1, + maxRequestRetries: 0, + navigationTimeoutSecs: 5, + requestHandlerTimeoutSecs: 5, + requestHandler: async () => {}, + failedRequestHandler: async ({ request }) => { + failed.push(request); + }, + }); + + await cheerioCrawler.run(); + + expect(failed).toHaveLength(0); + }); }); async function getRequestListForMock(mockData: Dictionary, pathName = 'special/mock') { diff --git a/test/core/crawlers/http_crawler.test.ts b/test/core/crawlers/http_crawler.test.ts index b6e2671d1c9..c95f63e8a1a 100644 --- a/test/core/crawlers/http_crawler.test.ts +++ b/test/core/crawlers/http_crawler.test.ts @@ -279,7 +279,7 @@ test('POST with undefined (empty) payload', async () => { expect(results).toStrictEqual(['']); }); -test('should ignore non http error status codes set by user', async () => { +test('should ignore http error status codes set by user', async () => { const failed: any[] = []; const crawler = new HttpCrawler({ @@ -298,7 +298,7 @@ test('should ignore non http error status codes set by user', async () => { expect(failed).toHaveLength(0); }); -test('should throw and error on http error status codes set by user', async () => { +test('should throw an error on http error status codes set by user', async () => { const failed: any[] = []; const crawler = new HttpCrawler({ @@ -316,3 +316,25 @@ test('should throw and error on http error status codes set by user', async () = expect(crawler.autoscaledPool.minConcurrency).toBe(2); expect(failed).toHaveLength(1); }); + +test('should work with delete requests', async () => { + const failed: any[] = []; + + const cheerioCrawler = new HttpCrawler({ + maxConcurrency: 1, + maxRequestRetries: 0, + navigationTimeoutSecs: 5, + requestHandlerTimeoutSecs: 5, + requestHandler: async () => {}, + failedRequestHandler: async ({ request }) => { + failed.push(request); + }, + }); + + await cheerioCrawler.run([{ + url: `${url}`, + method: 'DELETE', + }]); + + expect(failed).toHaveLength(0); +});