From 62a9f4036dba92d07547af489ac8b6c7974faa6f Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 24 May 2024 16:37:48 +0200 Subject: [PATCH] fix: Autodetect sitemap filetype from content (#2497) - closes #2461 --- packages/utils/package.json | 1 + packages/utils/src/internals/sitemap.ts | 40 ++++++++++------- packages/utils/test/sitemap.test.ts | 59 ++++++++++++++++++++++++- yarn.lock | 57 +++++++++++++++++++++++- 4 files changed, 140 insertions(+), 17 deletions(-) diff --git a/packages/utils/package.json b/packages/utils/package.json index 0929232456a..abf687ad267 100644 --- a/packages/utils/package.json +++ b/packages/utils/package.json @@ -52,6 +52,7 @@ "@crawlee/types": "3.10.1", "@types/sax": "^1.2.7", "cheerio": "^1.0.0-rc.12", + "file-type": "^19.0.0", "got-scraping": "^4.0.3", "ow": "^0.28.1", "robots-parser": "^3.0.1", diff --git a/packages/utils/src/internals/sitemap.ts b/packages/utils/src/internals/sitemap.ts index 57f7315c366..c3b68db61bf 100644 --- a/packages/utils/src/internals/sitemap.ts +++ b/packages/utils/src/internals/sitemap.ts @@ -1,4 +1,3 @@ -import type { Duplex } from 'node:stream'; import { Readable, Writable } from 'node:stream'; import { StringDecoder } from 'node:string_decoder'; import { createGunzip } from 'node:zlib'; @@ -173,6 +172,7 @@ export class Sitemap { protected static async parse(parsingState: ParsingState, proxyUrl?: string): Promise { const { gotScraping } = await import('got-scraping'); + const { fileTypeStream } = await import('file-type'); while (parsingState.sources.length > 0) { const source = parsingState.sources.pop()!; @@ -192,26 +192,36 @@ export class Sitemap { ); if (sitemapStream.response!.statusCode === 200) { + let contentType = sitemapStream.response!.headers['content-type']; + + const streamWithType = await fileTypeStream(sitemapStream); + if (streamWithType.fileType !== undefined) { + contentType = streamWithType.fileType.mime; + } + await new Promise((resolve, reject) => { - let stream: Duplex = sitemapStream; - if (sitemapUrl.pathname.endsWith('.gz')) { + let stream: Readable = streamWithType; + + if ( + contentType !== undefined + ? contentType === 'application/gzip' + : sitemapUrl.pathname.endsWith('.gz') + ) { stream = stream.pipe(createGunzip()).on('error', reject); - sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3); + + if (sitemapUrl.pathname.endsWith('.gz')) { + sitemapUrl.pathname = sitemapUrl.pathname.substring( + 0, + sitemapUrl.pathname.length - 3, + ); + } } - stream.pipe( - this.createParser( - resolve, - reject, - parsingState, - sitemapStream.response!.headers['content-type'], - sitemapUrl, - ), - ); + stream.pipe(this.createParser(resolve, reject, parsingState, contentType, sitemapUrl)); }); } } catch (e) { - log.warning(`Malformed sitemap content: ${sitemapUrl}`); + log.warning(`Malformed sitemap content: ${sitemapUrl}, ${e}`); } } @@ -248,6 +258,6 @@ export class Sitemap { return new SitemapTxtParser(parsingState, () => resolve(undefined)); } - throw new Error('Unsupported sitemap content type'); + throw new Error(`Unsupported sitemap content type (contentType = ${contentType}, url = ${url?.toString()})`); } } diff --git a/packages/utils/test/sitemap.test.ts b/packages/utils/test/sitemap.test.ts index af403216b1d..8075ca4af8c 100644 --- a/packages/utils/test/sitemap.test.ts +++ b/packages/utils/test/sitemap.test.ts @@ -8,7 +8,7 @@ describe('Sitemap', () => { nock.disableNetConnect(); nock('http://not-exists.com') .persist() - .get((url) => url === '/sitemap_child.xml' || url === '/sitemap_child_2.xml') + .get(/\/sitemap_child(_[0-9]+)?.xml/) .reply( 200, [ @@ -67,6 +67,38 @@ describe('Sitemap', () => { 'base64', ), ) + .get('/non_gzipped_sitemap.xml.gz') + .reply( + 200, + [ + '', + '', + '', + 'http://not-exists.com/catalog?item=80&desc=vacation_turkey', + '2004-11-23', + '', + '', + 'http://not-exists.com/catalog?item=81&desc=vacation_maledives', + '2004-11-23', + '', + '', + ].join('\n'), + ) + .get('/sneakily_gzipped_sitemap.xml') + .reply( + 200, + Buffer.from( + [ + 'H4sIAAAAAAAAA62S306DMBTG73kK0gtvDLSFLSKWcucTzOulKR00QottGZtPbxfQEEWXqElzkvMv', + '3y/fKSlPXRsehbFSqwLgGIFQKK4rqeoCPO0eowyUNCCDaa1woR9WtgCNc30O4TiOsZVOdKy3sTY1', + 'tLzxiYVzEaL4HkzLPraa03lRaReJk7TOxlx3kMBLz08w6zpd0QShbYSwf74z1wLCG6ZqcTDihXZa', + 'uaY9E7ioBaQ3UhvpzhTFGYEfWUDgBHANgzPHWl2XF/gCJzes6x8qYXlxZL7l/dk3bGRSvuMuxEch', + 'nr/w/Eb2Ll2RVWLcvwrWMlWtWLWJcBIl6TdW/R/ZZp3soAdV/Yy2w1mOUI63tz4itCRd3Cz9882y', + 'NfMGy9bJ8CfTZkU4fXUavAGtDs17GwMAAA==', + ].join('\n'), + 'base64', + ), + ) .get('/sitemap_parent.xml') .reply( 200, @@ -270,4 +302,29 @@ describe('Sitemap', () => { ]), ); }); + + it("loads XML sitemap even though it's gzipped according to file extension", async () => { + const sitemap = await Sitemap.load('http://not-exists.com/non_gzipped_sitemap.xml.gz'); + + expect(new Set(sitemap.urls)).toEqual( + new Set([ + 'http://not-exists.com/catalog?item=80&desc=vacation_turkey', + 'http://not-exists.com/catalog?item=81&desc=vacation_maledives', + ]), + ); + }); + + it("loads gzipped sitemap even though it's not gzipped according to file extension", async () => { + const sitemap = await Sitemap.load('http://not-exists.com/sneakily_gzipped_sitemap.xml'); + + expect(new Set(sitemap.urls)).toEqual( + new Set([ + 'http://not-exists.com/', + 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', + 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', + 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', + 'http://not-exists.com/catalog?item=83&desc=vacation_usa', + ]), + ); + }); }); diff --git a/yarn.lock b/yarn.lock index 2a5e588e349..b45a312a9e5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -827,6 +827,7 @@ __metadata: "@types/sax": "npm:^1.2.7" "@types/whatwg-mimetype": "npm:^3.0.2" cheerio: "npm:^1.0.0-rc.12" + file-type: "npm:^19.0.0" got-scraping: "npm:^4.0.3" ow: "npm:^0.28.1" robots-parser: "npm:^3.0.1" @@ -1961,6 +1962,13 @@ __metadata: languageName: node linkType: hard +"@tokenizer/token@npm:^0.3.0": + version: 0.3.0 + resolution: "@tokenizer/token@npm:0.3.0" + checksum: 10c0/7ab9a822d4b5ff3f5bca7f7d14d46bdd8432528e028db4a52be7fbf90c7f495cc1af1324691dda2813c6af8dc4b8eb29de3107d4508165f9aa5b53e7d501f155 + languageName: node + linkType: hard + "@tootallnate/once@npm:2": version: 2.0.0 resolution: "@tootallnate/once@npm:2.0.0" @@ -5642,6 +5650,17 @@ __metadata: languageName: node linkType: hard +"file-type@npm:^19.0.0": + version: 19.0.0 + resolution: "file-type@npm:19.0.0" + dependencies: + readable-web-to-node-stream: "npm:^3.0.2" + strtok3: "npm:^7.0.0" + token-types: "npm:^5.0.1" + checksum: 10c0/1884c3627f5922365f86cb19f107850fe7b72d78bef5c2affc92aa098ba414c944e3763101068236345737f44a5b6da13bb0ba79de4c4e3b1b1c68e1958643d9 + languageName: node + linkType: hard + "filelist@npm:^1.0.4": version: 1.0.4 resolution: "filelist@npm:1.0.4" @@ -6737,7 +6756,7 @@ __metadata: languageName: node linkType: hard -"ieee754@npm:^1.1.13": +"ieee754@npm:^1.1.13, ieee754@npm:^1.2.1": version: 1.2.1 resolution: "ieee754@npm:1.2.1" checksum: 10c0/b0782ef5e0935b9f12883a2e2aa37baa75da6e66ce6515c168697b42160807d9330de9a32ec1ed73149aea02e0d822e572bca6f1e22bdcbd2149e13b050b17bb @@ -9826,6 +9845,13 @@ __metadata: languageName: node linkType: hard +"peek-readable@npm:^5.0.0": + version: 5.0.0 + resolution: "peek-readable@npm:5.0.0" + checksum: 10c0/060aece3a907a157b4839aa923b61b664b59cac7296dc8d8e0ddcc39065a4f1e328dd2f171c8a49e869aabc6e076a1be59f939183fb0ababc81f3c870006d672 + languageName: node + linkType: hard + "pend@npm:~1.2.0": version: 1.2.0 resolution: "pend@npm:1.2.0" @@ -10393,6 +10419,15 @@ __metadata: languageName: node linkType: hard +"readable-web-to-node-stream@npm:^3.0.2": + version: 3.0.2 + resolution: "readable-web-to-node-stream@npm:3.0.2" + dependencies: + readable-stream: "npm:^3.6.0" + checksum: 10c0/533d5cd1580232a2c753e52a245be13fc552e6f82c5053a8a8da7ea1063d73a34f936a86b3d4433cdb4a13dd683835cfc87f230936cb96d329a1e28b6040f42e + languageName: node + linkType: hard + "redent@npm:^3.0.0": version: 3.0.0 resolution: "redent@npm:3.0.0" @@ -11481,6 +11516,16 @@ __metadata: languageName: node linkType: hard +"strtok3@npm:^7.0.0": + version: 7.0.0 + resolution: "strtok3@npm:7.0.0" + dependencies: + "@tokenizer/token": "npm:^0.3.0" + peek-readable: "npm:^5.0.0" + checksum: 10c0/63a72b10a302719242bfd31ca53955a06bb091dfec46ef14ca10c4b17ab15780ed8365cd5b270cfbde92d571f677539957add436e4bf9cccdf9977b40d762583 + languageName: node + linkType: hard + "supports-color@npm:^2.0.0": version: 2.0.0 resolution: "supports-color@npm:2.0.0" @@ -11723,6 +11768,16 @@ __metadata: languageName: node linkType: hard +"token-types@npm:^5.0.1": + version: 5.0.1 + resolution: "token-types@npm:5.0.1" + dependencies: + "@tokenizer/token": "npm:^0.3.0" + ieee754: "npm:^1.2.1" + checksum: 10c0/cb671b2b52271362816d22b7a076082b0da033cd7807992b81ae53cfd8541bd013ac29e455c3c7a8bb4f88aa1c5315a12353c3599b7f568df238d3c1723f9d8d + languageName: node + linkType: hard + "tough-cookie@npm:^4.0.0, tough-cookie@npm:^4.1.3": version: 4.1.4 resolution: "tough-cookie@npm:4.1.4"