Skip to content

Commit

Permalink
fix: Autodetect sitemap filetype from content (#2497)
Browse files Browse the repository at this point in the history
- closes #2461
  • Loading branch information
janbuchar committed May 24, 2024
1 parent 4a72a72 commit 62a9f40
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 17 deletions.
1 change: 1 addition & 0 deletions packages/utils/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
"@crawlee/types": "3.10.1",
"@types/sax": "^1.2.7",
"cheerio": "^1.0.0-rc.12",
"file-type": "^19.0.0",
"got-scraping": "^4.0.3",
"ow": "^0.28.1",
"robots-parser": "^3.0.1",
Expand Down
40 changes: 25 additions & 15 deletions packages/utils/src/internals/sitemap.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import type { Duplex } from 'node:stream';
import { Readable, Writable } from 'node:stream';
import { StringDecoder } from 'node:string_decoder';
import { createGunzip } from 'node:zlib';
Expand Down Expand Up @@ -173,6 +172,7 @@ export class Sitemap {

protected static async parse(parsingState: ParsingState, proxyUrl?: string): Promise<Sitemap> {
const { gotScraping } = await import('got-scraping');
const { fileTypeStream } = await import('file-type');

while (parsingState.sources.length > 0) {
const source = parsingState.sources.pop()!;
Expand All @@ -192,26 +192,36 @@ export class Sitemap {
);

if (sitemapStream.response!.statusCode === 200) {
let contentType = sitemapStream.response!.headers['content-type'];

const streamWithType = await fileTypeStream(sitemapStream);
if (streamWithType.fileType !== undefined) {
contentType = streamWithType.fileType.mime;
}

await new Promise((resolve, reject) => {
let stream: Duplex = sitemapStream;
if (sitemapUrl.pathname.endsWith('.gz')) {
let stream: Readable = streamWithType;

if (
contentType !== undefined
? contentType === 'application/gzip'
: sitemapUrl.pathname.endsWith('.gz')
) {
stream = stream.pipe(createGunzip()).on('error', reject);
sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3);

if (sitemapUrl.pathname.endsWith('.gz')) {
sitemapUrl.pathname = sitemapUrl.pathname.substring(
0,
sitemapUrl.pathname.length - 3,
);
}
}

stream.pipe(
this.createParser(
resolve,
reject,
parsingState,
sitemapStream.response!.headers['content-type'],
sitemapUrl,
),
);
stream.pipe(this.createParser(resolve, reject, parsingState, contentType, sitemapUrl));
});
}
} catch (e) {
log.warning(`Malformed sitemap content: ${sitemapUrl}`);
log.warning(`Malformed sitemap content: ${sitemapUrl}, ${e}`);
}
}

Expand Down Expand Up @@ -248,6 +258,6 @@ export class Sitemap {
return new SitemapTxtParser(parsingState, () => resolve(undefined));
}

throw new Error('Unsupported sitemap content type');
throw new Error(`Unsupported sitemap content type (contentType = ${contentType}, url = ${url?.toString()})`);
}
}
59 changes: 58 additions & 1 deletion packages/utils/test/sitemap.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ describe('Sitemap', () => {
nock.disableNetConnect();
nock('http://not-exists.com')
.persist()
.get((url) => url === '/sitemap_child.xml' || url === '/sitemap_child_2.xml')
.get(/\/sitemap_child(_[0-9]+)?.xml/)
.reply(
200,
[
Expand Down Expand Up @@ -67,6 +67,38 @@ describe('Sitemap', () => {
'base64',
),
)
.get('/non_gzipped_sitemap.xml.gz')
.reply(
200,
[
'<?xml version="1.0" encoding="UTF-8"?>',
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
'<url>',
'<loc>http://not-exists.com/catalog?item=80&amp;desc=vacation_turkey</loc>',
'<lastmod>2004-11-23</lastmod>',
'</url>',
'<url>',
'<loc>http://not-exists.com/catalog?item=81&amp;desc=vacation_maledives</loc>',
'<lastmod>2004-11-23</lastmod>',
'</url>',
'</urlset>',
].join('\n'),
)
.get('/sneakily_gzipped_sitemap.xml')
.reply(
200,
Buffer.from(
[
'H4sIAAAAAAAAA62S306DMBTG73kK0gtvDLSFLSKWcucTzOulKR00QottGZtPbxfQEEWXqElzkvMv',
'3y/fKSlPXRsehbFSqwLgGIFQKK4rqeoCPO0eowyUNCCDaa1woR9WtgCNc30O4TiOsZVOdKy3sTY1',
'tLzxiYVzEaL4HkzLPraa03lRaReJk7TOxlx3kMBLz08w6zpd0QShbYSwf74z1wLCG6ZqcTDihXZa',
'uaY9E7ioBaQ3UhvpzhTFGYEfWUDgBHANgzPHWl2XF/gCJzes6x8qYXlxZL7l/dk3bGRSvuMuxEch',
'nr/w/Eb2Ll2RVWLcvwrWMlWtWLWJcBIl6TdW/R/ZZp3soAdV/Yy2w1mOUI63tz4itCRd3Cz9882y',
'NfMGy9bJ8CfTZkU4fXUavAGtDs17GwMAAA==',
].join('\n'),
'base64',
),
)
.get('/sitemap_parent.xml')
.reply(
200,
Expand Down Expand Up @@ -270,4 +302,29 @@ describe('Sitemap', () => {
]),
);
});

it("loads XML sitemap even though it's gzipped according to file extension", async () => {
const sitemap = await Sitemap.load('http://not-exists.com/non_gzipped_sitemap.xml.gz');

expect(new Set(sitemap.urls)).toEqual(
new Set([
'http://not-exists.com/catalog?item=80&desc=vacation_turkey',
'http://not-exists.com/catalog?item=81&desc=vacation_maledives',
]),
);
});

it("loads gzipped sitemap even though it's not gzipped according to file extension", async () => {
const sitemap = await Sitemap.load('http://not-exists.com/sneakily_gzipped_sitemap.xml');

expect(new Set(sitemap.urls)).toEqual(
new Set([
'http://not-exists.com/',
'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
'http://not-exists.com/catalog?item=83&desc=vacation_usa',
]),
);
});
});
57 changes: 56 additions & 1 deletion yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,7 @@ __metadata:
"@types/sax": "npm:^1.2.7"
"@types/whatwg-mimetype": "npm:^3.0.2"
cheerio: "npm:^1.0.0-rc.12"
file-type: "npm:^19.0.0"
got-scraping: "npm:^4.0.3"
ow: "npm:^0.28.1"
robots-parser: "npm:^3.0.1"
Expand Down Expand Up @@ -1961,6 +1962,13 @@ __metadata:
languageName: node
linkType: hard

"@tokenizer/token@npm:^0.3.0":
version: 0.3.0
resolution: "@tokenizer/token@npm:0.3.0"
checksum: 10c0/7ab9a822d4b5ff3f5bca7f7d14d46bdd8432528e028db4a52be7fbf90c7f495cc1af1324691dda2813c6af8dc4b8eb29de3107d4508165f9aa5b53e7d501f155
languageName: node
linkType: hard

"@tootallnate/once@npm:2":
version: 2.0.0
resolution: "@tootallnate/once@npm:2.0.0"
Expand Down Expand Up @@ -5642,6 +5650,17 @@ __metadata:
languageName: node
linkType: hard

"file-type@npm:^19.0.0":
version: 19.0.0
resolution: "file-type@npm:19.0.0"
dependencies:
readable-web-to-node-stream: "npm:^3.0.2"
strtok3: "npm:^7.0.0"
token-types: "npm:^5.0.1"
checksum: 10c0/1884c3627f5922365f86cb19f107850fe7b72d78bef5c2affc92aa098ba414c944e3763101068236345737f44a5b6da13bb0ba79de4c4e3b1b1c68e1958643d9
languageName: node
linkType: hard

"filelist@npm:^1.0.4":
version: 1.0.4
resolution: "filelist@npm:1.0.4"
Expand Down Expand Up @@ -6737,7 +6756,7 @@ __metadata:
languageName: node
linkType: hard

"ieee754@npm:^1.1.13":
"ieee754@npm:^1.1.13, ieee754@npm:^1.2.1":
version: 1.2.1
resolution: "ieee754@npm:1.2.1"
checksum: 10c0/b0782ef5e0935b9f12883a2e2aa37baa75da6e66ce6515c168697b42160807d9330de9a32ec1ed73149aea02e0d822e572bca6f1e22bdcbd2149e13b050b17bb
Expand Down Expand Up @@ -9826,6 +9845,13 @@ __metadata:
languageName: node
linkType: hard

"peek-readable@npm:^5.0.0":
version: 5.0.0
resolution: "peek-readable@npm:5.0.0"
checksum: 10c0/060aece3a907a157b4839aa923b61b664b59cac7296dc8d8e0ddcc39065a4f1e328dd2f171c8a49e869aabc6e076a1be59f939183fb0ababc81f3c870006d672
languageName: node
linkType: hard

"pend@npm:~1.2.0":
version: 1.2.0
resolution: "pend@npm:1.2.0"
Expand Down Expand Up @@ -10393,6 +10419,15 @@ __metadata:
languageName: node
linkType: hard

"readable-web-to-node-stream@npm:^3.0.2":
version: 3.0.2
resolution: "readable-web-to-node-stream@npm:3.0.2"
dependencies:
readable-stream: "npm:^3.6.0"
checksum: 10c0/533d5cd1580232a2c753e52a245be13fc552e6f82c5053a8a8da7ea1063d73a34f936a86b3d4433cdb4a13dd683835cfc87f230936cb96d329a1e28b6040f42e
languageName: node
linkType: hard

"redent@npm:^3.0.0":
version: 3.0.0
resolution: "redent@npm:3.0.0"
Expand Down Expand Up @@ -11481,6 +11516,16 @@ __metadata:
languageName: node
linkType: hard

"strtok3@npm:^7.0.0":
version: 7.0.0
resolution: "strtok3@npm:7.0.0"
dependencies:
"@tokenizer/token": "npm:^0.3.0"
peek-readable: "npm:^5.0.0"
checksum: 10c0/63a72b10a302719242bfd31ca53955a06bb091dfec46ef14ca10c4b17ab15780ed8365cd5b270cfbde92d571f677539957add436e4bf9cccdf9977b40d762583
languageName: node
linkType: hard

"supports-color@npm:^2.0.0":
version: 2.0.0
resolution: "supports-color@npm:2.0.0"
Expand Down Expand Up @@ -11723,6 +11768,16 @@ __metadata:
languageName: node
linkType: hard

"token-types@npm:^5.0.1":
version: 5.0.1
resolution: "token-types@npm:5.0.1"
dependencies:
"@tokenizer/token": "npm:^0.3.0"
ieee754: "npm:^1.2.1"
checksum: 10c0/cb671b2b52271362816d22b7a076082b0da033cd7807992b81ae53cfd8541bd013ac29e455c3c7a8bb4f88aa1c5315a12353c3599b7f568df238d3c1723f9d8d
languageName: node
linkType: hard

"tough-cookie@npm:^4.0.0, tough-cookie@npm:^4.1.3":
version: 4.1.4
resolution: "tough-cookie@npm:4.1.4"
Expand Down

0 comments on commit 62a9f40

Please sign in to comment.