Skip to content

Commit

Permalink
fix: sitemap content-type check breaks on content-type parameters (
Browse files Browse the repository at this point in the history
…#2442)

According to the
[RFC1341](https://www.w3.org/Protocols/rfc1341/4_Content-Type.html), the
Content-type header can contain additional string parameters.
  • Loading branch information
barjin committed May 6, 2024
1 parent 1de6d0f commit db7d372
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 3 deletions.
6 changes: 5 additions & 1 deletion packages/utils/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
"ow": "^0.28.1",
"robots-parser": "^3.0.1",
"sax": "^1.3.0",
"tslib": "^2.4.0"
"tslib": "^2.4.0",
"whatwg-mimetype": "^4.0.0"
},
"devDependencies": {
"@types/whatwg-mimetype": "^3.0.2"
}
}
12 changes: 10 additions & 2 deletions packages/utils/src/internals/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { createGunzip } from 'node:zlib';
import log from '@apify/log';
import type { SAXStream } from 'sax';
import sax from 'sax';
import MIMEType from 'whatwg-mimetype';

class ParsingState {
sitemapUrls: string[] = [];
Expand Down Expand Up @@ -170,12 +171,19 @@ export class Sitemap {

const parser = (() => {
const contentType = sitemapStream.response!.headers['content-type'];
let mimeType: MIMEType | null;

if (['text/xml', 'application/xml'].includes(contentType ?? '') || sitemapUrl.pathname.endsWith('.xml')) {
try {
mimeType = new MIMEType(contentType ?? '');
} catch (e) {
mimeType = null;
}

if (mimeType?.isXML() || sitemapUrl.pathname.endsWith('.xml')) {
return Sitemap.createXmlParser(parsingState, () => resolve(undefined), reject);
}

if (contentType === 'text/plain' || sitemapUrl.pathname.endsWith('.txt')) {
if (mimeType?.essence === 'text/plain' || sitemapUrl.pathname.endsWith('.txt')) {
return new SitemapTxtParser(parsingState, () => resolve(undefined));
}

Expand Down
9 changes: 9 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -731,12 +731,14 @@ __metadata:
"@apify/ps-tree": "npm:^1.2.0"
"@crawlee/types": "npm:3.9.2"
"@types/sax": "npm:^1.2.7"
"@types/whatwg-mimetype": "npm:^3.0.2"
cheerio: "npm:^1.0.0-rc.12"
got-scraping: "npm:^4.0.3"
ow: "npm:^0.28.1"
robots-parser: "npm:^3.0.1"
sax: "npm:^1.3.0"
tslib: "npm:^2.4.0"
whatwg-mimetype: "npm:^4.0.0"
languageName: unknown
linkType: soft

Expand Down Expand Up @@ -2261,6 +2263,13 @@ __metadata:
languageName: node
linkType: hard

"@types/whatwg-mimetype@npm:^3.0.2":
version: 3.0.2
resolution: "@types/whatwg-mimetype@npm:3.0.2"
checksum: 10c0/dad39d1e4abe760a0a963c84bbdbd26b1df0eb68aff83bdf6ecbb50ad781ead777f6906d19a87007790b750f7500a12e5624d31fc6a1529d14bd19b5c3a316d1
languageName: node
linkType: hard

"@types/yargs-parser@npm:*":
version: 21.0.3
resolution: "@types/yargs-parser@npm:21.0.3"
Expand Down

0 comments on commit db7d372

Please sign in to comment.