From 24b6a4474db70819a5f7a24a78ab2ca3f65aab9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Thu, 16 Mar 2023 14:20:39 +0100 Subject: [PATCH 1/2] fix: use `new URL` for URL normalization This deprecates the custom `parseUrl` function and uses `new URL` for normalization instead, which helps with some edge cases like having `@` symbol inside the query string. Closes https://github.com/apify/crawlee/issues/1831 --- packages/utilities/src/utilities.client.ts | 44 ++++++++++++---------- test/utilities.client.test.ts | 7 +++- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/packages/utilities/src/utilities.client.ts b/packages/utilities/src/utilities.client.ts index a5d5dc7dc..6af3a0106 100644 --- a/packages/utilities/src/utilities.client.ts +++ b/packages/utilities/src/utilities.client.ts @@ -82,6 +82,9 @@ interface Uri { fragmentKey?: Record; } +/** + * @deprecated use `new URL()` instead + */ export function parseUrl(str: string): Uri { if (typeof str !== 'string') return {}; const o = { @@ -127,28 +130,31 @@ export function normalizeUrl(url: string, keepFragment?: boolean) { return null; } - const urlObj = parseUrl(url.trim()); - if (!urlObj.protocol || !urlObj.host) { + let urlObj; + + try { + urlObj = new URL(url.replace(/ +/g, '')); + } catch { return null; } - const path = urlObj.path!.replace(/\/$/, ''); - const params = (urlObj.query - ? urlObj.query - .split('&') - .filter((param) => { - return !/^utm_/.test(param); - }) - .sort() - : [] - ); - - return `${urlObj.protocol.trim().toLowerCase() - }://${ - urlObj.host.trim().toLowerCase() - }${path.trim() - }${params.length ? `?${params.join('&').trim()}` : '' - }${keepFragment && urlObj.fragment ? `#${urlObj.fragment.trim()}` : ''}`; + const { searchParams } = urlObj; + + for (const key of [...searchParams.keys()]) { + if (key.startsWith('utm_')) { + searchParams.delete(key); + } + } + + searchParams.sort(); + + const protocol = urlObj.protocol.toLowerCase(); + const host = urlObj.host.toLowerCase(); + const path = urlObj.pathname.replace(/\/$/, ''); + const search = searchParams.toString() ? `?${searchParams}` : ''; + const hash = keepFragment ? urlObj.hash : ''; + + return `${protocol}//${host}${path}${search}${hash}`; } // Helper function for markdown rendered marked diff --git a/test/utilities.client.test.ts b/test/utilities.client.test.ts index 48df14741..9ab8dfa0b 100644 --- a/test/utilities.client.test.ts +++ b/test/utilities.client.test.ts @@ -527,7 +527,12 @@ describe('utilities.client', () => { }); it('should not touch invalid or empty params', () => { - expect(normalizeUrl('http://example.com/?neco=&dalsi')).toEqual('http://example.com?dalsi&neco='); + expect(normalizeUrl('http://example.com/?neco=&dalsi')).toEqual('http://example.com?dalsi=&neco='); + }); + + it('should work with @ inside query', () => { + expect(normalizeUrl('https://www.google.com/maps/search/restaurant/@39.102972537998426,39.54835927707177,4z?foo=bar&aaa=bbb')) + .toEqual('https://www.google.com/maps/search/restaurant/@39.102972537998426,39.54835927707177,4z?aaa=bbb&foo=bar'); }); it('should normalize real-world URLs', () => { From d88f2fbb12313f5232c41d52dfc7d8df2a79eb08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Fri, 17 Mar 2023 12:24:48 +0100 Subject: [PATCH 2/2] refactor: keep spaces in the URL before parsing --- packages/utilities/src/utilities.client.ts | 2 +- test/utilities.client.test.ts | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/packages/utilities/src/utilities.client.ts b/packages/utilities/src/utilities.client.ts index 6af3a0106..8be3599b5 100644 --- a/packages/utilities/src/utilities.client.ts +++ b/packages/utilities/src/utilities.client.ts @@ -133,7 +133,7 @@ export function normalizeUrl(url: string, keepFragment?: boolean) { let urlObj; try { - urlObj = new URL(url.replace(/ +/g, '')); + urlObj = new URL(url.trim()); } catch { return null; } diff --git a/test/utilities.client.test.ts b/test/utilities.client.test.ts index 9ab8dfa0b..ab26db1cb 100644 --- a/test/utilities.client.test.ts +++ b/test/utilities.client.test.ts @@ -487,6 +487,16 @@ describe('utilities.client', () => { expect(normalizeUrl('https://example.com')).toEqual('https://example.com'); }); + it('edge cases', () => { + expect(normalizeUrl('a https://example.com b')).toEqual(null); + expect(normalizeUrl('https://example.com?q=foo bar')).toEqual('https://example.com?q=foo+bar'); + expect(normalizeUrl('https://example.com?q=foo+bar')).toEqual('https://example.com?q=foo+bar'); + expect(normalizeUrl('https://google.com/maps/search/restaurant prague/@39.1029725,39.5483593,4z')) + .toEqual('https://google.com/maps/search/restaurant%20prague/@39.1029725,39.5483593,4z'); + expect(normalizeUrl('https://google.com/maps/search/restaurantprague/@39.1029725,39.5483593,4z')) + .toEqual('https://google.com/maps/search/restaurantprague/@39.1029725,39.5483593,4z'); + }); + it('should lowercase hostname and protocols', () => { expect(normalizeUrl('httpS://EXAMPLE.cOm')).toEqual('https://example.com'); expect(normalizeUrl('hTTp://www.EXAMPLE.com')).toEqual('http://www.example.com'); @@ -546,10 +556,12 @@ describe('utilities.client', () => { expect(normalizeUrl('http://notebooky.heureka.cz/f:2111:25235;2278:9720,9539;p:579,580/')).toEqual(expected); }); - it('should trim all parts of URL', () => { - expect(normalizeUrl(' http :// test # fragment ')).toEqual('http://test'); - expect(normalizeUrl(' http :// test # fragment ', true)).toEqual('http://test#fragment'); - }); + // this is no longer a valid URL and results in `null`, if we want to support it, + // we will need some regexp magic to first remove the spaces + // it('should trim all parts of URL', () => { + // expect(normalizeUrl(' http :// test # fragment ')).toEqual('http://test'); + // expect(normalizeUrl(' http :// test # fragment ', true)).toEqual('http://test#fragment'); + // }); }); describe('#buildOrVersionNumberIntToStr()', () => { it('should convert build number to string', () => {