From 24b6a4474db70819a5f7a24a78ab2ca3f65aab9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= <banan23@gmail.com>
Date: Thu, 16 Mar 2023 14:20:39 +0100
Subject: [PATCH 1/2] fix: use `new URL` for URL normalization

This deprecates the custom `parseUrl` function and uses `new URL` for normalization instead,
which helps with some edge cases like having `@` symbol inside the query string.

Closes https://github.com/apify/crawlee/issues/1831
---
 packages/utilities/src/utilities.client.ts | 44 ++++++++++++----------
 test/utilities.client.test.ts              |  7 +++-
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/packages/utilities/src/utilities.client.ts b/packages/utilities/src/utilities.client.ts
index a5d5dc7dc..6af3a0106 100644
--- a/packages/utilities/src/utilities.client.ts
+++ b/packages/utilities/src/utilities.client.ts
@@ -82,6 +82,9 @@ interface Uri {
     fragmentKey?: Record<string, unknown>;
 }
 
+/**
+ * @deprecated use `new URL()` instead
+ */
 export function parseUrl(str: string): Uri {
     if (typeof str !== 'string') return {};
     const o = {
@@ -127,28 +130,31 @@ export function normalizeUrl(url: string, keepFragment?: boolean) {
         return null;
     }
 
-    const urlObj = parseUrl(url.trim());
-    if (!urlObj.protocol || !urlObj.host) {
+    let urlObj;
+
+    try {
+        urlObj = new URL(url.replace(/ +/g, ''));
+    } catch {
         return null;
     }
 
-    const path = urlObj.path!.replace(/\/$/, '');
-    const params = (urlObj.query
-        ? urlObj.query
-            .split('&')
-            .filter((param) => {
-                return !/^utm_/.test(param);
-            })
-            .sort()
-        : []
-    );
-
-    return `${urlObj.protocol.trim().toLowerCase()
-    }://${
-        urlObj.host.trim().toLowerCase()
-    }${path.trim()
-    }${params.length ? `?${params.join('&').trim()}` : ''
-    }${keepFragment && urlObj.fragment ? `#${urlObj.fragment.trim()}` : ''}`;
+    const { searchParams } = urlObj;
+
+    for (const key of [...searchParams.keys()]) {
+        if (key.startsWith('utm_')) {
+            searchParams.delete(key);
+        }
+    }
+
+    searchParams.sort();
+
+    const protocol = urlObj.protocol.toLowerCase();
+    const host = urlObj.host.toLowerCase();
+    const path = urlObj.pathname.replace(/\/$/, '');
+    const search = searchParams.toString() ? `?${searchParams}` : '';
+    const hash = keepFragment ? urlObj.hash : '';
+
+    return `${protocol}//${host}${path}${search}${hash}`;
 }
 
 // Helper function for markdown rendered marked
diff --git a/test/utilities.client.test.ts b/test/utilities.client.test.ts
index 48df14741..9ab8dfa0b 100644
--- a/test/utilities.client.test.ts
+++ b/test/utilities.client.test.ts
@@ -527,7 +527,12 @@ describe('utilities.client', () => {
         });
 
         it('should not touch invalid or empty params', () => {
-            expect(normalizeUrl('http://example.com/?neco=&dalsi')).toEqual('http://example.com?dalsi&neco=');
+            expect(normalizeUrl('http://example.com/?neco=&dalsi')).toEqual('http://example.com?dalsi=&neco=');
+        });
+
+        it('should work with @ inside query', () => {
+            expect(normalizeUrl('https://www.google.com/maps/search/restaurant/@39.102972537998426,39.54835927707177,4z?foo=bar&aaa=bbb'))
+                .toEqual('https://www.google.com/maps/search/restaurant/@39.102972537998426,39.54835927707177,4z?aaa=bbb&foo=bar');
         });
 
         it('should normalize real-world URLs', () => {

From d88f2fbb12313f5232c41d52dfc7d8df2a79eb08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= <banan23@gmail.com>
Date: Fri, 17 Mar 2023 12:24:48 +0100
Subject: [PATCH 2/2] refactor: keep spaces in the URL before parsing

---
 packages/utilities/src/utilities.client.ts |  2 +-
 test/utilities.client.test.ts              | 20 ++++++++++++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/packages/utilities/src/utilities.client.ts b/packages/utilities/src/utilities.client.ts
index 6af3a0106..8be3599b5 100644
--- a/packages/utilities/src/utilities.client.ts
+++ b/packages/utilities/src/utilities.client.ts
@@ -133,7 +133,7 @@ export function normalizeUrl(url: string, keepFragment?: boolean) {
     let urlObj;
 
     try {
-        urlObj = new URL(url.replace(/ +/g, ''));
+        urlObj = new URL(url.trim());
     } catch {
         return null;
     }
diff --git a/test/utilities.client.test.ts b/test/utilities.client.test.ts
index 9ab8dfa0b..ab26db1cb 100644
--- a/test/utilities.client.test.ts
+++ b/test/utilities.client.test.ts
@@ -487,6 +487,16 @@ describe('utilities.client', () => {
             expect(normalizeUrl('https://example.com')).toEqual('https://example.com');
         });
 
+        it('edge cases', () => {
+            expect(normalizeUrl('a   https://example.com   b')).toEqual(null);
+            expect(normalizeUrl('https://example.com?q=foo bar')).toEqual('https://example.com?q=foo+bar');
+            expect(normalizeUrl('https://example.com?q=foo+bar')).toEqual('https://example.com?q=foo+bar');
+            expect(normalizeUrl('https://google.com/maps/search/restaurant prague/@39.1029725,39.5483593,4z'))
+                .toEqual('https://google.com/maps/search/restaurant%20prague/@39.1029725,39.5483593,4z');
+            expect(normalizeUrl('https://google.com/maps/search/restaurantprague/@39.1029725,39.5483593,4z'))
+                .toEqual('https://google.com/maps/search/restaurantprague/@39.1029725,39.5483593,4z');
+        });
+
         it('should lowercase hostname and protocols', () => {
             expect(normalizeUrl('httpS://EXAMPLE.cOm')).toEqual('https://example.com');
             expect(normalizeUrl('hTTp://www.EXAMPLE.com')).toEqual('http://www.example.com');
@@ -546,10 +556,12 @@ describe('utilities.client', () => {
             expect(normalizeUrl('http://notebooky.heureka.cz/f:2111:25235;2278:9720,9539;p:579,580/')).toEqual(expected);
         });
 
-        it('should trim all parts of URL', () => {
-            expect(normalizeUrl('    http    ://     test    # fragment   ')).toEqual('http://test');
-            expect(normalizeUrl('    http   ://     test    # fragment   ', true)).toEqual('http://test#fragment');
-        });
+        // this is no longer a valid URL and results in `null`, if we want to support it,
+        // we will need some regexp magic to first remove the spaces
+        // it('should trim all parts of URL', () => {
+        //     expect(normalizeUrl('    http    ://     test    # fragment   ')).toEqual('http://test');
+        //     expect(normalizeUrl('    http   ://     test    # fragment   ', true)).toEqual('http://test#fragment');
+        // });
     });
     describe('#buildOrVersionNumberIntToStr()', () => {
         it('should convert build number to string', () => {