Merge pull request #2189 from roja-a-m/invalid-prettify-object-with-u…

…nicode Fix - Invalid prettification of object with unicode as key
beautifier · Aug 30, 2023 · 44b7131 · 44b7131
2 parents 75a9093 + 88c0ab5
commit 44b7131
Show file tree

Hide file tree

Showing 7 changed files with 80 additions and 14 deletions.
diff --git a/js/src/javascript/acorn.js b/js/src/javascript/acorn.js
diff --git a/js/src/javascript/tokenizer.js b/js/src/javascript/tokenizer.js
@@ -484,6 +484,9 @@ function unescape_string(s) {
         matched = input_scan.match(/x([0-9A-Fa-f]{2})/g);
       } else if (input_scan.peek() === 'u') {
         matched = input_scan.match(/u([0-9A-Fa-f]{4})/g);
+        if (!matched) {
+          matched = input_scan.match(/u\{([0-9A-Fa-f]+)\}/g);
+        }
       } else {
         out += '\\';
         if (input_scan.hasNext()) {
@@ -507,7 +510,9 @@ function unescape_string(s) {
       } else if (escaped >= 0x00 && escaped < 0x20) {
         // leave 0x00...0x1f escaped
         out += '\\' + matched[0];
-        continue;
+      } else if (escaped > 0x10FFFF) {
+        // If the escape sequence is out of bounds, keep the original sequence and continue conversion
+        out += '\\' + matched[0];
       } else if (escaped === 0x22 || escaped === 0x27 || escaped === 0x5c) {
         // single-quote, apostrophe, backslash - escape these
         out += '\\' + String.fromCharCode(escaped);

diff --git a/python/jsbeautifier/javascript/acorn.py b/python/jsbeautifier/javascript/acorn.py
@@ -43,14 +43,20 @@
 # _nonASCIIidentifierStart = re.compile("[" + _nonASCIIidentifierStartChars + "]")
 # _nonASCIIidentifier = re.compile("[" + _nonASCIIidentifierStartChars + _nonASCIIidentifierChars + "]")
 
+_unicodeEscapeOrCodePoint = six.u(r"\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]+\}")
+
 _identifierStart = (
-    six.u(r"(?:\\u[0-9a-fA-F]{4}|[")
+    six.u("(?:")
+    + _unicodeEscapeOrCodePoint
+    + six.u("|[")
     + _baseASCIIidentifierStartChars
     + _nonASCIIidentifierStartChars
     + six.u("])")
 )
 _identifierChars = (
-    six.u(r"(?:\\u[0-9a-fA-F]{4}|[")
+    six.u("(?:")
+    + _unicodeEscapeOrCodePoint
+    + six.u("|[")
     + _baseASCIIidentifierChars
     + _nonASCIIidentifierStartChars
     + _nonASCIIidentifierChars
@@ -61,7 +67,9 @@
 
 identifierStart = re.compile(_identifierStart)
 identifierMatch = re.compile(
-    six.u(r"(?:\\u[0-9a-fA-F]{4}|[")
+    six.u("(?:")
+    + _unicodeEscapeOrCodePoint
+    + six.u("|[")
     + _baseASCIIidentifierChars
     + _nonASCIIidentifierStartChars
     + _nonASCIIidentifierChars

diff --git a/python/jsbeautifier/javascript/tokenizer.py b/python/jsbeautifier/javascript/tokenizer.py
@@ -600,6 +600,8 @@ def unescape_string(self, s):
                 matched = input_scan.match(re.compile(r"x([0-9A-Fa-f]{2})"))
             elif input_scan.peek() == "u":
                 matched = input_scan.match(re.compile(r"u([0-9A-Fa-f]{4})"))
+                if not matched:
+                    matched = input_scan.match(re.compile(r"u\{([0-9A-Fa-f]+)\}"))
             else:
                 out += "\\"
                 if input_scan.hasNext():
@@ -620,7 +622,9 @@ def unescape_string(self, s):
             elif escaped >= 0x00 and escaped < 0x20:
                 # leave 0x00...0x1f escaped
                 out += "\\" + matched.group(0)
-                continue
+            elif escaped > 0x10FFFF:
+                # If the escape sequence is out of bounds, keep the original sequence and continue conversion
+                out += "\\" + matched.group(0)
             elif escaped == 0x22 or escaped == 0x27 or escaped == 0x5C:
                 # single-quote, apostrophe, backslash - escape these
                 out += "\\" + chr(escaped)

diff --git a/test/data/javascript/node.mustache b/test/data/javascript/node.mustache
@@ -453,21 +453,40 @@ function run_javascript_tests(test_obj, Urlencoded, js_beautify, html_beautify,
         bt('"—"');
         bt('"\\x41\\x42\\x43\\x01"', '"\\x41\\x42\\x43\\x01"');
         bt('"\\u2022"', '"\\u2022"');
+        bt('"\\u{2022}"', '"\\u{2022}"');
         bt('a = /\s+/');
         // bt('a = /\\x41/','a = /A/');
         bt('"\\u2022";a = /\s+/;"\\x41\\x42\\x43\\x01".match(/\\x41/);','"\\u2022";\na = /\s+/;\n"\\x41\\x42\\x43\\x01".match(/\\x41/);');
-        test_fragment('"\\x22\\x27",\'\\x22\\x27\',"\\x5c",\'\\x5c\',"\\xff and \\xzz","unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"', '"\\x22\\x27", \'\\x22\\x27\', "\\x5c", \'\\x5c\', "\\xff and \\xzz", "unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"');
+
+        test_fragment('"\\x41\\x42\\x01\\x43"');
+        test_fragment('"\\x41\\x42\\u0001\\x43"');
+        test_fragment('"\\x41\\x42\\u{0001}\\x43"');
+        test_fragment('"\\x20\\x40\\x4a"');
+        test_fragment('"\\xff\\x40\\x4a"');
+        test_fragment('"\\u0072\\u016B\\u0137\\u012B\\u0074\\u0069\\u0073"');
+        test_fragment('"\\u{0072}\\u{016B}\\u{110000}\\u{137}\\u012B\\x74\\u{0000069}\\u{073}"');
+        test_fragment('"Google Chrome est\\u00E1 actualizado."');
+        test_fragment(
+            '"\\x22\\x27",\'\\x22\\x27\',"\\x5c",\'\\x5c\',"\\xff and \\xzz","unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"',
+            '"\\x22\\x27", \'\\x22\\x27\', "\\x5c", \'\\x5c\', "\\xff and \\xzz", "unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"');
 
         opts.unescape_strings = true;
+        
+        test_fragment('"\\x41\\x42\\x01\\x43"', '"AB\\x01C"');
+        test_fragment('"\\x41\\x42\\u0001\\x43"', '"AB\\u0001C"');
+        test_fragment('"\\x41\\x42\\u{0001}\\x43"', '"AB\\u{0001}C"');
         test_fragment('"\\x20\\x40\\x4a"', '" @J"');
         test_fragment('"\\xff\\x40\\x4a"');
         test_fragment('"\\u0072\\u016B\\u0137\\u012B\\u0074\\u0069\\u0073"', '"\u0072\u016B\u0137\u012B\u0074\u0069\u0073"');
+        test_fragment('"\\u{0072}\\u{016B}\\u{110000}\\u{137}\\u012B\\x74\\u{0000069}\\u{073}"', '"\u0072\u016B\\u{110000}\u0137\u012B\u0074\u0069\u0073"');
         test_fragment('"Google Chrome est\\u00E1 actualizado."', '"Google Chrome está actualizado."');
-        test_fragment('"\\x22\\x27",\'\\x22\\x27\',"\\x5c",\'\\x5c\',"\\xff and \\xzz","unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff"',
-           '"\\"\\\'", \'\\"\\\'\', "\\\\", \'\\\\\', "\\xff and \\xzz", "unicode \\u0000 \\" \\\' \\\\ ' + unicode_char(0xffff) + '"');
+        test_fragment(
+            '"\\x22\\x27",\'\\x22\\x27\',"\\x5c",\'\\x5c\',"\\xff and \\xzz","unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff"',
+            '"\\"\\\'", \'\\"\\\'\', "\\\\", \'\\\\\', "\\xff and \\xzz", "unicode \\u0000 \\" \\\' \\\\ ' + unicode_char(0xffff) + '"');
 
         // For error case, return the string unchanged
-        test_fragment('"\\x22\\x27",\'\\x22\\x27\',"\\x5c",\'\\x5c\',"\\xff and \\xzz","unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"',
+        test_fragment(
+            '"\\x22\\x27",\'\\x22\\x27\',"\\x5c",\'\\x5c\',"\\xff and \\xzz","unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"',
             '"\\"\\\'", \'\\"\\\'\', "\\\\", \'\\\\\', "\\xff and \\xzz", "unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"');
 
         reset_options();

diff --git a/test/data/javascript/python.mustache b/test/data/javascript/python.mustache
@@ -78,17 +78,32 @@ class TestJSBeautifier(unittest.TestCase):
         bt('"—"')
         bt('"\\x41\\x42\\x43\\x01"', '"\\x41\\x42\\x43\\x01"')
         bt('"\\u2022"', '"\\u2022"')
+        bt('"\\u{2022}"', '"\\u{2022}"')
         bt('a = /\s+/')
         #bt('a = /\\x41/','a = /A/')
         bt('"\\u2022";a = /\s+/;"\\x41\\x42\\x43\\x01".match(/\\x41/);','"\\u2022";\na = /\s+/;\n"\\x41\\x42\\x43\\x01".match(/\\x41/);')
-        test_fragment('"\\x22\\x27",\'\\x22\\x27\',"\\x5c",\'\\x5c\',"\\xff and \\xzz","unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"', '"\\x22\\x27", \'\\x22\\x27\', "\\x5c", \'\\x5c\', "\\xff and \\xzz", "unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"')
+
+        test_fragment('"\\x41\\x42\\x01\\x43"')
+        test_fragment('"\\x41\\x42\\u0001\\x43"')
+        test_fragment('"\\x41\\x42\\u{0001}\\x43"')
+        test_fragment('"\\x20\\x40\\x4a"')
+        test_fragment('"\\xff\\x40\\x4a"')
+        test_fragment('"\\u0072\\u016B\\u0137\\u012B\\u0074\\u0069\\u0073"')
+        test_fragment('"\\u{0072}\\u{016B}\\u{110000}\\u{137}\\u012B\\x74\\u{0000069}\\u{073}"')
+        test_fragment('"Google Chrome est\\u00E1 actualizado."')
+        test_fragment(
+            '"\\x22\\x27",\'\\x22\\x27\',"\\x5c",\'\\x5c\',"\\xff and \\xzz","unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"',
+            '"\\x22\\x27", \'\\x22\\x27\', "\\x5c", \'\\x5c\', "\\xff and \\xzz", "unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff \\uzzzz"')
 
         self.options.unescape_strings = True
 
-        bt('"\\x41\\x42\\x43\\x01"', '"ABC\\x01"')
+        bt('"\\x41\\x42\\x01\\x43"', '"AB\\x01C"')
+        bt('"\\x41\\x42\\u0001\\x43"', '"AB\\u0001C"')
+        bt('"\\x41\\x42\\u{0001}\\x43"', '"AB\\u{0001}C"')
         test_fragment('"\\x20\\x40\\x4a"', '" @J"')
         test_fragment('"\\xff\\x40\\x4a"')
         test_fragment('"\\u0072\\u016B\\u0137\\u012B\\u0074\\u0069\\u0073"', six.u('"\u0072\u016B\u0137\u012B\u0074\u0069\u0073"'))
+        test_fragment('"\\u{0072}\\u{016B}\\u{110000}\\u{137}\\u012B\\x74\\u{0000069}\\u{073}"', six.u('"\u0072\u016B\\u{110000}\u0137\u012B\u0074\u0069\u0073"'))
 
         bt('a = /\s+/')
         test_fragment('"\\x22\\x27",\'\\x22\\x27\',"\\x5c",\'\\x5c\',"\\xff","unicode \\u0000 \\u0022 \\u0027 \\u005c \\uffff"',

diff --git a/test/data/javascript/tests.js b/test/data/javascript/tests.js
@@ -137,6 +137,20 @@ exports.test_data = {
       }, {
         input_: "var' + unicode_char(160) + unicode_char(3232) + '_' + unicode_char(3232) + ' = \"hi\";",
         output: "var ' + unicode_char(3232) + '_' + unicode_char(3232) + ' = \"hi\";"
+      }, {
+        comment: 'Issue #2159: Invalid prettification of object with unicode escape character as object key - test scenario: object with unicode as key',
+        input: '{\\\\u{1d4b6}:"ascr"}',
+        output: [
+          '{',
+          '    \\\\u{1d4b6}: "ascr"',
+          '}'
+        ]
+      }, {
+        unchanged: [
+          "var \\\\u{E4}\\\\u{ca0}\\\\u{0cA0}\\\\u{000000Ca0} = {",
+          "    \\\\u{ca0}rgerlich: true",
+          "};"
+        ]
       }]
     }, {
       name: "Test template and continuation strings",