Optimize String.prototype.normalize (#2848)

We currently use `unicode_normalization` to handle the `String.prototype.normalize` method. However, the crate doesn't support UTF-16 as a first class string, so we had to do some hacks by converting the valid parts of a string to UTF-8, normalizing each one, encoding back to UTF-16 and concatenating everything with the unpaired surrogates within. All of this is obviously suboptimal for performance, which is why I leveraged the `icu_normalizer`, which does support UTF-16 input, to replace our current implementation. Additionally, this allows users to override the default normalization data if the `intl` feature is enabled by providing the required data in the `BoaProvider` data provider.
boa-dev · Apr 23, 2023 · 739bd5a · 739bd5a
1 parent 93b52cd
commit 739bd5a
Show file tree

Hide file tree

Showing 29 changed files with 655 additions and 115 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/boa_engine/Cargo.toml b/boa_engine/Cargo.toml
@@ -15,7 +15,9 @@ rust-version.workspace = true
 profiler = ["boa_profiler/profiler"]
 deser = ["boa_interner/serde", "boa_ast/serde"]
 intl = [
-    "dep:boa_icu_provider",
+    "boa_icu_provider/full",
+    "icu_normalizer/serde",
+    "icu_normalizer/std",
     "dep:icu_locid_transform",
     "dep:icu_locid",
     "dep:icu_datetime",
@@ -53,6 +55,7 @@ boa_profiler.workspace = true
 boa_macros.workspace = true
 boa_ast.workspace = true
 boa_parser.workspace = true
+boa_icu_provider.workspace = true
 serde = { version = "1.0.160", features = ["derive", "rc"] }
 serde_json = "1.0.96"
 rand = "0.8.5"
@@ -66,7 +69,6 @@ indexmap = "1.9.3"
 ryu-js = "0.2.2"
 chrono = { version = "0.4.24", default-features = false, features = ["clock", "std"] }
 fast-float = "0.2.0"
-unicode-normalization = "0.1.22"
 once_cell = "1.17.1"
 tap = "1.0.1"
 sptr = "0.3.2"
@@ -77,10 +79,10 @@ num_enum = "0.6.1"
 pollster = "0.3.0"
 thin-vec = "0.2.12"
 itertools = { version = "0.10.5", default-features = false }
+icu_normalizer = "1.2.0"
 
 # intl deps
-boa_icu_provider = { workspace = true, optional = true }
-icu_locid_transform = { version = "1.2.1", features = ["serde"], optional = true }
+icu_locid_transform = { version = "1.2.1", features = ["std", "serde"], optional = true }
 icu_locid = { version = "1.2.0", features = ["serde"], optional = true }
 icu_datetime = { version = "1.2.0", features = ["serde", "experimental"], optional = true }
 icu_calendar = { version = "1.2.0", optional = true }

diff --git a/boa_engine/src/builtins/string/mod.rs b/boa_engine/src/builtins/string/mod.rs
@@ -24,13 +24,23 @@ use crate::{
     Context, JsArgs, JsResult, JsString, JsValue,
 };
 use boa_profiler::Profiler;
+use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
 use std::cmp::{max, min};
 
 use super::{BuiltInBuilder, BuiltInConstructor, IntrinsicObject};
 
 mod string_iterator;
 pub(crate) use string_iterator::StringIterator;
 
+/// The set of normalizers required for the `String.prototype.normalize` function.
+#[derive(Debug)]
+pub(crate) struct StringNormalizers {
+    pub(crate) nfc: ComposingNormalizer,
+    pub(crate) nfkc: ComposingNormalizer,
+    pub(crate) nfd: DecomposingNormalizer,
+    pub(crate) nfkd: DecomposingNormalizer,
+}
+
 #[cfg(test)]
 mod tests;
 
@@ -2024,7 +2034,6 @@ impl String {
         args: &[JsValue],
         context: &mut Context<'_>,
     ) -> JsResult<JsValue> {
-        use unicode_normalization::UnicodeNormalization;
         /// Represents the type of normalization applied to a [`JsString`]
         #[derive(Clone, Copy)]
         pub(crate) enum Normalization {
@@ -2033,79 +2042,72 @@ impl String {
             Nfkc,
             Nfkd,
         }
+
         // 1. Let O be ? RequireObjectCoercible(this value).
         let this = this.require_object_coercible()?;
 
         // 2. Let S be ? ToString(O).
         let s = this.to_string(context)?;
 
-        let f = match args.get_or_undefined(0) {
-            // 3. If form is undefined, let f be "NFC".
-            &JsValue::Undefined => js_string!("NFC"),
-            // 4. Else, let f be ? ToString(form).
-            form => form.to_string(context)?,
-        };
-
         // 6. Let ns be the String value that is the result of normalizing S
         // into the normalization form named by f as specified in
         // https://unicode.org/reports/tr15/.
-        let normalization = match f {
-            ntype if &ntype == utf16!("NFC") => Normalization::Nfc,
-            ntype if &ntype == utf16!("NFD") => Normalization::Nfd,
-            ntype if &ntype == utf16!("NFKC") => Normalization::Nfkc,
-            ntype if &ntype == utf16!("NFKD") => Normalization::Nfkd,
-            // 5. If f is not one of "NFC", "NFD", "NFKC", or "NFKD", throw a RangeError exception.
-            _ => {
-                return Err(JsNativeError::range()
-                    .with_message("The normalization form should be one of NFC, NFD, NFKC, NFKD.")
-                    .into());
-            }
-        };
-
-        let mut code_points = s.code_points();
-        let mut result = Vec::with_capacity(s.len());
-
-        let mut next_unpaired_surrogate = None;
-        let mut buf = [0; 2];
-
-        loop {
-            let only_chars = code_points.by_ref().map_while(|cpoint| match cpoint {
-                CodePoint::Unicode(c) => Some(c),
-                CodePoint::UnpairedSurrogate(s) => {
-                    next_unpaired_surrogate = Some(s);
-                    None
+        let normalization = match args.get_or_undefined(0) {
+            // 3. If form is undefined, let f be "NFC".
+            &JsValue::Undefined => Normalization::Nfc,
+            // 4. Else, let f be ? ToString(form).
+            f => match f.to_string(context)? {
+                ntype if &ntype == utf16!("NFC") => Normalization::Nfc,
+                ntype if &ntype == utf16!("NFD") => Normalization::Nfd,
+                ntype if &ntype == utf16!("NFKC") => Normalization::Nfkc,
+                ntype if &ntype == utf16!("NFKD") => Normalization::Nfkd,
+                // 5. If f is not one of "NFC", "NFD", "NFKC", or "NFKD", throw a RangeError exception.
+                _ => {
+                    return Err(JsNativeError::range()
+                        .with_message(
+                            "The normalization form should be one of NFC, NFD, NFKC, NFKD.",
+                        )
+                        .into());
                 }
-            });
+            },
+        };
 
-            match normalization {
-                Normalization::Nfc => {
-                    for mapped in only_chars.nfc() {
-                        result.extend_from_slice(mapped.encode_utf16(&mut buf));
-                    }
-                }
-                Normalization::Nfd => {
-                    for mapped in only_chars.nfd() {
-                        result.extend_from_slice(mapped.encode_utf16(&mut buf));
-                    }
-                }
-                Normalization::Nfkc => {
-                    for mapped in only_chars.nfkc() {
-                        result.extend_from_slice(mapped.encode_utf16(&mut buf));
-                    }
-                }
-                Normalization::Nfkd => {
-                    for mapped in only_chars.nfkd() {
-                        result.extend_from_slice(mapped.encode_utf16(&mut buf));
+        let normalizers = {
+            #[cfg(not(feature = "intl"))]
+            {
+                use once_cell::sync::Lazy;
+                static NORMALIZERS: Lazy<StringNormalizers> = Lazy::new(|| {
+                    let provider = &boa_icu_provider::minimal();
+                    let nfc = ComposingNormalizer::try_new_nfc_unstable(provider)
+                        .expect("minimal data should always be updated");
+                    let nfkc = ComposingNormalizer::try_new_nfkc_unstable(provider)
+                        .expect("minimal data should always be updated");
+                    let nfd = DecomposingNormalizer::try_new_nfd_unstable(provider)
+                        .expect("minimal data should always be updated");
+                    let nfkd = DecomposingNormalizer::try_new_nfkd_unstable(provider)
+                        .expect("minimal data should always be updated");
+
+                    StringNormalizers {
+                        nfc,
+                        nfkc,
+                        nfd,
+                        nfkd,
                     }
-                }
+                });
+                &*NORMALIZERS
             }
-
-            if let Some(surr) = next_unpaired_surrogate.take() {
-                result.push(surr);
-            } else {
-                break;
+            #[cfg(feature = "intl")]
+            {
+                context.icu().string_normalizers()
             }
-        }
+        };
+
+        let result = match normalization {
+            Normalization::Nfc => normalizers.nfc.normalize_utf16(&s),
+            Normalization::Nfd => normalizers.nfd.normalize_utf16(&s),
+            Normalization::Nfkc => normalizers.nfkc.normalize_utf16(&s),
+            Normalization::Nfkd => normalizers.nfkd.normalize_utf16(&s),
+        };
 
         // 7. Return ns.
         Ok(js_string!(result).into())