Skip to content

Commit

Permalink
Optimize String.prototype.normalize (#2848)
Browse files Browse the repository at this point in the history
We currently use `unicode_normalization` to handle the `String.prototype.normalize` method. However, the crate doesn't support UTF-16 as a first class string, so we had to do some hacks by converting the valid parts of a string to UTF-8, normalizing each one, encoding back to UTF-16 and concatenating everything with the unpaired surrogates within. All of this is obviously suboptimal for performance, which is why I leveraged the `icu_normalizer`, which does support UTF-16 input, to replace our current implementation.

Additionally, this allows users to override the default normalization data if the `intl` feature is enabled by providing the required data in the `BoaProvider` data provider.
  • Loading branch information
jedel1043 committed Apr 23, 2023
1 parent 93b52cd commit 739bd5a
Show file tree
Hide file tree
Showing 29 changed files with 655 additions and 115 deletions.
34 changes: 8 additions & 26 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 6 additions & 4 deletions boa_engine/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ rust-version.workspace = true
profiler = ["boa_profiler/profiler"]
deser = ["boa_interner/serde", "boa_ast/serde"]
intl = [
"dep:boa_icu_provider",
"boa_icu_provider/full",
"icu_normalizer/serde",
"icu_normalizer/std",
"dep:icu_locid_transform",
"dep:icu_locid",
"dep:icu_datetime",
Expand Down Expand Up @@ -53,6 +55,7 @@ boa_profiler.workspace = true
boa_macros.workspace = true
boa_ast.workspace = true
boa_parser.workspace = true
boa_icu_provider.workspace = true
serde = { version = "1.0.160", features = ["derive", "rc"] }
serde_json = "1.0.96"
rand = "0.8.5"
Expand All @@ -66,7 +69,6 @@ indexmap = "1.9.3"
ryu-js = "0.2.2"
chrono = { version = "0.4.24", default-features = false, features = ["clock", "std"] }
fast-float = "0.2.0"
unicode-normalization = "0.1.22"
once_cell = "1.17.1"
tap = "1.0.1"
sptr = "0.3.2"
Expand All @@ -77,10 +79,10 @@ num_enum = "0.6.1"
pollster = "0.3.0"
thin-vec = "0.2.12"
itertools = { version = "0.10.5", default-features = false }
icu_normalizer = "1.2.0"

# intl deps
boa_icu_provider = { workspace = true, optional = true }
icu_locid_transform = { version = "1.2.1", features = ["serde"], optional = true }
icu_locid_transform = { version = "1.2.1", features = ["std", "serde"], optional = true }
icu_locid = { version = "1.2.0", features = ["serde"], optional = true }
icu_datetime = { version = "1.2.0", features = ["serde", "experimental"], optional = true }
icu_calendar = { version = "1.2.0", optional = true }
Expand Down
122 changes: 62 additions & 60 deletions boa_engine/src/builtins/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,23 @@ use crate::{
Context, JsArgs, JsResult, JsString, JsValue,
};
use boa_profiler::Profiler;
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
use std::cmp::{max, min};

use super::{BuiltInBuilder, BuiltInConstructor, IntrinsicObject};

mod string_iterator;
pub(crate) use string_iterator::StringIterator;

/// The set of normalizers required for the `String.prototype.normalize` function.
#[derive(Debug)]
pub(crate) struct StringNormalizers {
pub(crate) nfc: ComposingNormalizer,
pub(crate) nfkc: ComposingNormalizer,
pub(crate) nfd: DecomposingNormalizer,
pub(crate) nfkd: DecomposingNormalizer,
}

#[cfg(test)]
mod tests;

Expand Down Expand Up @@ -2024,7 +2034,6 @@ impl String {
args: &[JsValue],
context: &mut Context<'_>,
) -> JsResult<JsValue> {
use unicode_normalization::UnicodeNormalization;
/// Represents the type of normalization applied to a [`JsString`]
#[derive(Clone, Copy)]
pub(crate) enum Normalization {
Expand All @@ -2033,79 +2042,72 @@ impl String {
Nfkc,
Nfkd,
}

// 1. Let O be ? RequireObjectCoercible(this value).
let this = this.require_object_coercible()?;

// 2. Let S be ? ToString(O).
let s = this.to_string(context)?;

let f = match args.get_or_undefined(0) {
// 3. If form is undefined, let f be "NFC".
&JsValue::Undefined => js_string!("NFC"),
// 4. Else, let f be ? ToString(form).
form => form.to_string(context)?,
};

// 6. Let ns be the String value that is the result of normalizing S
// into the normalization form named by f as specified in
// https://unicode.org/reports/tr15/.
let normalization = match f {
ntype if &ntype == utf16!("NFC") => Normalization::Nfc,
ntype if &ntype == utf16!("NFD") => Normalization::Nfd,
ntype if &ntype == utf16!("NFKC") => Normalization::Nfkc,
ntype if &ntype == utf16!("NFKD") => Normalization::Nfkd,
// 5. If f is not one of "NFC", "NFD", "NFKC", or "NFKD", throw a RangeError exception.
_ => {
return Err(JsNativeError::range()
.with_message("The normalization form should be one of NFC, NFD, NFKC, NFKD.")
.into());
}
};

let mut code_points = s.code_points();
let mut result = Vec::with_capacity(s.len());

let mut next_unpaired_surrogate = None;
let mut buf = [0; 2];

loop {
let only_chars = code_points.by_ref().map_while(|cpoint| match cpoint {
CodePoint::Unicode(c) => Some(c),
CodePoint::UnpairedSurrogate(s) => {
next_unpaired_surrogate = Some(s);
None
let normalization = match args.get_or_undefined(0) {
// 3. If form is undefined, let f be "NFC".
&JsValue::Undefined => Normalization::Nfc,
// 4. Else, let f be ? ToString(form).
f => match f.to_string(context)? {
ntype if &ntype == utf16!("NFC") => Normalization::Nfc,
ntype if &ntype == utf16!("NFD") => Normalization::Nfd,
ntype if &ntype == utf16!("NFKC") => Normalization::Nfkc,
ntype if &ntype == utf16!("NFKD") => Normalization::Nfkd,
// 5. If f is not one of "NFC", "NFD", "NFKC", or "NFKD", throw a RangeError exception.
_ => {
return Err(JsNativeError::range()
.with_message(
"The normalization form should be one of NFC, NFD, NFKC, NFKD.",
)
.into());
}
});
},
};

match normalization {
Normalization::Nfc => {
for mapped in only_chars.nfc() {
result.extend_from_slice(mapped.encode_utf16(&mut buf));
}
}
Normalization::Nfd => {
for mapped in only_chars.nfd() {
result.extend_from_slice(mapped.encode_utf16(&mut buf));
}
}
Normalization::Nfkc => {
for mapped in only_chars.nfkc() {
result.extend_from_slice(mapped.encode_utf16(&mut buf));
}
}
Normalization::Nfkd => {
for mapped in only_chars.nfkd() {
result.extend_from_slice(mapped.encode_utf16(&mut buf));
let normalizers = {
#[cfg(not(feature = "intl"))]
{
use once_cell::sync::Lazy;
static NORMALIZERS: Lazy<StringNormalizers> = Lazy::new(|| {
let provider = &boa_icu_provider::minimal();
let nfc = ComposingNormalizer::try_new_nfc_unstable(provider)
.expect("minimal data should always be updated");
let nfkc = ComposingNormalizer::try_new_nfkc_unstable(provider)
.expect("minimal data should always be updated");
let nfd = DecomposingNormalizer::try_new_nfd_unstable(provider)
.expect("minimal data should always be updated");
let nfkd = DecomposingNormalizer::try_new_nfkd_unstable(provider)
.expect("minimal data should always be updated");

StringNormalizers {
nfc,
nfkc,
nfd,
nfkd,
}
}
});
&*NORMALIZERS
}

if let Some(surr) = next_unpaired_surrogate.take() {
result.push(surr);
} else {
break;
#[cfg(feature = "intl")]
{
context.icu().string_normalizers()
}
}
};

let result = match normalization {
Normalization::Nfc => normalizers.nfc.normalize_utf16(&s),
Normalization::Nfd => normalizers.nfd.normalize_utf16(&s),
Normalization::Nfkc => normalizers.nfkc.normalize_utf16(&s),
Normalization::Nfkd => normalizers.nfkd.normalize_utf16(&s),
};

// 7. Return ns.
Ok(js_string!(result).into())
Expand Down

0 comments on commit 739bd5a

Please sign in to comment.