diff --git a/Cargo.lock b/Cargo.lock index d602dfe0c..e6d408959 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1536,21 +1536,6 @@ dependencies = [ "once_cell", ] -[[package]] -name = "tinyvec" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" - [[package]] name = "tokio" version = "1.21.2" @@ -1660,15 +1645,6 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" -[[package]] -name = "unicode-normalization" -version = "0.1.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" -dependencies = [ - "tinyvec", -] - [[package]] name = "unicode-segmentation" version = "1.10.0" @@ -2448,7 +2424,6 @@ dependencies = [ "rayon", "serde", "serde_json", - "unicode-normalization", "unicode-xid", ] diff --git a/crates/wit-parser/Cargo.toml b/crates/wit-parser/Cargo.toml index 2b5965c6b..7b48683f7 100644 --- a/crates/wit-parser/Cargo.toml +++ b/crates/wit-parser/Cargo.toml @@ -9,7 +9,6 @@ id-arena = "2" anyhow = { workspace = true } pulldown-cmark = { workspace = true } unicode-xid = "0.2.2" -unicode-normalization = "0.1.19" [dev-dependencies] rayon = "1" diff --git a/crates/wit-parser/src/ast/lex.rs b/crates/wit-parser/src/ast/lex.rs index fe20f666d..b5a5726f7 100644 --- a/crates/wit-parser/src/ast/lex.rs +++ b/crates/wit-parser/src/ast/lex.rs @@ -3,7 +3,6 @@ use std::char; use std::convert::TryFrom; use std::fmt; use std::str; -use unicode_normalization::char::canonical_combining_class; use unicode_xid::UnicodeXID; use self::Token::*; @@ -90,7 +89,6 @@ pub enum Token { pub enum Error { InvalidCharInString(usize, char), InvalidCharInId(usize, char), - IdNotSSNFC(usize), IdPartEmpty(usize), InvalidEscape(usize, char), // InvalidHexEscape(usize, char), @@ -472,11 +470,6 @@ fn is_keylike_continue(ch: char) -> bool { } pub fn validate_id(start: usize, id: &str) -> Result<(), Error> { - // Ids must be in stream-safe NFC. - if !unicode_normalization::is_nfc_stream_safe(&id) { - return Err(Error::IdNotSSNFC(start)); - } - // IDs must have at least one part. if id.is_empty() { return Err(Error::IdPartEmpty(start)); @@ -484,38 +477,32 @@ pub fn validate_id(start: usize, id: &str) -> Result<(), Error> { // Ids consist of parts separated by '-'s. for part in id.split("-") { - // Parts must be non-empty and start with a non-combining XID start. - match part.chars().next() { + // Parts must be non-empty and contain either all ASCII lowercase or + // all ASCII uppercase. + let upper = match part.chars().next() { None => return Err(Error::IdPartEmpty(start)), Some(first) => { - // Require the first character of each part to be non-combining, - // so that if a source langauge uses `CamelCase`, they won't - // combine with the last character of the previous part. - if canonical_combining_class(first) != 0 { - return Err(Error::InvalidCharInId(start, first)); - } - - // Require the first character to be a XID start. - if !UnicodeXID::is_xid_start(first) { + if first.is_ascii_lowercase() { + false + } else if first.is_ascii_uppercase() { + true + } else { return Err(Error::InvalidCharInId(start, first)); } - - // TODO: Disallow values with 'Grapheme_Extend = Yes', to - // prevent them from combining with previous parts? - - // TODO: Disallow values with 'Grapheme_Cluster_Break = SpacingMark'? } }; - // Some XID values are not valid ID part values. for ch in part.chars() { - // Disallow uppercase and underscore, so that identifiers - // consistently use `kebab-case`, and source languages can map - // identifiers according to their own conventions (which might use - // `CamelCase` or `snake_case` or something else) without worrying - // about collisions. - if ch.is_uppercase() || ch == '_' || !UnicodeXID::is_xid_continue(ch) { - return Err(Error::InvalidCharInId(start, ch)); + if ch.is_ascii_digit() { + // Digits are accepted in both uppercase and lowercase segments. + } else if upper { + if !ch.is_ascii_uppercase() { + return Err(Error::InvalidCharInId(start, ch)); + } + } else { + if !ch.is_ascii_lowercase() { + return Err(Error::InvalidCharInId(start, ch)); + } } } } @@ -595,7 +582,6 @@ impl fmt::Display for Error { Error::InvalidCharInString(_, ch) => write!(f, "invalid character in string {:?}", ch), Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {:?}", ch), Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"), - Error::IdNotSSNFC(_) => write!(f, "identifiers must be in stream-safe NFC"), Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {:?}", ch), } } @@ -614,7 +600,6 @@ pub fn rewrite_error(err: &mut anyhow::Error, file: &str, contents: &str) { | Error::NewlineInString(at) | Error::InvalidCharInString(at, _) | Error::InvalidCharInId(at, _) - | Error::IdNotSSNFC(at) | Error::IdPartEmpty(at) | Error::InvalidEscape(at, _) => *at, }; @@ -627,17 +612,17 @@ fn test_validate_id() { validate_id(0, "apple").unwrap(); validate_id(0, "apple-pear").unwrap(); validate_id(0, "apple-pear-grape").unwrap(); - validate_id(0, "garçon").unwrap(); - validate_id(0, "hühnervögel").unwrap(); - validate_id(0, "москва").unwrap(); - validate_id(0, "東京").unwrap(); - validate_id(0, "東-京").unwrap(); - validate_id(0, "garçon-hühnervögel-москва-東京").unwrap(); - validate_id(0, "garçon-hühnervögel-москва-東-京").unwrap(); validate_id(0, "a0").unwrap(); validate_id(0, "a").unwrap(); validate_id(0, "a-a").unwrap(); validate_id(0, "bool").unwrap(); + validate_id(0, "APPLE").unwrap(); + validate_id(0, "APPLE-PEAR").unwrap(); + validate_id(0, "APPLE-PEAR-GRAPE").unwrap(); + validate_id(0, "apple-PEAR-grape").unwrap(); + validate_id(0, "APPLE-pear-GRAPE").unwrap(); + validate_id(0, "ENOENT").unwrap(); + validate_id(0, "is-XML").unwrap(); assert!(validate_id(0, "").is_err()); assert!(validate_id(0, "0").is_err()); @@ -652,7 +637,6 @@ fn test_validate_id() { assert!(validate_id(0, "a-").is_err()); assert!(validate_id(0, "-a").is_err()); assert!(validate_id(0, "Apple").is_err()); - assert!(validate_id(0, "APPLE").is_err()); assert!(validate_id(0, "applE").is_err()); assert!(validate_id(0, "-apple-pear").is_err()); assert!(validate_id(0, "apple-pear-").is_err()); @@ -675,11 +659,10 @@ fn test_validate_id() { assert!(validate_id(0, "_Znwj").is_err()); assert!(validate_id(0, "__i386").is_err()); assert!(validate_id(0, "__i386__").is_err()); - assert!(validate_id(0, "ENOENT").is_err()); assert!(validate_id(0, "Москва").is_err()); assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err()); assert!(validate_id(0, "😼").is_err(), "non-identifier"); - assert!(validate_id(0, "\u{212b}").is_err(), "not NFC"); + assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii"); } #[test] @@ -716,6 +699,13 @@ fn test_tokenizer() { assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]); assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]); assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]); + assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]); + assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]); + assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]); + assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]); + assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]); + assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]); + assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]); assert_eq!(collect("func").unwrap(), vec![Token::Func]); assert_eq!( diff --git a/tests/codegen/conventions.wit b/tests/codegen/conventions.wit index 014b712a5..a7c165700 100644 --- a/tests/codegen/conventions.wit +++ b/tests/codegen/conventions.wit @@ -14,13 +14,16 @@ foo: func(x: ludicrous-speed) apple: func() apple-pear: func() apple-pear-grape: func() -garçon: func() -hühnervögel: func() -москва: func() -東-京: func() -garçon-hühnervögel-москва-東-京: func() a0: func() +// Comment out identifiers that collide when mapped to snake_case, for now; see +// https://github.com/WebAssembly/component-model/issues/118 +//APPLE: func() +//APPLE-pear-GRAPE: func() +//apple-PEAR-grape: func() + +is-XML: func() + %explicit: func() %explicit-kebab: func()