Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Unicode character escape sequence parsing #959

Merged
merged 2 commits into from
Dec 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 0 additions & 4 deletions boa/src/syntax/lexer/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,10 @@ impl<R> Tokenizer<R> for StringLiteral {
b'u' => {
// Support \u{X..X} (Unicode Codepoint)
if cursor.next_is(b'{')? {
cursor.next_byte()?.expect("{ character vanished"); // Consume the '{'.

// TODO: use bytes for a bit better performance (using stack)
let mut code_point_buf = Vec::with_capacity(6);
cursor.take_until(b'}', &mut code_point_buf)?;

cursor.next_byte()?.expect("} character vanished"); // Consume the '}'.

let code_point_str = unsafe {
str::from_utf8_unchecked(code_point_buf.as_slice())
};
Expand Down
47 changes: 39 additions & 8 deletions boa/src/syntax/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ fn check_positions() {

#[test]
fn check_positions_codepoint() {
let s = r#"console.log("hello world\u{{2764}}"); // Test"#;
let s = r#"console.log("hello world\u{2764}"); // Test"#;
// --------123456789
let mut lexer = Lexer::new(s.as_bytes());

Expand All @@ -281,19 +281,19 @@ fn check_positions_codepoint() {
// String token starts on column 13
assert_eq!(
lexer.next().unwrap().unwrap().span(),
span((1, 13), (1, 36))
span((1, 13), (1, 34))
);

// Close parenthesis token starts on column 36
// Close parenthesis token starts on column 34
assert_eq!(
lexer.next().unwrap().unwrap().span(),
span((1, 36), (1, 37))
span((1, 34), (1, 35))
);

// Semi Colon token starts on column 37
// Semi Colon token starts on column 35
assert_eq!(
lexer.next().unwrap().unwrap().span(),
span((1, 37), (1, 38))
span((1, 35), (1, 36))
);
}

Expand Down Expand Up @@ -702,10 +702,10 @@ fn codepoint_with_no_braces() {
fn illegal_code_point_following_numeric_literal() {
// Checks as per https://tc39.es/ecma262/#sec-literals-numeric-literals that a NumericLiteral cannot
// be immediately followed by an IdentifierStart where the IdentifierStart
let mut lexer = Lexer::new(&br#"17.4\u{{2764}}"#[..]);
let mut lexer = Lexer::new(&br#"17.4\u{2764}"#[..]);
assert!(
lexer.next().is_err(),
"IdentifierStart \\u{{2764}} following NumericLiteral not rejected as expected"
"IdentifierStart \\u{2764} following NumericLiteral not rejected as expected"
);
}

Expand All @@ -723,6 +723,37 @@ fn non_english_str() {
expect_tokens(&mut lexer, &expected);
}

#[test]
fn unicode_escape_with_braces() {
let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]);

let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())];

expect_tokens(&mut lexer, &expected);

lexer = Lexer::new(&br#"\u{{a0}"#[..]);

if let Error::Syntax(_, pos) = lexer
.next()
.expect_err("Malformed Unicode character sequence expected")
{
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("invalid error type");
}

lexer = Lexer::new(&br#"\u{{a0}}"#[..]);

if let Error::Syntax(_, pos) = lexer
.next()
.expect_err("Malformed Unicode character sequence expected")
{
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("invalid error type");
}
}

mod carriage_return {
use super::*;

Expand Down