diff --git a/src/lazy/binary/raw/value.rs b/src/lazy/binary/raw/value.rs index 6dc5081d..245f4706 100644 --- a/src/lazy/binary/raw/value.rs +++ b/src/lazy/binary/raw/value.rs @@ -413,7 +413,7 @@ impl<'data> LazyRawBinaryValue<'data> { fn read_clob(&self) -> ValueParseResult<'data, BinaryEncoding> { debug_assert!(self.encoded_value.ion_type() == IonType::Clob); let bytes = self.value_body()?; - Ok(RawValueRef::Clob(bytes)) + Ok(RawValueRef::Clob(bytes.into())) } /// Helper method called by [`Self::read`]. Reads the current value as an S-expression. diff --git a/src/lazy/raw_value_ref.rs b/src/lazy/raw_value_ref.rs index 4c63bf33..43819740 100644 --- a/src/lazy/raw_value_ref.rs +++ b/src/lazy/raw_value_ref.rs @@ -20,7 +20,7 @@ pub enum RawValueRef<'data, D: LazyDecoder<'data>> { String(StrRef<'data>), Symbol(RawSymbolTokenRef<'data>), Blob(BytesRef<'data>), - Clob(&'data [u8]), + Clob(BytesRef<'data>), SExp(D::SExp), List(D::List), Struct(D::Struct), @@ -149,7 +149,7 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> { } } - pub fn expect_clob(self) -> IonResult<&'data [u8]> { + pub fn expect_clob(self) -> IonResult> { if let RawValueRef::Clob(c) = self { Ok(c) } else { diff --git a/src/lazy/str_ref.rs b/src/lazy/str_ref.rs index 17161e0d..850d0556 100644 --- a/src/lazy/str_ref.rs +++ b/src/lazy/str_ref.rs @@ -1,3 +1,4 @@ +use crate::lazy::bytes_ref::BytesRef; use crate::text::text_formatter::IonValueFormatter; use crate::Str; use std::borrow::Cow; @@ -80,3 +81,12 @@ impl<'data> From> for Str { Str::from(text) } } + +impl<'data> From> for BytesRef<'data> { + fn from(value: StrRef<'data>) -> Self { + match value.text { + Cow::Borrowed(text) => text.as_bytes().into(), + Cow::Owned(text) => Vec::from(text).into(), + } + } +} diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index 96f5665d..00306eef 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -17,8 +17,8 @@ use crate::lazy::encoding::TextEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::{ - MatchedBlob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString, - MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue, + MatchedBlob, MatchedClob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, + MatchedString, MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue, }; use crate::lazy::text::parse_result::{InvalidInputError, IonParseError}; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; @@ -508,6 +508,12 @@ impl<'data> TextBufferView<'data> { EncodedTextValue::new(MatchedValue::Blob(matched_blob), self.offset(), length) }, ), + map( + match_and_length(Self::match_clob), + |(matched_clob, length)| { + EncodedTextValue::new(MatchedValue::Clob(matched_clob), self.offset(), length) + }, + ), map( match_and_length(Self::match_list), |(matched_list, length)| { @@ -983,12 +989,12 @@ impl<'data> TextBufferView<'data> { } /// Matches short- or long-form string. - fn match_string(self) -> IonParseResult<'data, MatchedString> { + pub fn match_string(self) -> IonParseResult<'data, MatchedString> { alt((Self::match_short_string, Self::match_long_string))(self) } /// Matches a short string. For example: `"foo"` - fn match_short_string(self) -> IonParseResult<'data, MatchedString> { + pub(crate) fn match_short_string(self) -> IonParseResult<'data, MatchedString> { delimited(char('"'), Self::match_short_string_body, char('"')) .map(|(_matched, contains_escaped_chars)| { if contains_escaped_chars { @@ -1002,13 +1008,13 @@ impl<'data> TextBufferView<'data> { /// Returns a matched buffer and a boolean indicating whether any escaped characters were /// found in the short string. - fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> { + pub(crate) fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> { Self::match_text_until_unescaped(self, b'\"') } /// Matches a long string comprised of any number of `'''`-enclosed segments interleaved /// with optional comments and whitespace. - pub fn match_long_string(self) -> IonParseResult<'data, MatchedString> { + pub(crate) fn match_long_string(self) -> IonParseResult<'data, MatchedString> { fold_many1( // Parser to keep applying repeatedly whitespace_and_then(Self::match_long_string_segment), @@ -1169,6 +1175,13 @@ impl<'data> TextBufferView<'data> { contains_escaped_chars = true; continue; } + if *byte == b'\r' { + // If the text contains an unescaped carriage return, we may need to normalize it. + // In some narrow cases, setting this flag to true may result in a sanitization buffer + // being allocated when it isn't strictly necessary. + contains_escaped_chars = true; + continue; + } if *byte == delimiter { let matched = self.slice(0, index); let remaining = self.slice_to_end(index); @@ -1435,6 +1448,66 @@ impl<'data> TextBufferView<'data> { .parse(self) } + /// Matches a clob of either short- or long-form syntax. + pub fn match_clob(self) -> IonParseResult<'data, MatchedClob> { + delimited( + tag("{{"), + preceded( + Self::match_optional_whitespace, + alt(( + value(MatchedClob::Short, Self::match_short_clob_body), + value(MatchedClob::Long, Self::match_long_clob_body), + )), + ), + preceded(Self::match_optional_whitespace, tag("}}")), + )(self) + } + + /// Matches the body (inside the `{{` and `}}`) of a short-form clob. + fn match_short_clob_body(self) -> IonMatchResult<'data> { + let (remaining, (body, _matched_string)) = consumed(Self::match_short_string)(self)?; + body.validate_clob_text()?; + Ok((remaining, body)) + } + + /// Matches the body (inside the `{{` and `}}`) of a long-form clob. + fn match_long_clob_body(self) -> IonMatchResult<'data> { + recognize(many1_count(preceded( + Self::match_optional_whitespace, + Self::match_long_clob_body_segment, + )))(self) + } + + /// Matches a single segment of a long-form clob's content. + fn match_long_clob_body_segment(self) -> IonMatchResult<'data> { + let (remaining, (body, _matched_string)) = consumed(Self::match_long_string_segment)(self)?; + body.validate_clob_text()?; + Ok((remaining, body)) + } + + /// Returns an error if the buffer contains any byte that is not legal inside a clob. + fn validate_clob_text(self) -> IonMatchResult<'data> { + for byte in self.bytes().iter().copied() { + if !Self::byte_is_legal_clob_ascii(byte) { + let message = format!("found an illegal byte '{:0x}'in clob", byte); + let error = InvalidInputError::new(self).with_description(message); + return Err(nom::Err::Failure(IonParseError::Invalid(error))); + } + } + // Return success without consuming + Ok((self, self.slice(0, 0))) + } + + /// Returns `false` if the specified byte cannot appear unescaped in a clob. + fn byte_is_legal_clob_ascii(b: u8) -> bool { + // Depending on where you look in the spec and/or `ion-tests`, you'll find conflicting + // information about which ASCII characters can appear unescaped in a clob. Some say + // "characters >= 0x20", but that excludes lots of whitespace characters that are < 0x20. + // Some say "displayable ASCII", but DEL (0x7F) is shown to be legal in one of the ion-tests. + // The definition used here has largely been inferred from the contents of `ion-tests`. + b.is_ascii() + && (u32::from(b) >= 0x20 || WHITESPACE_CHARACTERS_AS_STR.as_bytes().contains(&b)) + } /// Matches the base64 content within a blob. Ion allows the base64 content to be broken up with /// whitespace, so the matched input region may need to be stripped of whitespace before /// the data can be decoded. @@ -2189,6 +2262,48 @@ mod tests { } #[test] + fn test_match_clob() { + fn match_clob(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_clob)); + } + fn mismatch_blob(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_clob)); + } + // Base64 encodings of utf-8 strings + let good_inputs = &[ + r#"{{""}}"#, + r#"{{''''''}}"#, + r#"{{"foo"}}"#, + r#"{{ "foo"}}"#, + r#"{{ "foo" }}"#, + r#"{{"foo" }}"#, + r#"{{'''foo'''}}"#, + r#"{{"foobar"}}"#, + r#"{{'''foo''' '''bar'''}}"#, + r#"{{ + '''foo''' + '''bar''' + '''baz''' + }}"#, + ]; + for input in good_inputs { + match_clob(input); + } + + let bad_inputs = &[ + r#"{{foo}}"#, // No quotes + r#"{{"foo}}"#, // Missing closing quote + r#"{{"foo"}"#, // Missing closing brace + r#"{{'''foo'''}"#, // Missing closing brace + r#"{{'''foo''' /*hi!*/ '''bar'''}}"#, // Interleaved comments + r#"{{'''foo''' "bar"}}"#, // Mixed quote style + r#"{{"😎🙂🙃"}}"#, // Contains unescaped non-ascii characters + ]; + for input in bad_inputs { + mismatch_blob(input); + } + } + fn test_match_text_until_unescaped_str() { let input = TextBufferView::new(r" foo bar \''' baz''' quux ".as_bytes()); let (_remaining, (matched, contains_escapes)) = diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index 1423e743..970c0aa7 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -131,6 +131,7 @@ impl EncodedTextValue { MatchedValue::String(_) => IonType::String, MatchedValue::Symbol(_) => IonType::Symbol, MatchedValue::Blob(_) => IonType::Blob, + MatchedValue::Clob(_) => IonType::Clob, MatchedValue::List => IonType::List, MatchedValue::SExp => IonType::SExp, MatchedValue::Struct => IonType::Struct, diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index 42c78a97..abd7c233 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -55,10 +55,10 @@ pub(crate) enum MatchedValue { String(MatchedString), Symbol(MatchedSymbol), Blob(MatchedBlob), + Clob(MatchedClob), List, SExp, Struct, - // TODO: ...the other types } /// A partially parsed Ion int. @@ -338,7 +338,14 @@ impl MatchedString { let body = matched_input.slice(3, matched_input.len() - 6); // There are no escaped characters, so we can just validate the string in-place. let mut sanitized = Vec::with_capacity(matched_input.len()); - escape_text(body, &mut sanitized)?; + decode_text_containing_escapes( + body, + &mut sanitized, + // Normalize newlines + true, + // Support unicode escapes + true, + )?; let text = String::from_utf8(sanitized).unwrap(); Ok(StrRef::from(text.to_string())) } @@ -363,7 +370,14 @@ impl MatchedString { )(remaining) { remaining = remaining_after_match; - escape_text(segment_body, &mut sanitized)?; + decode_text_containing_escapes( + segment_body, + &mut sanitized, + // Normalize newlines + true, + // Support unicode escapes + true, + )?; } let text = String::from_utf8(sanitized).unwrap(); Ok(StrRef::from(text)) @@ -390,36 +404,93 @@ impl MatchedString { // There are escaped characters. We need to build a new version of our string // that replaces the escaped characters with their corresponding bytes. let mut sanitized = Vec::with_capacity(matched_input.len()); - escape_text(body, &mut sanitized)?; + decode_text_containing_escapes( + body, + &mut sanitized, + // Do not normalize newlines + false, + // Support Unicode escapes + true, + )?; let text = String::from_utf8(sanitized).unwrap(); Ok(StrRef::from(text.to_string())) } } -fn escape_text(matched_input: TextBufferView, sanitized: &mut Vec) -> IonResult<()> { +fn decode_text_containing_escapes( + matched_input: TextBufferView, + sanitized: &mut Vec, + // If the text being escaped is in a long string or a clob, then unescaped \r\n and \r get + // normalized to \n. + normalize_newlines: bool, + // Clobs use string-y syntax, but do not support `\u` or `\U` Unicode escapes because the + // data they contain may not be Unicode. + support_unicode_escapes: bool, +) -> IonResult<()> { let mut remaining = matched_input; + + // For ways to optimize this in the future, look at the `memchr` crate. + let match_byte = |byte: &u8| *byte == b'\\' || *byte == b'\r'; + while !remaining.is_empty() { - let next_escape = remaining.bytes().iter().position(|byte| *byte == b'\\'); - remaining = if let Some(escape_offset) = next_escape { - // Everything up to the '\' is already clean. Write that slice to 'sanitized'. - let already_clean = remaining.slice(0, escape_offset); - sanitized.extend_from_slice(already_clean.bytes()); - // Everything starting from the '\' needs to be evaluated. - let contains_escapes = remaining.slice_to_end(escape_offset); - write_escaped(contains_escapes, sanitized)? - } else { - sanitized.extend_from_slice(remaining.bytes()); - // 'remaining' is now empty - remaining.slice_to_end(remaining.len()) - }; + let next_index_to_inspect = remaining.bytes().iter().position(match_byte); + remaining = match next_index_to_inspect { + // It's an unescaped carriage return: 0x0A. + Some(carriage_return_offset) + if remaining.bytes().get(carriage_return_offset) == Some(&b'\r') => + { + // Add all of the data up to the \r is clean. Add that slice to `sanitized`. + sanitized.extend_from_slice(remaining.slice(0, carriage_return_offset).bytes()); + if normalize_newlines { + normalize_newline(remaining, sanitized, carriage_return_offset) + } else { + // Add it to the sanitized data as-is + sanitized.push(b'\r'); + remaining.slice_to_end(carriage_return_offset + 1) + } + } + // It's an escape character: `\` + Some(escape_offset) => { + // Add all of the data up to the `\` is clean. Add that slice to `sanitized`. + sanitized.extend_from_slice(remaining.slice(0, escape_offset).bytes()); + // Everything starting from the '\' needs to be evaluated. + let contains_escapes = remaining.slice_to_end(escape_offset); + decode_escape_into_bytes(contains_escapes, sanitized, support_unicode_escapes)? + } + None => { + sanitized.extend_from_slice(remaining.bytes()); + // 'remaining' is now empty. Return an empty slice. + remaining.slice_to_end(remaining.len()) + } + } } - Ok(()) } -fn write_escaped<'data>( +// This code is only called when a \r is encountered in either a clob or long-form string +#[cold] +fn normalize_newline<'data>( + remaining: TextBufferView<'data>, + sanitized: &mut Vec, + escape_offset: usize, +) -> TextBufferView<'data> { + // Insert the normalized newline + sanitized.push(b'\n'); + // Check whether we're skipping one byte (\r) or two (\r\n). + if remaining.bytes().get(escape_offset + 1).copied() == Some(b'\n') { + // The next byte is an unescaped newline; we normalize \r\n to \n, so we need + // to skip an extra byte. + remaining.slice_to_end(escape_offset + 2) + } else { + // We only processed a single byte: `\r` + remaining.slice_to_end(escape_offset + 1) + } +} + +fn decode_escape_into_bytes<'data>( input: TextBufferView<'data>, sanitized: &mut Vec, + support_unicode_escapes: bool, ) -> IonResult> { // Note that by the time this method has been called, the parser has already confirmed that // there is an appropriate closing delimiter. Thus, if any of the branches below run out of @@ -451,9 +522,23 @@ fn write_escaped<'data>( // If the byte following the '\' is a real newline (that is: 0x0A), we discard it. b'\n' => return Ok(input_after_escape), // These cases require more sophisticated parsing, not just a 1-to-1 mapping of bytes - b'x' => return hex_digits_code_point(2, input_after_escape, sanitized), - b'u' => return hex_digits_code_point(4, input_after_escape, sanitized), - b'U' => return hex_digits_code_point(8, input_after_escape, sanitized), + b'x' => return decode_hex_digits_escape(2, input_after_escape, sanitized), + // Clobs represent text of some encoding, but it may or may not be a flavor of Unicode. + // As such, clob syntax does not support Unicode escape sequences like `\u` or `\U`. + b'u' if support_unicode_escapes => { + return decode_hex_digits_escape(4, input_after_escape, sanitized) + } + b'U' if support_unicode_escapes => { + return decode_hex_digits_escape(8, input_after_escape, sanitized) + } + b'u' | b'U' => { + return Err(IonError::Decoding( + DecodingError::new( + "Unicode escape sequences (\\u, \\U) are not legal in this context", + ) + .with_position(input.offset()), + )) + } _ => { return Err(IonError::Decoding( DecodingError::new(format!("invalid escape sequence '\\{}", escape_id)) @@ -468,7 +553,7 @@ fn write_escaped<'data>( /// Reads the next `num_digits` bytes from `input` as a `char`, then writes that `char`'s UTF8 bytes /// to `sanitized`. -fn hex_digits_code_point<'data>( +fn decode_hex_digits_escape<'data>( num_digits: usize, input: TextBufferView<'data>, sanitized: &mut Vec, @@ -500,8 +585,23 @@ fn hex_digits_code_point<'data>( .with_position(input.offset()), )); } + // Isolate the portion of the input that follows the hex digits so we can return it. + let remaining_input = input.slice_to_end(num_digits); + // We just confirmed all of the digits are ASCII hex digits, so these steps cannot fail. + // We can unwrap() in each case. let hex_digits = std::str::from_utf8(hex_digit_bytes).unwrap(); + // If this was a '\x' escape, we cannot interpret the hex digits as a Unicode scalar. We treat + // it as a byte literal instead. + if num_digits == 2 { + let byte = u8::from_str_radix(hex_digits, 16).unwrap(); + sanitized.push(byte); + return Ok(remaining_input); + } + + // From here on, we know that the escape was either `\u` or `\U`--a Unicode scalar. + // Note that this means we are not processing a clob (which doesn't support Unicode) and can + // further infer that we are working with UTF-8, the only supported encoding for strings/symbols. let code_point = u32::from_str_radix(hex_digits, 16).unwrap(); // Check to see if this is a high surrogate; if it is, our code point isn't complete. Another @@ -510,7 +610,16 @@ fn hex_digits_code_point<'data>( // 4- and 8-digit escape sequences. `\x` escapes don't have enough digits to represent a // high surrogate.) if code_point_is_a_high_surrogate(code_point) { - todo!("support surrogate pairs") + // The spec has MAY-style language around supporting high surrogates. Supporting them is + // allowed but discouraged. For the time being, we will return an error. Other implementations + // (notably ion-java) support high surrogates largely for resilience/debugging. We can consider + // adding that support if there is demand for it. + return Err(IonError::Decoding( + DecodingError::new( + "found a Unicode high surrogate; UTF-16 is not legal in Ion strings/symbols", + ) + .with_position(input.offset()), + )); } // A Rust `char` can represent any Unicode scalar value--a code point that is not part of a @@ -522,7 +631,7 @@ fn hex_digits_code_point<'data>( sanitized.extend_from_slice(utf8_encoded.as_bytes()); // Skip beyond the digits we just processed - Ok(input.slice_to_end(num_digits)) + Ok(remaining_input) } /// Returns `true` if the provided code point is a utf-16 high surrogate. @@ -602,7 +711,7 @@ impl MatchedSymbol { // that replaces the escaped characters with their corresponding bytes. let mut sanitized = Vec::with_capacity(matched_input.len()); - escape_text(body, &mut sanitized)?; + decode_text_containing_escapes(body, &mut sanitized, false, true)?; let text = String::from_utf8(sanitized).unwrap(); Ok(RawSymbolTokenRef::Text(text.into())) } @@ -856,8 +965,92 @@ impl MatchedBlob { } } +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum MatchedClob { + Short, + Long, +} + +impl MatchedClob { + pub(crate) fn read<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + // `matched_input` contains the entire clob, including the opening {{ and closing }}. + // We can trim those off, but each function below will need to find the nested short- or + // long-form string content. + let matched_inside_braces = matched_input.slice(2, matched_input.len() - 4); + match self { + MatchedClob::Short => self.read_short_clob(matched_inside_braces), + MatchedClob::Long => self.read_long_clob(matched_inside_braces), + } + } + fn read_short_clob<'data>( + &self, + matched_inside_braces: TextBufferView<'data>, + ) -> IonResult> { + // There can be whitespace between the leading {{ and the `"`, so we need to scan ahead + // to the `"`. + let open_quote_position = matched_inside_braces + .bytes() + .iter() + .position(|b| *b == b'"') + .unwrap(); + // Get a slice that contains all of the bytes after the opening `"`. + let remaining = matched_inside_braces.slice_to_end(open_quote_position + 1); + // Use the existing short string body parser to identify all of the bytes up to the + // unescaped closing `"`. This parser succeeded once during matching, so we know it will + // succeed again here; it's safe to unwrap(). + let (_, (body, _has_escapes)) = remaining.match_short_string_body().unwrap(); + // There are escaped characters. We need to build a new version of our string + // that replaces the escaped characters with their corresponding bytes. + let mut sanitized = Vec::with_capacity(body.len()); + decode_text_containing_escapes( + body, + &mut sanitized, + // Do not normalize newlines + false, + // Unicode escapes are not supported + false, + )?; + Ok(BytesRef::from(sanitized)) + } + fn read_long_clob<'data>( + &self, + matched_inside_braces: TextBufferView<'data>, + ) -> IonResult> { + // We're going to re-parse the input to visit each segment, copying its sanitized bytes into + // a contiguous buffer. + + // Create a new buffer to hold the sanitized data. + let mut sanitized = Vec::with_capacity(matched_inside_braces.len()); + let mut remaining = matched_inside_braces; + + // Iterate over the string segments using the match_long_string_segment parser. + // This is the same parser that matched the input initially, which means that the only + // reason it wouldn't succeed here is if the input is empty, meaning we're done reading. + while let Ok((remaining_after_match, (segment_body, _has_escapes))) = preceded( + TextBufferView::match_optional_whitespace, + TextBufferView::match_long_string_segment, + )(remaining) + { + remaining = remaining_after_match; + decode_text_containing_escapes( + segment_body, + &mut sanitized, + // Normalize newlines + true, + // Unicode escapes are not supported + false, + )?; + } + Ok(BytesRef::from(sanitized)) + } +} + #[cfg(test)] mod tests { + use crate::lazy::bytes_ref::BytesRef; use crate::lazy::text::buffer::TextBufferView; use crate::{Decimal, IonResult, Timestamp}; @@ -1039,4 +1232,136 @@ mod tests { Ok(()) } + + #[test] + fn read_strings() -> IonResult<()> { + fn expect_string(data: &str, expected: &str) { + // Ordinarily the reader is responsible for indicating that the input is complete. + // For the sake of these tests, we're going to append one more value (`0`) to the input + // stream so the parser knows that the long-form strings are complete. We then trim + // our fabricated value off of the input before reading. + let data = format!("{data}\n0"); + let buffer = TextBufferView::new(data.as_bytes()); + let (_remaining, matched) = buffer.match_string().unwrap(); + let matched_input = buffer.slice(0, buffer.len() - 2); + let actual = matched.read(matched_input).unwrap(); + assert_eq!( + actual, expected, + "Actual didn't match expected for input '{}'.\n{:?}\n!=\n{:?}", + data, actual, expected + ); + } + + let tests = [ + (r#""hello""#, "hello"), + (r"'''hello'''", "hello"), + (r"'''he''' '''llo'''", "hello"), + (r"'''he''' '''llo'''", "hello"), + (r#""😎🙂🙃""#, "😎🙂🙃"), + (r"'''😎🙂''' '''🙃'''", "😎🙂🙃"), + // The below bytes are the UTF-8 encoding of Unicode code points: U+2764 U+FE0F + (r#""\xe2\x9d\xa4\xef\xb8\x8f""#, "❤️"), + (r"'''\xe2\x9d\xa4\xef\xb8\x8f'''", "❤️"), + (r"'''\u2764\uFE0F'''", "❤️"), + (r"'''\U00002764\U0000FE0F'''", "❤️"), + // In short strings, unescaped newlines are not normalized. + ("\"foo\rbar\r\nbaz\"", "foo\rbar\r\nbaz"), + // In long-form strings, unescaped newlines converted to `\n`. + ("'''foo\rbar\r\nbaz'''", "foo\nbar\nbaz"), + ]; + + for (input, expected) in tests { + expect_string(input, expected); + } + + Ok(()) + } + + #[test] + fn read_clobs() -> IonResult<()> { + fn read_clob(data: &str) -> IonResult { + let buffer = TextBufferView::new(data.as_bytes()); + // All `read_clob` usages should be accepted by the matcher, so we can `unwrap()` the + // call to `match_clob()`. + let (_remaining, matched) = buffer.match_clob().unwrap(); + // The resulting buffer slice may be rejected during reading. + matched.read(buffer) + } + + fn expect_clob_error(data: &str) { + let actual = read_clob(data); + assert!( + actual.is_err(), + "Successfully read a clob from illegal input." + ); + } + + fn expect_clob(data: &str, expected: &str) { + let actual = read_clob(data).unwrap(); + assert_eq!( + actual, + expected.as_ref(), + "Actual didn't match expected for input '{}'.\n{:?} ({})\n!=\n{:?} ({:0x?})", + data, + actual, + std::str::from_utf8(actual.data()).unwrap(), + expected, + expected.as_bytes() + ); + } + + // These tests compare a clob containing UTF-8 data to the expected string's bytes. + // This is just an example; clobs' data does not have to be (and often would not be) UTF-8. + let tests = [ + (r#"{{""}}"#, ""), + (r#"{{''''''}}"#, ""), + (r#"{{'''''' '''''' ''''''}}"#, ""), + (r#"{{"hello"}}"#, "hello"), + (r#"{{"\x4D"}}"#, "M"), + (r#"{{"\x4d \x4d \x4d"}}"#, "M M M"), + // The below bytes are the UTF-8 encoding of Unicode code points: U+2764 U+FE0F + (r#"{{"\xe2\x9d\xa4\xef\xb8\x8f"}}"#, "❤️"), + (r#"{{'''hel''' '''lo'''}}"#, "hello"), + ( + r"{{ + '''\xe2''' + '''\x9d\xa4''' + '''\xef\xb8\x8f''' + }} + ", + "❤️", + ), + // In a long-form clob, unescaped `\r` and `\r\n` are normalized into unescaped `\n` + ("{{'''foo\rbar\r\nbaz'''}}", "foo\nbar\nbaz"), + // In a short-form clob, newlines are not normalized. + ("{{\"foo\rbar\r\nbaz\"}}", "foo\rbar\r\nbaz"), + ]; + + for (input, expected) in tests { + expect_clob(input, expected); + } + + let illegal_inputs = [ + // Clobs represent text of some encoding, but it may or may not be a flavor of Unicode. + // As such, clob syntax does not support Unicode escape sequences like `\u` or `\U`. + // Byte literals may be written using `\x`, however. + r#"{{"\u004D" }}"#, + r#"{{"\U0000004D" }}"#, + // Escape sequence that terminates early + r#"{{"\x4"}}"#, + // Escape sequence split across long-form segments + r"{{ + '''\xe''' + '''2\x9d\xa''' + '''4\xef\xb8\x8f''' + }} + ", + ]; + + for input in illegal_inputs { + expect_clob_error(input); + } + + Ok(()) + } } diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 55b1e4ea..5e6254e6 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -94,7 +94,7 @@ mod tests { // Ion version marker $ion_1_0 - + // Typed nulls null @@ -120,7 +120,7 @@ mod tests { 3.6e0 2.5e008 -318e-2 - + // Decimals 1.5 3.14159 @@ -152,7 +152,6 @@ mod tests { '''Venus ''' '''Earth ''' '''Mars ''' - "#, ); // Escaped newlines are discarded @@ -174,9 +173,17 @@ mod tests { $10 $733 - // Blob + // Blobs {{cmF6emxlIGRhenpsZSByb290IGJlZXI=}} + // Clobs + {{"foobarbaz"}} + {{ + '''foo''' + '''bar''' + '''baz''' + }} + // List [ // First item @@ -369,6 +376,11 @@ mod tests { // {{cmF6emxlIGRhenpsZSByb290IGJlZXI=}} expect_next(reader, RawValueRef::Blob("razzle dazzle root beer".into())); + // {{"foobarbaz"}} + expect_next(reader, RawValueRef::Clob("foobarbaz".into())); + // {{'''foo''' '''bar''' '''baz'''}} + expect_next(reader, RawValueRef::Clob("foobarbaz".into())); + // [1, 2, 3] let list = reader.next()?.expect_value()?.read()?.expect_list()?; let mut sum = 0; diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index b5bab953..1c71a2d7 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -69,6 +69,7 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { MatchedValue::String(s) => RawValueRef::String(s.read(matched_input)?), MatchedValue::Symbol(s) => RawValueRef::Symbol(s.read(matched_input)?), MatchedValue::Blob(b) => RawValueRef::Blob(b.read(matched_input)?), + MatchedValue::Clob(c) => RawValueRef::Clob(c.read(matched_input)?), MatchedValue::List => { let lazy_list = LazyRawTextList { value: *self }; RawValueRef::List(lazy_list) diff --git a/src/lazy/value_ref.rs b/src/lazy/value_ref.rs index 47a35d44..2c47907d 100644 --- a/src/lazy/value_ref.rs +++ b/src/lazy/value_ref.rs @@ -25,7 +25,7 @@ pub enum ValueRef<'top, 'data, D: LazyDecoder<'data>> { String(StrRef<'data>), Symbol(SymbolRef<'top>), Blob(BytesRef<'data>), - Clob(&'data [u8]), + Clob(BytesRef<'data>), SExp(LazySExp<'top, 'data, D>), List(LazyList<'top, 'data, D>), Struct(LazyStruct<'top, 'data, D>), @@ -178,7 +178,7 @@ impl<'top, 'data, D: LazyDecoder<'data>> ValueRef<'top, 'data, D> { } } - pub fn expect_clob(self) -> IonResult<&'data [u8]> { + pub fn expect_clob(self) -> IonResult> { if let ValueRef::Clob(c) = self { Ok(c) } else { @@ -315,7 +315,7 @@ mod tests { ); assert_eq!( reader.expect_next()?.read()?, - ValueRef::Clob("Clob".as_bytes()) + ValueRef::Clob("Clob".as_bytes().into()) ); // PartialEq doesn't cover lazy containers