Adds lazy reader support for long strings (#630)

amazon-ion · Sep 1, 2023 · 4d34d2a · 4d34d2a
1 parent f728e08
commit 4d34d2a
Show file tree

Hide file tree

Showing 3 changed files with 202 additions and 15 deletions.
diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs
@@ -7,9 +7,9 @@ use std::str::FromStr;
 use nom::branch::alt;
 use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1, take_while_m_n};
 use nom::character::streaming::{alphanumeric1, char, digit1, one_of, satisfy};
-use nom::combinator::{consumed, fail, map, not, opt, peek, recognize, success, value};
+use nom::combinator::{consumed, map, not, opt, peek, recognize, success, value};
 use nom::error::{ErrorKind, ParseError};
-use nom::multi::{many0_count, many1_count};
+use nom::multi::{fold_many1, many0_count, many1_count};
 use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
 use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser};
 
@@ -30,20 +30,25 @@ use crate::{IonError, IonResult, IonType, TimestampPrecision};
 
 impl<'a> Debug for TextBufferView<'a> {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        const CHARS_TO_SHOW: usize = 64;
         write!(f, "TextBufferView {{")?;
         // Try to read the next several bytes from the buffer as UTF-8...
         let text_result = std::str::from_utf8(self.data);
         // ...if it works, print the first 32 unicode scalars...
         if let Ok(text) = text_result {
-            write!(f, "\"{}...\"", text.chars().take(32).collect::<String>())?;
+            write!(
+                f,
+                "\"{}...\"",
+                text.chars().take(CHARS_TO_SHOW).collect::<String>()
+            )?;
         } else {
             // ...if it doesn't, print the first 32 bytes in hex.
             write!(f, "Invalid UTF-8")?;
-            for byte in self.bytes().iter().take(32) {
+            for byte in self.bytes().iter().take(CHARS_TO_SHOW) {
                 write!(f, "{:x?} ", *byte)?;
             }
-            if self.bytes().len() > 32 {
-                write!(f, "...{} more bytes", self.bytes().len() - 32)?;
+            if self.bytes().len() > CHARS_TO_SHOW {
+                write!(f, "...{} more bytes", self.bytes().len() - CHARS_TO_SHOW)?;
             }
         }
         write!(f, "}}")
@@ -1001,10 +1006,41 @@ impl<'data> TextBufferView<'data> {
         Self::match_text_until_unescaped(self, b'\"')
     }
 
-    fn match_long_string(self) -> IonParseResult<'data, MatchedString> {
-        // TODO: implement long string matching
-        //       The `fail` parser is a nom builtin that never matches.
-        fail(self)
+    /// Matches a long string comprised of any number of `'''`-enclosed segments interleaved
+    /// with optional comments and whitespace.
+    pub fn match_long_string(self) -> IonParseResult<'data, MatchedString> {
+        fold_many1(
+            // Parser to keep applying repeatedly
+            whitespace_and_then(Self::match_long_string_segment),
+            // Initial accumulator value: segment count and whether the string contains escaped characters
+            || (0usize, false),
+            // Function to merge the current match's information with the accumulator
+            |(segment_count, string_contains_escapes),
+             (_matched_segment, segment_contains_escapes)| {
+                (
+                    segment_count + 1,
+                    string_contains_escapes || segment_contains_escapes,
+                )
+            },
+        )
+        .map(
+            |(segment_count, contains_escapes)| match (segment_count, contains_escapes) {
+                (1, false) => MatchedString::LongSingleSegmentWithoutEscapes,
+                (1, true) => MatchedString::LongSingleSegmentWithEscapes,
+                _ => MatchedString::Long,
+            },
+        )
+        .parse(self)
+    }
+
+    /// Matches a single long string segment enclosed by `'''` delimiters.
+    pub fn match_long_string_segment(self) -> IonParseResult<'data, (Self, bool)> {
+        delimited(tag("'''"), Self::match_long_string_segment_body, tag("'''"))(self)
+    }
+
+    /// Matches all input up to (but not including) the first unescaped instance of `'''`.
+    fn match_long_string_segment_body(self) -> IonParseResult<'data, (Self, bool)> {
+        Self::match_text_until_unescaped_str(self, "'''")
     }
 
     /// Matches an operator symbol, which can only legally appear within an s-expression
@@ -1142,6 +1178,43 @@ impl<'data> TextBufferView<'data> {
         Err(nom::Err::Incomplete(Needed::Unknown))
     }
 
+    /// A helper method for matching bytes until the specified delimiter. Ignores any byte
+    /// that is prefaced by the escape character `\`.
+    ///
+    /// The specified delimiter cannot be empty.
+    fn match_text_until_unescaped_str(
+        self,
+        delimiter: &str,
+    ) -> IonParseResult<'data, (Self, bool)> {
+        // The first byte in the delimiter
+        let delimiter_head = delimiter.as_bytes()[0];
+        // Whether we've encountered any escapes while looking for the delimiter
+        let mut contained_escapes = false;
+        // The input left to search
+        let mut remaining = self;
+        loop {
+            // Look for the first unescaped instance of the delimiter's head.
+            // If the input doesn't contain one, this will return an `Incomplete`.
+            // `match_text_until_escaped` does NOT include the delimiter byte in the match,
+            // so `remaining_after_match` starts at the delimiter byte.
+            let (remaining_after_match, (_, segment_contained_escapes)) =
+                remaining.match_text_until_unescaped(delimiter_head)?;
+            contained_escapes |= segment_contained_escapes;
+            remaining = remaining_after_match;
+
+            // If the remaining input starts with the complete delimiter, it's a match.
+            if remaining.bytes().starts_with(delimiter.as_bytes()) {
+                let relative_match_end = remaining.offset() - self.offset();
+                let matched_input = self.slice(0, relative_match_end);
+                let remaining_input = self.slice_to_end(relative_match_end);
+                return Ok((remaining_input, (matched_input, contained_escapes)));
+            } else {
+                // Otherwise, advance by one and try again.
+                remaining = remaining.slice_to_end(1);
+            }
+        }
+    }
+
     /// Matches a single base-10 digit, 0-9.
     fn match_any_digit(self) -> IonParseResult<'data, char> {
         satisfy(|c| c.is_ascii_digit())(self)
@@ -1609,11 +1682,11 @@ mod tests {
     }
 
     impl MatchTest {
-        /// Takes an `input` string and appends a trailing space to it, guaranteeing that the
+        /// Takes an `input` string and appends a trailing value to it, guaranteeing that the
         /// contents of the input are considered a complete token.
         fn new(input: &str) -> Self {
             MatchTest {
-                input: format!("{input} "), // add trailing space
+                input: format!("{input}\n0"), // add whitespace and a trailing value
             }
         }
 
@@ -1631,10 +1704,10 @@ mod tests {
         {
             let result = self.try_match(parser);
             let (_remaining, match_length) = result.unwrap();
-            // Inputs have a trailing space that should _not_ be part of the match
+            // Inputs have a trailing newline and `0` that should _not_ be part of the match
             assert_eq!(
                 match_length,
-                self.input.len() - 1,
+                self.input.len() - 2,
                 "\nInput: '{}'\nMatched: '{}'\n",
                 self.input,
                 &self.input[..match_length]
@@ -1903,6 +1976,21 @@ mod tests {
             r#"
             "this has an escaped quote \" right in the middle"
             "#,
+            r#" '''hi''' "#,
+            r#"
+            '''foo'''
+            '''bar'''
+            '''baz''' 
+            "#,
+            r#"
+            '''hello,''' /*comment*/ ''' world!'''
+            "#,
+            r#"
+            ''''''
+            "#, // empty string
+            r#"
+            '''''' ''''''
+            "#, // concatenated empty string
         ];
         for input in good_inputs {
             match_string(input.trim());
@@ -2099,4 +2187,13 @@ mod tests {
             mismatch_blob(input);
         }
     }
+
+    #[test]
+    fn test_match_text_until_unescaped_str() {
+        let input = TextBufferView::new(r" foo bar \''' baz''' quux ".as_bytes());
+        let (_remaining, (matched, contains_escapes)) =
+            input.match_text_until_unescaped_str(r#"'''"#).unwrap();
+        assert_eq!(matched.as_text().unwrap(), " foo bar \\''' baz");
+        assert!(contains_escapes);
+    }
 }
diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs
@@ -24,6 +24,7 @@ use std::num::IntErrorKind;
 use std::str::FromStr;
 
 use nom::character::is_hex_digit;
+use nom::sequence::preceded;
 use nom::AsChar;
 use num_bigint::{BigInt, BigUint};
 use num_traits::Num;
@@ -287,6 +288,14 @@ pub(crate) enum MatchedString {
     ///     """hello,"""
     ///     """ world!"""
     Long,
+    /// The string uses long-format delimiters, but is a single segment. We still have to
+    /// allocate a fresh String to store the version with decoded escapes, but we don't need
+    /// to re-parse the input because there's only one segment.
+    LongSingleSegmentWithEscapes,
+    /// The string uses long-format delimiters, but is a single segment and contains no escapes.
+    /// This allows us to return a slice of the input as-is. It also greatly simplifies the
+    /// reading process because we don't need to re-parse the input.
+    LongSingleSegmentWithoutEscapes,
 }
 
 impl MatchedString {
@@ -299,8 +308,65 @@ impl MatchedString {
                 self.read_short_string_without_escapes(matched_input)
             }
             MatchedString::ShortWithEscapes => self.read_short_string_with_escapes(matched_input),
-            MatchedString::Long => todo!("long-form strings"),
+            MatchedString::LongSingleSegmentWithoutEscapes => {
+                self.read_long_string_single_segment_without_escapes(matched_input)
+            }
+            MatchedString::LongSingleSegmentWithEscapes => {
+                self.read_long_string_single_segment_with_escapes(matched_input)
+            }
+            MatchedString::Long => self.read_long_string(matched_input),
+        }
+    }
+
+    fn read_long_string_single_segment_without_escapes<'data>(
+        &self,
+        matched_input: TextBufferView<'data>,
+    ) -> IonResult<StrRef<'data>> {
+        // Take a slice of the input that ignores the first and last three bytes, which are quotes.
+        let body = matched_input.slice(3, matched_input.len() - 6);
+        // There are no escaped characters, so we can just validate the string in-place.
+        let text = body.as_text()?;
+        let str_ref = StrRef::from(text);
+        Ok(str_ref)
+    }
+
+    fn read_long_string_single_segment_with_escapes<'data>(
+        &self,
+        matched_input: TextBufferView<'data>,
+    ) -> IonResult<StrRef<'data>> {
+        // Take a slice of the input that ignores the first and last three bytes, which are quotes.
+        let body = matched_input.slice(3, matched_input.len() - 6);
+        // There are no escaped characters, so we can just validate the string in-place.
+        let mut sanitized = Vec::with_capacity(matched_input.len());
+        escape_text(body, &mut sanitized)?;
+        let text = String::from_utf8(sanitized).unwrap();
+        Ok(StrRef::from(text.to_string()))
+    }
+
+    fn read_long_string<'data>(
+        &self,
+        matched_input: TextBufferView<'data>,
+    ) -> IonResult<StrRef<'data>> {
+        // We're going to re-parse the input to visit each segment, copying its sanitized bytes into
+        // a contiguous buffer.
+
+        // Create a new buffer to hold the sanitized data.
+        let mut sanitized = Vec::with_capacity(matched_input.len());
+        let mut remaining = matched_input;
+
+        // Iterate over the string segments using the match_long_string_segment parser.
+        // This is the same parser that matched the input initially, which means that the only
+        // reason it wouldn't succeed here is if the input is empty, meaning we're done reading.
+        while let Ok((remaining_after_match, (segment_body, _has_escapes))) = preceded(
+            TextBufferView::match_optional_whitespace,
+            TextBufferView::match_long_string_segment,
+        )(remaining)
+        {
+            remaining = remaining_after_match;
+            escape_text(segment_body, &mut sanitized)?;
         }
+        let text = String::from_utf8(sanitized).unwrap();
+        Ok(StrRef::from(text))
     }
 
     fn read_short_string_without_escapes<'data>(

diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs
@@ -135,13 +135,23 @@ mod tests {
             
         // Strings
 
+        '''Long string without escapes'''
+
         "Hello!"
+        
+        '''Long string with escaped \''' delimiter''' 
+
         "foo bar baz"
         "😎😎😎"
         "lol\n\r\0wat"                     // Single-character escapes
         "\x48ello, \x77orld!"              // \x 2-digit hex escape
         "\u0048ello, \u0077orld!"          // \u 4-digit hex escape
         "\U00000048ello, \U00000077orld!"  // \U 8-digit hex escape
+        
+        '''Mercury '''
+        '''Venus '''
+        '''Earth '''
+        '''Mars '''
 
         "#,
         );
@@ -288,8 +298,18 @@ mod tests {
             ),
         );
 
+        // '''Long string without escapes'''
+        expect_next(
+            reader,
+            RawValueRef::String("Long string without escapes".into()),
+        );
         // "Hello"
         expect_next(reader, RawValueRef::String("Hello!".into()));
+        // '''Long string with escaped \''' delimiter'''
+        expect_next(
+            reader,
+            RawValueRef::String("Long string with escaped ''' delimiter".into()),
+        );
         // "foo bar baz"
         expect_next(reader, RawValueRef::String("foo bar baz".into()));
         // "😎😎😎"
@@ -302,6 +322,10 @@ mod tests {
         expect_next(reader, RawValueRef::String("Hello, world!".into()));
         // "\U00000048ello, \U00000077orld!"
         expect_next(reader, RawValueRef::String("Hello, world!".into()));
+        expect_next(
+            reader,
+            RawValueRef::String("Mercury Venus Earth Mars ".into()),
+        );
         // "\"Hello,\\\n world!\" "
         expect_next(reader, RawValueRef::String("Hello, world!".into()));
         // 'foo'