From e0a83d8838c88bed1da0f647829841db587bde35 Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Sat, 15 Jul 2023 23:33:03 -0400 Subject: [PATCH 01/15] Top-level nulls, bools, ints --- Cargo.toml | 2 +- src/lazy/binary/encoding.rs | 19 - src/lazy/binary/mod.rs | 1 - src/lazy/binary/raw/mod.rs | 2 +- src/lazy/binary/raw/reader.rs | 2 +- .../raw/{lazy_raw_sequence.rs => sequence.rs} | 2 +- src/lazy/binary/raw/struct.rs | 2 +- src/lazy/binary/raw/value.rs | 16 +- src/lazy/decoder.rs | 1 + src/lazy/encoding.rs | 133 ++++ src/lazy/mod.rs | 2 + src/lazy/raw_value_ref.rs | 8 + src/lazy/reader.rs | 2 +- src/lazy/sequence.rs | 2 +- src/lazy/struct.rs | 2 +- src/lazy/system_reader.rs | 2 +- src/lazy/text/as_utf8.rs | 33 + src/lazy/text/buffer.rs | 730 ++++++++++++++++++ src/lazy/text/encoded_value.rs | 207 +++++ src/lazy/text/matched.rs | 108 +++ src/lazy/text/mod.rs | 7 + src/lazy/text/parse_result.rs | 274 +++++++ src/lazy/text/raw/mod.rs | 1 + src/lazy/text/raw/reader.rs | 189 +++++ src/lazy/text/value.rs | 66 ++ src/lazy/value.rs | 2 +- src/position.rs | 26 +- src/result/decoding_error.rs | 12 + src/result/incomplete.rs | 7 +- src/result/mod.rs | 6 +- 30 files changed, 1818 insertions(+), 48 deletions(-) delete mode 100644 src/lazy/binary/encoding.rs rename src/lazy/binary/raw/{lazy_raw_sequence.rs => sequence.rs} (98%) create mode 100644 src/lazy/encoding.rs create mode 100644 src/lazy/text/as_utf8.rs create mode 100644 src/lazy/text/buffer.rs create mode 100644 src/lazy/text/encoded_value.rs create mode 100644 src/lazy/text/matched.rs create mode 100644 src/lazy/text/mod.rs create mode 100644 src/lazy/text/parse_result.rs create mode 100644 src/lazy/text/raw/mod.rs create mode 100644 src/lazy/text/raw/reader.rs create mode 100644 src/lazy/text/value.rs diff --git a/Cargo.toml b/Cargo.toml index 5f148b85..54771ff8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,7 +66,7 @@ num-bigint = "0.4.3" num-integer = "0.1.44" num-traits = "0.2" arrayvec = "0.7" -smallvec = "1.9.0" +smallvec = {version ="1.9.0", features = ["const_generics"]} digest = { version = "0.9", optional = true } sha2 = { version = "0.9", optional = true } diff --git a/src/lazy/binary/encoding.rs b/src/lazy/binary/encoding.rs deleted file mode 100644 index e26d0b51..00000000 --- a/src/lazy/binary/encoding.rs +++ /dev/null @@ -1,19 +0,0 @@ -use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; -use crate::lazy::binary::raw::lazy_raw_sequence::LazyRawBinarySequence; -use crate::lazy::binary::raw::r#struct::LazyRawBinaryStruct; -use crate::lazy::binary::raw::reader::LazyRawBinaryReader; -use crate::lazy::binary::raw::value::LazyRawBinaryValue; -use crate::lazy::decoder::LazyDecoder; - -// This type derives trait implementations in order to allow types that contain it to also derive -// trait implementations. -#[derive(Clone, Debug)] -pub struct BinaryEncoding; - -impl<'data> LazyDecoder<'data> for BinaryEncoding { - type Reader = LazyRawBinaryReader<'data>; - type Value = LazyRawBinaryValue<'data>; - type Sequence = LazyRawBinarySequence<'data>; - type Struct = LazyRawBinaryStruct<'data>; - type AnnotationsIterator = RawBinaryAnnotationsIterator<'data>; -} diff --git a/src/lazy/binary/mod.rs b/src/lazy/binary/mod.rs index cfc54e78..93017274 100644 --- a/src/lazy/binary/mod.rs +++ b/src/lazy/binary/mod.rs @@ -2,6 +2,5 @@ mod encoded_value; pub mod immutable_buffer; pub mod raw; -pub(crate) mod encoding; #[cfg(test)] pub(crate) mod test_utilities; diff --git a/src/lazy/binary/raw/mod.rs b/src/lazy/binary/raw/mod.rs index 0861993f..3df82f4d 100644 --- a/src/lazy/binary/raw/mod.rs +++ b/src/lazy/binary/raw/mod.rs @@ -1,5 +1,5 @@ pub mod annotations_iterator; -pub mod lazy_raw_sequence; pub mod reader; +pub mod sequence; pub mod r#struct; pub mod value; diff --git a/src/lazy/binary/raw/reader.rs b/src/lazy/binary/raw/reader.rs index da3a983c..77297e54 100644 --- a/src/lazy/binary/raw/reader.rs +++ b/src/lazy/binary/raw/reader.rs @@ -1,7 +1,7 @@ -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::binary::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::value::LazyRawBinaryValue; use crate::lazy::decoder::LazyRawReader; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::result::IonFailure; use crate::IonResult; diff --git a/src/lazy/binary/raw/lazy_raw_sequence.rs b/src/lazy/binary/raw/sequence.rs similarity index 98% rename from src/lazy/binary/raw/lazy_raw_sequence.rs rename to src/lazy/binary/raw/sequence.rs index 16dbb021..66d26fef 100644 --- a/src/lazy/binary/raw/lazy_raw_sequence.rs +++ b/src/lazy/binary/raw/sequence.rs @@ -1,10 +1,10 @@ -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::binary::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; use crate::lazy::binary::raw::reader::DataSource; use crate::lazy::binary::raw::value::LazyRawBinaryValue; use crate::lazy::decoder::private::LazyContainerPrivate; use crate::lazy::decoder::LazyRawSequence; +use crate::lazy::encoding::BinaryEncoding; use crate::{IonResult, IonType}; use std::fmt; use std::fmt::{Debug, Formatter}; diff --git a/src/lazy/binary/raw/struct.rs b/src/lazy/binary/raw/struct.rs index 34ca489a..3f82ed16 100644 --- a/src/lazy/binary/raw/struct.rs +++ b/src/lazy/binary/raw/struct.rs @@ -1,10 +1,10 @@ -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::binary::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; use crate::lazy::binary::raw::reader::DataSource; use crate::lazy::binary::raw::value::LazyRawBinaryValue; use crate::lazy::decoder::private::{LazyContainerPrivate, LazyRawFieldPrivate}; use crate::lazy::decoder::{LazyRawField, LazyRawStruct}; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::raw_value_ref::RawValueRef; use crate::raw_symbol_token_ref::AsRawSymbolTokenRef; use crate::{IonResult, RawSymbolTokenRef}; diff --git a/src/lazy/binary/raw/value.rs b/src/lazy/binary/raw/value.rs index 0ffcb028..9ed2340e 100644 --- a/src/lazy/binary/raw/value.rs +++ b/src/lazy/binary/raw/value.rs @@ -1,13 +1,13 @@ use crate::binary::int::DecodedInt; use crate::binary::uint::DecodedUInt; use crate::lazy::binary::encoded_value::EncodedValue; -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::binary::immutable_buffer::ImmutableBuffer; use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; -use crate::lazy::binary::raw::lazy_raw_sequence::LazyRawBinarySequence; use crate::lazy::binary::raw::r#struct::LazyRawBinaryStruct; +use crate::lazy::binary::raw::sequence::LazyRawBinarySequence; use crate::lazy::decoder::private::LazyRawValuePrivate; use crate::lazy::decoder::LazyRawValue; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::raw_value_ref::RawValueRef; use crate::result::IonFailure; use crate::types::SymbolId; @@ -35,7 +35,7 @@ impl<'a> Debug for LazyRawBinaryValue<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, - "LazyRawValue {{\n val={:?},\n buf={:?}\n}}\n", + "LazyRawBinaryValue {{\n val={:?},\n buf={:?}\n}}\n", self.encoded_value, self.input ) } @@ -54,6 +54,10 @@ impl<'data> LazyRawValue<'data, BinaryEncoding> for LazyRawBinaryValue<'data> { self.ion_type() } + fn is_null(&self) -> bool { + self.is_null() + } + fn annotations(&self) -> RawBinaryAnnotationsIterator<'data> { self.annotations() } @@ -70,6 +74,10 @@ impl<'data> LazyRawBinaryValue<'data> { self.encoded_value.ion_type() } + pub fn is_null(&self) -> bool { + self.encoded_value.header().is_null() + } + /// Returns `true` if this value has a non-empty annotations sequence; otherwise, returns `false`. fn has_annotations(&self) -> bool { self.encoded_value.has_annotations() @@ -118,7 +126,7 @@ impl<'data> LazyRawBinaryValue<'data> { /// [`LazyRawBinarySequence`] or [`LazyStruct`](crate::lazy::struct::LazyStruct) /// that can be traversed to access the container's contents. pub fn read(&self) -> ValueParseResult<'data, BinaryEncoding> { - if self.encoded_value.header().is_null() { + if self.is_null() { let raw_value_ref = RawValueRef::Null(self.ion_type()); return Ok(raw_value_ref); } diff --git a/src/lazy/decoder.rs b/src/lazy/decoder.rs index 5f784c42..e53ad2d2 100644 --- a/src/lazy/decoder.rs +++ b/src/lazy/decoder.rs @@ -62,6 +62,7 @@ pub trait LazyRawValue<'data, D: LazyDecoder<'data>>: private::LazyRawValuePrivate<'data> + Clone + Debug { fn ion_type(&self) -> IonType; + fn is_null(&self) -> bool; fn annotations(&self) -> D::AnnotationsIterator; fn read(&self) -> IonResult>; } diff --git a/src/lazy/encoding.rs b/src/lazy/encoding.rs new file mode 100644 index 00000000..784879ad --- /dev/null +++ b/src/lazy/encoding.rs @@ -0,0 +1,133 @@ +use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; +use crate::lazy::binary::raw::r#struct::LazyRawBinaryStruct; +use crate::lazy::binary::raw::reader::LazyRawBinaryReader; +use crate::lazy::binary::raw::sequence::LazyRawBinarySequence; +use crate::lazy::binary::raw::value::LazyRawBinaryValue; +use crate::lazy::decoder::private::{LazyContainerPrivate, LazyRawFieldPrivate}; +use crate::lazy::decoder::{LazyDecoder, LazyRawField, LazyRawSequence, LazyRawStruct}; +use crate::lazy::raw_value_ref::RawValueRef; +use crate::lazy::text::raw::reader::LazyRawTextReader; +use crate::lazy::text::value::LazyRawTextValue; +use crate::{IonResult, IonType, RawSymbolTokenRef}; +use std::marker::PhantomData; + +// These types derive trait implementations in order to allow types that containing them +// to also derive trait implementations. + +/// The Ion 1.0 binary encoding. +#[derive(Clone, Debug)] +pub struct BinaryEncoding; + +/// The Ion 1.0 text encoding. +#[derive(Clone, Debug)] +pub struct TextEncoding; + +impl<'data> LazyDecoder<'data> for BinaryEncoding { + type Reader = LazyRawBinaryReader<'data>; + type Value = LazyRawBinaryValue<'data>; + type Sequence = LazyRawBinarySequence<'data>; + type Struct = LazyRawBinaryStruct<'data>; + type AnnotationsIterator = RawBinaryAnnotationsIterator<'data>; +} + +// === Placeholders === +// The types below will need to be properly defined in order for the lazy text reader to be complete. +// The exist to satisfy various trait definitions. +#[derive(Debug, Clone)] +pub struct ToDoTextSequence; + +impl<'data> LazyContainerPrivate<'data, TextEncoding> for ToDoTextSequence { + fn from_value(_value: LazyRawTextValue<'data>) -> Self { + todo!() + } +} + +impl<'data> LazyRawSequence<'data, TextEncoding> for ToDoTextSequence { + type Iterator = Box>>>; + + fn annotations(&self) -> ToDoTextAnnotationsIterator<'data> { + todo!() + } + + fn ion_type(&self) -> IonType { + todo!() + } + + fn iter(&self) -> Self::Iterator { + todo!() + } + + fn as_value(&self) -> &>::Value { + todo!() + } +} + +#[derive(Debug, Clone)] +pub struct ToDoTextStruct; + +#[derive(Debug, Clone)] +pub struct ToDoTextField; + +impl<'data> LazyRawFieldPrivate<'data, TextEncoding> for ToDoTextField { + fn into_value(self) -> LazyRawTextValue<'data> { + todo!() + } +} + +impl<'data> LazyRawField<'data, TextEncoding> for ToDoTextField { + fn name(&self) -> RawSymbolTokenRef<'data> { + todo!() + } + + fn value(&self) -> &LazyRawTextValue<'data> { + todo!() + } +} + +impl<'data> LazyContainerPrivate<'data, TextEncoding> for ToDoTextStruct { + fn from_value(_value: ::Value) -> Self { + todo!() + } +} + +impl<'data> LazyRawStruct<'data, TextEncoding> for ToDoTextStruct { + type Field = ToDoTextField; + type Iterator = Box>>; + + fn annotations(&self) -> ToDoTextAnnotationsIterator<'data> { + todo!() + } + + fn find(&self, _name: &str) -> IonResult>> { + todo!() + } + + fn get(&self, _name: &str) -> IonResult>> { + todo!() + } + + fn iter(&self) -> Self::Iterator { + todo!() + } +} + +#[derive(Debug, Clone)] +pub struct ToDoTextAnnotationsIterator<'data> { + spooky: &'data PhantomData<()>, +} + +impl<'data> Iterator for ToDoTextAnnotationsIterator<'data> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + todo!() + } +} + +impl<'data> LazyDecoder<'data> for TextEncoding { + type Reader = LazyRawTextReader<'data>; + type Value = LazyRawTextValue<'data>; + type Sequence = ToDoTextSequence; + type Struct = ToDoTextStruct; + type AnnotationsIterator = ToDoTextAnnotationsIterator<'data>; +} diff --git a/src/lazy/mod.rs b/src/lazy/mod.rs index c0c3c413..3f42baa8 100644 --- a/src/lazy/mod.rs +++ b/src/lazy/mod.rs @@ -3,6 +3,7 @@ pub mod binary; pub mod decoder; +pub(crate) mod encoding; pub mod raw_stream_item; pub mod raw_value_ref; pub mod reader; @@ -10,5 +11,6 @@ pub mod sequence; pub mod r#struct; pub mod system_reader; pub mod system_stream_item; +pub mod text; pub mod value; pub mod value_ref; diff --git a/src/lazy/raw_value_ref.rs b/src/lazy/raw_value_ref.rs index 11b09bd6..a0da98eb 100644 --- a/src/lazy/raw_value_ref.rs +++ b/src/lazy/raw_value_ref.rs @@ -69,6 +69,14 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> { } } + pub fn expect_i64(self) -> IonResult { + if let RawValueRef::Int(i) = self { + i.expect_i64() + } else { + IonResult::decoding_error("expected an i64 (int)") + } + } + pub fn expect_float(self) -> IonResult { if let RawValueRef::Float(f) = self { Ok(f) diff --git a/src/lazy/reader.rs b/src/lazy/reader.rs index bd20656c..2f3cfbb4 100644 --- a/src/lazy/reader.rs +++ b/src/lazy/reader.rs @@ -1,8 +1,8 @@ use crate::binary::constants::v1_0::IVM; use crate::element::reader::ElementReader; use crate::element::Element; -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::decoder::LazyDecoder; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::system_reader::LazySystemReader; use crate::lazy::value::LazyValue; use crate::result::IonFailure; diff --git a/src/lazy/sequence.rs b/src/lazy/sequence.rs index 7f7f810f..b3c830f8 100644 --- a/src/lazy/sequence.rs +++ b/src/lazy/sequence.rs @@ -1,5 +1,5 @@ -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::decoder::{LazyDecoder, LazyRawSequence, LazyRawValue}; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::value::{AnnotationsIterator, LazyValue}; use crate::{Annotations, Element, IntoAnnotatedElement, Sequence, Value}; use crate::{IonError, IonResult, IonType, SymbolTable}; diff --git a/src/lazy/struct.rs b/src/lazy/struct.rs index f7347efd..2251b949 100644 --- a/src/lazy/struct.rs +++ b/src/lazy/struct.rs @@ -1,7 +1,7 @@ use crate::element::builders::StructBuilder; -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::decoder::private::{LazyRawFieldPrivate, LazyRawValuePrivate}; use crate::lazy::decoder::{LazyDecoder, LazyRawStruct}; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::value::{AnnotationsIterator, LazyValue}; use crate::lazy::value_ref::ValueRef; use crate::result::IonFailure; diff --git a/src/lazy/system_reader.rs b/src/lazy/system_reader.rs index 4936fa8a..bdf76de2 100644 --- a/src/lazy/system_reader.rs +++ b/src/lazy/system_reader.rs @@ -1,4 +1,4 @@ -use crate::lazy::binary::encoding::BinaryEncoding; +use crate::lazy::encoding::BinaryEncoding; use crate::result::IonFailure; use crate::{IonResult, IonType, RawSymbolTokenRef, SymbolTable}; diff --git a/src/lazy/text/as_utf8.rs b/src/lazy/text/as_utf8.rs new file mode 100644 index 00000000..0d1e211c --- /dev/null +++ b/src/lazy/text/as_utf8.rs @@ -0,0 +1,33 @@ +use crate::lazy::text::buffer::TextBufferView; +use crate::position::Position; +use crate::result::DecodingError; +use crate::{IonError, IonResult}; +use smallvec::SmallVec; + +/// Attempts to validate a byte sequence as UTF-8 text. If the data is not valid UTF-8, returns +/// an [`IonError`]. +/// +/// The provided `position` is added to the `IonError` that is constructed if the data is not valid. +pub(crate) trait AsUtf8 { + fn as_utf8(&self, position: impl Into) -> IonResult<&str>; +} + +impl AsUtf8 for SmallVec<[u8; N]> { + fn as_utf8(&self, position: impl Into) -> IonResult<&str> { + std::str::from_utf8(self.as_ref()).map_err(|_| { + let decoding_error = + DecodingError::new("encountered invalid UTF-8").with_position(position); + IonError::Decoding(decoding_error) + }) + } +} + +impl<'data> AsUtf8 for TextBufferView<'data> { + fn as_utf8(&self, position: impl Into) -> IonResult<&str> { + std::str::from_utf8(self.bytes()).map_err(|_| { + let decoding_error = + DecodingError::new("encountered invalid UTF-8").with_position(position); + IonError::Decoding(decoding_error) + }) + } +} diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs new file mode 100644 index 00000000..54ecf4f6 --- /dev/null +++ b/src/lazy/text/buffer.rs @@ -0,0 +1,730 @@ +use crate::lazy::encoding::TextEncoding; +use crate::lazy::raw_stream_item::RawStreamItem; +use crate::lazy::text::encoded_value::EncodedTextValue; +use crate::lazy::text::matched::{MatchedInt, MatchedValue}; +use crate::lazy::text::parse_result::IonParseError; +use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; +use crate::lazy::text::value::LazyRawTextValue; +use crate::{IonResult, IonType}; +use nom::branch::alt; +use nom::bytes::streaming::{is_a, tag, take_while1}; +use nom::character::streaming::{char, digit1, one_of}; +use nom::combinator::{map, opt, peek, recognize, success, value}; +use nom::error::{ErrorKind, ParseError}; +use nom::multi::many0_count; +use nom::sequence::{delimited, pair, preceded, separated_pair, terminated}; +use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser}; +use std::fmt::{Debug, Formatter}; +use std::iter::{Copied, Enumerate}; +use std::ops::{RangeFrom, RangeTo}; +use std::slice::Iter; + +impl<'a> Debug for TextBufferView<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "TextBufferView {{")?; + // Try to read the next several bytes from the buffer as UTF-8... + let text_result = std::str::from_utf8(self.data); + // ...if it works, print the first 32 unicode scalars... + if let Ok(text) = text_result { + write!(f, "\"{}...\"", text.chars().take(32).collect::())?; + } else { + // ...if it doesn't, print the first 32 bytes in hex. + write!(f, "Invalid UTF-8")?; + for byte in self.bytes().iter().take(32) { + write!(f, "{:x?} ", *byte)?; + } + if self.bytes().len() > 32 { + write!(f, "...{} more bytes", self.bytes().len() - 32)?; + } + } + write!(f, "}}") + } +} + +/// The Ion specification's enumeration of whitespace characters. +const WHITESPACE_CHARACTERS: &[char] = &[ + ' ', // Space + '\t', // Tab + '\r', // Carriage return + '\n', // Newline + '\x09', // Horizontal tab + '\x0B', // Vertical tab + '\x0C', // Form feed +]; + +/// Same as [WHITESPACE_CHARACTERS], but formatted as a string for use in some `nom` APIs +const WHITESPACE_CHARACTERS_AS_STR: &str = " \t\r\n\x09\x0B\x0C"; + +/// A slice of unsigned bytes that can be cheaply copied and which defines methods for parsing +/// the various encoding elements of a text Ion stream. +/// +/// Upon success, each parsing method on the `TextBufferView` will return the value that was read +/// and a new copy of the `TextBufferView` that starts _after_ the bytes that were parsed. +/// +/// Methods that begin with `match_` return the input slice that they matched OR a `MatchedValue` +/// that retains additional information found during the matching process. +#[derive(PartialEq, Clone, Copy)] +pub(crate) struct TextBufferView<'a> { + // `data` is a slice of remaining data in the larger input stream. + // `offset` is the absolute position in the overall input stream where that slice begins. + // + // input: 00 01 02 03 04 05 06 07 08 09 + // └────┬────┘ + // data: &[u8] + // offset: 6 + data: &'a [u8], + offset: usize, +} + +pub(crate) type ParseResult<'a, T> = IonResult<(T, TextBufferView<'a>)>; + +impl<'data> TextBufferView<'data> { + /// Constructs a new `TextBufferView` that wraps `data`. + #[inline] + pub fn new(data: &[u8]) -> TextBufferView { + Self::new_with_offset(data, 0) + } + + pub fn new_with_offset(data: &[u8], offset: usize) -> TextBufferView { + TextBufferView { data, offset } + } + + /// Returns a subslice copy of the [`TextBufferView`] that starts at `offset` and continues for + /// `length` bytes. + /// + /// Note that `offset` is relative to the beginning of the buffer, not the beginning of the + /// larger stream of which the buffer is a piece. + pub fn slice(&self, offset: usize, length: usize) -> TextBufferView<'data> { + TextBufferView { + data: &self.data[offset..offset + length], + offset: self.offset + offset, + } + } + + /// Returns a subslice copy of the [`TextBufferView`] that starts at `offset` and continues + /// to the end. + /// + /// Note that `offset` is relative to the beginning of the buffer, not the beginning of the + /// larger stream of which the buffer is a piece. + pub fn slice_to_end(&self, offset: usize) -> TextBufferView<'data> { + TextBufferView { + data: &self.data[offset..], + offset: self.offset + offset, + } + } + + /// Returns a slice containing all of the buffer's bytes. + pub fn bytes(&self) -> &[u8] { + self.data + } + + /// Returns the number of bytes between the start of the original input byte array and the + /// subslice of that byte array that this `TextBufferView` represents. + pub fn offset(&self) -> usize { + self.offset + } + + /// Returns the number of bytes in the buffer. + pub fn len(&self) -> usize { + self.data.len() + } + + /// Returns `true` if there are no bytes in the buffer. Otherwise, returns `false`. + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + + /// Creates a copy of this `TextBufferView` that begins `num_bytes_to_consume` further into the + /// slice. + #[inline] + pub fn consume(&self, num_bytes_to_consume: usize) -> Self { + // This assertion is always run during testing but is removed in the release build. + debug_assert!(num_bytes_to_consume <= self.len()); + Self { + data: &self.data[num_bytes_to_consume..], + offset: self.offset + num_bytes_to_consume, + } + } + + // An adapter for nom::combinator::success. + // Always succeeds and consumes none of the input. Returns an empty slice of the buffer. + pub fn match_nothing(self) -> IonMatchResult<'data> { + // Return an empty slice from the head position + success(self.slice(0, 0))(self) + } + + pub fn match_whitespace(self) -> IonMatchResult<'data> { + is_a(WHITESPACE_CHARACTERS_AS_STR)(self) + } + + pub fn match_optional_whitespace(self) -> IonMatchResult<'data> { + // Either match whitespace and return what follows or just return the input as-is. + // This will always return `Ok`, but is packaged as an IonMatchResult for compatability + alt((Self::match_whitespace, Self::match_nothing))(self) + } + + pub fn read_top_level(self) -> IonParseResult<'data, RawStreamItem<'data, TextEncoding>> { + let (remaining, value) = match self.read_value() { + Ok(value) => value, + Err(e) => return Err(e), + }; + + // TODO: Check to see if `value` is actually an IVM. + // => If it's a symbol, try the IVM parser on it and see if it succeeds. + // For now, we just return the value. + Ok((remaining, RawStreamItem::Value(value))) + } + + pub fn read_value(self) -> IonParseResult<'data, LazyRawTextValue<'data>> { + alt(( + // For `null` and `bool`, we use `read_` instead of `match_` because there's no additional + // parsing to be done. + map(match_and_length(Self::read_null), |(ion_type, length)| { + EncodedTextValue::new(MatchedValue::Null(ion_type), self.offset(), length) + }), + map(match_and_length(Self::read_bool), |(value, length)| { + EncodedTextValue::new(MatchedValue::Bool(value), self.offset(), length) + }), + // For `int` and the other types, we use `match` and store the partially-processed input in the + // `matched_value` field of the `EncodedTextValue` we return. + map( + match_and_length(Self::match_int), + |(matched_int, length)| { + EncodedTextValue::new(MatchedValue::Int(matched_int), self.offset(), length) + }, + ), + // TODO: The other Ion types + )) + .map(|encoded_value| LazyRawTextValue { + encoded_value, + input: self, + }) + .parse(self) + } + + pub fn match_bool(self) -> IonMatchResult<'data> { + recognize(Self::read_bool)(self) + } + + pub fn read_bool(self) -> IonParseResult<'data, bool> { + terminated( + alt((value(true, tag("true")), value(false, tag("false")))), + Self::peek_stop_character, + )(self) + } + + pub fn match_null(self) -> IonMatchResult<'data> { + recognize(Self::read_null)(self) + } + + pub fn read_null(self) -> IonParseResult<'data, IonType> { + delimited( + tag("null"), + opt(preceded(char('.'), Self::read_ion_type)), + Self::peek_stop_character, + ) + .map(|explicit_ion_type| explicit_ion_type.unwrap_or(IonType::Null)) + .parse(self) + } + + fn match_ion_type(self) -> IonMatchResult<'data> { + recognize(Self::read_ion_type)(self) + } + + fn read_ion_type(self) -> IonParseResult<'data, IonType> { + alt(( + value(IonType::Null, tag("null")), + value(IonType::Bool, tag("bool")), + value(IonType::Int, tag("int")), + value(IonType::Float, tag("float")), + value(IonType::Decimal, tag("decimal")), + value(IonType::Timestamp, tag("timestamp")), + value(IonType::Symbol, tag("symbol")), + value(IonType::String, tag("string")), + value(IonType::Clob, tag("clob")), + value(IonType::Blob, tag("blob")), + value(IonType::List, tag("list")), + value(IonType::SExp, tag("sexp")), + value(IonType::Struct, tag("struct")), + ))(self) + } + + fn match_stop_character(self) -> IonMatchResult<'data> { + recognize(one_of("{}[](),\"' \t\n\r\u{0b}\u{0c}")).parse(self) + } + + fn peek_stop_character(self) -> IonMatchResult<'data> { + peek(Self::match_stop_character).parse(self) + } + + /// Matches the three parts of an int--its base, its sign, and its digits--without actually + /// constructing an Int from them. + fn match_int(self) -> IonParseResult<'data, MatchedInt> { + terminated( + // We test for base 16 and base 2 so the '0x' or '0b' isn't confused for a leading zero + // in a base 10 number, which would be illegal. + alt(( + Self::match_base_2_int, + Self::match_base_16_int, + Self::match_base_10_int, + )), + Self::peek_stop_character, + )(self) + } + + /// Matches a base-2 notation integer (e.g. `0b0`, `0B1010`, or `-0b0111`) and returns the + /// partially parsed value as a [`MatchedInt`]. + fn match_base_2_int(self) -> IonParseResult<'data, MatchedInt> { + separated_pair( + opt(char('-')), + alt((tag("0b"), tag("0B"))), + Self::match_base_2_int_digits, + ) + .map(|(maybe_sign, digits)| { + MatchedInt::new(2, maybe_sign.is_some(), digits.offset() - self.offset()) + }) + .parse(self) + } + + /// Matches the digits of a base-2 integer. + fn match_base_2_int_digits(self) -> IonMatchResult<'data> { + recognize(terminated( + // Zero or more digits-followed-by-underscores + many0_count(pair(is_a("01"), char('_'))), + // One or more digits + is_a("01"), + ))(self) + } + + /// Matches a base-10 notation integer (e.g. `0`, `255`, or `-1_024`) and returns the partially + /// parsed value as a [`MatchedInt`]. + fn match_base_10_int(self) -> IonParseResult<'data, MatchedInt> { + pair(opt(char('-')), Self::match_base_10_int_digits) + .map(|(maybe_sign, digits)| { + MatchedInt::new(10, maybe_sign.is_some(), digits.offset() - self.offset()) + }) + .parse(self) + } + + /// Matches the digits of a base-10 integer. (i.e. An integer without a sign.) + fn match_base_10_int_digits(self) -> IonMatchResult<'data> { + alt(( + // The number is either a zero... + recognize(char('0')), + // Or it's a non-zero followed by some number of '_'-separated digits + Self::match_base_10_digits_before_dot, + ))(self) + } + + /// Matches either: + /// * a zero + /// * a non-zero followed by some number of digits with optional underscores + fn match_base_10_digits_before_dot(self) -> IonMatchResult<'data> { + alt(( + tag("0"), + recognize(pair( + Self::match_base_10_leading_digit, + Self::match_base_10_trailing_digits, + )), + ))(self) + } + + /// Matches the first digit of a multi-digit base-10 integer. (i.e. Any digit but zero.) + fn match_base_10_leading_digit(self) -> IonMatchResult<'data> { + recognize(one_of("123456789"))(self) + } + + /// Matches any number of digits with underscores optionally appearing in the middle. + /// This parser accepts leading zeros, which is why it cannot be used for the beginning + /// of a number. + fn match_base_10_trailing_digits(self) -> IonMatchResult<'data> { + recognize(many0_count(pair(opt(char('_')), digit1)))(self) + } + + /// Matches a base-10 notation integer (e.g. `0x0`, `0X20`, or `-0xCAFE`) and returns the + /// partially parsed value as a [`MatchedInt`]. + fn match_base_16_int(self) -> IonParseResult<'data, MatchedInt> { + separated_pair( + opt(char('-')), + alt((tag("0x"), tag("0X"))), + Self::match_base_16_int_trailing_digits, + ) + .map(|(maybe_sign, digits)| { + MatchedInt::new(16, maybe_sign.is_some(), digits.offset() - self.offset()) + }) + .parse(self) + } + + /// Matches the digits that follow the '0x' or '0X' in a base-16 integer + fn match_base_16_int_trailing_digits(self) -> IonMatchResult<'data> { + recognize(terminated( + // Zero or more digits-followed-by-underscores + many0_count(pair(Self::take_base_16_digits1, char('_'))), + // One or more digits + Self::take_base_16_digits1, + ))(self) + } + + /// Recognizes 1 or more consecutive base-16 digits. + // This function's "1" suffix is a style borrowed from `nom`. + fn take_base_16_digits1(self) -> IonMatchResult<'data> { + take_while1(|b: u8| b.is_ascii_hexdigit())(self) + } +} + +// === nom trait implementations === +// The trait implementations that follow are necessary for `TextBufferView` to be used as an input +// type in `nom` parsers. (`nom` only supports `&str` and `&[u8]` out of the box.) Defining our own +// input type makes it possible for us to carry around additional context during the parsing process, +// which is important for providing helpful error messages. For example: we can include the absolute +// offset of the input slice currently being read in our error messages. +// +// As `TextBufferView` is just a wrapper around a `&[u8]`, these implementations mostly delegate +// to the existing trait impls for `&[u8]`. + +impl<'data> nom::InputTake for TextBufferView<'data> { + fn take(&self, count: usize) -> Self { + self.slice(0, count) + } + + fn take_split(&self, count: usize) -> (Self, Self) { + let (before, after) = self.data.split_at(count); + let buffer_before = TextBufferView::new_with_offset(before, self.offset()); + let buffer_after = TextBufferView::new_with_offset(after, self.offset() + count); + // Nom's convention is to place the remaining portion of the buffer first, which leads to + // a potentially surprising reversed tuple order. + (buffer_after, buffer_before) + } +} + +impl<'data> nom::InputLength for TextBufferView<'data> { + fn input_len(&self) -> usize { + self.len() + } +} + +impl<'data> nom::InputIter for TextBufferView<'data> { + type Item = u8; + type Iter = Enumerate; + type IterElem = Copied>; + + fn iter_indices(&self) -> Self::Iter { + self.iter_elements().enumerate() + } + + fn iter_elements(&self) -> Self::IterElem { + self.data.iter().copied() + } + + fn position

(&self, predicate: P) -> Option + where + P: Fn(Self::Item) -> bool, + { + self.data.iter().position(|b| predicate(*b)) + } + + fn slice_index(&self, count: usize) -> Result { + self.data.slice_index(count) + } +} + +impl<'a, 'b> nom::Compare<&'a str> for TextBufferView<'b> { + fn compare(&self, t: &'a str) -> CompareResult { + self.data.compare(t.as_bytes()) + } + + fn compare_no_case(&self, t: &'a str) -> CompareResult { + self.data.compare_no_case(t.as_bytes()) + } +} + +impl<'data> nom::Offset for TextBufferView<'data> { + fn offset(&self, second: &Self) -> usize { + self.data.offset(second.data) + } +} + +impl<'data> nom::Slice> for TextBufferView<'data> { + fn slice(&self, range: RangeFrom) -> Self { + self.slice_to_end(range.start) + } +} + +impl<'data> nom::Slice> for TextBufferView<'data> { + fn slice(&self, range: RangeTo) -> Self { + self.slice(0, range.end) + } +} + +impl<'data> nom::InputTakeAtPosition for TextBufferView<'data> { + type Item = u8; + + fn split_at_position>(&self, predicate: P) -> IResult + where + P: Fn(Self::Item) -> bool, + { + match self.data.iter().position(|c| predicate(*c)) { + Some(i) => Ok(self.take_split(i)), + None => Err(nom::Err::Incomplete(Needed::new(1))), + } + } + + fn split_at_position1>( + &self, + predicate: P, + e: ErrorKind, + ) -> IResult + where + P: Fn(Self::Item) -> bool, + { + match self.data.iter().position(|c| predicate(*c)) { + Some(0) => Err(nom::Err::Error(E::from_error_kind(*self, e))), + Some(i) => Ok(self.take_split(i)), + None => Err(nom::Err::Incomplete(Needed::new(1))), + } + } + + fn split_at_position_complete>( + &self, + predicate: P, + ) -> IResult + where + P: Fn(Self::Item) -> bool, + { + match self.data.iter().position(|c| predicate(*c)) { + Some(i) => Ok(self.take_split(i)), + None => Ok(self.take_split(self.input_len())), + } + } + + fn split_at_position1_complete>( + &self, + predicate: P, + e: ErrorKind, + ) -> IResult + where + P: Fn(Self::Item) -> bool, + { + match self.data.iter().position(|c| predicate(*c)) { + Some(0) => Err(nom::Err::Error(E::from_error_kind(*self, e))), + Some(i) => Ok(self.take_split(i)), + None => { + if self.is_empty() { + Err(nom::Err::Error(E::from_error_kind(*self, e))) + } else { + Ok(self.take_split(self.input_len())) + } + } + } + } +} + +// === end of `nom` trait implementations + +/// Augments a given parser such that it returns the matched value and the number of input bytes +/// that it matched. +fn match_and_length<'data, P, O>( + mut parser: P, +) -> impl Parser, (O, usize), IonParseError<'data>> +where + P: Parser, O, IonParseError<'data>>, +{ + move |input: TextBufferView<'data>| { + let offset_before = input.offset(); + let (remaining, matched) = match parser.parse(input) { + Ok((remaining, matched)) => (remaining, matched), + Err(e) => return Err(e), + }; + let offset_after = remaining.offset(); + let match_length = offset_after - offset_before; + Ok((remaining, (matched, match_length))) + } +} + +/// Returns the number of bytes that the provided parser matched. +fn match_length<'data, P, O>( + parser: P, +) -> impl Parser, usize, IonParseError<'data>> +where + P: Parser, O, IonParseError<'data>>, +{ + // Call `match_and_length` and discard the output + match_and_length(parser).map(|(_output, match_length)| match_length) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Stores an input string that can be tested against a given parser. + struct MatchTest { + input: String, + } + + impl MatchTest { + /// Takes an `input` string and appends a trailing space to it, guaranteeing that the + /// contents of the input are considered a complete token. + fn new(input: &str) -> Self { + MatchTest { + input: format!("{input} "), // add trailing space + } + } + + fn try_match<'data, P, O>(&'data self, parser: P) -> IonParseResult<'data, usize> + where + P: Parser, O, IonParseError<'data>>, + { + let buffer = TextBufferView::new(self.input.as_bytes()); + match_length(parser).parse(buffer) + } + + fn expect_match<'data, P, O>(&'data self, parser: P) + where + P: Parser, O, IonParseError<'data>>, + { + let result = self.try_match(parser); + let (_remaining, match_length) = result.unwrap(); + // Inputs have a trailing space that should _not_ be part of the match + assert_eq!( + match_length, + self.input.len() - 1, + "\nInput: '{}'\nMatched: '{}'\n", + self.input, + &self.input[..match_length] + ); + } + + fn expect_mismatch<'data, P, O>(&'data self, parser: P) + where + P: Parser, O, IonParseError<'data>>, + { + let result = self.try_match(parser); + // We expect this to fail for one reason or another + result.unwrap_err(); + } + } + + #[test] + fn test_match_stop_char() { + MatchTest::new(" ").expect_match(match_length(TextBufferView::match_stop_character)); + } + + #[test] + fn test_match_bool() { + fn match_bool(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_bool)); + } + fn mismatch_bool(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_bool)); + } + + match_bool("true"); + match_bool("false"); + + mismatch_bool("True"); + mismatch_bool("TRUE"); + mismatch_bool("False"); + mismatch_bool("FALSE"); + mismatch_bool("potato"); + mismatch_bool("42"); + } + + #[test] + fn test_match_null() { + fn match_null(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_null)); + } + fn mismatch_null(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_null)); + } + let good_inputs = &[ + "null", + "null.null", + "null.bool", + "null.int", + "null.float", + "null.decimal", + "null.timestamp", + "null.symbol", + "null.string", + "null.clob", + "null.blob", + "null.list", + "null.sexp", + "null.struct", + ]; + for input in good_inputs { + match_null(input); + } + + let bad_inputs = &[ + "-1", + "null.hello", + "nullnull", + "nullify", + "null..int", + "string.null", + ]; + for input in bad_inputs { + mismatch_null(input); + } + } + + #[test] + fn test_match_int() { + fn match_int(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_int)); + } + fn mismatch_int(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_int)); + } + let good_inputs = &[ + // Base 2 integers + "0b0", + "0B0", + "0b1", + "0B1", + "0b0001", + "0B1111", + "0b1111_1111", + "0B1010_1010", + // Base 10 integers + "0", + "13", + "942", + "7_216", + "1_000_000", + "9_999_999", + // Base 16 integers + "0x0", + "0x20", + "0x0A", + "0xcafe", + "0xCAFE", + "0XcaFE", + "0xC_A_F_E", + "0Xca_FE", + ]; + for input in good_inputs { + match_int(input); + let negative = format!("-{input}"); + match_int(&negative); + } + + let bad_inputs = &[ + "00", // Zero with leading zero + "0123", // Non-zero with leading zero + "--5", // Double negative + "+5", // Explicit positive + "1__000__000", // More than one underscore at a time + "_123", // Leading underscore + "0x0x5", // Multiple 0x prefixes + "0xx5", // Multiple Xs after 0 + "0x", // Base 16 prefix w/no number + "0b", // Base 2 prefix w/no number + ]; + for input in bad_inputs { + mismatch_int(input); + } + } +} diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs new file mode 100644 index 00000000..e1a3bcc1 --- /dev/null +++ b/src/lazy/text/encoded_value.rs @@ -0,0 +1,207 @@ +use crate::lazy::text::matched::MatchedValue; +use crate::IonType; +use std::ops::Range; + +/// Represents the type, offset, and length metadata of the various components of an encoded value +/// in a text input stream. +/// +/// Each [`LazyRawTextValue`](crate::lazy::text::value::LazyRawTextValue) contains an `EncodedValue`, +/// allowing a user to re-read (that is: parse) the body of the value as many times as necessary +/// without re-parsing its header information each time. +#[derive(Clone, Copy, Debug, PartialEq)] +pub(crate) struct EncodedTextValue { + // Each encoded text value has up to three components, appearing in the following order: + // + // [ field_name? | annotations? | data ] + // + // Components shown with a `?` are optional. + + // The following is an example encoding of a struct field with an annotated value-- the only kind + // of Ion value that has both of the optional components--that appears 5 gigabytes into the input + // stream: + // + // ┌─── field_name_offset: 12 + // │ ┌─── annotations_offset: 5 + // │ │ ┌─── data_offset: 5_000_000_012 + // price: USD::55.99, + // └─┬─┘ └─┬─┘└─┬─┘ + // │ │ └─ data_length: 5 + // │ └─ annotations_length: 5 + // └─ field_name_length: 5 + // + // Notice that only `data_offset` is an absolute offset from the beginning of the stream; + // this is because `data` is the only field that is always guaranteed to be present. + // `field_name_offset` and `annotations_offset` are stored as the number of bytes _before_ + // `data_offset`, allowing them to be stored in fewer bytes. + + // The absolute position (in bytes) of this value's `data` component within the overall stream + // being decoded. + data_offset: usize, + // The number of bytes _before_ `data_offset` at which the field name begins. If this value + // does not have a field name, this value will be zero. + field_name_offset: u32, + // The number of bytes _before_ `data_offset` at which the annotations sequence begins. + // If this value does not have a field name, this value will be zero. + annotations_offset: u32, + + // The number of bytes used to encode the data component of this Ion value. + data_length: usize, + // The number of bytes used to encode the field name preceding the data, if any. + // If there is no field name (i.e. the value is not inside a struct), this will be zero. + // If there is whitespace before the field name, this will not include it. + field_name_length: u32, + // The number of bytes used to encode the annotations sequence preceding the data, if any. + // If there is no annotations sequence, this will be zero. // If there is whitespace before the + // annotations sequence, this will not include it. + annotations_length: u32, + + // Information that was recorded about the value as it was being matched. + // For some types (e.g. bool), matching the text is the complete parsing process so the whole + // value is stored. For others (e.g. a timestamp), the various components of the value are + // recognized during matching and partial information like subfield offsets can be stored here. + matched_value: MatchedValue, +} + +impl EncodedTextValue { + pub(crate) fn new( + matched_value: MatchedValue, + offset: usize, + length: usize, + ) -> EncodedTextValue { + EncodedTextValue { + data_offset: offset, + data_length: length, + field_name_length: 0, + field_name_offset: 0, + annotations_offset: 0, + annotations_length: 0, + matched_value, + } + } + + // The field name range should contain the field name literal itself without any trailing + // whitespace or the delimiting ':'. + // Examples: + // foo + // 'foo' + // "foo" + // $10 + pub(crate) fn with_field_name(mut self, offset: usize, length: usize) -> EncodedTextValue { + self.field_name_offset = (self.data_offset - offset) as u32; + self.field_name_length = length as u32; + self + } + + // The annotations should include all of the symbol tokens, their delimiting '::'s, and any + // interstitial whitespace. It should not include any leading/trailing whitespace or the value + // itself. + // Examples: + // foo::bar:: + // 'foo'::'bar':: + // foo :: 'bar' :: + pub(crate) fn with_annotations_sequence( + mut self, + offset: usize, + length: usize, + ) -> EncodedTextValue { + self.annotations_offset = (self.data_offset - offset) as u32; + self.annotations_length = length as u32; + self + } + + pub fn ion_type(&self) -> IonType { + match self.matched_value { + MatchedValue::Null(ion_type) => ion_type, + MatchedValue::Bool(_) => IonType::Bool, + MatchedValue::Int(_) => IonType::Int, + } + } + + pub fn is_null(&self) -> bool { + matches!(self.matched_value, MatchedValue::Null(_)) + } + + pub fn data_length(&self) -> usize { + self.data_length + } + + pub fn data_range(&self) -> Range { + self.data_offset..(self.data_offset + self.data_length) + } + + pub fn field_name_range(&self) -> Option> { + if self.field_name_offset == 0 { + return None; + } + let start = self.data_offset - (self.field_name_offset as usize); + let end = start + (self.field_name_length as usize); + Some(start..end) + } + + pub fn annotations_range(&self) -> Option> { + if self.annotations_offset == 0 { + return None; + } + let start = self.data_offset - (self.annotations_offset as usize); + let end = start + (self.annotations_length as usize); + Some(start..end) + } + + pub fn has_field_name(&self) -> bool { + self.field_name_offset > 0 + } + + pub fn has_annotations(&self) -> bool { + self.annotations_offset > 0 + } + + /// Returns the total number of bytes used to represent the current value, including the + /// field ID (if any), its annotations (if any), its header (type descriptor + length bytes), + /// and its value. + pub fn total_length(&self) -> usize { + self.data_length + u32::max(self.annotations_offset, self.field_name_offset) as usize + } + + pub fn matched(&self) -> MatchedValue { + self.matched_value + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn total_length_data_only() { + let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12); + assert_eq!(value.total_length(), 12); + } + + #[test] + fn total_length_data_with_field_name() { + let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) + .with_field_name(90, 4); + assert_eq!(value.total_length(), 22); + } + + #[test] + fn total_length_data_with_annotations() { + let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) + .with_annotations_sequence(90, 4); + assert_eq!(value.total_length(), 22); + } + + #[test] + fn total_length_data_with_field_name_and_annotations() { + let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) + .with_field_name(90, 4) + .with_annotations_sequence(94, 6); + assert_eq!(value.total_length(), 22); + + // Same test but with extra whitespace between the components + let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) + .with_field_name(80, 4) + .with_annotations_sequence(91, 6); + assert_eq!(value.total_length(), 32, "{:?}", value); + } +} diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs new file mode 100644 index 00000000..3f846a38 --- /dev/null +++ b/src/lazy/text/matched.rs @@ -0,0 +1,108 @@ +//! Types in this module represent partially parsed values from the text Ion input stream. +//! +//! Ion readers are not necessarily interested in every value in the input. While the binary reader +//! is able to skip over uninteresting values using their length prefix, text readers must parse +//! every value in the stream in order to access the ones that follow. +//! +//! A somewhat naive implementation of a text reader might fully read each value in the input +//! stream eagerly, simply discarding values that the user doesn't request. This approach is +//! technically correct, but incurs the expense of validating and materializing data that will +//! ultimately be ignored. (As an example: consider a timestamp, which can have up to ~9 subfields +//! to check for syntactic and semantic correctness.) +//! +//! In contrast, when the lazy text reader is asked for the `next()` value in the stream, it uses its +//! Ion parser to identify the next slice of input that contains either a complete scalar value or +//! the beginning of a container. It stores an intermediate representation (IR) of that value using +//! one of the types defined in this module. The IR stores the value's Ion type, subfield offsets, +//! and other information that is identified in the process of parsing the next value. Later, if the +//! application asks to `read()` the value, the reader does not have to start from scratch. It can +//! use the previously recorded information to minimize the amount of information that needs to be +//! re-discovered. + +use crate::lazy::text::as_utf8::AsUtf8; +use crate::lazy::text::buffer::TextBufferView; +use crate::result::IonFailure; +use crate::{Int, IonResult, IonType}; +use num_bigint::BigInt; +use num_traits::Num; +use smallvec::SmallVec; +use std::num::IntErrorKind; + +/// A partially parsed Ion value. +#[derive(Copy, Clone, Debug, PartialEq)] +pub(crate) enum MatchedValue { + Null(IonType), + Bool(bool), + Int(MatchedInt), + // TODO: ...the other types +} + +/// A partially parsed Ion int. +#[derive(Copy, Clone, Debug, PartialEq)] +pub(crate) struct MatchedInt { + radix: u32, + digits_offset: usize, + is_negative: bool, +} + +impl MatchedInt { + // Integers that take more than 32 bytes to represent will heap allocate a larger buffer. + const STACK_ALLOC_BUFFER_CAPACITY: usize = 32; + + /// Constructs a new `MatchedInt`. + pub fn new(radix: u32, is_negative: bool, digits_offset: usize) -> Self { + Self { + radix, + digits_offset, + is_negative, + } + } + + /// Whether the partially parsed int began with a `-` + pub fn is_negative(&self) -> bool { + self.is_negative + } + + /// One of: `2`, `10`, or `16`, as determined by whether the partially parsed integer began + /// with a `0b`/`0B`, `0x`/`0X`, or no prefix. + pub fn radix(&self) -> u32 { + self.radix + } + + /// Attempts to finish reading the partially parsed integer. + pub fn read(&self, matched_input: TextBufferView) -> IonResult { + let digits = matched_input.slice_to_end(self.digits_offset); + let mut sanitized: SmallVec<[u8; Self::STACK_ALLOC_BUFFER_CAPACITY]> = + SmallVec::with_capacity(Self::STACK_ALLOC_BUFFER_CAPACITY); + // Copy the input text over to the sanitization buffer, discarding any underscores. These + // are legal input, but Rust's integer `from_str_radix` method does not support them. + sanitized.extend(digits.bytes().iter().copied().filter(|b| *b != b'_')); + // Note: This UTF-8 validation step should be unnecessary as the parser only recognizes + // ASCII integer characters. If this shows up in profiling, we could consider skipping it. + let text = sanitized.as_utf8(matched_input.offset())?; + let int: Int = match i64::from_str_radix(text, self.radix()) { + Ok(i) => i.into(), + Err(parse_int_error) => { + debug_assert!( + // `from_str_radix` can fail for a variety of reasons, but our rules for matching an + // int rule out most of them (empty str, invalid digit, etc). The only ones that should + // happen are overflow and underflow. In those cases, we fall back to using `BigInt`. + parse_int_error.kind() == &IntErrorKind::NegOverflow + || parse_int_error.kind() == &IntErrorKind::PosOverflow + ); + + match BigInt::from_str_radix(text, self.radix()) { + Ok(big_int) => big_int.into(), + Err(_big_parse_int_error) => { + return IonResult::decoding_error(format!( + "unexpected error while parsing int: '{}'", + std::str::from_utf8(matched_input.bytes()).unwrap_or("invalid UTF-8") + )) + } + } + } + }; + + Ok(int) + } +} diff --git a/src/lazy/text/mod.rs b/src/lazy/text/mod.rs new file mode 100644 index 00000000..a9a2cea2 --- /dev/null +++ b/src/lazy/text/mod.rs @@ -0,0 +1,7 @@ +mod as_utf8; +pub mod buffer; +pub mod encoded_value; +pub mod matched; +pub mod parse_result; +pub mod raw; +pub mod value; diff --git a/src/lazy/text/parse_result.rs b/src/lazy/text/parse_result.rs new file mode 100644 index 00000000..7da90511 --- /dev/null +++ b/src/lazy/text/parse_result.rs @@ -0,0 +1,274 @@ +//! The [`nom` parser combinator crate](https://docs.rs/nom/latest/nom/) intentionally provides +//! bare-bones error reporting by default. Each error contains only a `&str` representing the input +//! that could not be matched and an [`ErrorKind`] enum variant indicating which `nom` parser produced +//! the error. This stack-allocated type is very cheap to create, which is important because a +//! typical parse will require creating large numbers of short-lived error values. +//! +//! This module defines `IonParseError`, a custom error type that can capture more information than is +//! supported by [`nom::error::Error`]. It also defines `IonParseResult`, a type alias for an +//! [`IResult`] that parses `TextBufferView`s and produces `IonParseError`s if something goes wrong. + +use crate::lazy::text::buffer::TextBufferView; +use crate::position::Position; +use crate::result::{DecodingError, IonFailure}; +use crate::{IonError, IonResult}; +use nom::error::{Error as NomError, ErrorKind, ParseError}; +use nom::{Err, IResult}; +use std::borrow::Cow; +use std::fmt::{Debug, Display}; + +/// A type alias for a [`IResult`] whose input is a `TextBufferView` and whose error type is an +/// [`InvalidInputError`]. All of the Ion parsers in the `text::parsers` module return an +/// [`IonParseResult`]. +/// +/// If the parser is successful, it will return `Ok(output_value)`. If it encounters a problem, +/// it will return a `nom::Err`. [nom::Err] is a generic enum with three possible +/// variants: +/// 1. `Incomplete(_)` indicates that there wasn't enough input data to determine whether the +/// parser should match or not. +/// 2. `Error(ion_parse_error)` indicates that the parser did not match the input text. +/// 3. `Failure(ion_parse_error)` indicates that the parser matched the text but encountered +/// a problem when trying to materialize it into the `output_value`. In such cases, returning a +/// `Failure` signals that this was the correct parser to handle the input but it could not +/// be processed successfully for some reason. For example, a parser trying to match a number of +/// hours and minutes might match the text `11:71`, but fail when it tries to turn `71` into a +/// number of minutes because it's `>=60`. We know this was the right parser, but it wasn't +/// able to process it. (This is slightly contrived; it would be possible to write a parser +/// that rejected `71` as a number of minutes based on syntax alone.) +pub(crate) type IonParseResult<'a, O> = IResult, O, IonParseError<'a>>; +// Functions that return IonParseResult parse TextBufferView-^ ^ ^ +// ...return a value of type `O` -----+ | +// ...or a nom::Err if something goes wrong ----+ + +/// As above, but for parsers that simply identify (i.e. 'match') a slice of the input as a +/// particular item. +pub(crate) type IonMatchResult<'a> = + IResult, TextBufferView<'a>, IonParseError<'a>>; + +#[derive(Debug, PartialEq)] +pub enum IonParseError<'data> { + // When nom reports that the data was incomplete, it doesn't provide additional context. + Incomplete, + // When we encounter illegal text Ion, we'll have more information to share with the user. + Invalid(InvalidInputError<'data>), +} + +/// Describes a problem that occurred while trying to parse a given input `TextBufferView`. +/// +/// When returned as part of an `IonParseResult`, an `IonParseError` is always wrapped in +/// a [nom::Err] (see `IonParseResult`'s documentation for details). If the `nom::Err` is +/// a non-fatal `Error`, the `IonParseError`'s `description` will be `None`. If the `nom::Err` is +/// a fatal `Failure`, the `description` will be `Some(String)`. In this way, using an +/// `IonParseError` only incurs heap allocation costs when parsing is coming to an end. +#[derive(Debug, PartialEq)] +pub struct InvalidInputError<'data> { + // The input that being parsed when the error arose + input: TextBufferView<'data>, + // A human-friendly name for what the parser was working on when the error occurred + label: Option>, + // The nature of the error--what went wrong? + description: Option>, + // A backtrace of errors that occurred leading to this one. + // XXX: This is the most expensive part of error handling and is likely not very useful. + // Consider removing it if it doesn't carry its weight. + backtrace: Vec>, + // The nom ErrorKind, which indicates which nom-provided parser encountered the error we're + // bubbling up. + nom_error_kind: Option, +} + +impl<'data> InvalidInputError<'data> { + /// Constructs a new `IonParseError` from the provided `input` text. + pub(crate) fn new(input: TextBufferView<'data>) -> Self { + InvalidInputError { + input, + label: None, + description: None, + nom_error_kind: None, + backtrace: Vec::new(), + } + } + + /// Constructs a new `IonParseError` from the provided `input` text and `description`. + pub(crate) fn with_label>>(mut self, label: D) -> Self { + self.label = Some(label.into()); + self + } + + /// Constructs a new `IonParseError` from the provided `input` text and `description`. + pub(crate) fn with_description>>(mut self, description: D) -> Self { + self.description = Some(description.into()); + self + } + + /// Constructs a new `IonParseError` from the provided `input` text and `description`. + pub(crate) fn with_nom_error_kind(mut self, nom_error_kind: ErrorKind) -> Self { + self.nom_error_kind = Some(nom_error_kind); + self + } + + pub(crate) fn append_error(&mut self, error: InvalidInputError<'data>) { + self.backtrace.push(error) + } + + /// Returns a reference to the `description` text, if any. + pub fn description(&self) -> Option<&str> { + self.description.as_deref() + } + + pub fn label(&self) -> Option<&str> { + self.label.as_deref() + } + + // TODO: Decide how to expose 'input'. +} + +// impl<'data> From> for IonError { +// fn from(value: InvalidInputError) -> Self { +// dbg!(&value.backtrace); +// let mut message = String::from(value.description().unwrap_or("invalid text Ion syntax")); +// if let Some(label) = value.label { +// message.push_str(" while "); +// message.push_str(label.as_ref()); +// } +// let position = Position::with_offset(value.input.offset()).with_length(value.input.len()); +// let decoding_error = DecodingError::new(message).with_position(position); +// IonError::Decoding(decoding_error) +// } +// } + +impl<'data> From> for IonParseError<'data> { + fn from(value: InvalidInputError<'data>) -> Self { + IonParseError::Invalid(value) + } +} + +impl<'data> From>> for IonParseError<'data> { + fn from(value: Err>) -> Self { + match value { + Err::Incomplete(_) => IonParseError::Incomplete, + Err::Error(e) => e, + Err::Failure(e) => e, + } + } +} + +/// Allows an `IonParseError` to be constructed from a `(&str, ErrorKind)` tuple, which is the +/// data provided by core `nom` parsers if they do not match the input. +impl<'data> From<(TextBufferView<'data>, ErrorKind)> for IonParseError<'data> { + fn from((input, error_kind): (TextBufferView<'data>, ErrorKind)) -> Self { + InvalidInputError::new(input) + .with_nom_error_kind(error_kind) + .into() + } +} + +/// Allows a [nom::error::Error] to be converted into an [IonParseError] by calling `.into()`. +impl<'data> From>> for IonParseError<'data> { + fn from(nom_error: NomError>) -> Self { + InvalidInputError::new(nom_error.input) + .with_nom_error_kind(nom_error.code) + .into() + } +} + +/// Allows `IonParseError` to be used as the error type in various `nom` functions. +impl<'data> ParseError> for IonParseError<'data> { + fn from_error_kind(input: TextBufferView<'data>, error_kind: ErrorKind) -> Self { + InvalidInputError::new(input) + .with_nom_error_kind(error_kind) + .into() + } + + fn append(input: TextBufferView<'data>, kind: ErrorKind, mut other: Self) -> Self { + // When an error stack is being built, this method is called to give the error + // type an opportunity to aggregate the errors into a collection or a more descriptive + // message. For now, we simply allow the most recent error to take precedence. + let new_error = InvalidInputError::new(input).with_nom_error_kind(kind); + if let IonParseError::Invalid(invalid_input_error) = &mut other { + invalid_input_error.backtrace.push(new_error) + } + other + } +} + +pub(crate) trait AddContext<'data, T> { + fn with_context( + self, + label: impl Into>, + input: TextBufferView<'data>, + ) -> IonResult<(TextBufferView<'data>, T)>; +} + +impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> { + fn with_context( + self, + label: impl Into>, + input: TextBufferView<'data>, + ) -> IonResult<(TextBufferView<'data>, T)> { + match self { + // No change needed in the ok case + Ok(matched) => Ok(matched), + // If the error was an incomplete + Err(e) => { + // Nom error to IonParseError + match IonParseError::from(e) { + IonParseError::Incomplete => IonResult::incomplete(label, input.offset()), + IonParseError::Invalid(invalid_input_error) => { + dbg!(&invalid_input_error.backtrace); + let mut message = String::from( + invalid_input_error + .description() + .unwrap_or("invalid text Ion syntax"), + ); + if let Some(label) = invalid_input_error.label { + message.push_str(" while "); + message.push_str(label.as_ref()); + } + let position = Position::with_offset(invalid_input_error.input.offset()) + .with_length(invalid_input_error.input.len()); + let decoding_error = DecodingError::new(message).with_position(position); + Err(IonError::Decoding(decoding_error)) + } + } + } + } + } +} + +/// Constructs a `nom::Err::Failure` that contains an `IonParseError` describing the problem +/// that was encountered. +pub(crate) fn fatal_parse_error>, O>( + input: TextBufferView, + description: D, +) -> IonParseResult { + Err(nom::Err::Failure( + InvalidInputError::new(input) + .with_description(description) + .into(), + )) +} + +/// An extension trait that allows a [std::result::Result] of any kind to be mapped to an +/// `IonParseResult` concisely. +pub(crate) trait OrFatalParseError { + fn or_fatal_parse_error(self, input: TextBufferView, label: L) + -> IonParseResult; +} + +/// See the documentation for [OrFatalParseError]. +impl OrFatalParseError for Result +where + E: Debug, +{ + fn or_fatal_parse_error( + self, + input: TextBufferView, + label: L, + ) -> IonParseResult { + match self { + Ok(value) => Ok((input, value)), + Err(error) => fatal_parse_error(input, format!("{label}: {error:?}")), + } + } +} diff --git a/src/lazy/text/raw/mod.rs b/src/lazy/text/raw/mod.rs new file mode 100644 index 00000000..1077754f --- /dev/null +++ b/src/lazy/text/raw/mod.rs @@ -0,0 +1 @@ +pub mod reader; diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs new file mode 100644 index 00000000..dfc9f863 --- /dev/null +++ b/src/lazy/text/raw/reader.rs @@ -0,0 +1,189 @@ +use crate::lazy::decoder::LazyRawReader; +use crate::lazy::encoding::TextEncoding; +use crate::lazy::raw_stream_item::RawStreamItem; +use crate::lazy::text::buffer::TextBufferView; +use crate::lazy::text::parse_result::AddContext; +use crate::lazy::text::value::LazyRawTextValue; +use crate::result::IonFailure; +use crate::IonResult; + +/// Wraps a [`TextBufferView`], allowing the reader to advance each time an item is successfully +/// parsed from it. +pub(crate) struct DataSource<'data> { + // The buffer we're reading from + buffer: TextBufferView<'data>, + // Each time something is parsed from the buffer successfully, the caller will mark the number + // of bytes that may be skipped the next time `advance_to_next_item` is called. + bytes_to_skip: usize, +} + +impl<'data> DataSource<'data> { + pub(crate) fn new(buffer: TextBufferView<'data>) -> DataSource<'data> { + DataSource { + buffer, + bytes_to_skip: 0, + } + } + + pub(crate) fn buffer(&self) -> TextBufferView<'data> { + self.buffer + } + + fn advance_to_next_item(&mut self) -> IonResult> { + if self.buffer.len() < self.bytes_to_skip { + return IonResult::incomplete( + "cannot advance to next item, insufficient data in buffer", + self.buffer.offset(), + ); + } + + if self.bytes_to_skip > 0 { + Ok(self.buffer.consume(self.bytes_to_skip)) + } else { + Ok(self.buffer) + } + } + + /// Runs the provided parsing function on this DataSource's buffer. + /// If it succeeds, marks the `DataSource` as ready to advance by the 'n' bytes + /// that were consumed and returns `Some(value)`. + /// If it does not succeed, the `DataSource` remains unchanged. + pub(crate) fn try_parse_next< + F: Fn(TextBufferView<'data>) -> IonResult>>, + >( + &mut self, + parser: F, + ) -> IonResult>> { + let buffer_after = self.advance_to_next_item()?; + + let lazy_value = match parser(buffer_after) { + Ok(Some(output)) => output, + Ok(None) => return Ok(None), + Err(e) => return Err(e), + }; + + self.buffer = buffer_after; + self.bytes_to_skip = lazy_value.encoded_value.total_length(); + Ok(Some(lazy_value)) + } +} + +/// A text Ion 1.0 reader that yields [`LazyRawTextValue`]s representing the top level values found +/// in the provided input stream. +pub struct LazyRawTextReader<'data> { + data: DataSource<'data>, +} + +impl<'data> LazyRawTextReader<'data> { + /// Constructs a `LazyRawTextReader` positioned at the beginning of the provided input stream. + pub fn new(data: &'data [u8]) -> LazyRawTextReader<'data> { + Self::new_with_offset(data, 0) + } + + /// Constructs a `LazyRawTextReader` positioned at the beginning of the provided input stream. + /// The provided input stream is itself a slice starting `offset` bytes from the beginning + /// of a larger data stream. This offset is used for reporting the absolute (stream-level) + /// position of values encountered in `data`. + fn new_with_offset(data: &'data [u8], offset: usize) -> LazyRawTextReader<'data> { + let data = DataSource::new(TextBufferView::new_with_offset(data, offset)); + LazyRawTextReader { data } + } + + pub fn next<'top>(&'top mut self) -> IonResult> + where + 'data: 'top, + { + let buffer = self.data.buffer; + if buffer.is_empty() { + return IonResult::incomplete("reading a top-level value", buffer.offset()); + } + let (buffer_after_whitespace, _whitespace) = buffer + .match_optional_whitespace() + .with_context("skipping whitespace between top-level values", buffer)?; + let (remaining, matched) = buffer_after_whitespace + .read_top_level() + .with_context("reading a top-level value", buffer_after_whitespace)?; + // If we successfully moved to the next value, store the remaining buffer view + self.data.buffer = remaining; + Ok(matched) + } +} + +impl<'data> LazyRawReader<'data, TextEncoding> for LazyRawTextReader<'data> { + fn new(data: &'data [u8]) -> Self { + LazyRawTextReader::new(data) + } + + fn next<'a>(&'a mut self) -> IonResult> { + self.next() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::lazy::decoder::LazyRawValue; + use crate::IonType; + + #[test] + fn test_top_level() -> IonResult<()> { + let data = r#" + null + null.bool + null.int + false + true + 500 + 0x20 + 0b0101 + "#; + let mut reader = LazyRawTextReader::new(data.as_bytes()); + + // null + let lazy_untyped_null = reader.next()?.expect_value()?; + assert!(lazy_untyped_null.is_null()); + assert_eq!(lazy_untyped_null.ion_type(), IonType::Null); + + // null.bool + let lazy_null_bool = reader.next()?.expect_value()?; + assert!(lazy_null_bool.is_null()); + assert_eq!(lazy_null_bool.ion_type(), IonType::Bool); + + // null.int + let lazy_null_int = reader.next()?.expect_value()?; + assert!(lazy_null_int.is_null()); + assert_eq!(lazy_null_int.ion_type(), IonType::Int); + + // false + let lazy_bool_false = reader.next()?.expect_value()?; + assert!(!lazy_bool_false.is_null()); + assert_eq!(lazy_bool_false.ion_type(), IonType::Bool); + assert!(!lazy_bool_false.read()?.expect_bool()?); + + // true + let lazy_bool_true = reader.next()?.expect_value()?; + assert!(!lazy_bool_true.is_null()); + assert_eq!(lazy_bool_true.ion_type(), IonType::Bool); + assert!(lazy_bool_true.read()?.expect_bool()?); + + // 500 + let lazy_int_decimal_500 = reader.next()?.expect_value()?; + assert!(!lazy_int_decimal_500.is_null()); + assert_eq!(lazy_int_decimal_500.ion_type(), IonType::Int); + assert_eq!(lazy_int_decimal_500.read()?.expect_i64()?, 500); + + // 0x20 + let lazy_int_hex_20 = reader.next()?.expect_value()?; + assert!(!lazy_int_hex_20.is_null()); + assert_eq!(lazy_int_hex_20.ion_type(), IonType::Int); + assert_eq!(lazy_int_hex_20.read()?.expect_i64()?, 0x20); // decimal 32 + + // 0b0101 + let lazy_int_binary_0101 = reader.next()?.expect_value()?; + assert!(!lazy_int_binary_0101.is_null()); + assert_eq!(lazy_int_binary_0101.ion_type(), IonType::Int); + assert_eq!(lazy_int_binary_0101.read()?.expect_i64()?, 0b0101); // decimal 5 + + Ok(()) + } +} diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs new file mode 100644 index 00000000..e586f677 --- /dev/null +++ b/src/lazy/text/value.rs @@ -0,0 +1,66 @@ +use crate::lazy::decoder::private::LazyRawValuePrivate; +use crate::lazy::decoder::{LazyDecoder, LazyRawValue}; +use crate::lazy::encoding::TextEncoding; +use crate::lazy::raw_value_ref::RawValueRef; +use crate::lazy::text::buffer::TextBufferView; +use crate::lazy::text::encoded_value::EncodedTextValue; +use crate::lazy::text::matched::MatchedValue; +use crate::{IonResult, IonType, RawSymbolTokenRef}; +use std::fmt; +use std::fmt::{Debug, Formatter}; + +/// A value that has been identified in the text input stream but whose data has not yet been read. +/// +/// If only part of the value is in the input buffer, calls to [`LazyRawTextValue::read`] (which examines +/// bytes beyond the value's header) may return [`IonError::Incomplete`](crate::result::IonError::Incomplete). +/// +/// `LazyRawTextValue`s are "unresolved," which is to say that symbol values, annotations, and +/// struct field names may or may not include a text definition. (This is less common in Ion's text +/// format than in its binary format, but is still possible.) For a resolved lazy value that +/// includes a text definition for these items whenever one exists, see +/// [`crate::lazy::value::LazyValue`]. +#[derive(Clone)] +pub struct LazyRawTextValue<'data> { + pub(crate) encoded_value: EncodedTextValue, + pub(crate) input: TextBufferView<'data>, +} + +impl<'data> LazyRawValuePrivate<'data> for LazyRawTextValue<'data> { + fn field_name(&self) -> Option> { + todo!() + } +} + +impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { + fn ion_type(&self) -> IonType { + self.encoded_value.ion_type() + } + + fn is_null(&self) -> bool { + self.encoded_value.is_null() + } + + fn annotations(&self) -> >::AnnotationsIterator { + todo!() + } + + fn read(&self) -> IonResult> { + let matched_input = self.input.slice(0, self.encoded_value.data_length()); + let value_ref = match self.encoded_value.matched() { + MatchedValue::Null(ion_type) => RawValueRef::Null(ion_type), + MatchedValue::Bool(b) => RawValueRef::Bool(b), + MatchedValue::Int(i) => RawValueRef::Int(i.read(matched_input)?), + }; + Ok(value_ref) + } +} + +impl<'a> Debug for LazyRawTextValue<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "LazyRawTextValue {{\n val={:?},\n buf={:?}\n}}\n", + self.encoded_value, self.input + ) + } +} diff --git a/src/lazy/value.rs b/src/lazy/value.rs index d7ae1b64..8f09cdbf 100644 --- a/src/lazy/value.rs +++ b/src/lazy/value.rs @@ -1,5 +1,5 @@ -use crate::lazy::binary::encoding::BinaryEncoding; use crate::lazy::decoder::{LazyDecoder, LazyRawValue}; +use crate::lazy::encoding::BinaryEncoding; use crate::lazy::r#struct::LazyStruct; use crate::lazy::sequence::LazySequence; use crate::lazy::value_ref::ValueRef; diff --git a/src/position.rs b/src/position.rs index bb5d9648..413d82db 100644 --- a/src/position.rs +++ b/src/position.rs @@ -7,6 +7,7 @@ use std::fmt::{Display, Error}; #[derive(Clone, Debug, PartialEq, Eq)] pub struct Position { pub(crate) byte_offset: usize, + pub(crate) byte_length: Option, pub(crate) line_column: Option<(usize, usize)>, } @@ -16,16 +17,20 @@ impl Position { pub fn with_offset(offset: usize) -> Self { Position { byte_offset: offset, + byte_length: None, line_column: None, } } + pub fn with_length(mut self, length: usize) -> Self { + self.byte_length = Some(length); + self + } + /// Add line and column information to the current Position. - pub fn with_line_and_column(&self, line: usize, column: usize) -> Self { - Position { - line_column: Some((line, column)), - ..*self - } + pub fn with_line_and_column(mut self, line: usize, column: usize) -> Self { + self.line_column = Some((line, column)); + self } /// Returns the offset from the start of the Ion stream in bytes. @@ -33,17 +38,22 @@ impl Position { self.byte_offset } - /// If available returns the text position as line and column offsets. + /// If available, returns the length of the input slice in question. + pub fn byte_length(&self) -> Option { + self.byte_length + } + + /// If available, returns the text position as line and column offsets. pub fn line_and_column(&self) -> Option<(usize, usize)> { self.line_column } - /// If available returns the line component of the text position. + /// If available, returns the line component of the text position. pub fn line(&self) -> Option { self.line_column.map(|(line, _column)| line) } - /// If available returns the column component of the text position. + /// If available, returns the column component of the text position. pub fn column(&self) -> Option { self.line_column.map(|(_line, column)| column) } diff --git a/src/result/decoding_error.rs b/src/result/decoding_error.rs index e2fb39af..ade5d501 100644 --- a/src/result/decoding_error.rs +++ b/src/result/decoding_error.rs @@ -1,3 +1,4 @@ +use crate::position::Position; use std::borrow::Cow; use thiserror::Error; @@ -6,12 +7,23 @@ use thiserror::Error; #[error("{description}")] pub struct DecodingError { description: Cow<'static, str>, + position: Option, } impl DecodingError { pub(crate) fn new(description: impl Into>) -> Self { DecodingError { description: description.into(), + position: None, } } + + pub(crate) fn with_position(mut self, position: impl Into) -> Self { + self.position = Some(position.into()); + self + } + + pub fn position(&self) -> Option<&Position> { + self.position.as_ref() + } } diff --git a/src/result/incomplete.rs b/src/result/incomplete.rs index c47c721d..896b9a16 100644 --- a/src/result/incomplete.rs +++ b/src/result/incomplete.rs @@ -1,4 +1,5 @@ use crate::position::Position; +use std::borrow::Cow; use thiserror::Error; /// For non-blocking readers, indicates that there was not enough data available in the input buffer @@ -6,14 +7,14 @@ use thiserror::Error; #[derive(Clone, Debug, Error, PartialEq)] #[error("ran out of input while reading {label} at offset {position}")] pub struct IncompleteError { - label: &'static str, + label: Cow<'static, str>, position: Position, } impl IncompleteError { - pub(crate) fn new(label: &'static str, position: impl Into) -> Self { + pub(crate) fn new(label: impl Into>, position: impl Into) -> Self { IncompleteError { - label, + label: label.into(), position: position.into(), } } diff --git a/src/result/mod.rs b/src/result/mod.rs index 8b47a476..157184ac 100644 --- a/src/result/mod.rs +++ b/src/result/mod.rs @@ -82,14 +82,14 @@ pub(crate) trait IonFailure { // an `IonError::Io` is by converting a `std::io::IoError` with the ? operator. // Because this trait is only crate-visible, methods can be added/changed as needed in // the future. - fn incomplete(label: &'static str, position: impl Into) -> Self; + fn incomplete(label: impl Into>, position: impl Into) -> Self; fn decoding_error>>(description: S) -> Self; fn encoding_error>>(description: S) -> Self; fn illegal_operation>>(operation: S) -> Self; } impl IonFailure for IonError { - fn incomplete(label: &'static str, position: impl Into) -> Self { + fn incomplete(label: impl Into>, position: impl Into) -> Self { IncompleteError::new(label, position).into() } @@ -107,7 +107,7 @@ impl IonFailure for IonError { } impl IonFailure for IonResult { - fn incomplete(label: &'static str, position: impl Into) -> Self { + fn incomplete(label: impl Into>, position: impl Into) -> Self { Err(IonError::incomplete(label, position)) } From 89f79aa1b67820c1ef364a111ceca36f8b498edd Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Tue, 25 Jul 2023 08:36:00 -0400 Subject: [PATCH 02/15] Consolidate impls of AsUtf8 w/helper fn --- src/lazy/text/as_utf8.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/lazy/text/as_utf8.rs b/src/lazy/text/as_utf8.rs index 0d1e211c..9be4784c 100644 --- a/src/lazy/text/as_utf8.rs +++ b/src/lazy/text/as_utf8.rs @@ -14,20 +14,20 @@ pub(crate) trait AsUtf8 { impl AsUtf8 for SmallVec<[u8; N]> { fn as_utf8(&self, position: impl Into) -> IonResult<&str> { - std::str::from_utf8(self.as_ref()).map_err(|_| { - let decoding_error = - DecodingError::new("encountered invalid UTF-8").with_position(position); - IonError::Decoding(decoding_error) - }) + bytes_as_utf8(self.as_ref(), position) } } impl<'data> AsUtf8 for TextBufferView<'data> { fn as_utf8(&self, position: impl Into) -> IonResult<&str> { - std::str::from_utf8(self.bytes()).map_err(|_| { - let decoding_error = - DecodingError::new("encountered invalid UTF-8").with_position(position); - IonError::Decoding(decoding_error) - }) + bytes_as_utf8(self.bytes(), position) } } + +fn bytes_as_utf8(bytes: &[u8], position: impl Into) -> IonResult<&str> { + std::str::from_utf8(bytes).map_err(|_| { + let decoding_error = + DecodingError::new("encountered invalid UTF-8").with_position(position); + IonError::Decoding(decoding_error) + }) +} From 840be4d62783ebb754314dee018e655ca7570d73 Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Tue, 25 Jul 2023 10:05:27 -0400 Subject: [PATCH 03/15] Improved TextBufferView docs, removed DataSource --- src/lazy/text/buffer.rs | 70 ++++++++++++++++---------------- src/lazy/text/matched.rs | 1 + src/lazy/text/raw/reader.rs | 80 ++++++------------------------------- 3 files changed, 49 insertions(+), 102 deletions(-) diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index 54ecf4f6..c12ec11c 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -58,11 +58,10 @@ const WHITESPACE_CHARACTERS_AS_STR: &str = " \t\r\n\x09\x0B\x0C"; /// A slice of unsigned bytes that can be cheaply copied and which defines methods for parsing /// the various encoding elements of a text Ion stream. /// -/// Upon success, each parsing method on the `TextBufferView` will return the value that was read -/// and a new copy of the `TextBufferView` that starts _after_ the bytes that were parsed. -/// -/// Methods that begin with `match_` return the input slice that they matched OR a `MatchedValue` -/// that retains additional information found during the matching process. +/// Parsing methods have names that begin with `match_` and each return a `(match, remaining_input)` +/// pair. The `match` may be either the slice of the input that was matched (represented as another +/// `TextBufferView`) or a `MatchedValue` that retains information discovered during parsing that +/// will be useful if the match is later fully materialized into a value. #[derive(PartialEq, Clone, Copy)] pub(crate) struct TextBufferView<'a> { // `data` is a slice of remaining data in the larger input stream. @@ -79,17 +78,21 @@ pub(crate) struct TextBufferView<'a> { pub(crate) type ParseResult<'a, T> = IonResult<(T, TextBufferView<'a>)>; impl<'data> TextBufferView<'data> { - /// Constructs a new `TextBufferView` that wraps `data`. + /// Constructs a new `TextBufferView` that wraps `data`, setting the view's `offset` to zero. #[inline] pub fn new(data: &[u8]) -> TextBufferView { Self::new_with_offset(data, 0) } + /// Constructs a new `TextBufferView` that wraps `data`, setting the view's `offset` to the + /// specified value. This is useful when `data` is a slice from the middle of a larger stream. + /// Note that `offset` is the index of the larger stream at which `data` begins and not an + /// offset _into_ `data`. pub fn new_with_offset(data: &[u8], offset: usize) -> TextBufferView { TextBufferView { data, offset } } - /// Returns a subslice copy of the [`TextBufferView`] that starts at `offset` and continues for + /// Returns a subslice of the [`TextBufferView`] that starts at `offset` and continues for /// `length` bytes. /// /// Note that `offset` is relative to the beginning of the buffer, not the beginning of the @@ -101,7 +104,7 @@ impl<'data> TextBufferView<'data> { } } - /// Returns a subslice copy of the [`TextBufferView`] that starts at `offset` and continues + /// Returns a subslice of the [`TextBufferView`] that starts at `offset` and continues /// to the end. /// /// Note that `offset` is relative to the beginning of the buffer, not the beginning of the @@ -134,48 +137,44 @@ impl<'data> TextBufferView<'data> { self.data.is_empty() } - /// Creates a copy of this `TextBufferView` that begins `num_bytes_to_consume` further into the - /// slice. - #[inline] - pub fn consume(&self, num_bytes_to_consume: usize) -> Self { - // This assertion is always run during testing but is removed in the release build. - debug_assert!(num_bytes_to_consume <= self.len()); - Self { - data: &self.data[num_bytes_to_consume..], - offset: self.offset + num_bytes_to_consume, - } + pub fn match_whitespace(self) -> IonMatchResult<'data> { + is_a(WHITESPACE_CHARACTERS_AS_STR)(self) } - // An adapter for nom::combinator::success. - // Always succeeds and consumes none of the input. Returns an empty slice of the buffer. - pub fn match_nothing(self) -> IonMatchResult<'data> { - // Return an empty slice from the head position + /// Always succeeds and consumes none of the input. Returns an empty slice of the buffer. + // This method is useful for parsers that need to match an optional construct but don't want + // to return an Option<_>. For an example, see its use in `match_optional_whitespace`. + fn match_nothing(self) -> IonMatchResult<'data> { + // Use nom's `success` parser to return an empty slice from the head position success(self.slice(0, 0))(self) } - pub fn match_whitespace(self) -> IonMatchResult<'data> { - is_a(WHITESPACE_CHARACTERS_AS_STR)(self) - } - + /// Matches zero or more whitespace characters. pub fn match_optional_whitespace(self) -> IonMatchResult<'data> { // Either match whitespace and return what follows or just return the input as-is. - // This will always return `Ok`, but is packaged as an IonMatchResult for compatability + // This will always return `Ok`, but it is packaged as an IonMatchResult for compatability + // with other parsers. alt((Self::match_whitespace, Self::match_nothing))(self) } - pub fn read_top_level(self) -> IonParseResult<'data, RawStreamItem<'data, TextEncoding>> { - let (remaining, value) = match self.read_value() { + /// Matches a single top-level scalar value, the beginning of a container, or an IVM. + pub fn match_top_level(self) -> IonParseResult<'data, RawStreamItem<'data, TextEncoding>> { + let (remaining, value) = match self.match_value() { Ok(value) => value, Err(e) => return Err(e), }; + // TODO: Augment this method to take an `is_complete` flag that indicates whether the absence + // of further values should return an `Incomplete` or a `RawStreamItem::EndOfStream`. + // TODO: Check to see if `value` is actually an IVM. // => If it's a symbol, try the IVM parser on it and see if it succeeds. // For now, we just return the value. Ok((remaining, RawStreamItem::Value(value))) } - pub fn read_value(self) -> IonParseResult<'data, LazyRawTextValue<'data>> { + /// Matches a single scalar value or the beginning of a container. + pub fn match_value(self) -> IonParseResult<'data, LazyRawTextValue<'data>> { alt(( // For `null` and `bool`, we use `read_` instead of `match_` because there's no additional // parsing to be done. @@ -202,10 +201,12 @@ impl<'data> TextBufferView<'data> { .parse(self) } + /// Matches a boolean value. pub fn match_bool(self) -> IonMatchResult<'data> { recognize(Self::read_bool)(self) } + /// Matches and returns a boolean value. pub fn read_bool(self) -> IonParseResult<'data, bool> { terminated( alt((value(true, tag("true")), value(false, tag("false")))), @@ -213,10 +214,12 @@ impl<'data> TextBufferView<'data> { )(self) } + /// Matches any type of null. (`null`, `null.null`, `null.int`, etc) pub fn match_null(self) -> IonMatchResult<'data> { recognize(Self::read_null)(self) } + /// Matches and returns a null value. pub fn read_null(self) -> IonParseResult<'data, IonType> { delimited( tag("null"), @@ -227,10 +230,7 @@ impl<'data> TextBufferView<'data> { .parse(self) } - fn match_ion_type(self) -> IonMatchResult<'data> { - recognize(Self::read_ion_type)(self) - } - + /// Matches and returns an Ion type. fn read_ion_type(self) -> IonParseResult<'data, IonType> { alt(( value(IonType::Null, tag("null")), @@ -249,10 +249,12 @@ impl<'data> TextBufferView<'data> { ))(self) } + /// Matches any one of Ion's stop characters. fn match_stop_character(self) -> IonMatchResult<'data> { recognize(one_of("{}[](),\"' \t\n\r\u{0b}\u{0c}")).parse(self) } + /// Matches--but does not consume--any one of Ion's stop characters. fn peek_stop_character(self) -> IonMatchResult<'data> { peek(Self::match_stop_character).parse(self) } diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index 3f846a38..b3f79056 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -31,6 +31,7 @@ use std::num::IntErrorKind; /// A partially parsed Ion value. #[derive(Copy, Clone, Debug, PartialEq)] pub(crate) enum MatchedValue { + // `Null` and `Bool` are fully parsed because they only involve matching a keyword. Null(IonType), Bool(bool), Int(MatchedInt), diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index dfc9f863..22eedf61 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -3,77 +3,19 @@ use crate::lazy::encoding::TextEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::parse_result::AddContext; -use crate::lazy::text::value::LazyRawTextValue; use crate::result::IonFailure; use crate::IonResult; -/// Wraps a [`TextBufferView`], allowing the reader to advance each time an item is successfully -/// parsed from it. -pub(crate) struct DataSource<'data> { - // The buffer we're reading from +/// A text Ion 1.0 reader that yields [`RawStreamItem`]s representing the top level values found +/// in the provided input stream. +pub struct LazyRawTextReader<'data> { + // The current view of the data we're reading from. buffer: TextBufferView<'data>, // Each time something is parsed from the buffer successfully, the caller will mark the number - // of bytes that may be skipped the next time `advance_to_next_item` is called. + // of bytes that may be skipped the next time the reader advances. bytes_to_skip: usize, } -impl<'data> DataSource<'data> { - pub(crate) fn new(buffer: TextBufferView<'data>) -> DataSource<'data> { - DataSource { - buffer, - bytes_to_skip: 0, - } - } - - pub(crate) fn buffer(&self) -> TextBufferView<'data> { - self.buffer - } - - fn advance_to_next_item(&mut self) -> IonResult> { - if self.buffer.len() < self.bytes_to_skip { - return IonResult::incomplete( - "cannot advance to next item, insufficient data in buffer", - self.buffer.offset(), - ); - } - - if self.bytes_to_skip > 0 { - Ok(self.buffer.consume(self.bytes_to_skip)) - } else { - Ok(self.buffer) - } - } - - /// Runs the provided parsing function on this DataSource's buffer. - /// If it succeeds, marks the `DataSource` as ready to advance by the 'n' bytes - /// that were consumed and returns `Some(value)`. - /// If it does not succeed, the `DataSource` remains unchanged. - pub(crate) fn try_parse_next< - F: Fn(TextBufferView<'data>) -> IonResult>>, - >( - &mut self, - parser: F, - ) -> IonResult>> { - let buffer_after = self.advance_to_next_item()?; - - let lazy_value = match parser(buffer_after) { - Ok(Some(output)) => output, - Ok(None) => return Ok(None), - Err(e) => return Err(e), - }; - - self.buffer = buffer_after; - self.bytes_to_skip = lazy_value.encoded_value.total_length(); - Ok(Some(lazy_value)) - } -} - -/// A text Ion 1.0 reader that yields [`LazyRawTextValue`]s representing the top level values found -/// in the provided input stream. -pub struct LazyRawTextReader<'data> { - data: DataSource<'data>, -} - impl<'data> LazyRawTextReader<'data> { /// Constructs a `LazyRawTextReader` positioned at the beginning of the provided input stream. pub fn new(data: &'data [u8]) -> LazyRawTextReader<'data> { @@ -85,15 +27,17 @@ impl<'data> LazyRawTextReader<'data> { /// of a larger data stream. This offset is used for reporting the absolute (stream-level) /// position of values encountered in `data`. fn new_with_offset(data: &'data [u8], offset: usize) -> LazyRawTextReader<'data> { - let data = DataSource::new(TextBufferView::new_with_offset(data, offset)); - LazyRawTextReader { data } + LazyRawTextReader { + buffer: TextBufferView::new_with_offset(data, offset), + bytes_to_skip: 0, + } } pub fn next<'top>(&'top mut self) -> IonResult> where 'data: 'top, { - let buffer = self.data.buffer; + let buffer = self.buffer; if buffer.is_empty() { return IonResult::incomplete("reading a top-level value", buffer.offset()); } @@ -101,10 +45,10 @@ impl<'data> LazyRawTextReader<'data> { .match_optional_whitespace() .with_context("skipping whitespace between top-level values", buffer)?; let (remaining, matched) = buffer_after_whitespace - .read_top_level() + .match_top_level() .with_context("reading a top-level value", buffer_after_whitespace)?; // If we successfully moved to the next value, store the remaining buffer view - self.data.buffer = remaining; + self.buffer = remaining; Ok(matched) } } From 5db1ff0b8748487e33a9e3d3069e2c1966eaa7fb Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Thu, 27 Jul 2023 13:30:34 -0400 Subject: [PATCH 04/15] Adds lazy text floats --- src/lazy/text/buffer.rs | 156 ++++++++++++++++++++++++++++++++- src/lazy/text/encoded_value.rs | 1 + src/lazy/text/matched.rs | 57 ++++++++++-- src/lazy/text/parse_result.rs | 68 +++++++++----- src/lazy/text/raw/reader.rs | 45 ++++++++++ src/lazy/text/value.rs | 1 + 6 files changed, 297 insertions(+), 31 deletions(-) diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index c12ec11c..e7f7d393 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -1,7 +1,7 @@ use crate::lazy::encoding::TextEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::text::encoded_value::EncodedTextValue; -use crate::lazy::text::matched::{MatchedInt, MatchedValue}; +use crate::lazy::text::matched::{MatchedFloat, MatchedInt, MatchedValue}; use crate::lazy::text::parse_result::IonParseError; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; use crate::lazy::text::value::LazyRawTextValue; @@ -12,7 +12,7 @@ use nom::character::streaming::{char, digit1, one_of}; use nom::combinator::{map, opt, peek, recognize, success, value}; use nom::error::{ErrorKind, ParseError}; use nom::multi::many0_count; -use nom::sequence::{delimited, pair, preceded, separated_pair, terminated}; +use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}; use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser}; use std::fmt::{Debug, Formatter}; use std::iter::{Copied, Enumerate}; @@ -192,6 +192,12 @@ impl<'data> TextBufferView<'data> { EncodedTextValue::new(MatchedValue::Int(matched_int), self.offset(), length) }, ), + map( + match_and_length(Self::match_float), + |(matched_float, length)| { + EncodedTextValue::new(MatchedValue::Float(matched_float), self.offset(), length) + }, + ), // TODO: The other Ion types )) .map(|encoded_value| LazyRawTextValue { @@ -372,6 +378,111 @@ impl<'data> TextBufferView<'data> { fn take_base_16_digits1(self) -> IonMatchResult<'data> { take_while1(|b: u8| b.is_ascii_hexdigit())(self) } + + /// Matches an Ion float of any syntax + fn match_float(self) -> IonParseResult<'data, MatchedFloat> { + alt(( + Self::match_float_special_value, + Self::match_float_numeric_value, + ))(self) + } + + /// Matches special IEEE-754 floating point values, including +/- infinity and NaN. + fn match_float_special_value(self) -> IonParseResult<'data, MatchedFloat> { + alt(( + value(MatchedFloat::NotANumber, tag("nan")), + value(MatchedFloat::PositiveInfinity, tag("+inf")), + value(MatchedFloat::NegativeInfinity, tag("-inf")), + ))(self) + } + + /// Matches numeric IEEE-754 floating point values. + fn match_float_numeric_value(self) -> IonParseResult<'data, MatchedFloat> { + terminated( + recognize(pair( + Self::match_number_with_optional_dot_and_digits, + Self::match_float_exponent_marker_and_digits, + )), + Self::peek_stop_character, + ) + .map(|_matched| MatchedFloat::Numeric) + .parse(self) + } + + /// Matches a number that may or may not have a decimal place and trailing fractional digits. + /// If a decimal place is present, there must also be trailing digits. + /// For example: + /// 1000 + /// 1000.559 + /// -25.2 + fn match_number_with_optional_dot_and_digits(self) -> IonMatchResult<'data> { + recognize(tuple(( + opt(tag("-")), + Self::match_base_10_digits_before_dot, + opt(Self::match_dot_followed_by_base_10_digits), + )))(self) + } + + /// In a float or decimal, matches the digits that are permitted before the decimal point. + /// This includes either a single zero, or a non-zero followed by any sequence of digits. + fn match_digits_before_dot(self) -> IonMatchResult<'data> { + alt(( + tag("0"), + recognize(pair(Self::match_leading_digit, Self::match_trailing_digits)), + ))(self) + } + + /// Matches a single non-zero base 10 digit. + fn match_leading_digit(self) -> IonMatchResult<'data> { + recognize(one_of("123456789"))(self) + } + + /// Matches any number of base 10 digits, allowing underscores at any position except the end. + fn match_trailing_digits(self) -> IonMatchResult<'data> { + recognize(many0_count(preceded(opt(char('_')), digit1)))(self) + } + + /// Recognizes a decimal point followed by any number of base-10 digits. + fn match_dot_followed_by_base_10_digits(self) -> IonMatchResult<'data> { + recognize(preceded(tag("."), opt(Self::match_digits_after_dot)))(self) + } + + /// Like `match_digits_before_dot`, but allows leading zeros. + fn match_digits_after_dot(self) -> IonMatchResult<'data> { + recognize(terminated( + // Zero or more digits-followed-by-underscores + many0_count(pair(digit1, char('_'))), + // One or more digits + digit1, + ))(self) + } + + /// Matches an `e` or `E` followed by an optional sign (`+` or `-`) followed by one or more + /// base 10 digits. + fn match_float_exponent_marker_and_digits(self) -> IonMatchResult<'data> { + preceded(one_of("eE"), Self::match_exponent_sign_and_digits)(self) + } + + /// Recognizes the exponent portion of a decimal (everything after the 'd') or float + /// (everything after the 'e'). This includes: + /// * an optional '+' OR '-' + /// * any number of decimal digits, which may: + /// * have underscores in between them: `1_000_000` + /// * have one or more leading zeros: `0005` + fn match_exponent_sign_and_digits(self) -> IonMatchResult<'data> { + recognize(pair( + // Optional leading sign; if there's no sign, it's not negative. + opt(Self::match_any_sign), + Self::match_digits_after_dot, + ))(self) + } + + /// Matches `-` OR `+`. + /// + /// This is used for matching exponent signs; most places in Ion do not allow `+`. + pub fn match_any_sign(self) -> IonMatchResult<'data> { + alt((tag("+"), tag("-")))(self) + } } // === nom trait implementations === @@ -602,7 +713,12 @@ mod tests { { let result = self.try_match(parser); // We expect this to fail for one reason or another - result.unwrap_err(); + assert!( + result.is_err(), + "Expected a parse failure for input: {:?}\nResult: {:?}", + self.input, + result + ); } } @@ -729,4 +845,38 @@ mod tests { mismatch_int(input); } } + + #[test] + fn test_match_float() { + fn match_float(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_float)); + } + fn mismatch_float(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_float)); + } + + let good_inputs = &[ + "0.0e0", "0E0", "0e0", "305e1", "305e+1", "305e-1", "305e100", "305e-100", "305e+100", + "305.0e1", "0.279e3", "279e0", "279.5e0", "279.5E0", + ]; + for input in good_inputs { + match_float(input); + let negative = format!("-{input}"); + match_float(&negative); + } + + let bad_inputs = &[ + "305", // Integer + "305e", // Has exponent delimiter but no exponent + ".305e", // No digits before the decimal point + "305e0.5", // Fractional exponent + "305e-0.5", // Negative fractional exponent + "0305e1", // Leading zero + "+305e1", // Leading plus sign + "--305e1", // Multiple negative signs + ]; + for input in bad_inputs { + mismatch_float(input); + } + } } diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index e1a3bcc1..0c649d08 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -114,6 +114,7 @@ impl EncodedTextValue { MatchedValue::Null(ion_type) => ion_type, MatchedValue::Bool(_) => IonType::Bool, MatchedValue::Int(_) => IonType::Int, + MatchedValue::Float(_) => IonType::Float, } } diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index b3f79056..560d828e 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -19,14 +19,17 @@ //! use the previously recorded information to minimize the amount of information that needs to be //! re-discovered. -use crate::lazy::text::as_utf8::AsUtf8; -use crate::lazy::text::buffer::TextBufferView; -use crate::result::IonFailure; -use crate::{Int, IonResult, IonType}; +use std::num::IntErrorKind; + use num_bigint::BigInt; use num_traits::Num; use smallvec::SmallVec; -use std::num::IntErrorKind; + +use crate::lazy::text::as_utf8::AsUtf8; +use crate::lazy::text::buffer::TextBufferView; +use crate::lazy::text::parse_result::InvalidInputError; +use crate::result::IonFailure; +use crate::{Int, IonError, IonResult, IonType}; /// A partially parsed Ion value. #[derive(Copy, Clone, Debug, PartialEq)] @@ -35,6 +38,7 @@ pub(crate) enum MatchedValue { Null(IonType), Bool(bool), Int(MatchedInt), + Float(MatchedFloat), // TODO: ...the other types } @@ -107,3 +111,46 @@ impl MatchedInt { Ok(int) } } + +/// A partially parsed Ion float. +#[derive(Copy, Clone, Debug, PartialEq)] +pub(crate) enum MatchedFloat { + /// `+inf` + PositiveInfinity, + /// `-inf` + NegativeInfinity, + /// `nan` + NotANumber, + /// Any numeric float value + Numeric, +} + +impl MatchedFloat { + // Floats that take more than 32 bytes of text to represent will heap allocate a larger buffer. + const STACK_ALLOC_BUFFER_CAPACITY: usize = 32; + + pub fn read(&self, matched_input: TextBufferView) -> IonResult { + use std::str::FromStr; + + match self { + MatchedFloat::PositiveInfinity => return Ok(f64::INFINITY), + MatchedFloat::NegativeInfinity => return Ok(f64::NEG_INFINITY), + MatchedFloat::NotANumber => return Ok(f64::NAN), + MatchedFloat::Numeric => {} // fall through + }; + + let mut sanitized: SmallVec<[u8; Self::STACK_ALLOC_BUFFER_CAPACITY]> = + SmallVec::with_capacity(Self::STACK_ALLOC_BUFFER_CAPACITY); + sanitized.extend(matched_input.bytes().iter().copied().filter(|b| *b != b'_')); + + let text = sanitized.as_utf8(matched_input.offset())?; + let float = f64::from_str(text).map_err(|e| { + let error: IonError = InvalidInputError::new(matched_input) + .with_description(format!("encountered an unexpected error ({:?})", e)) + .with_label("parsing a float") + .into(); + error + })?; + Ok(float) + } +} diff --git a/src/lazy/text/parse_result.rs b/src/lazy/text/parse_result.rs index 7da90511..6dfb919b 100644 --- a/src/lazy/text/parse_result.rs +++ b/src/lazy/text/parse_result.rs @@ -143,6 +143,25 @@ impl<'data> From> for IonParseError<'data> { } } +// We cannot provide an analogous impl for `Incomplete` because it is missing necessary data. +impl<'data> From> for IonError { + fn from(invalid_input_error: InvalidInputError) -> Self { + let mut message = String::from( + invalid_input_error + .description() + .unwrap_or("invalid Ion syntax encountered"), + ); + if let Some(label) = invalid_input_error.label { + message.push_str(" while "); + message.push_str(label.as_ref()); + } + let position = Position::with_offset(invalid_input_error.input.offset()) + .with_length(invalid_input_error.input.len()); + let decoding_error = DecodingError::new(message).with_position(position); + IonError::Decoding(decoding_error) + } +} + impl<'data> From>> for IonParseError<'data> { fn from(value: Err>) -> Self { match value { @@ -200,6 +219,31 @@ pub(crate) trait AddContext<'data, T> { ) -> IonResult<(TextBufferView<'data>, T)>; } +impl<'data, T> AddContext<'data, T> for nom::Err> { + fn with_context( + self, + label: impl Into>, + input: TextBufferView<'data>, + ) -> IonResult<(TextBufferView<'data>, T)> { + let ipe = IonParseError::from(self); + ipe.with_context(label, input) + } +} + +// Turns an IonParseError into an IonResult +impl<'data, T> AddContext<'data, T> for IonParseError<'data> { + fn with_context( + self, + label: impl Into>, + input: TextBufferView<'data>, + ) -> IonResult<(TextBufferView<'data>, T)> { + match self { + IonParseError::Incomplete => IonResult::incomplete(label, input.offset()), + IonParseError::Invalid(invalid_input_error) => Err(IonError::from(invalid_input_error)), + } + } +} + impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> { fn with_context( self, @@ -209,29 +253,7 @@ impl<'data, T> AddContext<'data, T> for IonParseResult<'data, T> { match self { // No change needed in the ok case Ok(matched) => Ok(matched), - // If the error was an incomplete - Err(e) => { - // Nom error to IonParseError - match IonParseError::from(e) { - IonParseError::Incomplete => IonResult::incomplete(label, input.offset()), - IonParseError::Invalid(invalid_input_error) => { - dbg!(&invalid_input_error.backtrace); - let mut message = String::from( - invalid_input_error - .description() - .unwrap_or("invalid text Ion syntax"), - ); - if let Some(label) = invalid_input_error.label { - message.push_str(" while "); - message.push_str(label.as_ref()); - } - let position = Position::with_offset(invalid_input_error.input.offset()) - .with_length(invalid_input_error.input.len()); - let decoding_error = DecodingError::new(message).with_position(position); - Err(IonError::Decoding(decoding_error)) - } - } - } + Err(e) => e.with_context(label, input), } } } diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 22eedf61..94048d80 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -80,6 +80,12 @@ mod tests { 500 0x20 0b0101 + +inf + -inf + nan + 3.6e0 + 2.5e23 + -318e-2 "#; let mut reader = LazyRawTextReader::new(data.as_bytes()); @@ -128,6 +134,45 @@ mod tests { assert_eq!(lazy_int_binary_0101.ion_type(), IonType::Int); assert_eq!(lazy_int_binary_0101.read()?.expect_i64()?, 0b0101); // decimal 5 + // +inf + let lazy_float_pos_inf = reader.next()?.expect_value()?; + assert!(!lazy_float_pos_inf.is_null()); + assert_eq!(lazy_float_pos_inf.ion_type(), IonType::Float); + assert_eq!(lazy_float_pos_inf.read()?.expect_float()?, f64::INFINITY); + + // -inf + let lazy_float_neg_inf = reader.next()?.expect_value()?; + assert!(!lazy_float_neg_inf.is_null()); + assert_eq!(lazy_float_neg_inf.ion_type(), IonType::Float); + assert_eq!( + lazy_float_neg_inf.read()?.expect_float()?, + f64::NEG_INFINITY + ); + + // nan + let lazy_float_neg_inf = reader.next()?.expect_value()?; + assert!(!lazy_float_neg_inf.is_null()); + assert_eq!(lazy_float_neg_inf.ion_type(), IonType::Float); + assert!(lazy_float_neg_inf.read()?.expect_float()?.is_nan()); + + // 3.6e0 + let lazy_float = reader.next()?.expect_value()?; + assert!(!lazy_float.is_null()); + assert_eq!(lazy_float.ion_type(), IonType::Float); + assert_eq!(lazy_float.read()?.expect_float()?, 3.6f64); + + // 2.25e23 + let lazy_float = reader.next()?.expect_value()?; + assert!(!lazy_float.is_null()); + assert_eq!(lazy_float.ion_type(), IonType::Float); + assert_eq!(lazy_float.read()?.expect_float()?, 2.5f64 * 10f64.powi(23)); + + // -3.14 + let lazy_float = reader.next()?.expect_value()?; + assert!(!lazy_float.is_null()); + assert_eq!(lazy_float.ion_type(), IonType::Float); + assert_eq!(lazy_float.read()?.expect_float()?, -3.18); + Ok(()) } } diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index e586f677..1842ddd5 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -50,6 +50,7 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { MatchedValue::Null(ion_type) => RawValueRef::Null(ion_type), MatchedValue::Bool(b) => RawValueRef::Bool(b), MatchedValue::Int(i) => RawValueRef::Int(i.read(matched_input)?), + MatchedValue::Float(f) => RawValueRef::Float(f.read(matched_input)?), }; Ok(value_ref) } From 07d4a70c547cd731831b6a96c274112fd81dc45b Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Thu, 27 Jul 2023 14:21:20 -0400 Subject: [PATCH 05/15] Adds LazyRawTextReader support for comments --- src/lazy/text/buffer.rs | 57 ++++++++++++++++++++++++++++++++++++- src/lazy/text/raw/reader.rs | 20 +++++++++++-- 2 files changed, 74 insertions(+), 3 deletions(-) diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index e7f7d393..c74be95c 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -7,7 +7,7 @@ use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; use crate::lazy::text::value::LazyRawTextValue; use crate::{IonResult, IonType}; use nom::branch::alt; -use nom::bytes::streaming::{is_a, tag, take_while1}; +use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1}; use nom::character::streaming::{char, digit1, one_of}; use nom::combinator::{map, opt, peek, recognize, success, value}; use nom::error::{ErrorKind, ParseError}; @@ -157,6 +157,55 @@ impl<'data> TextBufferView<'data> { alt((Self::match_whitespace, Self::match_nothing))(self) } + /// Matches any amount of contiguous comments and whitespace, including none. + pub fn match_optional_comments_and_whitespace(self) -> IonMatchResult<'data> { + recognize(many0_count(alt(( + Self::match_whitespace, + Self::match_comment, + ))))(self) + } + + /// Matches a single + /// // Rest-of-the-line + /// or + /// /* multi + /// line */ + /// comment + pub fn match_comment(self) -> IonMatchResult<'data> { + alt(( + Self::match_rest_of_line_comment, + Self::match_multiline_comment, + ))(self) + } + + /// Matches a single rest-of-the-line comment. + fn match_rest_of_line_comment(self) -> IonMatchResult<'data> { + preceded( + // Matches a leading "//"... + tag("//"), + // ...followed by either... + alt(( + // ...one or more non-EOL characters... + is_not("\r\n"), + // ...or any EOL character. + peek(recognize(one_of("\r\n"))), + // In either case, the line ending will not be consumed. + )), + )(self) + } + + /// Matches a single multiline comment. + fn match_multiline_comment(self) -> IonMatchResult<'data> { + recognize(delimited( + // Matches a leading "/*"... + tag("/*"), + // ...any number of non-"*/" characters... + take_until("*/"), + // ...and then a closing "*/" + tag("*/"), + ))(self) + } + /// Matches a single top-level scalar value, the beginning of a container, or an IVM. pub fn match_top_level(self) -> IonParseResult<'data, RawStreamItem<'data, TextEncoding>> { let (remaining, value) = match self.match_value() { @@ -569,6 +618,12 @@ impl<'data> nom::Slice> for TextBufferView<'data> { } } +impl<'data> nom::FindSubstring<&str> for TextBufferView<'data> { + fn find_substring(&self, substr: &str) -> Option { + self.data.find_substring(substr) + } +} + impl<'data> nom::InputTakeAtPosition for TextBufferView<'data> { type Item = u8; diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 94048d80..8b41a5e5 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -42,8 +42,11 @@ impl<'data> LazyRawTextReader<'data> { return IonResult::incomplete("reading a top-level value", buffer.offset()); } let (buffer_after_whitespace, _whitespace) = buffer - .match_optional_whitespace() - .with_context("skipping whitespace between top-level values", buffer)?; + .match_optional_comments_and_whitespace() + .with_context( + "skipping comments and whitespace between top-level values", + buffer, + )?; let (remaining, matched) = buffer_after_whitespace .match_top_level() .with_context("reading a top-level value", buffer_after_whitespace)?; @@ -72,14 +75,27 @@ mod tests { #[test] fn test_top_level() -> IonResult<()> { let data = r#" + /* + This test demonstrates lazily reading top-level values + of various Ion types. The values are interspersed with + different kinds of comments and whitespace. + */ + + // Typed nulls null null.bool null.int + + // Booleans false true + + // Integers 500 0x20 0b0101 + + // Floats +inf -inf nan From 181e0a548ccac7734dab064e77461125d981440d Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Fri, 28 Jul 2023 17:36:01 -0400 Subject: [PATCH 06/15] Adds LazyRawTextReader support for reading strings --- src/lazy/binary/raw/value.rs | 3 +- src/lazy/mod.rs | 1 + src/lazy/raw_value_ref.rs | 27 +++- src/lazy/str_ref.rs | 82 ++++++++++++ src/lazy/system_reader.rs | 4 +- src/lazy/text/as_utf8.rs | 6 + src/lazy/text/buffer.rs | 134 +++++++++++++++++-- src/lazy/text/encoded_value.rs | 7 +- src/lazy/text/matched.rs | 234 ++++++++++++++++++++++++++++++++- src/lazy/text/parse_result.rs | 28 ++-- src/lazy/text/raw/reader.rs | 191 ++++++++++++++------------- src/lazy/text/value.rs | 7 +- src/lazy/value_ref.rs | 12 +- 13 files changed, 605 insertions(+), 131 deletions(-) create mode 100644 src/lazy/str_ref.rs diff --git a/src/lazy/binary/raw/value.rs b/src/lazy/binary/raw/value.rs index 9ed2340e..a8cb397f 100644 --- a/src/lazy/binary/raw/value.rs +++ b/src/lazy/binary/raw/value.rs @@ -9,6 +9,7 @@ use crate::lazy::decoder::private::LazyRawValuePrivate; use crate::lazy::decoder::LazyRawValue; use crate::lazy::encoding::BinaryEncoding; use crate::lazy::raw_value_ref::RawValueRef; +use crate::lazy::str_ref::StrRef; use crate::result::IonFailure; use crate::types::SymbolId; use crate::{Decimal, Int, IonError, IonResult, IonType, RawSymbolTokenRef, Timestamp}; @@ -390,7 +391,7 @@ impl<'data> LazyRawBinaryValue<'data> { let raw_bytes = self.value_body()?; let text = std::str::from_utf8(raw_bytes) .map_err(|_| IonError::decoding_error("found a string with invalid utf-8 data"))?; - Ok(RawValueRef::String(text)) + Ok(RawValueRef::String(StrRef::from(text))) } /// Helper method called by [`Self::read`]. Reads the current value as a blob. diff --git a/src/lazy/mod.rs b/src/lazy/mod.rs index 3f42baa8..af004584 100644 --- a/src/lazy/mod.rs +++ b/src/lazy/mod.rs @@ -8,6 +8,7 @@ pub mod raw_stream_item; pub mod raw_value_ref; pub mod reader; pub mod sequence; +pub mod str_ref; pub mod r#struct; pub mod system_reader; pub mod system_stream_item; diff --git a/src/lazy/raw_value_ref.rs b/src/lazy/raw_value_ref.rs index a0da98eb..5e76db66 100644 --- a/src/lazy/raw_value_ref.rs +++ b/src/lazy/raw_value_ref.rs @@ -1,4 +1,5 @@ use crate::lazy::decoder::LazyDecoder; +use crate::lazy::str_ref::StrRef; use crate::result::IonFailure; use crate::{Decimal, Int, IonResult, IonType, RawSymbolTokenRef, Timestamp}; use std::fmt::{Debug, Formatter}; @@ -15,7 +16,7 @@ pub enum RawValueRef<'data, D: LazyDecoder<'data>> { Float(f64), Decimal(Decimal), Timestamp(Timestamp), - String(&'data str), + String(StrRef<'data>), Symbol(RawSymbolTokenRef<'data>), Blob(&'data [u8]), Clob(&'data [u8]), @@ -24,6 +25,28 @@ pub enum RawValueRef<'data, D: LazyDecoder<'data>> { Struct(D::Struct), } +// Provides equality for scalar types, but not containers. +impl<'data, D: LazyDecoder<'data>> PartialEq for RawValueRef<'data, D> { + fn eq(&self, other: &Self) -> bool { + use RawValueRef::*; + match (self, other) { + (Null(i1), Null(i2)) => i1 == i2, + (Bool(b1), Bool(b2)) => b1 == b2, + (Int(i1), Int(i2)) => i1 == i2, + (Float(f1), Float(f2)) => f1 == f2, + (Decimal(d1), Decimal(d2)) => d1 == d2, + (Timestamp(t1), Timestamp(t2)) => t1 == t2, + (String(s1), String(s2)) => s1 == s2, + (Symbol(s1), Symbol(s2)) => s1 == s2, + (Blob(b1), Blob(b2)) => b1 == b2, + (Clob(c1), Clob(c2)) => c1 == c2, + // We cannot compare lazy containers as we cannot guarantee that their complete contents + // are available in the buffer. Is `{foo: bar}` equal to `{foo: b`? + _ => false, + } + } +} + impl<'data, D: LazyDecoder<'data>> Debug for RawValueRef<'data, D> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { @@ -101,7 +124,7 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> { } } - pub fn expect_string(self) -> IonResult<&'data str> { + pub fn expect_string(self) -> IonResult> { if let RawValueRef::String(s) = self { Ok(s) } else { diff --git a/src/lazy/str_ref.rs b/src/lazy/str_ref.rs new file mode 100644 index 00000000..17161e0d --- /dev/null +++ b/src/lazy/str_ref.rs @@ -0,0 +1,82 @@ +use crate::text::text_formatter::IonValueFormatter; +use crate::Str; +use std::borrow::Cow; +use std::fmt::{Display, Formatter}; +use std::ops::Deref; + +#[derive(Clone, PartialEq, Debug)] +pub struct StrRef<'data> { + text: Cow<'data, str>, +} + +impl<'data> StrRef<'data> { + pub fn to_owned(&self) -> Str { + Str::from(self.as_ref()) + } + + pub fn into_owned(self) -> Str { + Str::from(self) + } + + pub fn text(&self) -> &str { + self.as_ref() + } +} + +impl<'data> Deref for StrRef<'data> { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.text.as_ref() + } +} + +impl<'data> PartialEq for StrRef<'data> { + fn eq(&self, other: &str) -> bool { + self.text() == other + } +} + +impl<'data> PartialEq<&str> for StrRef<'data> { + fn eq(&self, other: &&str) -> bool { + self.text() == *other + } +} + +impl<'data> PartialEq> for str { + fn eq(&self, other: &StrRef<'data>) -> bool { + self == other.text() + } +} + +impl<'data> Display for StrRef<'data> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let mut formatter = IonValueFormatter { output: f }; + formatter + .format_string(self.text()) + .map_err(|_| std::fmt::Error) + } +} + +impl<'a> From<&'a str> for StrRef<'a> { + fn from(value: &'a str) -> Self { + StrRef { + text: Cow::from(value), + } + } +} + +impl<'a> From for StrRef<'a> { + fn from(value: String) -> Self { + StrRef { + text: Cow::from(value), + } + } +} + +impl<'data> From> for Str { + fn from(str_ref: StrRef<'data>) -> Self { + let text: String = str_ref.text.into_owned(); + Str::from(text) + } +} diff --git a/src/lazy/system_reader.rs b/src/lazy/system_reader.rs index bdf76de2..bee0458a 100644 --- a/src/lazy/system_reader.rs +++ b/src/lazy/system_reader.rs @@ -235,8 +235,8 @@ impl<'data, D: LazyDecoder<'data>> LazySystemReader<'data, D> { fn process_symbols(pending_lst: &mut PendingLst, symbols: &D::Value) -> IonResult<()> { if let RawValueRef::List(list) = symbols.read()? { for symbol_text in list.iter() { - if let RawValueRef::String(text) = symbol_text?.read()? { - pending_lst.symbols.push(Some(text.to_owned())) + if let RawValueRef::String(str_ref) = symbol_text?.read()? { + pending_lst.symbols.push(Some(str_ref.text().to_owned())) } else { pending_lst.symbols.push(None) } diff --git a/src/lazy/text/as_utf8.rs b/src/lazy/text/as_utf8.rs index 9be4784c..69dfa46e 100644 --- a/src/lazy/text/as_utf8.rs +++ b/src/lazy/text/as_utf8.rs @@ -12,6 +12,12 @@ pub(crate) trait AsUtf8 { fn as_utf8(&self, position: impl Into) -> IonResult<&str>; } +impl AsUtf8 for [u8] { + fn as_utf8(&self, position: impl Into) -> IonResult<&str> { + bytes_as_utf8(self, position) + } +} + impl AsUtf8 for SmallVec<[u8; N]> { fn as_utf8(&self, position: impl Into) -> IonResult<&str> { bytes_as_utf8(self.as_ref(), position) diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index c74be95c..d7e0bc64 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -1,11 +1,8 @@ -use crate::lazy::encoding::TextEncoding; -use crate::lazy::raw_stream_item::RawStreamItem; -use crate::lazy::text::encoded_value::EncodedTextValue; -use crate::lazy::text::matched::{MatchedFloat, MatchedInt, MatchedValue}; -use crate::lazy::text::parse_result::IonParseError; -use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; -use crate::lazy::text::value::LazyRawTextValue; -use crate::{IonResult, IonType}; +use std::fmt::{Debug, Formatter}; +use std::iter::{Copied, Enumerate}; +use std::ops::{RangeFrom, RangeTo}; +use std::slice::Iter; + use nom::branch::alt; use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1}; use nom::character::streaming::{char, digit1, one_of}; @@ -14,10 +11,18 @@ use nom::error::{ErrorKind, ParseError}; use nom::multi::many0_count; use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}; use nom::{CompareResult, IResult, InputLength, InputTake, Needed, Parser}; -use std::fmt::{Debug, Formatter}; -use std::iter::{Copied, Enumerate}; -use std::ops::{RangeFrom, RangeTo}; -use std::slice::Iter; + +use crate::lazy::encoding::TextEncoding; +use crate::lazy::raw_stream_item::RawStreamItem; +use crate::lazy::text::encoded_value::EncodedTextValue; +use crate::lazy::text::matched::{ + MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedValue, +}; +use crate::lazy::text::parse_result::IonParseError; +use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; +use crate::lazy::text::value::LazyRawTextValue; +use crate::result::DecodingError; +use crate::{IonError, IonResult, IonType}; impl<'a> Debug for TextBufferView<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { @@ -137,6 +142,19 @@ impl<'data> TextBufferView<'data> { self.data.is_empty() } + /// Attempts to view the contents of the buffer as a UTF-8 `&str`. + pub fn as_text<'a>(&'a self) -> IonResult<&'data str> { + // On its surface, this method very closely resembles the `AsUtf8` trait's method. + // However, this one returns a `&'data str` instead of a `&'a str`, which is to say + // that the string that's returned lives as long as the data itself, not just the duration + // of the lifetime introduced by this method call. + std::str::from_utf8(&self.data).map_err(move |_| { + let decoding_error = + DecodingError::new("encountered invalid UTF-8").with_position(self.offset()); + IonError::Decoding(decoding_error) + }) + } + pub fn match_whitespace(self) -> IonMatchResult<'data> { is_a(WHITESPACE_CHARACTERS_AS_STR)(self) } @@ -247,6 +265,16 @@ impl<'data> TextBufferView<'data> { EncodedTextValue::new(MatchedValue::Float(matched_float), self.offset(), length) }, ), + map( + match_and_length(Self::match_string), + |(matched_string, length)| { + EncodedTextValue::new( + MatchedValue::String(matched_string), + self.offset(), + length, + ) + }, + ), // TODO: The other Ion types )) .map(|encoded_value| LazyRawTextValue { @@ -532,6 +560,49 @@ impl<'data> TextBufferView<'data> { pub fn match_any_sign(self) -> IonMatchResult<'data> { alt((tag("+"), tag("-")))(self) } + + /// Matches short- or long-form string. + fn match_string(self) -> IonParseResult<'data, MatchedString> { + alt((Self::match_short_string, Self::match_long_string))(self) + } + + /// Matches a short string. For example: `"foo"` + fn match_short_string(self) -> IonParseResult<'data, MatchedString> { + delimited(char('"'), Self::match_short_string_body, char('"')) + .map(|(_matched, contains_escaped_chars)| { + MatchedString::Short(MatchedShortString::new(contains_escaped_chars)) + }) + .parse(self) + } + + /// Returns a matched buffer and a boolean indicating whether any escaped characters were + /// found in the short string. + fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> { + let mut is_escaped = false; + let mut contains_escaped_chars = false; + for (index, byte) in self.bytes().iter().enumerate() { + if is_escaped { + // If we're escaped, the previous byte was a \ and we ignore this one. + is_escaped = false; + continue; + } + if *byte == b'\\' { + is_escaped = true; + contains_escaped_chars = true; + continue; + } + if *byte == b'\"' { + let matched = self.slice(0, index); + let remaining = self.slice_to_end(index); + return Ok((remaining, (matched, contains_escaped_chars))); + } + } + Err(nom::Err::Incomplete(Needed::Unknown)) + } + + fn match_long_string(self) -> IonParseResult<'data, MatchedString> { + todo!() + } } // === nom trait implementations === @@ -934,4 +1005,43 @@ mod tests { mismatch_float(input); } } + + #[test] + fn test_match_string() { + fn match_string(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_string)); + } + fn mismatch_string(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_string)); + } + + let good_inputs = &[ + r#" + "hello" + "#, + r#" + "😀😀😀" + "#, + r#" + "this has an escaped quote \" right in the middle" + "#, + ]; + for input in good_inputs { + match_string(input); + } + + let bad_inputs = &[ + // Missing an opening quote + r#" + hello" + "#, + // Missing a trailing quote + r#" + "hello + "#, + ]; + for input in bad_inputs { + mismatch_string(input); + } + } } diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index 0c649d08..17779c2d 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -8,7 +8,7 @@ use std::ops::Range; /// Each [`LazyRawTextValue`](crate::lazy::text::value::LazyRawTextValue) contains an `EncodedValue`, /// allowing a user to re-read (that is: parse) the body of the value as many times as necessary /// without re-parsing its header information each time. -#[derive(Clone, Copy, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq)] pub(crate) struct EncodedTextValue { // Each encoded text value has up to three components, appearing in the following order: // @@ -115,6 +115,7 @@ impl EncodedTextValue { MatchedValue::Bool(_) => IonType::Bool, MatchedValue::Int(_) => IonType::Int, MatchedValue::Float(_) => IonType::Float, + MatchedValue::String(_) => IonType::String, } } @@ -163,8 +164,8 @@ impl EncodedTextValue { self.data_length + u32::max(self.annotations_offset, self.field_name_offset) as usize } - pub fn matched(&self) -> MatchedValue { - self.matched_value + pub fn matched(&self) -> &MatchedValue { + &self.matched_value } } diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index 560d828e..7c21b3e2 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -19,26 +19,30 @@ //! use the previously recorded information to minimize the amount of information that needs to be //! re-discovered. +use nom::character::is_hex_digit; use std::num::IntErrorKind; +use std::ops::Range; use num_bigint::BigInt; use num_traits::Num; use smallvec::SmallVec; +use crate::lazy::str_ref::StrRef; use crate::lazy::text::as_utf8::AsUtf8; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::parse_result::InvalidInputError; -use crate::result::IonFailure; +use crate::result::{DecodingError, IonFailure}; use crate::{Int, IonError, IonResult, IonType}; /// A partially parsed Ion value. -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq)] pub(crate) enum MatchedValue { // `Null` and `Bool` are fully parsed because they only involve matching a keyword. Null(IonType), Bool(bool), Int(MatchedInt), Float(MatchedFloat), + String(MatchedString), // TODO: ...the other types } @@ -154,3 +158,229 @@ impl MatchedFloat { Ok(float) } } + +#[derive(Clone, Debug, PartialEq)] +pub(crate) enum MatchedString { + /// The string only has one segment. (e.g. "foo") + Short(MatchedShortString), + /// The string is in multiple segments: + /// """hello,""" + /// """ world!""" + Long(MatchedLongString), +} + +#[derive(Clone, Debug, PartialEq)] +pub(crate) struct MatchedLongString { + // Keep a list of all the string segment ranges we found. + // If the user asks to read the string, we'll collate the segments into a single string. + slices: Vec>, +} + +#[derive(Clone, Copy, Debug, PartialEq)] +pub(crate) struct MatchedShortString { + contains_escaped_chars: bool, +} + +impl MatchedShortString { + pub fn new(contains_escaped_chars: bool) -> Self { + Self { + contains_escaped_chars, + } + } + pub fn contains_escaped_chars(&self) -> bool { + self.contains_escaped_chars + } +} + +impl MatchedString { + // Strings longer than 64 bytes will allocate a larger space on the heap. + const STACK_ALLOC_BUFFER_CAPACITY: usize = 64; + + pub fn read<'a, 'data>( + &'a self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + match self { + MatchedString::Short(short) => self.read_short_string(*short, matched_input), + MatchedString::Long(_) => todo!("long-form strings"), + } + } + + fn read_short_string<'a, 'data>( + &'a self, + short: MatchedShortString, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + // Take a slice of the input that ignores the first and last bytes, which are quotes. + let body = matched_input.slice(1, matched_input.len() - 2); + if !short.contains_escaped_chars() { + // There are no escaped characters, so we can just validate the string in-place. + let text = body.as_text()?; + let str_ref = StrRef::from(text); + return Ok(str_ref); + } + // Otherwise, there are escaped characters. We need to build a new version of our string + // that replaces the escaped characters with their corresponding bytes. + let mut sanitized = Vec::with_capacity(matched_input.len()); + + Self::escape_short_string(body, &mut sanitized)?; + let text = String::from_utf8(sanitized).unwrap(); + Ok(StrRef::from(text.to_string())) + } + + fn escape_short_string( + matched_input: TextBufferView, + sanitized: &mut Vec, + ) -> IonResult<()> { + let mut remaining = matched_input; + while !remaining.is_empty() { + let next_escape = remaining.bytes().iter().position(|byte| *byte == b'\\'); + remaining = if let Some(escape_offset) = next_escape { + // Everything up to the '\' is already clean. Write that slice to 'sanitized'. + let already_clean = remaining.slice(0, escape_offset); + sanitized.extend_from_slice(already_clean.bytes()); + // Everything starting from the '\' needs to be evaluated. + let contains_escapes = remaining.slice_to_end(escape_offset); + Self::write_escaped(contains_escapes, sanitized)? + } else { + sanitized.extend_from_slice(remaining.bytes()); + // 'remaining' is now empty + remaining.slice_to_end(remaining.len()) + }; + } + + Ok(()) + } + + fn write_escaped<'data>( + input: TextBufferView<'data>, + sanitized: &mut Vec, + ) -> IonResult> { + // Note that by the time this method has been called, the parser has already confirmed that + // there is an appropriate closing delimiter. Thus, if any of the branches below run out of + // data, it means that it's a fatal error and not just an Incomplete. + debug_assert!(!input.is_empty()); + debug_assert!(input.bytes()[0] == b'\\'); + if input.len() == 1 { + return Err(IonError::Decoding( + DecodingError::new("found an escape ('\\') with no subsequent character") + .with_position(input.offset()), + )); + } + let input_after_escape = input.slice_to_end(2); // After (e.g.) '\x' + let escape_id = input.bytes()[1]; + let substitute = match escape_id { + b'n' => b'\n', + b'r' => b'\r', + b't' => b'\t', + b'\\' => b'\\', + b'/' => b'/', + b'"' => b'"', + b'\'' => b'\'', + b'?' => b'?', + b'0' => 0x00u8, // NUL + b'a' => 0x07u8, // alert BEL + b'b' => 0x08u8, // backspace + b'v' => 0x0Bu8, // vertical tab + b'f' => 0x0Cu8, // form feed + // If the byte following the '\' is a real newline (that is: 0x0A), we discard it. + b'\n' => return Ok(input_after_escape), + // These cases require more sophisticated parsing, not just a 1-to-1 mapping of bytes + b'x' => return Self::hex_digits_code_point(2, input_after_escape, sanitized), + b'u' => return Self::hex_digits_code_point(4, input_after_escape, sanitized), + b'U' => return Self::hex_digits_code_point(8, input_after_escape, sanitized), + _ => { + return Err(IonError::Decoding( + DecodingError::new(format!("invalid escape sequence '\\{}", escape_id)) + .with_position(input.offset()), + )) + } + }; + + sanitized.push(substitute); + Ok(input_after_escape) + } + + fn hex_digits_code_point<'a, 'data>( + num_digits: usize, + input: TextBufferView<'data>, + sanitized: &'a mut Vec, + ) -> IonResult> { + if input.len() < num_digits { + return Err(IonError::Decoding( + DecodingError::new(format!( + "found a {}-hex-digit escape sequence with only {} digits", + num_digits, + input.len() + )) + .with_position(input.offset()), + )); + } + + let hex_digit_bytes = &input.bytes()[..num_digits]; + + let all_are_hex_digits = hex_digit_bytes + .iter() + .take(num_digits) + .copied() + .all(is_hex_digit); + if !all_are_hex_digits { + return Err(IonError::Decoding( + DecodingError::new(format!( + "found a {}-hex-digit escape sequence that contained an invalid hex digit", + num_digits, + )) + .with_position(input.offset()), + )); + } + // We just confirmed all of the digits are ASCII hex digits, so these steps cannot fail. + let hex_digits = std::str::from_utf8(hex_digit_bytes).unwrap(); + let code_point = u32::from_str_radix(hex_digits, 16).unwrap(); + + // Check to see if this is a high surrogate; if it is, our code point isn't complete. Another + // unicode escape representing the low surrogate has to be next in the input to complete it. + // See the docs for this helper function for details. (Note: this will only ever be true for + // 4- and 8-digit escape sequences. `\x` escapes don't have enough digits to represent a + // high surrogate.) + if code_point_is_a_high_surrogate(code_point) { + todo!("support surrogate pairs") + } + + // A Rust `char` can represent any Unicode scalar value--a code point that is not part of a + // surrogate pair. If the value we found isn't a high surrogate, then it's a complete scalar + // value. We can safely convert it to a `char`. + let character = char::from_u32(code_point).unwrap(); + let utf8_buffer: &mut [u8; 4] = &mut [0; 4]; + let utf8_encoded = character.encode_utf8(utf8_buffer); + sanitized.extend_from_slice(utf8_encoded.as_bytes()); + + // Skip beyond the digits we just processed + Ok(input.slice_to_end(num_digits)) + } +} + +/// Returns `true` if the provided code point is a utf-16 high surrogate. +/// +/// Terse primer: Unicode text is made up of a stream of unsigned integers called 'code points'. +/// What a person might think of as a 'character' (for example: 'a', '本', or '🥸') can be made up +/// of one or more code points. +/// +/// A single code point can require up to 21 bits. Depending on which Unicode encoding you're using, +/// these 21 bits can come with different amounts of additional overhead bits: +/// * In utf-8, a code point can be 1, 2, 3, or 4 bytes, with some bits in each byte being used +/// for the code point and others being used to indicate whether more bytes are coming. +/// * In utf-16, a code point can be 2 bytes or 4 bytes. If it's four bytes, the first two bytes will +/// be a 'high surrogate' (a value between 0xD800 and 0xDFFF) to communicate that another two +/// bytes are coming to complete the code point. +/// * In utf-32, a code point is always 32 bits. This is a bit wasteful, but makes for simple +/// processing. +/// +/// This helper function detects high surrogates (which are only used in utf-16) so the parser +/// can know to require a second one immediately following. +/// +/// Further reading: +/// * +/// * +fn code_point_is_a_high_surrogate(value: u32) -> bool { + (0xD800..=0xDFFF).contains(&value) +} diff --git a/src/lazy/text/parse_result.rs b/src/lazy/text/parse_result.rs index 6dfb919b..5def24ca 100644 --- a/src/lazy/text/parse_result.rs +++ b/src/lazy/text/parse_result.rs @@ -68,10 +68,6 @@ pub struct InvalidInputError<'data> { label: Option>, // The nature of the error--what went wrong? description: Option>, - // A backtrace of errors that occurred leading to this one. - // XXX: This is the most expensive part of error handling and is likely not very useful. - // Consider removing it if it doesn't carry its weight. - backtrace: Vec>, // The nom ErrorKind, which indicates which nom-provided parser encountered the error we're // bubbling up. nom_error_kind: Option, @@ -85,7 +81,6 @@ impl<'data> InvalidInputError<'data> { label: None, description: None, nom_error_kind: None, - backtrace: Vec::new(), } } @@ -107,10 +102,6 @@ impl<'data> InvalidInputError<'data> { self } - pub(crate) fn append_error(&mut self, error: InvalidInputError<'data>) { - self.backtrace.push(error) - } - /// Returns a reference to the `description` text, if any. pub fn description(&self) -> Option<&str> { self.description.as_deref() @@ -155,6 +146,19 @@ impl<'data> From> for IonError { message.push_str(" while "); message.push_str(label.as_ref()); } + message.push_str("; buffer: "); + let input = invalid_input_error.input; + let buffer_text = if let Ok(text) = invalid_input_error.input.as_text() { + // TODO: This really should be graphemes instead of chars() + text.chars().take(32).collect::() + } else { + format!( + "{:X?}", + &invalid_input_error.input.bytes()[..(32.min(input.len()))] + ) + }; + message.push_str(buffer_text.as_str()); + message.push_str("..."); let position = Position::with_offset(invalid_input_error.input.offset()) .with_length(invalid_input_error.input.len()); let decoding_error = DecodingError::new(message).with_position(position); @@ -199,14 +203,10 @@ impl<'data> ParseError> for IonParseError<'data> { .into() } - fn append(input: TextBufferView<'data>, kind: ErrorKind, mut other: Self) -> Self { + fn append(_input: TextBufferView<'data>, _kind: ErrorKind, other: Self) -> Self { // When an error stack is being built, this method is called to give the error // type an opportunity to aggregate the errors into a collection or a more descriptive // message. For now, we simply allow the most recent error to take precedence. - let new_error = InvalidInputError::new(input).with_nom_error_kind(kind); - if let IonParseError::Invalid(invalid_input_error) = &mut other { - invalid_input_error.backtrace.push(new_error) - } other } } diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 8b41a5e5..54777894 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -41,12 +41,17 @@ impl<'data> LazyRawTextReader<'data> { if buffer.is_empty() { return IonResult::incomplete("reading a top-level value", buffer.offset()); } - let (buffer_after_whitespace, _whitespace) = buffer - .match_optional_comments_and_whitespace() - .with_context( - "skipping comments and whitespace between top-level values", - buffer, - )?; + + let (buffer_after_whitespace, _whitespace) = + match buffer.match_optional_comments_and_whitespace() { + Ok((buf, ws)) => (buf, ws), + Err(nom::Err::Incomplete(_)) => return Ok(RawStreamItem::EndOfStream), + Err(e) => return IonResult::decoding_error(format!("broken: {:?}", e)), + }; + + if buffer_after_whitespace.is_empty() { + return Ok(RawStreamItem::EndOfStream); + } let (remaining, matched) = buffer_after_whitespace .match_top_level() .with_context("reading a top-level value", buffer_after_whitespace)?; @@ -70,6 +75,7 @@ impl<'data> LazyRawReader<'data, TextEncoding> for LazyRawTextReader<'data> { mod tests { use super::*; use crate::lazy::decoder::LazyRawValue; + use crate::lazy::raw_value_ref::RawValueRef; use crate::IonType; #[test] @@ -82,113 +88,120 @@ mod tests { */ // Typed nulls - null - null.bool - null.int + + null + null.bool + null.int // Booleans - false - true + + false + true // Integers - 500 - 0x20 - 0b0101 + + 500 + 0x20 + 0b0101 // Floats - +inf - -inf - nan - 3.6e0 - 2.5e23 - -318e-2 + + +inf + -inf + nan + 3.6e0 + 2.5e23 + -318e-2 + + // Strings + + "Hello!" + "foo bar baz" + "😎😎😎" + "lol\n\r\0wat" // Single-character escapes + "\x48ello, \x77orld!" // \x 2-digit hex escape + "\u0048ello, \u0077orld!" // \u 4-digit hex escape + "\U00000048ello, \U00000077orld!" // \U 8-digit hex escape + "#; - let mut reader = LazyRawTextReader::new(data.as_bytes()); - // null - let lazy_untyped_null = reader.next()?.expect_value()?; - assert!(lazy_untyped_null.is_null()); - assert_eq!(lazy_untyped_null.ion_type(), IonType::Null); + // Make a mutable string so we can append some things that require Rust-level escapes + let mut data = String::from(data); + // Escaped newlines are discarded + data.push_str("\"Hello,\\\n world!\""); + + fn expect_next<'a, 'data>( + reader: &'a mut LazyRawTextReader<'data>, + expected: RawValueRef<'data, TextEncoding>, + ) { + let lazy_value = reader + .next() + .expect("advancing the reader failed") + .expect_value() + .expect("expected a value"); + assert_eq!( + matches!(expected, RawValueRef::Null(_)), + lazy_value.is_null() + ); + let value_ref = lazy_value.read().expect("reading failed"); + assert_eq!(value_ref, expected, "{:?} != {:?}", value_ref, expected); + } - // null.bool - let lazy_null_bool = reader.next()?.expect_value()?; - assert!(lazy_null_bool.is_null()); - assert_eq!(lazy_null_bool.ion_type(), IonType::Bool); + let reader = &mut LazyRawTextReader::new(data.as_bytes()); + // null + expect_next(reader, RawValueRef::Null(IonType::Null)); + // null.bool + expect_next(reader, RawValueRef::Null(IonType::Bool)); // null.int - let lazy_null_int = reader.next()?.expect_value()?; - assert!(lazy_null_int.is_null()); - assert_eq!(lazy_null_int.ion_type(), IonType::Int); + expect_next(reader, RawValueRef::Null(IonType::Int)); // false - let lazy_bool_false = reader.next()?.expect_value()?; - assert!(!lazy_bool_false.is_null()); - assert_eq!(lazy_bool_false.ion_type(), IonType::Bool); - assert!(!lazy_bool_false.read()?.expect_bool()?); - + expect_next(reader, RawValueRef::Bool(false)); // true - let lazy_bool_true = reader.next()?.expect_value()?; - assert!(!lazy_bool_true.is_null()); - assert_eq!(lazy_bool_true.ion_type(), IonType::Bool); - assert!(lazy_bool_true.read()?.expect_bool()?); + expect_next(reader, RawValueRef::Bool(true)); // 500 - let lazy_int_decimal_500 = reader.next()?.expect_value()?; - assert!(!lazy_int_decimal_500.is_null()); - assert_eq!(lazy_int_decimal_500.ion_type(), IonType::Int); - assert_eq!(lazy_int_decimal_500.read()?.expect_i64()?, 500); - + expect_next(reader, RawValueRef::Int(500.into())); // 0x20 - let lazy_int_hex_20 = reader.next()?.expect_value()?; - assert!(!lazy_int_hex_20.is_null()); - assert_eq!(lazy_int_hex_20.ion_type(), IonType::Int); - assert_eq!(lazy_int_hex_20.read()?.expect_i64()?, 0x20); // decimal 32 - + expect_next(reader, RawValueRef::Int(0x20.into())); // 0b0101 - let lazy_int_binary_0101 = reader.next()?.expect_value()?; - assert!(!lazy_int_binary_0101.is_null()); - assert_eq!(lazy_int_binary_0101.ion_type(), IonType::Int); - assert_eq!(lazy_int_binary_0101.read()?.expect_i64()?, 0b0101); // decimal 5 + expect_next(reader, RawValueRef::Int(0b0101.into())); // +inf - let lazy_float_pos_inf = reader.next()?.expect_value()?; - assert!(!lazy_float_pos_inf.is_null()); - assert_eq!(lazy_float_pos_inf.ion_type(), IonType::Float); - assert_eq!(lazy_float_pos_inf.read()?.expect_float()?, f64::INFINITY); - + expect_next(reader, RawValueRef::Float(f64::INFINITY)); // -inf - let lazy_float_neg_inf = reader.next()?.expect_value()?; - assert!(!lazy_float_neg_inf.is_null()); - assert_eq!(lazy_float_neg_inf.ion_type(), IonType::Float); - assert_eq!( - lazy_float_neg_inf.read()?.expect_float()?, - f64::NEG_INFINITY - ); - + expect_next(reader, RawValueRef::Float(f64::NEG_INFINITY)); // nan - let lazy_float_neg_inf = reader.next()?.expect_value()?; - assert!(!lazy_float_neg_inf.is_null()); - assert_eq!(lazy_float_neg_inf.ion_type(), IonType::Float); - assert!(lazy_float_neg_inf.read()?.expect_float()?.is_nan()); - + // NaN != NaN, so we have to spell this test out a bit more + assert!(reader + .next()? + .expect_value()? + .read()? + .expect_float()? + .is_nan()); // 3.6e0 - let lazy_float = reader.next()?.expect_value()?; - assert!(!lazy_float.is_null()); - assert_eq!(lazy_float.ion_type(), IonType::Float); - assert_eq!(lazy_float.read()?.expect_float()?, 3.6f64); - + expect_next(reader, RawValueRef::Float(3.6f64)); // 2.25e23 - let lazy_float = reader.next()?.expect_value()?; - assert!(!lazy_float.is_null()); - assert_eq!(lazy_float.ion_type(), IonType::Float); - assert_eq!(lazy_float.read()?.expect_float()?, 2.5f64 * 10f64.powi(23)); - - // -3.14 - let lazy_float = reader.next()?.expect_value()?; - assert!(!lazy_float.is_null()); - assert_eq!(lazy_float.ion_type(), IonType::Float); - assert_eq!(lazy_float.read()?.expect_float()?, -3.18); - + expect_next(reader, RawValueRef::Float(2.5f64 * 10f64.powi(23))); + // -3.18 + expect_next(reader, RawValueRef::Float(-3.18f64)); + // "Hello" + expect_next(reader, RawValueRef::String("Hello!".into())); + // "foo bar baz" + expect_next(reader, RawValueRef::String("foo bar baz".into())); + // "😎😎😎" + expect_next(reader, RawValueRef::String("😎😎😎".into())); + // "lol\n\r\0wat" + expect_next(reader, RawValueRef::String("lol\n\r\0wat".into())); + // "\x48ello, \x77orld!" + expect_next(reader, RawValueRef::String("Hello, world!".into())); + // "\u0048ello, \u0077orld!" + expect_next(reader, RawValueRef::String("Hello, world!".into())); + // "\U00000048ello, \U00000077orld!" + expect_next(reader, RawValueRef::String("Hello, world!".into())); + // "\"Hello,\\\n world!\" " + expect_next(reader, RawValueRef::String("Hello, world!".into())); Ok(()) } } diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index 1842ddd5..f888b63a 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -47,10 +47,13 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { fn read(&self) -> IonResult> { let matched_input = self.input.slice(0, self.encoded_value.data_length()); let value_ref = match self.encoded_value.matched() { - MatchedValue::Null(ion_type) => RawValueRef::Null(ion_type), - MatchedValue::Bool(b) => RawValueRef::Bool(b), + MatchedValue::Null(ion_type) => RawValueRef::Null(*ion_type), + MatchedValue::Bool(b) => RawValueRef::Bool(*b), MatchedValue::Int(i) => RawValueRef::Int(i.read(matched_input)?), MatchedValue::Float(f) => RawValueRef::Float(f.read(matched_input)?), + // ...decimal, timestamp... + MatchedValue::String(s) => RawValueRef::String(s.read(matched_input)?), + // ...and the rest! }; Ok(value_ref) } diff --git a/src/lazy/value_ref.rs b/src/lazy/value_ref.rs index 54aa9272..0b4fb739 100644 --- a/src/lazy/value_ref.rs +++ b/src/lazy/value_ref.rs @@ -2,6 +2,7 @@ use crate::element::Value; use crate::lazy::decoder::LazyDecoder; use crate::lazy::r#struct::LazyStruct; use crate::lazy::sequence::LazySequence; +use crate::lazy::str_ref::StrRef; use crate::result::IonFailure; use crate::{Decimal, Int, IonError, IonResult, IonType, SymbolRef, Timestamp}; use std::fmt::{Debug, Formatter}; @@ -20,7 +21,7 @@ pub enum ValueRef<'top, 'data, D: LazyDecoder<'data>> { Float(f64), Decimal(Decimal), Timestamp(Timestamp), - String(&'data str), + String(StrRef<'data>), Symbol(SymbolRef<'top>), Blob(&'data [u8]), Clob(&'data [u8]), @@ -152,7 +153,7 @@ impl<'top, 'data, D: LazyDecoder<'data>> ValueRef<'top, 'data, D> { } } - pub fn expect_string(self) -> IonResult<&'data str> { + pub fn expect_string(self) -> IonResult> { if let ValueRef::String(s) = self { Ok(s) } else { @@ -286,7 +287,7 @@ mod tests { )?; let mut reader = LazyBinaryReader::new(&ion_data)?; let first_value = reader.expect_next()?.read()?; - assert_ne!(first_value, ValueRef::String("it's not a string")); + assert_ne!(first_value, ValueRef::String("it's not a string".into())); assert_eq!(first_value, ValueRef::Null(IonType::Null)); assert_eq!(reader.expect_next()?.read()?, ValueRef::Bool(true)); assert_eq!(reader.expect_next()?.read()?, ValueRef::Int(1.into())); @@ -303,7 +304,10 @@ mod tests { reader.expect_next()?.read()?, ValueRef::Symbol(SymbolRef::from("foo")) ); - assert_eq!(reader.expect_next()?.read()?, ValueRef::String("hello")); + assert_eq!( + reader.expect_next()?.read()?, + ValueRef::String("hello".into()) + ); assert_eq!( reader.expect_next()?.read()?, ValueRef::Blob(&[0x06, 0x5A, 0x1B]) // Base64-decoded "Blob" From 357ca8f9f8cc83dbe740a478afed6376b5e717b2 Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Fri, 28 Jul 2023 17:42:18 -0400 Subject: [PATCH 07/15] clippy fixes --- src/lazy/text/buffer.rs | 2 +- src/lazy/text/matched.rs | 13 +++++-------- src/lazy/text/raw/reader.rs | 4 ++-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index d7e0bc64..3ceb5de6 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -148,7 +148,7 @@ impl<'data> TextBufferView<'data> { // However, this one returns a `&'data str` instead of a `&'a str`, which is to say // that the string that's returned lives as long as the data itself, not just the duration // of the lifetime introduced by this method call. - std::str::from_utf8(&self.data).map_err(move |_| { + std::str::from_utf8(self.data).map_err(move |_| { let decoding_error = DecodingError::new("encountered invalid UTF-8").with_position(self.offset()); IonError::Decoding(decoding_error) diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index 7c21b3e2..38a1f6ac 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -196,18 +196,15 @@ impl MatchedString { // Strings longer than 64 bytes will allocate a larger space on the heap. const STACK_ALLOC_BUFFER_CAPACITY: usize = 64; - pub fn read<'a, 'data>( - &'a self, - matched_input: TextBufferView<'data>, - ) -> IonResult> { + pub fn read<'data>(&self, matched_input: TextBufferView<'data>) -> IonResult> { match self { MatchedString::Short(short) => self.read_short_string(*short, matched_input), MatchedString::Long(_) => todo!("long-form strings"), } } - fn read_short_string<'a, 'data>( - &'a self, + fn read_short_string<'data>( + &self, short: MatchedShortString, matched_input: TextBufferView<'data>, ) -> IonResult> { @@ -301,10 +298,10 @@ impl MatchedString { Ok(input_after_escape) } - fn hex_digits_code_point<'a, 'data>( + fn hex_digits_code_point<'data>( num_digits: usize, input: TextBufferView<'data>, - sanitized: &'a mut Vec, + sanitized: &mut Vec, ) -> IonResult> { if input.len() < num_digits { return Err(IonError::Decoding( diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 54777894..20a1fa52 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -130,8 +130,8 @@ mod tests { // Escaped newlines are discarded data.push_str("\"Hello,\\\n world!\""); - fn expect_next<'a, 'data>( - reader: &'a mut LazyRawTextReader<'data>, + fn expect_next<'data>( + reader: &mut LazyRawTextReader<'data>, expected: RawValueRef<'data, TextEncoding>, ) { let lazy_value = reader From 716ff343f294ff573fa7af36208174804d83c204 Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Fri, 28 Jul 2023 20:19:31 -0400 Subject: [PATCH 08/15] Fix a couple of unit tests --- src/lazy/struct.rs | 8 ++++---- src/lazy/text/buffer.rs | 10 +++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/lazy/struct.rs b/src/lazy/struct.rs index 2251b949..4956728f 100644 --- a/src/lazy/struct.rs +++ b/src/lazy/struct.rs @@ -147,7 +147,7 @@ impl<'top, 'data, D: LazyDecoder<'data>> LazyStruct<'top, 'data, D> { ///# use ion_rs::IonResult; ///# fn main() -> IonResult<()> { /// use ion_rs::{Element, IonType}; - /// use ion_rs::lazy::reader::LazyBinaryReader;; + /// use ion_rs::lazy::reader::LazyBinaryReader; /// use ion_rs::lazy::value_ref::ValueRef; /// /// let ion_data = r#"{foo: "hello", bar: null.list, baz: 3, bar: 4}"#; @@ -156,7 +156,7 @@ impl<'top, 'data, D: LazyDecoder<'data>> LazyStruct<'top, 'data, D> { /// /// let lazy_struct = reader.expect_next()?.read()?.expect_struct()?; /// - /// assert_eq!(lazy_struct.get("foo")?, Some(ValueRef::String("hello"))); + /// assert_eq!(lazy_struct.get("foo")?, Some(ValueRef::String("hello".into()))); /// assert_eq!(lazy_struct.get("baz")?, Some(ValueRef::Int(3.into()))); /// assert_eq!(lazy_struct.get("bar")?, Some(ValueRef::Null(IonType::List))); ///# Ok(()) @@ -175,7 +175,7 @@ impl<'top, 'data, D: LazyDecoder<'data>> LazyStruct<'top, 'data, D> { ///# use ion_rs::IonResult; ///# fn main() -> IonResult<()> { /// use ion_rs::Element; - /// use ion_rs::lazy::reader::LazyBinaryReader;; + /// use ion_rs::lazy::reader::LazyBinaryReader; /// use ion_rs::lazy::value_ref::ValueRef; /// /// let ion_data = r#"{foo: "hello", bar: null.list, baz: 3, bar: 4}"#; @@ -184,7 +184,7 @@ impl<'top, 'data, D: LazyDecoder<'data>> LazyStruct<'top, 'data, D> { /// /// let lazy_struct = reader.expect_next()?.read()?.expect_struct()?; /// - /// assert_eq!(lazy_struct.get_expected("foo")?, ValueRef::String("hello")); + /// assert_eq!(lazy_struct.get_expected("foo")?, ValueRef::String("hello".into())); /// assert!(dbg!(lazy_struct.get_expected("Ontario")).is_err()); ///# Ok(()) ///# } diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index 3ceb5de6..24a6ec74 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -6,7 +6,7 @@ use std::slice::Iter; use nom::branch::alt; use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1}; use nom::character::streaming::{char, digit1, one_of}; -use nom::combinator::{map, opt, peek, recognize, success, value}; +use nom::combinator::{fail, map, opt, peek, recognize, success, value}; use nom::error::{ErrorKind, ParseError}; use nom::multi::many0_count; use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}; @@ -601,7 +601,9 @@ impl<'data> TextBufferView<'data> { } fn match_long_string(self) -> IonParseResult<'data, MatchedString> { - todo!() + // TODO: implement long string matching + // The `fail` parser is a nom builtin that never matches. + fail(self) } } @@ -1015,6 +1017,8 @@ mod tests { MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_string)); } + // These inputs have leading/trailing whitespace to make them more readable, but the string + // matcher doesn't accept whitespace. We'll trim each one before testing it. let good_inputs = &[ r#" "hello" @@ -1027,7 +1031,7 @@ mod tests { "#, ]; for input in good_inputs { - match_string(input); + match_string(input.trim()); } let bad_inputs = &[ From e29fec571b6ba9c81cd6ef4aad76a2619f8e0f38 Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Fri, 28 Jul 2023 20:28:53 -0400 Subject: [PATCH 09/15] Less ambitious float eq comparison --- src/lazy/text/raw/reader.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 20a1fa52..d5ec559f 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -110,7 +110,7 @@ mod tests { -inf nan 3.6e0 - 2.5e23 + 2.5e008 -318e-2 // Strings @@ -183,7 +183,7 @@ mod tests { // 3.6e0 expect_next(reader, RawValueRef::Float(3.6f64)); // 2.25e23 - expect_next(reader, RawValueRef::Float(2.5f64 * 10f64.powi(23))); + expect_next(reader, RawValueRef::Float(2.5f64 * 10f64.powi(8))); // -3.18 expect_next(reader, RawValueRef::Float(-3.18f64)); // "Hello" From 8f79a3681e0dc82937cfdbb8582533f6c5576c1f Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Tue, 1 Aug 2023 13:18:24 -0700 Subject: [PATCH 10/15] Adds LazyRawTextReader support for reading symbols --- src/binary/binary_writer.rs | 6 +- src/lazy/text/buffer.rs | 184 +++++++++++++++++-- src/lazy/text/encoded_value.rs | 1 + src/lazy/text/matched.rs | 312 ++++++++++++++++++++------------- src/lazy/text/raw/reader.rs | 68 ++++++- src/lazy/text/value.rs | 1 + src/lazy/value.rs | 6 +- src/raw_symbol_token_ref.rs | 16 +- src/symbol_ref.rs | 46 +++-- src/text/raw_text_writer.rs | 5 +- src/text/text_formatter.rs | 5 +- src/text/text_writer.rs | 6 +- 12 files changed, 473 insertions(+), 183 deletions(-) diff --git a/src/binary/binary_writer.rs b/src/binary/binary_writer.rs index 305604b2..186f845b 100644 --- a/src/binary/binary_writer.rs +++ b/src/binary/binary_writer.rs @@ -128,7 +128,7 @@ impl IonWriter for BinaryWriter { panic!("Cannot set symbol ID ${symbol_id} as annotation. It is undefined."); } } - RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text), + RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()), }; self.raw_writer.add_annotation(symbol_id); } @@ -145,7 +145,7 @@ impl IonWriter for BinaryWriter { )); } } - RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text), + RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()), }; self.raw_writer.write_symbol(symbol_id) } @@ -159,7 +159,7 @@ impl IonWriter for BinaryWriter { panic!("Cannot set symbol ID ${symbol_id} as field name. It is undefined."); } } - RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text), + RawSymbolTokenRef::Text(text) => self.get_or_create_symbol_id(text.as_ref()), }; self.raw_writer.set_field_name(text); } diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index 24a6ec74..d8a7def6 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -5,8 +5,8 @@ use std::slice::Iter; use nom::branch::alt; use nom::bytes::streaming::{is_a, is_not, tag, take_until, take_while1}; -use nom::character::streaming::{char, digit1, one_of}; -use nom::combinator::{fail, map, opt, peek, recognize, success, value}; +use nom::character::streaming::{char, digit1, one_of, satisfy}; +use nom::combinator::{fail, map, not, opt, peek, recognize, success, value}; use nom::error::{ErrorKind, ParseError}; use nom::multi::many0_count; use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple}; @@ -16,9 +16,9 @@ use crate::lazy::encoding::TextEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::{ - MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedValue, + MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedSymbol, MatchedValue, }; -use crate::lazy::text::parse_result::IonParseError; +use crate::lazy::text::parse_result::{InvalidInputError, IonParseError}; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; use crate::lazy::text::value::LazyRawTextValue; use crate::result::DecodingError; @@ -275,6 +275,16 @@ impl<'data> TextBufferView<'data> { ) }, ), + map( + match_and_length(Self::match_symbol), + |(matched_symbol, length)| { + EncodedTextValue::new( + MatchedValue::Symbol(matched_symbol), + self.offset(), + length, + ) + }, + ), // TODO: The other Ion types )) .map(|encoded_value| LazyRawTextValue { @@ -578,6 +588,103 @@ impl<'data> TextBufferView<'data> { /// Returns a matched buffer and a boolean indicating whether any escaped characters were /// found in the short string. fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> { + Self::match_text_until_unescaped(self, b'\"') + } + + fn match_long_string(self) -> IonParseResult<'data, MatchedString> { + // TODO: implement long string matching + // The `fail` parser is a nom builtin that never matches. + fail(self) + } + + fn match_symbol(self) -> IonParseResult<'data, MatchedSymbol> { + // TODO: identifiers + alt(( + Self::match_symbol_id, + Self::match_identifier, + Self::match_quoted_symbol, + ))(self) + } + + fn match_symbol_id(self) -> IonParseResult<'data, MatchedSymbol> { + recognize(terminated( + // Discard a `$` and parse an integer representing the symbol ID. + // Note that symbol ID integers: + // * CANNOT have underscores in them. For example: `$1_0` is considered an identifier. + // * CAN have leading zeros. There's precedent for this in ion-java. + preceded(tag("$"), digit1), + // Peek at the next character to make sure it's unrelated to the symbol ID. + // The spec does not offer a formal definition of what ends a symbol ID. + // This checks for either a stop_character (which performs its own `peek()`) + // or a colon (":"), which could be a field delimiter (":") or the beginning of + // an annotation delimiter ('::'). + alt(( + // Each of the parsers passed to `alt` must have the same return type. `stop_character` + // returns a char instead of a &str, so we use `recognize()` to get a &str instead. + recognize(Self::peek_stop_character), + peek(tag(":")), // Field delimiter (":") or annotation delimiter ("::") + )), + )) + .map(|_matched| MatchedSymbol::SymbolId) + .parse(self) + } + + fn match_identifier(self) -> IonParseResult<'data, MatchedSymbol> { + let (remaining, identifier_text) = recognize(terminated( + pair( + Self::identifier_initial_character, + Self::identifier_trailing_characters, + ), + not(Self::identifier_trailing_character), + ))(self)?; + // Ion defines a number of keywords that are syntactically indistinguishable from + // identifiers. Keywords take precedence; we must ensure that any identifier we find + // is not actually a keyword. + const KEYWORDS: &[&str] = &["true", "false", "nan", "null"]; + // In many situations, this check will not be necessary. Another type's parser will + // recognize the keyword as its own. (For example, `parse_boolean` would match the input + // text `false`.) However, because symbols can appear in annotations and the check for + // annotations precedes the parsing for all other types, we need this extra verification. + if KEYWORDS + .iter() + .any(|k| k.as_bytes() == identifier_text.bytes()) + { + // Finding a keyword is not a fatal error, it just means that this parser doesn't match. + return Err(nom::Err::Error(IonParseError::Invalid( + InvalidInputError::new(self), + ))); + } + Ok((remaining, MatchedSymbol::Identifier)) + } + + /// Matches any character that can appear at the start of an identifier. + fn identifier_initial_character(self) -> IonParseResult<'data, Self> { + recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphabetic()))))(self) + } + + /// Matches any character that is legal in an identifier, though not necessarily at the beginning. + fn identifier_trailing_character(self) -> IonParseResult<'data, Self> { + recognize(alt((one_of("$_"), satisfy(|c| c.is_ascii_alphanumeric()))))(self) + } + + /// Matches characters that are legal in an identifier, though not necessarily at the beginning. + fn identifier_trailing_characters(self) -> IonParseResult<'data, Self> { + recognize(many0_count(Self::identifier_trailing_character))(self) + } + + fn match_quoted_symbol(self) -> IonParseResult<'data, MatchedSymbol> { + delimited(char('\''), Self::match_quoted_symbol_body, char('\'')) + .map(|(_matched, contains_escaped_chars)| MatchedSymbol::Quoted(contains_escaped_chars)) + .parse(self) + } + + /// Returns a matched buffer and a boolean indicating whether any escaped characters were + /// found in the short string. + fn match_quoted_symbol_body(self) -> IonParseResult<'data, (Self, bool)> { + Self::match_text_until_unescaped(self, b'\'') + } + + fn match_text_until_unescaped(self, delimiter: u8) -> IonParseResult<'data, (Self, bool)> { let mut is_escaped = false; let mut contains_escaped_chars = false; for (index, byte) in self.bytes().iter().enumerate() { @@ -591,7 +698,7 @@ impl<'data> TextBufferView<'data> { contains_escaped_chars = true; continue; } - if *byte == b'\"' { + if *byte == delimiter { let matched = self.slice(0, index); let remaining = self.slice_to_end(index); return Ok((remaining, (matched, contains_escaped_chars))); @@ -599,12 +706,6 @@ impl<'data> TextBufferView<'data> { } Err(nom::Err::Incomplete(Needed::Unknown)) } - - fn match_long_string(self) -> IonParseResult<'data, MatchedString> { - // TODO: implement long string matching - // The `fail` parser is a nom builtin that never matches. - fail(self) - } } // === nom trait implementations === @@ -840,13 +941,17 @@ mod tests { P: Parser, O, IonParseError<'data>>, { let result = self.try_match(parser); - // We expect this to fail for one reason or another - assert!( - result.is_err(), - "Expected a parse failure for input: {:?}\nResult: {:?}", - self.input, - result - ); + // We expect that only part of the input will match or that the entire + // input will be rejected outright. + if let Ok((_remaining, match_length)) = result { + assert_ne!( + match_length, + self.input.len() - 1, + "parser unexpectedly matched the complete input: '{:?}\nResult: {:?}", + self.input, + result + ); + } } } @@ -1039,13 +1144,54 @@ mod tests { r#" hello" "#, - // Missing a trailing quote + // Missing a closing quote r#" "hello "#, + // Closing quote is escaped + r#" + "hello\" + "#, ]; for input in bad_inputs { mismatch_string(input); } } + + #[test] + fn test_match_symbol() { + fn match_symbol(input: &str) { + MatchTest::new(input).expect_match(match_length(TextBufferView::match_symbol)); + } + fn mismatch_symbol(input: &str) { + MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_symbol)); + } + + // These inputs have leading/trailing whitespace to make them more readable, but the string + // matcher doesn't accept whitespace. We'll trim each one before testing it. + let good_inputs = &[ + "'hello'", + "'😀😀😀'", + "'this has an escaped quote \\' right in the middle'", + "$308", + "$0", + "foo", + "name", + "$bar", + "_baz_quux", + ]; + for input in good_inputs { + match_symbol(input); + } + + let bad_inputs = &[ + "'hello", // No closing quote + "'hello\\'", // Closing quote is escaped + "$-8", // Negative SID + "nan", // Identifier that is also a keyword + ]; + for input in bad_inputs { + mismatch_symbol(input); + } + } } diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index 17779c2d..f0a2c096 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -116,6 +116,7 @@ impl EncodedTextValue { MatchedValue::Int(_) => IonType::Int, MatchedValue::Float(_) => IonType::Float, MatchedValue::String(_) => IonType::String, + MatchedValue::Symbol(_) => IonType::Symbol, } } diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index 38a1f6ac..bb619350 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -20,8 +20,10 @@ //! re-discovered. use nom::character::is_hex_digit; +use std::borrow::Cow; use std::num::IntErrorKind; use std::ops::Range; +use std::str::FromStr; use num_bigint::BigInt; use num_traits::Num; @@ -32,7 +34,7 @@ use crate::lazy::text::as_utf8::AsUtf8; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::parse_result::InvalidInputError; use crate::result::{DecodingError, IonFailure}; -use crate::{Int, IonError, IonResult, IonType}; +use crate::{Int, IonError, IonResult, IonType, RawSymbolTokenRef}; /// A partially parsed Ion value. #[derive(Clone, Debug, PartialEq)] @@ -43,6 +45,7 @@ pub(crate) enum MatchedValue { Int(MatchedInt), Float(MatchedFloat), String(MatchedString), + Symbol(MatchedSymbol), // TODO: ...the other types } @@ -134,8 +137,6 @@ impl MatchedFloat { const STACK_ALLOC_BUFFER_CAPACITY: usize = 32; pub fn read(&self, matched_input: TextBufferView) -> IonResult { - use std::str::FromStr; - match self { MatchedFloat::PositiveInfinity => return Ok(f64::INFINITY), MatchedFloat::NegativeInfinity => return Ok(f64::NEG_INFINITY), @@ -220,140 +221,137 @@ impl MatchedString { // that replaces the escaped characters with their corresponding bytes. let mut sanitized = Vec::with_capacity(matched_input.len()); - Self::escape_short_string(body, &mut sanitized)?; + escape_text(body, &mut sanitized)?; let text = String::from_utf8(sanitized).unwrap(); Ok(StrRef::from(text.to_string())) } +} - fn escape_short_string( - matched_input: TextBufferView, - sanitized: &mut Vec, - ) -> IonResult<()> { - let mut remaining = matched_input; - while !remaining.is_empty() { - let next_escape = remaining.bytes().iter().position(|byte| *byte == b'\\'); - remaining = if let Some(escape_offset) = next_escape { - // Everything up to the '\' is already clean. Write that slice to 'sanitized'. - let already_clean = remaining.slice(0, escape_offset); - sanitized.extend_from_slice(already_clean.bytes()); - // Everything starting from the '\' needs to be evaluated. - let contains_escapes = remaining.slice_to_end(escape_offset); - Self::write_escaped(contains_escapes, sanitized)? - } else { - sanitized.extend_from_slice(remaining.bytes()); - // 'remaining' is now empty - remaining.slice_to_end(remaining.len()) - }; - } - - Ok(()) +fn escape_text(matched_input: TextBufferView, sanitized: &mut Vec) -> IonResult<()> { + let mut remaining = matched_input; + while !remaining.is_empty() { + let next_escape = remaining.bytes().iter().position(|byte| *byte == b'\\'); + remaining = if let Some(escape_offset) = next_escape { + // Everything up to the '\' is already clean. Write that slice to 'sanitized'. + let already_clean = remaining.slice(0, escape_offset); + sanitized.extend_from_slice(already_clean.bytes()); + // Everything starting from the '\' needs to be evaluated. + let contains_escapes = remaining.slice_to_end(escape_offset); + write_escaped(contains_escapes, sanitized)? + } else { + sanitized.extend_from_slice(remaining.bytes()); + // 'remaining' is now empty + remaining.slice_to_end(remaining.len()) + }; } - fn write_escaped<'data>( - input: TextBufferView<'data>, - sanitized: &mut Vec, - ) -> IonResult> { - // Note that by the time this method has been called, the parser has already confirmed that - // there is an appropriate closing delimiter. Thus, if any of the branches below run out of - // data, it means that it's a fatal error and not just an Incomplete. - debug_assert!(!input.is_empty()); - debug_assert!(input.bytes()[0] == b'\\'); - if input.len() == 1 { - return Err(IonError::Decoding( - DecodingError::new("found an escape ('\\') with no subsequent character") - .with_position(input.offset()), - )); - } - let input_after_escape = input.slice_to_end(2); // After (e.g.) '\x' - let escape_id = input.bytes()[1]; - let substitute = match escape_id { - b'n' => b'\n', - b'r' => b'\r', - b't' => b'\t', - b'\\' => b'\\', - b'/' => b'/', - b'"' => b'"', - b'\'' => b'\'', - b'?' => b'?', - b'0' => 0x00u8, // NUL - b'a' => 0x07u8, // alert BEL - b'b' => 0x08u8, // backspace - b'v' => 0x0Bu8, // vertical tab - b'f' => 0x0Cu8, // form feed - // If the byte following the '\' is a real newline (that is: 0x0A), we discard it. - b'\n' => return Ok(input_after_escape), - // These cases require more sophisticated parsing, not just a 1-to-1 mapping of bytes - b'x' => return Self::hex_digits_code_point(2, input_after_escape, sanitized), - b'u' => return Self::hex_digits_code_point(4, input_after_escape, sanitized), - b'U' => return Self::hex_digits_code_point(8, input_after_escape, sanitized), - _ => { - return Err(IonError::Decoding( - DecodingError::new(format!("invalid escape sequence '\\{}", escape_id)) - .with_position(input.offset()), - )) - } - }; + Ok(()) +} - sanitized.push(substitute); - Ok(input_after_escape) +fn write_escaped<'data>( + input: TextBufferView<'data>, + sanitized: &mut Vec, +) -> IonResult> { + // Note that by the time this method has been called, the parser has already confirmed that + // there is an appropriate closing delimiter. Thus, if any of the branches below run out of + // data, it means that it's a fatal error and not just an Incomplete. + debug_assert!(!input.is_empty()); + debug_assert!(input.bytes()[0] == b'\\'); + if input.len() == 1 { + return Err(IonError::Decoding( + DecodingError::new("found an escape ('\\') with no subsequent character") + .with_position(input.offset()), + )); } - - fn hex_digits_code_point<'data>( - num_digits: usize, - input: TextBufferView<'data>, - sanitized: &mut Vec, - ) -> IonResult> { - if input.len() < num_digits { + let input_after_escape = input.slice_to_end(2); // After (e.g.) '\x' + let escape_id = input.bytes()[1]; + let substitute = match escape_id { + b'n' => b'\n', + b'r' => b'\r', + b't' => b'\t', + b'\\' => b'\\', + b'/' => b'/', + b'"' => b'"', + b'\'' => b'\'', + b'?' => b'?', + b'0' => 0x00u8, // NUL + b'a' => 0x07u8, // alert BEL + b'b' => 0x08u8, // backspace + b'v' => 0x0Bu8, // vertical tab + b'f' => 0x0Cu8, // form feed + // If the byte following the '\' is a real newline (that is: 0x0A), we discard it. + b'\n' => return Ok(input_after_escape), + // These cases require more sophisticated parsing, not just a 1-to-1 mapping of bytes + b'x' => return hex_digits_code_point(2, input_after_escape, sanitized), + b'u' => return hex_digits_code_point(4, input_after_escape, sanitized), + b'U' => return hex_digits_code_point(8, input_after_escape, sanitized), + _ => { return Err(IonError::Decoding( - DecodingError::new(format!( - "found a {}-hex-digit escape sequence with only {} digits", - num_digits, - input.len() - )) - .with_position(input.offset()), - )); + DecodingError::new(format!("invalid escape sequence '\\{}", escape_id)) + .with_position(input.offset()), + )) } + }; - let hex_digit_bytes = &input.bytes()[..num_digits]; - - let all_are_hex_digits = hex_digit_bytes - .iter() - .take(num_digits) - .copied() - .all(is_hex_digit); - if !all_are_hex_digits { - return Err(IonError::Decoding( - DecodingError::new(format!( - "found a {}-hex-digit escape sequence that contained an invalid hex digit", - num_digits, - )) - .with_position(input.offset()), - )); - } - // We just confirmed all of the digits are ASCII hex digits, so these steps cannot fail. - let hex_digits = std::str::from_utf8(hex_digit_bytes).unwrap(); - let code_point = u32::from_str_radix(hex_digits, 16).unwrap(); - - // Check to see if this is a high surrogate; if it is, our code point isn't complete. Another - // unicode escape representing the low surrogate has to be next in the input to complete it. - // See the docs for this helper function for details. (Note: this will only ever be true for - // 4- and 8-digit escape sequences. `\x` escapes don't have enough digits to represent a - // high surrogate.) - if code_point_is_a_high_surrogate(code_point) { - todo!("support surrogate pairs") - } + sanitized.push(substitute); + Ok(input_after_escape) +} - // A Rust `char` can represent any Unicode scalar value--a code point that is not part of a - // surrogate pair. If the value we found isn't a high surrogate, then it's a complete scalar - // value. We can safely convert it to a `char`. - let character = char::from_u32(code_point).unwrap(); - let utf8_buffer: &mut [u8; 4] = &mut [0; 4]; - let utf8_encoded = character.encode_utf8(utf8_buffer); - sanitized.extend_from_slice(utf8_encoded.as_bytes()); +fn hex_digits_code_point<'data>( + num_digits: usize, + input: TextBufferView<'data>, + sanitized: &mut Vec, +) -> IonResult> { + if input.len() < num_digits { + return Err(IonError::Decoding( + DecodingError::new(format!( + "found a {}-hex-digit escape sequence with only {} digits", + num_digits, + input.len() + )) + .with_position(input.offset()), + )); + } - // Skip beyond the digits we just processed - Ok(input.slice_to_end(num_digits)) + let hex_digit_bytes = &input.bytes()[..num_digits]; + + let all_are_hex_digits = hex_digit_bytes + .iter() + .take(num_digits) + .copied() + .all(is_hex_digit); + if !all_are_hex_digits { + return Err(IonError::Decoding( + DecodingError::new(format!( + "found a {}-hex-digit escape sequence that contained an invalid hex digit", + num_digits, + )) + .with_position(input.offset()), + )); } + // We just confirmed all of the digits are ASCII hex digits, so these steps cannot fail. + let hex_digits = std::str::from_utf8(hex_digit_bytes).unwrap(); + let code_point = u32::from_str_radix(hex_digits, 16).unwrap(); + + // Check to see if this is a high surrogate; if it is, our code point isn't complete. Another + // unicode escape representing the low surrogate has to be next in the input to complete it. + // See the docs for this helper function for details. (Note: this will only ever be true for + // 4- and 8-digit escape sequences. `\x` escapes don't have enough digits to represent a + // high surrogate.) + if code_point_is_a_high_surrogate(code_point) { + todo!("support surrogate pairs") + } + + // A Rust `char` can represent any Unicode scalar value--a code point that is not part of a + // surrogate pair. If the value we found isn't a high surrogate, then it's a complete scalar + // value. We can safely convert it to a `char`. + let character = char::from_u32(code_point).unwrap(); + let utf8_buffer: &mut [u8; 4] = &mut [0; 4]; + let utf8_encoded = character.encode_utf8(utf8_buffer); + sanitized.extend_from_slice(utf8_encoded.as_bytes()); + + // Skip beyond the digits we just processed + Ok(input.slice_to_end(num_digits)) } /// Returns `true` if the provided code point is a utf-16 high surrogate. @@ -381,3 +379,71 @@ impl MatchedString { fn code_point_is_a_high_surrogate(value: u32) -> bool { (0xD800..=0xDFFF).contains(&value) } + +#[derive(Clone, Debug, PartialEq)] +pub(crate) enum MatchedSymbol { + /// A numeric symbol ID (e.g. `$21`) + SymbolId, + /// The symbol is an unquoted identifier (e.g. `foo`) + Identifier, + /// The symbol is delimited by single quotes. + Quoted(bool), + // TODO: Operators in S-Expressions +} + +impl MatchedSymbol { + pub fn read<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + match self { + MatchedSymbol::SymbolId => self.read_symbol_id(matched_input), + MatchedSymbol::Identifier => self.read_identifier(matched_input), + MatchedSymbol::Quoted(contains_escaped_chars) => { + self.read_quoted(matched_input, *contains_escaped_chars) + } + } + } + + fn read_quoted<'data>( + &self, + matched_input: TextBufferView<'data>, + contains_escaped_chars: bool, + ) -> IonResult> { + // Take a slice of the input that ignores the first and last bytes, which are quotes. + let body = matched_input.slice(1, matched_input.len() - 2); + if !contains_escaped_chars { + // There are no escaped characters, so we can just validate the string in-place. + let text = body.as_text()?; + let str_ref = RawSymbolTokenRef::Text(text.into()); + return Ok(str_ref); + } + + // Otherwise, there are escaped characters. We need to build a new version of our symbol + // that replaces the escaped characters with their corresponding bytes. + let mut sanitized = Vec::with_capacity(matched_input.len()); + + escape_text(body, &mut sanitized)?; + let text = String::from_utf8(sanitized).unwrap(); + Ok(RawSymbolTokenRef::Text(text.into())) + } + fn read_identifier<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + matched_input + .as_text() + .map(|t| RawSymbolTokenRef::Text(Cow::Borrowed(t))) + } + fn read_symbol_id<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + // Skip past the first byte, which has to be a `$`. + let text = matched_input.slice_to_end(1).as_text()?; + // It's not possible for the number parsing to fail because the matcher's rules + // guarantee that this string contains only decimal digits. + let sid = usize::from_str(text).expect("loading symbol ID as usize"); + Ok(RawSymbolTokenRef::SymbolId(sid)) + } +} diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index d5ec559f..48495d81 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -76,11 +76,13 @@ mod tests { use super::*; use crate::lazy::decoder::LazyRawValue; use crate::lazy::raw_value_ref::RawValueRef; - use crate::IonType; + use crate::{IonType, RawSymbolTokenRef}; #[test] fn test_top_level() -> IonResult<()> { - let data = r#" + let mut data = String::new(); + data.push_str( + r#" /* This test demonstrates lazily reading top-level values of various Ion types. The values are interspersed with @@ -123,13 +125,29 @@ mod tests { "\u0048ello, \u0077orld!" // \u 4-digit hex escape "\U00000048ello, \U00000077orld!" // \U 8-digit hex escape - "#; - - // Make a mutable string so we can append some things that require Rust-level escapes - let mut data = String::from(data); + "#, + ); // Escaped newlines are discarded data.push_str("\"Hello,\\\n world!\""); + data.push_str( + r#" + // Symbols + + 'foo' + 'Hello, world!' + '😎😎😎' + + firstName + date_of_birth + $variable + + $0 + $10 + $733 + "#, + ); + fn expect_next<'data>( reader: &mut LazyRawTextReader<'data>, expected: RawValueRef<'data, TextEncoding>, @@ -202,6 +220,44 @@ mod tests { expect_next(reader, RawValueRef::String("Hello, world!".into())); // "\"Hello,\\\n world!\" " expect_next(reader, RawValueRef::String("Hello, world!".into())); + // 'foo' + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("foo".into())), + ); + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("Hello, world!".into())), + ); + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("😎😎😎".into())), + ); + // firstName + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("firstName".into())), + ); + // date_of_birth + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("date_of_birth".into())), + ); + // $variable + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::Text("$variable".into())), + ); + // $0 + expect_next(reader, RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(0))); + // $10 + expect_next(reader, RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(10))); + // $733 + expect_next( + reader, + RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(733)), + ); + Ok(()) } } diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index f888b63a..3df2e985 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -53,6 +53,7 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { MatchedValue::Float(f) => RawValueRef::Float(f.read(matched_input)?), // ...decimal, timestamp... MatchedValue::String(s) => RawValueRef::String(s.read(matched_input)?), + MatchedValue::Symbol(s) => RawValueRef::Symbol(s.read(matched_input)?), // ...and the rest! }; Ok(value_ref) diff --git a/src/lazy/value.rs b/src/lazy/value.rs index 8f09cdbf..f25caf7e 100644 --- a/src/lazy/value.rs +++ b/src/lazy/value.rs @@ -9,6 +9,7 @@ use crate::{ Annotations, Element, IntoAnnotatedElement, IonError, IonResult, IonType, RawSymbolTokenRef, SymbolRef, SymbolTable, Value, }; +use std::borrow::Cow; /// A value in a binary Ion stream whose header has been parsed but whose body (i.e. its data) has /// not. A `LazyValue` is immutable; its data can be read any number of times. @@ -184,7 +185,8 @@ impl<'top, 'data, D: LazyDecoder<'data>> LazyValue<'top, 'data, D> { )) })? .into(), - RawSymbolTokenRef::Text(text) => text.into(), + RawSymbolTokenRef::Text(Cow::Borrowed(text)) => text.into(), + RawSymbolTokenRef::Text(Cow::Owned(text)) => text.into(), }; ValueRef::Symbol(symbol) } @@ -333,7 +335,7 @@ where )), Some(symbol) => Some(Ok(symbol.into())), }, - Ok(RawSymbolTokenRef::Text(text)) => Some(Ok(SymbolRef::with_text(text))), + Ok(RawSymbolTokenRef::Text(text)) => Some(Ok(text.into())), Err(e) => Some(Err(e)), } } diff --git a/src/raw_symbol_token_ref.rs b/src/raw_symbol_token_ref.rs index d4a00c4d..dddedc7c 100644 --- a/src/raw_symbol_token_ref.rs +++ b/src/raw_symbol_token_ref.rs @@ -1,11 +1,12 @@ use crate::raw_symbol_token::RawSymbolToken; use crate::{Symbol, SymbolId}; +use std::borrow::Cow; /// Like RawSymbolToken, but the Text variant holds a borrowed reference instead of a String. #[derive(Debug, Clone, PartialEq, Eq)] pub enum RawSymbolTokenRef<'a> { SymbolId(SymbolId), - Text(&'a str), + Text(Cow<'a, str>), } /// Implemented by types that can be viewed as a [RawSymbolTokenRef] without allocations. @@ -15,10 +16,7 @@ pub trait AsRawSymbolTokenRef { impl<'a> AsRawSymbolTokenRef for RawSymbolTokenRef<'a> { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef { - match self { - RawSymbolTokenRef::SymbolId(sid) => RawSymbolTokenRef::SymbolId(*sid), - RawSymbolTokenRef::Text(text) => RawSymbolTokenRef::Text(text), - } + self.clone() } } @@ -30,20 +28,20 @@ impl AsRawSymbolTokenRef for SymbolId { impl AsRawSymbolTokenRef for String { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef { - RawSymbolTokenRef::Text(self.as_str()) + RawSymbolTokenRef::Text(Cow::from(self.as_str())) } } impl AsRawSymbolTokenRef for &str { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef { - RawSymbolTokenRef::Text(self) + RawSymbolTokenRef::Text(Cow::from(*self)) } } impl AsRawSymbolTokenRef for Symbol { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef { match self.text() { - Some(text) => RawSymbolTokenRef::Text(text), + Some(text) => RawSymbolTokenRef::Text(Cow::from(text)), None => RawSymbolTokenRef::SymbolId(0), } } @@ -62,7 +60,7 @@ impl AsRawSymbolTokenRef for RawSymbolToken { fn as_raw_symbol_token_ref(&self) -> RawSymbolTokenRef { match self { RawSymbolToken::SymbolId(sid) => RawSymbolTokenRef::SymbolId(*sid), - RawSymbolToken::Text(text) => RawSymbolTokenRef::Text(text.as_str()), + RawSymbolToken::Text(text) => RawSymbolTokenRef::Text(Cow::from(text.as_str())), } } } diff --git a/src/symbol_ref.rs b/src/symbol_ref.rs index 9cd42cac..815c75fe 100644 --- a/src/symbol_ref.rs +++ b/src/symbol_ref.rs @@ -1,5 +1,5 @@ use crate::Symbol; -use std::borrow::Borrow; +use std::borrow::{Borrow, Cow}; use std::fmt::{Debug, Formatter}; use std::hash::{Hash, Hasher}; @@ -7,19 +7,19 @@ use std::hash::{Hash, Hasher}; /// static lifetime), a `SymbolRef` may have known or undefined text (i.e. `$0`). #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)] pub struct SymbolRef<'a> { - text: Option<&'a str>, + text: Option>, } impl<'a> Debug for SymbolRef<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.text.unwrap_or("$0")) + write!(f, "{}", self.text().unwrap_or("$0")) } } impl<'a> SymbolRef<'a> { /// If this symbol has known text, returns `Some(&str)`. Otherwise, returns `None`. pub fn text(&self) -> Option<&str> { - self.text + self.text.as_ref().map(|t| t.as_ref()) } /// Constructs a `SymbolRef` with unknown text. @@ -28,14 +28,17 @@ impl<'a> SymbolRef<'a> { } /// Constructs a `SymbolRef` with the specified text. - pub fn with_text(text: &str) -> SymbolRef { - SymbolRef { text: Some(text) } + pub fn with_text(text: impl Into>) -> SymbolRef<'a> { + SymbolRef { + text: Some(text.into()), + } } pub fn to_owned(self) -> Symbol { - match self.text() { + match self.text { None => Symbol::unknown_text(), - Some(text) => Symbol::owned(text), + Some(Cow::Borrowed(text)) => Symbol::owned(text), + Some(Cow::Owned(text)) => Symbol::owned(text), } } } @@ -60,14 +63,14 @@ pub trait AsSymbolRef { impl<'a, A: AsRef + 'a> AsSymbolRef for A { fn as_symbol_ref(&self) -> SymbolRef { SymbolRef { - text: Some(self.as_ref()), + text: Some(Cow::Borrowed(self.as_ref())), } } } impl<'a> Hash for SymbolRef<'a> { fn hash(&self, state: &mut H) { - match self.text { + match self.text() { None => 0.hash(state), Some(text) => text.hash(state), } @@ -76,18 +79,33 @@ impl<'a> Hash for SymbolRef<'a> { impl<'a> From<&'a str> for SymbolRef<'a> { fn from(text: &'a str) -> Self { - Self { text: Some(text) } + Self { + text: Some(Cow::Borrowed(text)), + } } } -impl<'a> From<&'a Symbol> for SymbolRef<'a> { - fn from(symbol: &'a Symbol) -> Self { +impl<'a> From for SymbolRef<'a> { + fn from(text: String) -> Self { Self { - text: symbol.text(), + text: Some(Cow::Owned(text)), } } } +impl<'a> From> for SymbolRef<'a> { + fn from(value: Cow<'a, str>) -> Self { + Self { text: Some(value) } + } +} + +impl<'a> From<&'a Symbol> for SymbolRef<'a> { + fn from(symbol: &'a Symbol) -> Self { + let text = symbol.text().map(Cow::Borrowed); + Self { text } + } +} + // Note that this method panics if the SymbolRef has unknown text! This is unfortunate but is required // in order to allow a HashMap to do lookups with a &str instead of a &SymbolRef impl<'a> Borrow for SymbolRef<'a> { diff --git a/src/text/raw_text_writer.rs b/src/text/raw_text_writer.rs index b0e75717..68a043a0 100644 --- a/src/text/raw_text_writer.rs +++ b/src/text/raw_text_writer.rs @@ -320,12 +320,13 @@ impl RawTextWriter { match token.as_raw_symbol_token_ref() { RawSymbolTokenRef::SymbolId(sid) => write!(output, "${sid}")?, RawSymbolTokenRef::Text(text) - if Self::token_is_keyword(text) || Self::token_resembles_symbol_id(text) => + if Self::token_is_keyword(text.as_ref()) + || Self::token_resembles_symbol_id(text.as_ref()) => { // Write the symbol text in single quotes write!(output, "'{text}'")?; } - RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text) => { + RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text.as_ref()) => { // Write the symbol text without quotes write!(output, "{text}")? } diff --git a/src/text/text_formatter.rs b/src/text/text_formatter.rs index 828e9fb5..404d556d 100644 --- a/src/text/text_formatter.rs +++ b/src/text/text_formatter.rs @@ -229,12 +229,13 @@ impl<'a, W: std::fmt::Write> IonValueFormatter<'a, W> { match token.as_raw_symbol_token_ref() { RawSymbolTokenRef::SymbolId(sid) => write!(self.output, "${sid}")?, RawSymbolTokenRef::Text(text) - if Self::token_is_keyword(text) || Self::token_resembles_symbol_id(text) => + if Self::token_is_keyword(text.as_ref()) + || Self::token_resembles_symbol_id(text.as_ref()) => { // Write the symbol text in single quotes write!(self.output, "'{text}'")?; } - RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text) => { + RawSymbolTokenRef::Text(text) if Self::token_is_identifier(text.as_ref()) => { // Write the symbol text without quotes write!(self.output, "{text}")? } diff --git a/src/text/text_writer.rs b/src/text/text_writer.rs index 7fc140d0..c4829974 100644 --- a/src/text/text_writer.rs +++ b/src/text/text_writer.rs @@ -123,7 +123,7 @@ impl IonWriter for TextWriter { RawSymbolTokenRef::SymbolId(symbol_id) => { // Get the text associated with this symbol ID match self.symbol_table.text_for(symbol_id) { - Some(text) => RawSymbolTokenRef::Text(text), + Some(text) => RawSymbolTokenRef::Text(text.into()), None => RawSymbolTokenRef::SymbolId(symbol_id), } } @@ -138,7 +138,7 @@ impl IonWriter for TextWriter { RawSymbolTokenRef::SymbolId(symbol_id) => { // Get the text associated with this symbol ID match self.symbol_table.text_for(symbol_id) { - Some(text) => RawSymbolTokenRef::Text(text), + Some(text) => RawSymbolTokenRef::Text(text.into()), None => RawSymbolTokenRef::SymbolId(symbol_id), } } @@ -152,7 +152,7 @@ impl IonWriter for TextWriter { RawSymbolTokenRef::SymbolId(symbol_id) => { // Get the text associated with this symbol ID match self.symbol_table.text_for(symbol_id) { - Some(text) => RawSymbolTokenRef::Text(text), + Some(text) => RawSymbolTokenRef::Text(text.into()), None => RawSymbolTokenRef::SymbolId(symbol_id), } } From 4cb9b2b906e18ebc2cd11b7ff641273ff21d3726 Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Tue, 1 Aug 2023 13:37:07 -0700 Subject: [PATCH 11/15] Adds more doc comments --- src/lazy/text/buffer.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index d8a7def6..e1c7a5e5 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -597,6 +597,7 @@ impl<'data> TextBufferView<'data> { fail(self) } + /// Matches a symbol ID (`$28`), an identifier (`foo`), or a quoted symbol (`'foo'`). fn match_symbol(self) -> IonParseResult<'data, MatchedSymbol> { // TODO: identifiers alt(( @@ -606,6 +607,7 @@ impl<'data> TextBufferView<'data> { ))(self) } + /// Matches a symbol ID (`$28`). fn match_symbol_id(self) -> IonParseResult<'data, MatchedSymbol> { recognize(terminated( // Discard a `$` and parse an integer representing the symbol ID. @@ -629,6 +631,7 @@ impl<'data> TextBufferView<'data> { .parse(self) } + /// Matches an identifier (`foo`). fn match_identifier(self) -> IonParseResult<'data, MatchedSymbol> { let (remaining, identifier_text) = recognize(terminated( pair( @@ -672,6 +675,7 @@ impl<'data> TextBufferView<'data> { recognize(many0_count(Self::identifier_trailing_character))(self) } + /// Matches a quoted symbol (`'foo'`). fn match_quoted_symbol(self) -> IonParseResult<'data, MatchedSymbol> { delimited(char('\''), Self::match_quoted_symbol_body, char('\'')) .map(|(_matched, contains_escaped_chars)| MatchedSymbol::Quoted(contains_escaped_chars)) @@ -684,6 +688,8 @@ impl<'data> TextBufferView<'data> { Self::match_text_until_unescaped(self, b'\'') } + /// A helper method for matching bytes until the specified delimiter. Ignores any byte + /// (including the delimiter) that is prefaced by the escape character `\`. fn match_text_until_unescaped(self, delimiter: u8) -> IonParseResult<'data, (Self, bool)> { let mut is_escaped = false; let mut contains_escaped_chars = false; From 54470d2358a730098d811e060960bc876bc1a10d Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Tue, 1 Aug 2023 16:03:49 -0700 Subject: [PATCH 12/15] More doc comments --- src/lazy/text/matched.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index bb619350..db9bdf0b 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -297,6 +297,8 @@ fn write_escaped<'data>( Ok(input_after_escape) } +/// Reads the next `num_digits` bytes from `input` as a `char`, then writes that `char`'s UTF8 bytes +/// to `sanitized`. fn hex_digits_code_point<'data>( num_digits: usize, input: TextBufferView<'data>, @@ -386,7 +388,8 @@ pub(crate) enum MatchedSymbol { SymbolId, /// The symbol is an unquoted identifier (e.g. `foo`) Identifier, - /// The symbol is delimited by single quotes. + /// The symbol is delimited by single quotes. Holds a `bool` indicating whether the + /// matched input contained any escaped bytes. Quoted(bool), // TODO: Operators in S-Expressions } From 78014e7c0cb3272fb05c4b3a0fe138783b878f24 Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Thu, 3 Aug 2023 09:26:09 -0700 Subject: [PATCH 13/15] Adds `LazyRawTextReader` support for reading lists --- src/lazy/binary/raw/sequence.rs | 18 ++--- src/lazy/encoding.rs | 35 +-------- src/lazy/text/buffer.rs | 37 ++++++++++ src/lazy/text/encoded_value.rs | 3 +- src/lazy/text/matched.rs | 17 +++-- src/lazy/text/parse_result.rs | 20 +++++ src/lazy/text/raw/mod.rs | 1 + src/lazy/text/raw/reader.rs | 17 +++++ src/lazy/text/raw/sequence.rs | 126 ++++++++++++++++++++++++++++++++ src/lazy/text/value.rs | 8 +- 10 files changed, 231 insertions(+), 51 deletions(-) create mode 100644 src/lazy/text/raw/sequence.rs diff --git a/src/lazy/binary/raw/sequence.rs b/src/lazy/binary/raw/sequence.rs index 66d26fef..a5f2487d 100644 --- a/src/lazy/binary/raw/sequence.rs +++ b/src/lazy/binary/raw/sequence.rs @@ -18,11 +18,11 @@ impl<'data> LazyRawBinarySequence<'data> { self.value.ion_type() } - pub fn iter(&self) -> RawSequenceIterator<'data> { + pub fn iter(&self) -> RawBinarySequenceIterator<'data> { // Get as much of the sequence's body as is available in the input buffer. // Reading a child value may fail as `Incomplete` let buffer_slice = self.value.available_body(); - RawSequenceIterator::new(buffer_slice) + RawBinarySequenceIterator::new(buffer_slice) } } @@ -33,7 +33,7 @@ impl<'data> LazyContainerPrivate<'data, BinaryEncoding> for LazyRawBinarySequenc } impl<'data> LazyRawSequence<'data, BinaryEncoding> for LazyRawBinarySequence<'data> { - type Iterator = RawSequenceIterator<'data>; + type Iterator = RawBinarySequenceIterator<'data>; fn annotations(&self) -> RawBinaryAnnotationsIterator<'data> { self.value.annotations() @@ -54,7 +54,7 @@ impl<'data> LazyRawSequence<'data, BinaryEncoding> for LazyRawBinarySequence<'da impl<'a, 'data> IntoIterator for &'a LazyRawBinarySequence<'data> { type Item = IonResult>; - type IntoIter = RawSequenceIterator<'data>; + type IntoIter = RawBinarySequenceIterator<'data>; fn into_iter(self) -> Self::IntoIter { self.iter() @@ -99,19 +99,19 @@ impl<'a> Debug for LazyRawBinarySequence<'a> { } } -pub struct RawSequenceIterator<'data> { +pub struct RawBinarySequenceIterator<'data> { source: DataSource<'data>, } -impl<'data> RawSequenceIterator<'data> { - pub(crate) fn new(input: ImmutableBuffer<'data>) -> RawSequenceIterator<'data> { - RawSequenceIterator { +impl<'data> RawBinarySequenceIterator<'data> { + pub(crate) fn new(input: ImmutableBuffer<'data>) -> RawBinarySequenceIterator<'data> { + RawBinarySequenceIterator { source: DataSource::new(input), } } } -impl<'data> Iterator for RawSequenceIterator<'data> { +impl<'data> Iterator for RawBinarySequenceIterator<'data> { type Item = IonResult>; fn next(&mut self) -> Option { diff --git a/src/lazy/encoding.rs b/src/lazy/encoding.rs index 784879ad..987bd1f6 100644 --- a/src/lazy/encoding.rs +++ b/src/lazy/encoding.rs @@ -4,11 +4,12 @@ use crate::lazy::binary::raw::reader::LazyRawBinaryReader; use crate::lazy::binary::raw::sequence::LazyRawBinarySequence; use crate::lazy::binary::raw::value::LazyRawBinaryValue; use crate::lazy::decoder::private::{LazyContainerPrivate, LazyRawFieldPrivate}; -use crate::lazy::decoder::{LazyDecoder, LazyRawField, LazyRawSequence, LazyRawStruct}; +use crate::lazy::decoder::{LazyDecoder, LazyRawField, LazyRawStruct}; use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::text::raw::reader::LazyRawTextReader; +use crate::lazy::text::raw::sequence::LazyRawTextSequence; use crate::lazy::text::value::LazyRawTextValue; -use crate::{IonResult, IonType, RawSymbolTokenRef}; +use crate::{IonResult, RawSymbolTokenRef}; use std::marker::PhantomData; // These types derive trait implementations in order to allow types that containing them @@ -33,34 +34,6 @@ impl<'data> LazyDecoder<'data> for BinaryEncoding { // === Placeholders === // The types below will need to be properly defined in order for the lazy text reader to be complete. // The exist to satisfy various trait definitions. -#[derive(Debug, Clone)] -pub struct ToDoTextSequence; - -impl<'data> LazyContainerPrivate<'data, TextEncoding> for ToDoTextSequence { - fn from_value(_value: LazyRawTextValue<'data>) -> Self { - todo!() - } -} - -impl<'data> LazyRawSequence<'data, TextEncoding> for ToDoTextSequence { - type Iterator = Box>>>; - - fn annotations(&self) -> ToDoTextAnnotationsIterator<'data> { - todo!() - } - - fn ion_type(&self) -> IonType { - todo!() - } - - fn iter(&self) -> Self::Iterator { - todo!() - } - - fn as_value(&self) -> &>::Value { - todo!() - } -} #[derive(Debug, Clone)] pub struct ToDoTextStruct; @@ -127,7 +100,7 @@ impl<'data> Iterator for ToDoTextAnnotationsIterator<'data> { impl<'data> LazyDecoder<'data> for TextEncoding { type Reader = LazyRawTextReader<'data>; type Value = LazyRawTextValue<'data>; - type Sequence = ToDoTextSequence; + type Sequence = LazyRawTextSequence<'data>; type Struct = ToDoTextStruct; type AnnotationsIterator = ToDoTextAnnotationsIterator<'data>; } diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index e1c7a5e5..22f17461 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -224,6 +224,37 @@ impl<'data> TextBufferView<'data> { ))(self) } + /// Matches a single value in a list OR the end of the list, allowing for leading whitespace + /// and comments in either case. + /// + /// If a value is found, returns `Ok(Some(value))`. If the end of the list is found, returns + /// `Ok(None)`. + pub fn match_list_value(self) -> IonParseResult<'data, Option>> { + preceded( + // Some amount of whitespace/comments... + Self::match_optional_comments_and_whitespace, + // ...followed by either the end of the list... + alt(( + value(None, tag("]")), + // ...or a value... + terminated( + Self::match_value.map(Some), + // ...followed by a comma or end-of-list + Self::match_delimiter_after_list_value, + ), + )), + )(self) + } + + /// Matches syntax that is expected to follow a value in a list: any amount of whitespace and/or + /// comments followed by either a comma (consumed) or an end-of-list `]` (not consumed). + fn match_delimiter_after_list_value(self) -> IonMatchResult<'data> { + preceded( + Self::match_optional_comments_and_whitespace, + alt((tag(","), peek(tag("]")))), + )(self) + } + /// Matches a single top-level scalar value, the beginning of a container, or an IVM. pub fn match_top_level(self) -> IonParseResult<'data, RawStreamItem<'data, TextEncoding>> { let (remaining, value) = match self.match_value() { @@ -285,6 +316,12 @@ impl<'data> TextBufferView<'data> { ) }, ), + map( + match_and_length(tag("[")), + |(_matched_list_start, length)| { + EncodedTextValue::new(MatchedValue::List, self.offset(), length) + }, + ), // TODO: The other Ion types )) .map(|encoded_value| LazyRawTextValue { diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index f0a2c096..bce5d44b 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -8,7 +8,7 @@ use std::ops::Range; /// Each [`LazyRawTextValue`](crate::lazy::text::value::LazyRawTextValue) contains an `EncodedValue`, /// allowing a user to re-read (that is: parse) the body of the value as many times as necessary /// without re-parsing its header information each time. -#[derive(Clone, Debug, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq)] pub(crate) struct EncodedTextValue { // Each encoded text value has up to three components, appearing in the following order: // @@ -117,6 +117,7 @@ impl EncodedTextValue { MatchedValue::Float(_) => IonType::Float, MatchedValue::String(_) => IonType::String, MatchedValue::Symbol(_) => IonType::Symbol, + MatchedValue::List => IonType::List, } } diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index db9bdf0b..53fa63b4 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -22,7 +22,6 @@ use nom::character::is_hex_digit; use std::borrow::Cow; use std::num::IntErrorKind; -use std::ops::Range; use std::str::FromStr; use num_bigint::BigInt; @@ -37,7 +36,7 @@ use crate::result::{DecodingError, IonFailure}; use crate::{Int, IonError, IonResult, IonType, RawSymbolTokenRef}; /// A partially parsed Ion value. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub(crate) enum MatchedValue { // `Null` and `Bool` are fully parsed because they only involve matching a keyword. Null(IonType), @@ -46,6 +45,7 @@ pub(crate) enum MatchedValue { Float(MatchedFloat), String(MatchedString), Symbol(MatchedSymbol), + List, // TODO: ...the other types } @@ -160,7 +160,7 @@ impl MatchedFloat { } } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub(crate) enum MatchedString { /// The string only has one segment. (e.g. "foo") Short(MatchedShortString), @@ -170,11 +170,12 @@ pub(crate) enum MatchedString { Long(MatchedLongString), } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub(crate) struct MatchedLongString { - // Keep a list of all the string segment ranges we found. - // If the user asks to read the string, we'll collate the segments into a single string. - slices: Vec>, + // TODO: Decide what (if anything) to store here. + // Storing any collection of bytes or ranges means that this type cannot implement Copy, + // which in turn means MatchedValue and EncodedTextValue also cannot implement Copy. + // We probably also don't want to heap allocate just to match the long string. } #[derive(Clone, Copy, Debug, PartialEq)] @@ -382,7 +383,7 @@ fn code_point_is_a_high_surrogate(value: u32) -> bool { (0xD800..=0xDFFF).contains(&value) } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub(crate) enum MatchedSymbol { /// A numeric symbol ID (e.g. `$21`) SymbolId, diff --git a/src/lazy/text/parse_result.rs b/src/lazy/text/parse_result.rs index 5def24ca..4225ca6e 100644 --- a/src/lazy/text/parse_result.rs +++ b/src/lazy/text/parse_result.rs @@ -211,6 +211,26 @@ impl<'data> ParseError> for IonParseError<'data> { } } +/// `Result, _>` has a method called `transpose` that converts it into an `Option>`, +/// allowing it to be easily used in places like iterators that expect that return type. +/// This trait defines a similar extension method for `Result<(TextBufferView, Option)>`. +pub(crate) trait ToIteratorOutput<'data, T> { + fn transpose(self) -> Option>; +} + +impl<'data, T> ToIteratorOutput<'data, T> for IonResult<(TextBufferView<'data>, Option)> { + fn transpose(self) -> Option> { + match self { + Ok((_remaining, Some(value))) => Some(Ok(value)), + Ok((_remaining, None)) => None, + Err(e) => Some(Err(e)), + } + } +} + +/// Converts the output of a text Ion parser (any of `IonParseResult`, `IonParseError`, +/// or `nom::Err`) into a general-purpose `IonResult`. If the implementing type +/// does not have its own `label` and `input`, the specified values will be used. pub(crate) trait AddContext<'data, T> { fn with_context( self, diff --git a/src/lazy/text/raw/mod.rs b/src/lazy/text/raw/mod.rs index 1077754f..a9ad6f8d 100644 --- a/src/lazy/text/raw/mod.rs +++ b/src/lazy/text/raw/mod.rs @@ -1 +1,2 @@ pub mod reader; +pub mod sequence; diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 48495d81..99ef7537 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -145,6 +145,16 @@ mod tests { $0 $10 $733 + + [ + // First item + 1, + // Second item + 2 /*comment before comma*/, + // Third item + 3 + ] + "#, ); @@ -258,6 +268,13 @@ mod tests { RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(733)), ); + let list = reader.next()?.expect_value()?.read()?.expect_list()?; + let mut sum = 0; + for value in &list { + sum += value?.read()?.expect_i64()?; + } + assert_eq!(sum, 6); + Ok(()) } } diff --git a/src/lazy/text/raw/sequence.rs b/src/lazy/text/raw/sequence.rs new file mode 100644 index 00000000..ab1f4616 --- /dev/null +++ b/src/lazy/text/raw/sequence.rs @@ -0,0 +1,126 @@ +use crate::lazy::decoder::private::LazyContainerPrivate; +use crate::lazy::decoder::{LazyDecoder, LazyRawSequence, LazyRawValue}; +use crate::lazy::encoding::TextEncoding; +use crate::lazy::text::buffer::TextBufferView; +use crate::lazy::text::parse_result::AddContext; +use crate::lazy::text::parse_result::ToIteratorOutput; +use crate::lazy::text::value::LazyRawTextValue; +use crate::{IonResult, IonType}; +use std::fmt; +use std::fmt::{Debug, Formatter}; + +#[derive(Copy, Clone)] +pub struct LazyRawTextSequence<'data> { + pub(crate) value: LazyRawTextValue<'data>, +} + +impl<'data> LazyRawTextSequence<'data> { + pub fn ion_type(&self) -> IonType { + self.value.ion_type() + } + + pub fn iter(&self) -> RawTextSequenceIterator<'data> { + // Make an iterator over the input bytes that follow the initial `[` + RawTextSequenceIterator::new(self.value.input.slice_to_end(1)) + } +} + +impl<'data> LazyContainerPrivate<'data, TextEncoding> for LazyRawTextSequence<'data> { + fn from_value(value: LazyRawTextValue<'data>) -> Self { + LazyRawTextSequence { value } + } +} + +impl<'data> LazyRawSequence<'data, TextEncoding> for LazyRawTextSequence<'data> { + type Iterator = RawTextSequenceIterator<'data>; + + fn annotations(&self) -> >::AnnotationsIterator { + todo!("lazy sequence annotations") + } + + fn ion_type(&self) -> IonType { + self.value.ion_type() + } + + fn iter(&self) -> Self::Iterator { + LazyRawTextSequence::iter(self) + } + + fn as_value(&self) -> &LazyRawTextValue<'data> { + &self.value + } +} + +impl<'a, 'data> IntoIterator for &'a LazyRawTextSequence<'data> { + type Item = IonResult>; + type IntoIter = RawTextSequenceIterator<'data>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +impl<'a> Debug for LazyRawTextSequence<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.value.encoded_value.ion_type() { + IonType::SExp => { + write!(f, "(")?; + for value in self { + write!( + f, + "{:?} ", + value + .map_err(|_| fmt::Error)? + .read() + .map_err(|_| fmt::Error)? + )?; + } + write!(f, ")").unwrap(); + } + IonType::List => { + write!(f, "[")?; + for value in self { + write!( + f, + "{:?},", + value + .map_err(|_| fmt::Error)? + .read() + .map_err(|_| fmt::Error)? + )?; + } + write!(f, "]").unwrap(); + } + _ => unreachable!("LazyRawSequence is only created for list and sexp"), + } + + Ok(()) + } +} + +pub struct RawTextSequenceIterator<'data> { + input: TextBufferView<'data>, +} + +impl<'data> RawTextSequenceIterator<'data> { + pub(crate) fn new(input: TextBufferView<'data>) -> RawTextSequenceIterator<'data> { + RawTextSequenceIterator { input } + } +} + +impl<'data> Iterator for RawTextSequenceIterator<'data> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + match self.input.match_list_value() { + Ok((remaining, Some(value))) => { + self.input = remaining; + Some(Ok(value)) + } + Ok((_remaining, None)) => None, + Err(e) => e + .with_context("reading the next list value", self.input) + .transpose(), + } + } +} diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index 3df2e985..dd33b98a 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -5,6 +5,7 @@ use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::MatchedValue; +use crate::lazy::text::raw::sequence::LazyRawTextSequence; use crate::{IonResult, IonType, RawSymbolTokenRef}; use std::fmt; use std::fmt::{Debug, Formatter}; @@ -19,7 +20,7 @@ use std::fmt::{Debug, Formatter}; /// format than in its binary format, but is still possible.) For a resolved lazy value that /// includes a text definition for these items whenever one exists, see /// [`crate::lazy::value::LazyValue`]. -#[derive(Clone)] +#[derive(Copy, Clone)] pub struct LazyRawTextValue<'data> { pub(crate) encoded_value: EncodedTextValue, pub(crate) input: TextBufferView<'data>, @@ -54,7 +55,10 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { // ...decimal, timestamp... MatchedValue::String(s) => RawValueRef::String(s.read(matched_input)?), MatchedValue::Symbol(s) => RawValueRef::Symbol(s.read(matched_input)?), - // ...and the rest! + MatchedValue::List => { + let lazy_sequence = LazyRawTextSequence { value: *self }; + RawValueRef::List(lazy_sequence) + } // ...and the rest! }; Ok(value_ref) } From a6a3aa8c42c801bdbf3f7cb3a51cbabf670bb017 Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Wed, 9 Aug 2023 14:24:08 -1000 Subject: [PATCH 14/15] Adds `LazyRawTextReader` support for structs --- src/lazy/decoder.rs | 8 ++ src/lazy/encoding.rs | 59 +-------- src/lazy/raw_value_ref.rs | 2 +- src/lazy/text/buffer.rs | 202 +++++++++++++++++++++++++++- src/lazy/text/encoded_value.rs | 45 ++++++- src/lazy/text/matched.rs | 86 ++++++------ src/lazy/text/raw/mod.rs | 1 + src/lazy/text/raw/reader.rs | 30 ++++- src/lazy/text/raw/sequence.rs | 103 +++++++++++++-- src/lazy/text/raw/struct.rs | 232 +++++++++++++++++++++++++++++++++ src/lazy/text/value.rs | 15 ++- 11 files changed, 656 insertions(+), 127 deletions(-) create mode 100644 src/lazy/text/raw/struct.rs diff --git a/src/lazy/decoder.rs b/src/lazy/decoder.rs index e53ad2d2..c522a073 100644 --- a/src/lazy/decoder.rs +++ b/src/lazy/decoder.rs @@ -1,5 +1,6 @@ use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::raw_value_ref::RawValueRef; +use crate::result::IonFailure; use crate::{IonResult, IonType, RawSymbolTokenRef}; use std::fmt::Debug; @@ -86,6 +87,13 @@ pub trait LazyRawStruct<'data, D: LazyDecoder<'data>>: fn annotations(&self) -> D::AnnotationsIterator; fn find(&self, name: &str) -> IonResult>; fn get(&self, name: &str) -> IonResult>>; + fn get_expected(&self, name: &str) -> IonResult> { + if let Some(value) = self.get(name)? { + Ok(value) + } else { + IonResult::decoding_error(format!("did not find expected struct field '{}'", name)) + } + } fn iter(&self) -> Self::Iterator; } diff --git a/src/lazy/encoding.rs b/src/lazy/encoding.rs index 987bd1f6..3c6fc0f2 100644 --- a/src/lazy/encoding.rs +++ b/src/lazy/encoding.rs @@ -1,16 +1,16 @@ +use std::marker::PhantomData; + use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; use crate::lazy::binary::raw::r#struct::LazyRawBinaryStruct; use crate::lazy::binary::raw::reader::LazyRawBinaryReader; use crate::lazy::binary::raw::sequence::LazyRawBinarySequence; use crate::lazy::binary::raw::value::LazyRawBinaryValue; -use crate::lazy::decoder::private::{LazyContainerPrivate, LazyRawFieldPrivate}; -use crate::lazy::decoder::{LazyDecoder, LazyRawField, LazyRawStruct}; -use crate::lazy::raw_value_ref::RawValueRef; +use crate::lazy::decoder::LazyDecoder; +use crate::lazy::text::raw::r#struct::LazyRawTextStruct; use crate::lazy::text::raw::reader::LazyRawTextReader; use crate::lazy::text::raw::sequence::LazyRawTextSequence; use crate::lazy::text::value::LazyRawTextValue; use crate::{IonResult, RawSymbolTokenRef}; -use std::marker::PhantomData; // These types derive trait implementations in order to allow types that containing them // to also derive trait implementations. @@ -35,55 +35,6 @@ impl<'data> LazyDecoder<'data> for BinaryEncoding { // The types below will need to be properly defined in order for the lazy text reader to be complete. // The exist to satisfy various trait definitions. -#[derive(Debug, Clone)] -pub struct ToDoTextStruct; - -#[derive(Debug, Clone)] -pub struct ToDoTextField; - -impl<'data> LazyRawFieldPrivate<'data, TextEncoding> for ToDoTextField { - fn into_value(self) -> LazyRawTextValue<'data> { - todo!() - } -} - -impl<'data> LazyRawField<'data, TextEncoding> for ToDoTextField { - fn name(&self) -> RawSymbolTokenRef<'data> { - todo!() - } - - fn value(&self) -> &LazyRawTextValue<'data> { - todo!() - } -} - -impl<'data> LazyContainerPrivate<'data, TextEncoding> for ToDoTextStruct { - fn from_value(_value: ::Value) -> Self { - todo!() - } -} - -impl<'data> LazyRawStruct<'data, TextEncoding> for ToDoTextStruct { - type Field = ToDoTextField; - type Iterator = Box>>; - - fn annotations(&self) -> ToDoTextAnnotationsIterator<'data> { - todo!() - } - - fn find(&self, _name: &str) -> IonResult>> { - todo!() - } - - fn get(&self, _name: &str) -> IonResult>> { - todo!() - } - - fn iter(&self) -> Self::Iterator { - todo!() - } -} - #[derive(Debug, Clone)] pub struct ToDoTextAnnotationsIterator<'data> { spooky: &'data PhantomData<()>, @@ -101,6 +52,6 @@ impl<'data> LazyDecoder<'data> for TextEncoding { type Reader = LazyRawTextReader<'data>; type Value = LazyRawTextValue<'data>; type Sequence = LazyRawTextSequence<'data>; - type Struct = ToDoTextStruct; + type Struct = LazyRawTextStruct<'data>; type AnnotationsIterator = ToDoTextAnnotationsIterator<'data>; } diff --git a/src/lazy/raw_value_ref.rs b/src/lazy/raw_value_ref.rs index 5e76db66..d4c8a614 100644 --- a/src/lazy/raw_value_ref.rs +++ b/src/lazy/raw_value_ref.rs @@ -176,7 +176,7 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> { if let RawValueRef::Struct(s) = self { Ok(s) } else { - IonResult::decoding_error("expected a struct") + IonResult::decoding_error(format!("expected a struct, found: {:?}", self)) } } } diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index 22f17461..d9788f8e 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -1,6 +1,6 @@ use std::fmt::{Debug, Formatter}; use std::iter::{Copied, Enumerate}; -use std::ops::{RangeFrom, RangeTo}; +use std::ops::{Range, RangeFrom, RangeTo}; use std::slice::Iter; use nom::branch::alt; @@ -16,10 +16,12 @@ use crate::lazy::encoding::TextEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::{ - MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedSymbol, MatchedValue, + MatchedFloat, MatchedInt, MatchedString, MatchedSymbol, MatchedValue, }; use crate::lazy::text::parse_result::{InvalidInputError, IonParseError}; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; +use crate::lazy::text::raw::r#struct::{LazyRawTextField, RawTextStructIterator}; +use crate::lazy::text::raw::sequence::RawTextSequenceIterator; use crate::lazy::text::value::LazyRawTextValue; use crate::result::DecodingError; use crate::{IonError, IonResult, IonType}; @@ -246,6 +248,78 @@ impl<'data> TextBufferView<'data> { )(self) } + /// Matches a struct field name/value pair. + /// + /// If a pair is found, returns `Some(field)` and consumes the following comma if present. + /// If no pair is found (that is: the end of the struct is next), returns `None`. + pub fn match_struct_field(self) -> IonParseResult<'data, Option>> { + // A struct field can have leading whitespace, but we want the buffer slice that we match + // to begin with the field name. Here we skip any whitespace so we have another named + // slice (`input_including_field_name`) with that property. + let (input_including_field_name, _ws) = self.match_optional_comments_and_whitespace()?; + alt(( + // If the next thing in the input is a `}`, return `None`. + value(None, Self::match_struct_end), + // Otherwise, match a name/value pair and turn it into a `LazyRawTextField`. + Self::match_struct_field_name_and_value.map( + move |((name_syntax, name_span), mut value)| { + // Add the field name offsets to the `EncodedTextValue` + value.encoded_value = value.encoded_value.with_field_name( + name_syntax, + name_span.start, + name_span.len(), + ); + // Replace the value's buffer slice (which starts with the value itself) with the + // buffer slice we created that begins with the field name. + value.input = input_including_field_name; + Some(LazyRawTextField { value }) + }, + ), + ))(input_including_field_name) + } + + /// Matches any amount of whitespace followed by a closing `}`. + fn match_struct_end(self) -> IonMatchResult<'data> { + whitespace_and_then(peek(tag("}"))).parse(self) + } + + /// Matches a field name/value pair. Returns the syntax used for the field name, the range of + /// input bytes where the field name is found, and the value. + pub fn match_struct_field_name_and_value( + self, + ) -> IonParseResult<'data, ((MatchedSymbol, Range), LazyRawTextValue<'data>)> { + terminated( + separated_pair( + whitespace_and_then(match_and_span(Self::match_struct_field_name)), + whitespace_and_then(tag(":")), + whitespace_and_then(Self::match_value), + ), + whitespace_and_then(alt((tag(","), peek(tag("}"))))), + )(self) + } + + /// Matches a struct field name. That is: + /// * A quoted symbol + /// * An identifier + /// * A symbol ID + /// * A short-form string + pub fn match_struct_field_name(self) -> IonParseResult<'data, MatchedSymbol> { + alt(( + Self::match_symbol, + Self::match_short_string.map(|s| { + // NOTE: We're "casting" the matched short string to a matched symbol here. + // This relies on the fact that the MatchedSymbol logic ignores + // the first and last matched byte, which are usually single + // quotes but in this case are double quotes. + match s { + MatchedString::ShortWithoutEscapes => MatchedSymbol::QuotedWithoutEscapes, + MatchedString::ShortWithEscapes => MatchedSymbol::QuotedWithEscapes, + _ => unreachable!("field name parser matched long string"), + } + }), + ))(self) + } + /// Matches syntax that is expected to follow a value in a list: any amount of whitespace and/or /// comments followed by either a comma (consumed) or an end-of-list `]` (not consumed). fn match_delimiter_after_list_value(self) -> IonMatchResult<'data> { @@ -317,9 +391,15 @@ impl<'data> TextBufferView<'data> { }, ), map( - match_and_length(tag("[")), - |(_matched_list_start, length)| { - EncodedTextValue::new(MatchedValue::List, self.offset(), length) + match_and_length(Self::match_list), + |(matched_list, length)| { + EncodedTextValue::new(MatchedValue::List, matched_list.offset(), length) + }, + ), + map( + match_and_length(Self::match_struct), + |(matched_struct, length)| { + EncodedTextValue::new(MatchedValue::Struct, matched_struct.offset(), length) }, ), // TODO: The other Ion types @@ -331,6 +411,74 @@ impl<'data> TextBufferView<'data> { .parse(self) } + /// Matches a list. + /// + /// If the input does not contain the entire list, returns `IonError::Incomplete(_)`. + pub fn match_list(self) -> IonMatchResult<'data> { + // If it doesn't start with [, it isn't a list. + if self.bytes().first() != Some(&b'[') { + let error = InvalidInputError::new(self); + return Err(nom::Err::Error(IonParseError::Invalid(error))); + } + // Scan ahead to find the end of this list. + let list_body = self.slice_to_end(1); + let sequence_iter = RawTextSequenceIterator::new(b']', list_body); + let span = match sequence_iter.find_span() { + Ok(span) => span, + // If the complete container isn't available, return an incomplete. + Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), + // If invalid syntax was encountered, return a failure to prevent nom from trying + // other parser kinds. + Err(e) => { + return { + let error = InvalidInputError::new(self) + .with_label("matching a list") + .with_description(format!("{}", e)); + Err(nom::Err::Failure(IonParseError::Invalid(error))) + } + } + }; + + // For the matched span, we use `self` again to include the opening `[` + let matched = self.slice(0, span.len()); + let remaining = self.slice_to_end(span.len()); + Ok((remaining, matched)) + } + + /// Matches a struct. + /// + /// If the input does not contain the entire struct, returns `IonError::Incomplete(_)`. + pub fn match_struct(self) -> IonMatchResult<'data> { + // If it doesn't start with {, it isn't a struct. + if self.bytes().first() != Some(&b'{') { + let error = InvalidInputError::new(self); + return Err(nom::Err::Error(IonParseError::Invalid(error))); + } + // Scan ahead to find the end of this struct. + let struct_body = self.slice_to_end(1); + let struct_iter = RawTextStructIterator::new(struct_body); + let span = match struct_iter.find_span() { + Ok(span) => span, + // If the complete container isn't available, return an incomplete. + Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), + // If invalid syntax was encountered, return a failure to prevent nom from trying + // other parser kinds. + Err(e) => { + return { + let error = InvalidInputError::new(self) + .with_label("matching a struct") + .with_description(format!("{}", e)); + Err(nom::Err::Failure(IonParseError::Invalid(error))) + } + } + }; + + // For the matched span, we use `self` again to include the opening `{` + let matched = self.slice(0, span.len()); + let remaining = self.slice_to_end(span.len()); + Ok((remaining, matched)) + } + /// Matches a boolean value. pub fn match_bool(self) -> IonMatchResult<'data> { recognize(Self::read_bool)(self) @@ -617,7 +765,11 @@ impl<'data> TextBufferView<'data> { fn match_short_string(self) -> IonParseResult<'data, MatchedString> { delimited(char('"'), Self::match_short_string_body, char('"')) .map(|(_matched, contains_escaped_chars)| { - MatchedString::Short(MatchedShortString::new(contains_escaped_chars)) + if contains_escaped_chars { + MatchedString::ShortWithEscapes + } else { + MatchedString::ShortWithoutEscapes + } }) .parse(self) } @@ -715,7 +867,13 @@ impl<'data> TextBufferView<'data> { /// Matches a quoted symbol (`'foo'`). fn match_quoted_symbol(self) -> IonParseResult<'data, MatchedSymbol> { delimited(char('\''), Self::match_quoted_symbol_body, char('\'')) - .map(|(_matched, contains_escaped_chars)| MatchedSymbol::Quoted(contains_escaped_chars)) + .map(|(_matched, contains_escaped_chars)| { + if contains_escaped_chars { + MatchedSymbol::QuotedWithEscapes + } else { + MatchedSymbol::QuotedWithoutEscapes + } + }) .parse(self) } @@ -906,6 +1064,18 @@ impl<'data> nom::InputTakeAtPosition for TextBufferView<'data> { // === end of `nom` trait implementations +fn whitespace_and_then<'data, P, O>( + parser: P, +) -> impl Parser, O, IonParseError<'data>> +where + P: Parser, O, IonParseError<'data>>, +{ + preceded( + TextBufferView::match_optional_comments_and_whitespace, + parser, + ) +} + /// Augments a given parser such that it returns the matched value and the number of input bytes /// that it matched. fn match_and_length<'data, P, O>( @@ -926,6 +1096,24 @@ where } } +fn match_and_span<'data, P, O>( + mut parser: P, +) -> impl Parser, (O, Range), IonParseError<'data>> +where + P: Parser, O, IonParseError<'data>>, +{ + move |input: TextBufferView<'data>| { + let offset_before = input.offset(); + let (remaining, matched) = match parser.parse(input) { + Ok((remaining, matched)) => (remaining, matched), + Err(e) => return Err(e), + }; + let offset_after = remaining.offset(); + let span = offset_before..offset_after; + Ok((remaining, (matched, span))) + } +} + /// Returns the number of bytes that the provided parser matched. fn match_length<'data, P, O>( parser: P, diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index bce5d44b..6a1dbece 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -1,5 +1,7 @@ -use crate::lazy::text::matched::MatchedValue; -use crate::IonType; +use crate::lazy::text::buffer::TextBufferView; +use crate::lazy::text::matched::{MatchedSymbol, MatchedValue}; +use crate::result::IonFailure; +use crate::{IonResult, IonType}; use std::ops::Range; /// Represents the type, offset, and length metadata of the various components of an encoded value @@ -51,7 +53,7 @@ pub(crate) struct EncodedTextValue { // If there is whitespace before the field name, this will not include it. field_name_length: u32, // The number of bytes used to encode the annotations sequence preceding the data, if any. - // If there is no annotations sequence, this will be zero. // If there is whitespace before the + // If there is no annotations sequence, this will be zero. If there is whitespace before the // annotations sequence, this will not include it. annotations_length: u32, @@ -60,6 +62,8 @@ pub(crate) struct EncodedTextValue { // value is stored. For others (e.g. a timestamp), the various components of the value are // recognized during matching and partial information like subfield offsets can be stored here. matched_value: MatchedValue, + + field_name_syntax: Option, } impl EncodedTextValue { @@ -76,6 +80,7 @@ impl EncodedTextValue { annotations_offset: 0, annotations_length: 0, matched_value, + field_name_syntax: None, } } @@ -86,7 +91,13 @@ impl EncodedTextValue { // 'foo' // "foo" // $10 - pub(crate) fn with_field_name(mut self, offset: usize, length: usize) -> EncodedTextValue { + pub(crate) fn with_field_name( + mut self, + field_name_syntax: MatchedSymbol, + offset: usize, + length: usize, + ) -> EncodedTextValue { + self.field_name_syntax = Some(field_name_syntax); self.field_name_offset = (self.data_offset - offset) as u32; self.field_name_length = length as u32; self @@ -118,6 +129,7 @@ impl EncodedTextValue { MatchedValue::String(_) => IonType::String, MatchedValue::Symbol(_) => IonType::Symbol, MatchedValue::List => IonType::List, + MatchedValue::Struct => IonType::Struct, } } @@ -125,6 +137,10 @@ impl EncodedTextValue { matches!(self.matched_value, MatchedValue::Null(_)) } + pub fn data_offset(&self) -> usize { + self.data_offset + } + pub fn data_length(&self) -> usize { self.data_length } @@ -133,6 +149,17 @@ impl EncodedTextValue { self.data_offset..(self.data_offset + self.data_length) } + pub fn field_name<'data>(&self, input: TextBufferView<'data>) -> IonResult<&'data str> { + if self.field_name_offset == 0 { + return IonResult::illegal_operation( + "requested field name, but value was not in a struct field", + ); + } + let relative_start = self.data_offset - input.offset() - (self.field_name_offset as usize); + let field_name_bytes = input.slice(relative_start, self.field_name_length as usize); + field_name_bytes.as_text() + } + pub fn field_name_range(&self) -> Option> { if self.field_name_offset == 0 { return None; @@ -169,6 +196,10 @@ impl EncodedTextValue { pub fn matched(&self) -> &MatchedValue { &self.matched_value } + + pub fn field_name_syntax(&self) -> Option { + self.field_name_syntax + } } #[cfg(test)] @@ -184,7 +215,7 @@ mod tests { #[test] fn total_length_data_with_field_name() { let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) - .with_field_name(90, 4); + .with_field_name(MatchedSymbol::Identifier, 90, 4); assert_eq!(value.total_length(), 22); } @@ -198,13 +229,13 @@ mod tests { #[test] fn total_length_data_with_field_name_and_annotations() { let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) - .with_field_name(90, 4) + .with_field_name(MatchedSymbol::Identifier, 90, 4) .with_annotations_sequence(94, 6); assert_eq!(value.total_length(), 22); // Same test but with extra whitespace between the components let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) - .with_field_name(80, 4) + .with_field_name(MatchedSymbol::Identifier, 80, 4) .with_annotations_sequence(91, 6); assert_eq!(value.total_length(), 32, "{:?}", value); } diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index 53fa63b4..e6daf3dc 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -46,6 +46,7 @@ pub(crate) enum MatchedValue { String(MatchedString), Symbol(MatchedSymbol), List, + Struct, // TODO: ...the other types } @@ -53,6 +54,7 @@ pub(crate) enum MatchedValue { #[derive(Copy, Clone, Debug, PartialEq)] pub(crate) struct MatchedInt { radix: u32, + // Offset of the digits from the beginning of the value digits_offset: usize, is_negative: bool, } @@ -163,7 +165,8 @@ impl MatchedFloat { #[derive(Clone, Copy, Debug, PartialEq)] pub(crate) enum MatchedString { /// The string only has one segment. (e.g. "foo") - Short(MatchedShortString), + ShortWithoutEscapes, + ShortWithEscapes, /// The string is in multiple segments: /// """hello,""" /// """ world!""" @@ -178,50 +181,41 @@ pub(crate) struct MatchedLongString { // We probably also don't want to heap allocate just to match the long string. } -#[derive(Clone, Copy, Debug, PartialEq)] -pub(crate) struct MatchedShortString { - contains_escaped_chars: bool, -} - -impl MatchedShortString { - pub fn new(contains_escaped_chars: bool) -> Self { - Self { - contains_escaped_chars, - } - } - pub fn contains_escaped_chars(&self) -> bool { - self.contains_escaped_chars - } -} - impl MatchedString { // Strings longer than 64 bytes will allocate a larger space on the heap. const STACK_ALLOC_BUFFER_CAPACITY: usize = 64; pub fn read<'data>(&self, matched_input: TextBufferView<'data>) -> IonResult> { match self { - MatchedString::Short(short) => self.read_short_string(*short, matched_input), + MatchedString::ShortWithoutEscapes => { + self.read_short_string_without_escapes(matched_input) + } + MatchedString::ShortWithEscapes => self.read_short_string_with_escapes(matched_input), MatchedString::Long(_) => todo!("long-form strings"), } } - fn read_short_string<'data>( + fn read_short_string_without_escapes<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + // Take a slice of the input that ignores the first and last bytes, which are quotes. + let body = matched_input.slice(1, matched_input.len() - 2); + // There are no escaped characters, so we can just validate the string in-place. + let text = body.as_text()?; + let str_ref = StrRef::from(text); + Ok(str_ref) + } + + fn read_short_string_with_escapes<'data>( &self, - short: MatchedShortString, matched_input: TextBufferView<'data>, ) -> IonResult> { // Take a slice of the input that ignores the first and last bytes, which are quotes. let body = matched_input.slice(1, matched_input.len() - 2); - if !short.contains_escaped_chars() { - // There are no escaped characters, so we can just validate the string in-place. - let text = body.as_text()?; - let str_ref = StrRef::from(text); - return Ok(str_ref); - } // Otherwise, there are escaped characters. We need to build a new version of our string // that replaces the escaped characters with their corresponding bytes. let mut sanitized = Vec::with_capacity(matched_input.len()); - escape_text(body, &mut sanitized)?; let text = String::from_utf8(sanitized).unwrap(); Ok(StrRef::from(text.to_string())) @@ -389,9 +383,10 @@ pub(crate) enum MatchedSymbol { SymbolId, /// The symbol is an unquoted identifier (e.g. `foo`) Identifier, - /// The symbol is delimited by single quotes. Holds a `bool` indicating whether the - /// matched input contained any escaped bytes. - Quoted(bool), + /// The symbol is delimited by single quotes but contains no escape sequences. + QuotedWithoutEscapes, + /// The symbol is delimited by single quotes and has at least one escape sequence. + QuotedWithEscapes, // TODO: Operators in S-Expressions } @@ -403,27 +398,31 @@ impl MatchedSymbol { match self { MatchedSymbol::SymbolId => self.read_symbol_id(matched_input), MatchedSymbol::Identifier => self.read_identifier(matched_input), - MatchedSymbol::Quoted(contains_escaped_chars) => { - self.read_quoted(matched_input, *contains_escaped_chars) - } + MatchedSymbol::QuotedWithEscapes => self.read_quoted_with_escapes(matched_input), + MatchedSymbol::QuotedWithoutEscapes => self.read_quoted_without_escapes(matched_input), } } - fn read_quoted<'data>( + pub(crate) fn read_quoted_without_escapes<'data>( &self, matched_input: TextBufferView<'data>, - contains_escaped_chars: bool, ) -> IonResult> { // Take a slice of the input that ignores the first and last bytes, which are quotes. let body = matched_input.slice(1, matched_input.len() - 2); - if !contains_escaped_chars { - // There are no escaped characters, so we can just validate the string in-place. - let text = body.as_text()?; - let str_ref = RawSymbolTokenRef::Text(text.into()); - return Ok(str_ref); - } + // There are no escaped characters, so we can just validate the string in-place. + let text = body.as_text()?; + let str_ref = RawSymbolTokenRef::Text(text.into()); + Ok(str_ref) + } - // Otherwise, there are escaped characters. We need to build a new version of our symbol + pub(crate) fn read_quoted_with_escapes<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + // Take a slice of the input that ignores the first and last bytes, which are quotes. + let body = matched_input.slice(1, matched_input.len() - 2); + + // There are escaped characters. We need to build a new version of our symbol // that replaces the escaped characters with their corresponding bytes. let mut sanitized = Vec::with_capacity(matched_input.len()); @@ -431,7 +430,8 @@ impl MatchedSymbol { let text = String::from_utf8(sanitized).unwrap(); Ok(RawSymbolTokenRef::Text(text.into())) } - fn read_identifier<'data>( + + pub(crate) fn read_identifier<'data>( &self, matched_input: TextBufferView<'data>, ) -> IonResult> { diff --git a/src/lazy/text/raw/mod.rs b/src/lazy/text/raw/mod.rs index a9ad6f8d..43f7a659 100644 --- a/src/lazy/text/raw/mod.rs +++ b/src/lazy/text/raw/mod.rs @@ -1,2 +1,3 @@ pub mod reader; pub mod sequence; +pub mod r#struct; diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 99ef7537..9ff7a5c0 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -39,7 +39,7 @@ impl<'data> LazyRawTextReader<'data> { { let buffer = self.buffer; if buffer.is_empty() { - return IonResult::incomplete("reading a top-level value", buffer.offset()); + return Ok(RawStreamItem::EndOfStream); } let (buffer_after_whitespace, _whitespace) = @@ -55,7 +55,8 @@ impl<'data> LazyRawTextReader<'data> { let (remaining, matched) = buffer_after_whitespace .match_top_level() .with_context("reading a top-level value", buffer_after_whitespace)?; - // If we successfully moved to the next value, store the remaining buffer view + // Since we successfully matched the next value, we'll update the buffer + // so a future call to `next()` will resume parsing the remaining input. self.buffer = remaining; Ok(matched) } @@ -73,11 +74,12 @@ impl<'data> LazyRawReader<'data, TextEncoding> for LazyRawTextReader<'data> { #[cfg(test)] mod tests { - use super::*; - use crate::lazy::decoder::LazyRawValue; + use crate::lazy::decoder::{LazyRawStruct, LazyRawValue}; use crate::lazy::raw_value_ref::RawValueRef; use crate::{IonType, RawSymbolTokenRef}; + use super::*; + #[test] fn test_top_level() -> IonResult<()> { let mut data = String::new(); @@ -155,6 +157,15 @@ mod tests { 3 ] + { + // Identifier + foo: 100, + // Quoted symbol + 'bar': 200, + // Short-form string + "baz": 300 + } + "#, ); @@ -268,6 +279,7 @@ mod tests { RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(733)), ); + // [1, 2, 3] let list = reader.next()?.expect_value()?.read()?.expect_list()?; let mut sum = 0; for value in &list { @@ -275,6 +287,16 @@ mod tests { } assert_eq!(sum, 6); + // {foo: 100, bar: 200, baz: 300} + let item = reader.next()?; + let value = item.expect_value()?.read()?; + let strukt = value.expect_struct()?; + let mut sum = 0; + sum += strukt.get_expected("foo")?.expect_i64()?; + sum += strukt.get_expected("bar")?.expect_i64()?; + sum += strukt.get_expected("baz")?.expect_i64()?; + assert_eq!(sum, 600); + Ok(()) } } diff --git a/src/lazy/text/raw/sequence.rs b/src/lazy/text/raw/sequence.rs index ab1f4616..2e4e7e1c 100644 --- a/src/lazy/text/raw/sequence.rs +++ b/src/lazy/text/raw/sequence.rs @@ -1,3 +1,9 @@ +use std::fmt; +use std::fmt::{Debug, Formatter}; +use std::ops::Range; + +use nom::character::streaming::satisfy; + use crate::lazy::decoder::private::LazyContainerPrivate; use crate::lazy::decoder::{LazyDecoder, LazyRawSequence, LazyRawValue}; use crate::lazy::encoding::TextEncoding; @@ -6,8 +12,6 @@ use crate::lazy::text::parse_result::AddContext; use crate::lazy::text::parse_result::ToIteratorOutput; use crate::lazy::text::value::LazyRawTextValue; use crate::{IonResult, IonType}; -use std::fmt; -use std::fmt::{Debug, Formatter}; #[derive(Copy, Clone)] pub struct LazyRawTextSequence<'data> { @@ -21,7 +25,7 @@ impl<'data> LazyRawTextSequence<'data> { pub fn iter(&self) -> RawTextSequenceIterator<'data> { // Make an iterator over the input bytes that follow the initial `[` - RawTextSequenceIterator::new(self.value.input.slice_to_end(1)) + RawTextSequenceIterator::new(b']', self.value.input.slice_to_end(1)) } } @@ -98,13 +102,50 @@ impl<'a> Debug for LazyRawTextSequence<'a> { } } +#[derive(Copy, Clone, Debug)] pub struct RawTextSequenceIterator<'data> { + end_delimiter: u8, input: TextBufferView<'data>, + // If this iterator has returned an error, it should return `None` forever afterwards + has_returned_error: bool, +} + +impl<'data> RawTextSequenceIterator<'data> { + pub(crate) fn new( + end_delimiter: u8, + input: TextBufferView<'data>, + ) -> RawTextSequenceIterator<'data> { + RawTextSequenceIterator { + end_delimiter, + input, + has_returned_error: false, + } + } } impl<'data> RawTextSequenceIterator<'data> { - pub(crate) fn new(input: TextBufferView<'data>) -> RawTextSequenceIterator<'data> { - RawTextSequenceIterator { input } + pub(crate) fn find_span(&self) -> IonResult> { + // The input has already skipped past the opening delimiter. + let start = self.input.offset() - 1; + // We need to find the input slice containing the closing delimiter. It's either... + let input_after_last = if let Some(value_result) = self.last() { + let value = value_result?; + // ...the input slice that follows the last sequence value... + value.input.slice_to_end(value.encoded_value.total_length()) + } else { + // ...or there aren't values, so it's just the input after the opening delimiter. + self.input + }; + let (input_after_ws, _ws) = input_after_last + .match_optional_comments_and_whitespace() + .with_context("seeking the end of a sequence", input_after_last)?; + let (input_after_end, _end_delimiter) = + satisfy(|c| c == self.end_delimiter as char)(input_after_ws).with_context( + "seeking the closing delimiter of a sequence", + input_after_ws, + )?; + let end = input_after_end.offset(); + Ok(start..end) } } @@ -112,15 +153,61 @@ impl<'data> Iterator for RawTextSequenceIterator<'data> { type Item = IonResult>; fn next(&mut self) -> Option { + if self.has_returned_error { + return None; + } match self.input.match_list_value() { Ok((remaining, Some(value))) => { self.input = remaining; Some(Ok(value)) } Ok((_remaining, None)) => None, - Err(e) => e - .with_context("reading the next list value", self.input) - .transpose(), + Err(e) => { + self.has_returned_error = true; + e.with_context("reading the next list value", self.input) + .transpose() + } + } + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use crate::lazy::text::raw::reader::LazyRawTextReader; + use crate::IonResult; + + fn expect_sequence_range(ion_data: &str, expected: Range) -> IonResult<()> { + let reader = &mut LazyRawTextReader::new(ion_data.as_bytes()); + let value = reader.next()?.expect_value()?; + let actual_range = value.encoded_value.data_range(); + assert_eq!( + actual_range, expected, + "Sequence range ({:?}) did not match expected range ({:?})", + actual_range, expected + ); + Ok(()) + } + + #[test] + fn list_range() -> IonResult<()> { + // For each pair below, we'll confirm that the top-level list is found to + // occupy the specified input span. + let tests = &[ + // (Ion input, expected range of the sequence) + ("[]", 0..2), + (" [] ", 2..4), + ("[1, 2]", 0..6), + ("[1, /* comment ]]] */ 2]", 0..24), + // Nested + ("[1, 2, [3, 4, 5], 6]", 0..20), + // Doubly nested + ("[1, 2, [3, [a, b, c], 5], 6]", 0..28), + ]; + for test in tests { + expect_sequence_range(test.0, test.1.clone())?; } + Ok(()) } } diff --git a/src/lazy/text/raw/struct.rs b/src/lazy/text/raw/struct.rs new file mode 100644 index 00000000..f9f3742d --- /dev/null +++ b/src/lazy/text/raw/struct.rs @@ -0,0 +1,232 @@ +use crate::lazy::decoder::private::{LazyContainerPrivate, LazyRawFieldPrivate}; +use crate::lazy::decoder::{LazyRawField, LazyRawStruct, LazyRawValue}; +use crate::lazy::encoding::{TextEncoding, ToDoTextAnnotationsIterator}; +use crate::lazy::raw_value_ref::RawValueRef; +use crate::lazy::text::buffer::TextBufferView; +use crate::lazy::text::parse_result::{AddContext, ToIteratorOutput}; +use crate::lazy::text::value::LazyRawTextValue; +use crate::raw_symbol_token_ref::AsRawSymbolTokenRef; +use crate::{IonResult, RawSymbolTokenRef}; +use nom::character::streaming::satisfy; +use std::ops::Range; + +#[derive(Clone, Copy, Debug)] +pub struct RawTextStructIterator<'data> { + input: TextBufferView<'data>, + has_returned_error: bool, +} + +impl<'data> RawTextStructIterator<'data> { + pub(crate) fn new(input: TextBufferView<'data>) -> Self { + RawTextStructIterator { + input, + has_returned_error: false, + } + } + + pub(crate) fn find_span(&self) -> IonResult> { + // The input has already skipped past the opening delimiter. + let start = self.input.offset() - 1; + // We need to find the input slice containing the closing delimiter. It's either... + let input_after_last = if let Some(field_result) = self.last() { + let field = field_result?; + // ...the input slice that follows the last field... + field + .value + .input + .slice_to_end(field.value.encoded_value.total_length()) + } else { + // ...or there aren't fields, so it's just the input after the opening delimiter. + self.input + }; + let (input_after_ws, _ws) = input_after_last + .match_optional_comments_and_whitespace() + .with_context("seeking the end of a struct", input_after_last)?; + let (input_after_end, _end_delimiter) = satisfy(|c| c == b'}' as char)(input_after_ws) + .with_context("seeking the closing delimiter of a struct", input_after_ws)?; + let end = input_after_end.offset(); + Ok(start..end) + } +} + +impl<'data> Iterator for RawTextStructIterator<'data> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + if self.has_returned_error { + return None; + } + match self.input.match_struct_field() { + Ok((remaining_input, Some(field))) => { + self.input = remaining_input; + Some(Ok(field)) + } + Ok((_, None)) => None, + Err(e) => { + self.has_returned_error = true; + e.with_context("reading the next struct field", self.input) + .transpose() + } + } + } +} + +#[derive(Clone, Copy, Debug)] +pub struct LazyRawTextField<'data> { + pub(crate) value: LazyRawTextValue<'data>, +} + +impl<'data> LazyRawTextField<'data> { + pub(crate) fn new(value: LazyRawTextValue<'data>) -> Self { + LazyRawTextField { value } + } + + pub fn name(&self) -> RawSymbolTokenRef<'data> { + // We're in a struct field, the field name _must_ be populated. + // If it's not (or the field name is not a valid SID or UTF-8 string), + // that's a bug. We can safely unwrap/expect here. + let matched_symbol = self + .value + .encoded_value + .field_name_syntax() + .expect("field name syntax not available"); + let name_length = self + .value + .encoded_value + .field_name_range() + .expect("field name length not available") + .len(); + matched_symbol + .read(self.value.input.slice(0, name_length)) + .expect("invalid struct field name") + } + + pub fn value(&self) -> &LazyRawTextValue<'data> { + &self.value + } + + pub(crate) fn into_value(self) -> LazyRawTextValue<'data> { + self.value + } +} + +impl<'data> LazyRawFieldPrivate<'data, TextEncoding> for LazyRawTextField<'data> { + fn into_value(self) -> LazyRawTextValue<'data> { + self.value + } +} + +impl<'data> LazyRawField<'data, TextEncoding> for LazyRawTextField<'data> { + fn name(&self) -> RawSymbolTokenRef<'data> { + LazyRawTextField::name(self) + } + + fn value(&self) -> &LazyRawTextValue<'data> { + LazyRawTextField::value(self) + } +} + +#[derive(Clone, Copy, Debug)] +pub struct LazyRawTextStruct<'data> { + pub(crate) value: LazyRawTextValue<'data>, +} + +impl<'data> LazyRawTextStruct<'data> { + fn find(&self, name: &str) -> IonResult>> { + let name: RawSymbolTokenRef = name.as_raw_symbol_token_ref(); + for field_result in *self { + let field = field_result?; + let field_name = field.name(); + if field_name == name { + let value = field.value; + return Ok(Some(value)); + } + } + Ok(None) + } + + fn get(&self, name: &str) -> IonResult>> { + self.find(name)?.map(|f| f.read()).transpose() + } +} + +impl<'data> LazyContainerPrivate<'data, TextEncoding> for LazyRawTextStruct<'data> { + fn from_value(value: LazyRawTextValue<'data>) -> Self { + LazyRawTextStruct { value } + } +} + +impl<'data> LazyRawStruct<'data, TextEncoding> for LazyRawTextStruct<'data> { + type Field = LazyRawTextField<'data>; + type Iterator = RawTextStructIterator<'data>; + + fn annotations(&self) -> ToDoTextAnnotationsIterator<'data> { + todo!() + } + + fn find(&self, name: &str) -> IonResult>> { + self.find(name) + } + + fn get(&self, name: &str) -> IonResult>> { + self.get(name) + } + + fn iter(&self) -> Self::Iterator { + // Slice the input to skip the opening `{` + RawTextStructIterator::new(self.value.input.slice_to_end(1)) + } +} + +impl<'data> IntoIterator for LazyRawTextStruct<'data> { + type Item = IonResult>; + type IntoIter = RawTextStructIterator<'data>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use crate::lazy::text::raw::reader::LazyRawTextReader; + use crate::IonResult; + + fn expect_struct_range(ion_data: &str, expected: Range) -> IonResult<()> { + let reader = &mut LazyRawTextReader::new(ion_data.as_bytes()); + let value = reader.next()?.expect_value()?; + let actual_range = value.encoded_value.data_range(); + assert_eq!( + actual_range, expected, + "Struct range ({:?}) did not match expected range ({:?})", + actual_range, expected + ); + println!("input ok: {}", ion_data); + Ok(()) + } + + #[test] + fn struct_range() -> IonResult<()> { + // For each pair below, we'll confirm that the top-level list is found to + // occupy the specified input span. + let tests = &[ + // (Ion input, expected range of the struct) + ("{}", 0..2), + (" {} ", 2..4), + ("{a:1}", 0..5), + ("{a: 1}", 0..6), + ("{a: 1, b: 2}", 0..12), + ("{a: 1, /* comment }}} */ b: 2}", 0..30), + // Nested + ("{a: 1, b: 2, c: {d: 3, e: 4, f: 5}, g: 6}", 0..41), + // Doubly nested + ("{a: 1, b: 2, c: {d: 3, e: {foo: bar}, f: 5}, g: 6}", 0..50), + ]; + for test in tests { + expect_struct_range(test.0, test.1.clone())?; + } + Ok(()) + } +} diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index dd33b98a..0059dd8b 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -1,3 +1,6 @@ +use std::fmt; +use std::fmt::{Debug, Formatter}; + use crate::lazy::decoder::private::LazyRawValuePrivate; use crate::lazy::decoder::{LazyDecoder, LazyRawValue}; use crate::lazy::encoding::TextEncoding; @@ -5,10 +8,9 @@ use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::MatchedValue; +use crate::lazy::text::raw::r#struct::LazyRawTextStruct; use crate::lazy::text::raw::sequence::LazyRawTextSequence; use crate::{IonResult, IonType, RawSymbolTokenRef}; -use std::fmt; -use std::fmt::{Debug, Formatter}; /// A value that has been identified in the text input stream but whose data has not yet been read. /// @@ -46,7 +48,10 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { } fn read(&self) -> IonResult> { - let matched_input = self.input.slice(0, self.encoded_value.data_length()); + let matched_input = self.input.slice( + self.encoded_value.data_offset() - self.input.offset(), + self.encoded_value.data_length(), + ); let value_ref = match self.encoded_value.matched() { MatchedValue::Null(ion_type) => RawValueRef::Null(*ion_type), MatchedValue::Bool(b) => RawValueRef::Bool(*b), @@ -58,6 +63,10 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { MatchedValue::List => { let lazy_sequence = LazyRawTextSequence { value: *self }; RawValueRef::List(lazy_sequence) + } + MatchedValue::Struct => { + let lazy_struct = LazyRawTextStruct { value: *self }; + RawValueRef::Struct(lazy_struct) } // ...and the rest! }; Ok(value_ref) From 4fc9078592d567843cea4e00124a9722e15f11fc Mon Sep 17 00:00:00 2001 From: Zack Slayton Date: Thu, 10 Aug 2023 11:57:35 -1000 Subject: [PATCH 15/15] More doc comments --- src/lazy/text/buffer.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index d9788f8e..b78f8584 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -1064,6 +1064,8 @@ impl<'data> nom::InputTakeAtPosition for TextBufferView<'data> { // === end of `nom` trait implementations +/// Takes a given parser and returns a new one that accepts any amount of leading whitespace before +/// calling the original parser. fn whitespace_and_then<'data, P, O>( parser: P, ) -> impl Parser, O, IonParseError<'data>> @@ -1096,6 +1098,8 @@ where } } +/// Augments a given parser such that it returns the matched value and the range of input bytes +/// that it matched. fn match_and_span<'data, P, O>( mut parser: P, ) -> impl Parser, (O, Range), IonParseError<'data>>