diff --git a/src/lazy/decoder.rs b/src/lazy/decoder.rs index e53ad2d2..c522a073 100644 --- a/src/lazy/decoder.rs +++ b/src/lazy/decoder.rs @@ -1,5 +1,6 @@ use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::raw_value_ref::RawValueRef; +use crate::result::IonFailure; use crate::{IonResult, IonType, RawSymbolTokenRef}; use std::fmt::Debug; @@ -86,6 +87,13 @@ pub trait LazyRawStruct<'data, D: LazyDecoder<'data>>: fn annotations(&self) -> D::AnnotationsIterator; fn find(&self, name: &str) -> IonResult>; fn get(&self, name: &str) -> IonResult>>; + fn get_expected(&self, name: &str) -> IonResult> { + if let Some(value) = self.get(name)? { + Ok(value) + } else { + IonResult::decoding_error(format!("did not find expected struct field '{}'", name)) + } + } fn iter(&self) -> Self::Iterator; } diff --git a/src/lazy/encoding.rs b/src/lazy/encoding.rs index 987bd1f6..3c6fc0f2 100644 --- a/src/lazy/encoding.rs +++ b/src/lazy/encoding.rs @@ -1,16 +1,16 @@ +use std::marker::PhantomData; + use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator; use crate::lazy::binary::raw::r#struct::LazyRawBinaryStruct; use crate::lazy::binary::raw::reader::LazyRawBinaryReader; use crate::lazy::binary::raw::sequence::LazyRawBinarySequence; use crate::lazy::binary::raw::value::LazyRawBinaryValue; -use crate::lazy::decoder::private::{LazyContainerPrivate, LazyRawFieldPrivate}; -use crate::lazy::decoder::{LazyDecoder, LazyRawField, LazyRawStruct}; -use crate::lazy::raw_value_ref::RawValueRef; +use crate::lazy::decoder::LazyDecoder; +use crate::lazy::text::raw::r#struct::LazyRawTextStruct; use crate::lazy::text::raw::reader::LazyRawTextReader; use crate::lazy::text::raw::sequence::LazyRawTextSequence; use crate::lazy::text::value::LazyRawTextValue; use crate::{IonResult, RawSymbolTokenRef}; -use std::marker::PhantomData; // These types derive trait implementations in order to allow types that containing them // to also derive trait implementations. @@ -35,55 +35,6 @@ impl<'data> LazyDecoder<'data> for BinaryEncoding { // The types below will need to be properly defined in order for the lazy text reader to be complete. // The exist to satisfy various trait definitions. -#[derive(Debug, Clone)] -pub struct ToDoTextStruct; - -#[derive(Debug, Clone)] -pub struct ToDoTextField; - -impl<'data> LazyRawFieldPrivate<'data, TextEncoding> for ToDoTextField { - fn into_value(self) -> LazyRawTextValue<'data> { - todo!() - } -} - -impl<'data> LazyRawField<'data, TextEncoding> for ToDoTextField { - fn name(&self) -> RawSymbolTokenRef<'data> { - todo!() - } - - fn value(&self) -> &LazyRawTextValue<'data> { - todo!() - } -} - -impl<'data> LazyContainerPrivate<'data, TextEncoding> for ToDoTextStruct { - fn from_value(_value: ::Value) -> Self { - todo!() - } -} - -impl<'data> LazyRawStruct<'data, TextEncoding> for ToDoTextStruct { - type Field = ToDoTextField; - type Iterator = Box>>; - - fn annotations(&self) -> ToDoTextAnnotationsIterator<'data> { - todo!() - } - - fn find(&self, _name: &str) -> IonResult>> { - todo!() - } - - fn get(&self, _name: &str) -> IonResult>> { - todo!() - } - - fn iter(&self) -> Self::Iterator { - todo!() - } -} - #[derive(Debug, Clone)] pub struct ToDoTextAnnotationsIterator<'data> { spooky: &'data PhantomData<()>, @@ -101,6 +52,6 @@ impl<'data> LazyDecoder<'data> for TextEncoding { type Reader = LazyRawTextReader<'data>; type Value = LazyRawTextValue<'data>; type Sequence = LazyRawTextSequence<'data>; - type Struct = ToDoTextStruct; + type Struct = LazyRawTextStruct<'data>; type AnnotationsIterator = ToDoTextAnnotationsIterator<'data>; } diff --git a/src/lazy/raw_value_ref.rs b/src/lazy/raw_value_ref.rs index 5e76db66..d4c8a614 100644 --- a/src/lazy/raw_value_ref.rs +++ b/src/lazy/raw_value_ref.rs @@ -176,7 +176,7 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> { if let RawValueRef::Struct(s) = self { Ok(s) } else { - IonResult::decoding_error("expected a struct") + IonResult::decoding_error(format!("expected a struct, found: {:?}", self)) } } } diff --git a/src/lazy/text/buffer.rs b/src/lazy/text/buffer.rs index a0e2e42e..f17a6340 100644 --- a/src/lazy/text/buffer.rs +++ b/src/lazy/text/buffer.rs @@ -1,6 +1,6 @@ use std::fmt::{Debug, Formatter}; use std::iter::{Copied, Enumerate}; -use std::ops::{RangeFrom, RangeTo}; +use std::ops::{Range, RangeFrom, RangeTo}; use std::slice::Iter; use nom::branch::alt; @@ -16,10 +16,12 @@ use crate::lazy::encoding::TextEncoding; use crate::lazy::raw_stream_item::RawStreamItem; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::{ - MatchedFloat, MatchedInt, MatchedShortString, MatchedString, MatchedSymbol, MatchedValue, + MatchedFloat, MatchedInt, MatchedString, MatchedSymbol, MatchedValue, }; use crate::lazy::text::parse_result::{InvalidInputError, IonParseError}; use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult}; +use crate::lazy::text::raw::r#struct::{LazyRawTextField, RawTextStructIterator}; +use crate::lazy::text::raw::sequence::RawTextSequenceIterator; use crate::lazy::text::value::LazyRawTextValue; use crate::result::DecodingError; use crate::{IonError, IonResult, IonType}; @@ -246,6 +248,78 @@ impl<'data> TextBufferView<'data> { )(self) } + /// Matches a struct field name/value pair. + /// + /// If a pair is found, returns `Some(field)` and consumes the following comma if present. + /// If no pair is found (that is: the end of the struct is next), returns `None`. + pub fn match_struct_field(self) -> IonParseResult<'data, Option>> { + // A struct field can have leading whitespace, but we want the buffer slice that we match + // to begin with the field name. Here we skip any whitespace so we have another named + // slice (`input_including_field_name`) with that property. + let (input_including_field_name, _ws) = self.match_optional_comments_and_whitespace()?; + alt(( + // If the next thing in the input is a `}`, return `None`. + value(None, Self::match_struct_end), + // Otherwise, match a name/value pair and turn it into a `LazyRawTextField`. + Self::match_struct_field_name_and_value.map( + move |((name_syntax, name_span), mut value)| { + // Add the field name offsets to the `EncodedTextValue` + value.encoded_value = value.encoded_value.with_field_name( + name_syntax, + name_span.start, + name_span.len(), + ); + // Replace the value's buffer slice (which starts with the value itself) with the + // buffer slice we created that begins with the field name. + value.input = input_including_field_name; + Some(LazyRawTextField { value }) + }, + ), + ))(input_including_field_name) + } + + /// Matches any amount of whitespace followed by a closing `}`. + fn match_struct_end(self) -> IonMatchResult<'data> { + whitespace_and_then(peek(tag("}"))).parse(self) + } + + /// Matches a field name/value pair. Returns the syntax used for the field name, the range of + /// input bytes where the field name is found, and the value. + pub fn match_struct_field_name_and_value( + self, + ) -> IonParseResult<'data, ((MatchedSymbol, Range), LazyRawTextValue<'data>)> { + terminated( + separated_pair( + whitespace_and_then(match_and_span(Self::match_struct_field_name)), + whitespace_and_then(tag(":")), + whitespace_and_then(Self::match_value), + ), + whitespace_and_then(alt((tag(","), peek(tag("}"))))), + )(self) + } + + /// Matches a struct field name. That is: + /// * A quoted symbol + /// * An identifier + /// * A symbol ID + /// * A short-form string + pub fn match_struct_field_name(self) -> IonParseResult<'data, MatchedSymbol> { + alt(( + Self::match_symbol, + Self::match_short_string.map(|s| { + // NOTE: We're "casting" the matched short string to a matched symbol here. + // This relies on the fact that the MatchedSymbol logic ignores + // the first and last matched byte, which are usually single + // quotes but in this case are double quotes. + match s { + MatchedString::ShortWithoutEscapes => MatchedSymbol::QuotedWithoutEscapes, + MatchedString::ShortWithEscapes => MatchedSymbol::QuotedWithEscapes, + _ => unreachable!("field name parser matched long string"), + } + }), + ))(self) + } + /// Matches syntax that is expected to follow a value in a list: any amount of whitespace and/or /// comments followed by either a comma (consumed) or an end-of-list `]` (not consumed). fn match_delimiter_after_list_value(self) -> IonMatchResult<'data> { @@ -317,9 +391,15 @@ impl<'data> TextBufferView<'data> { }, ), map( - match_and_length(tag("[")), - |(_matched_list_start, length)| { - EncodedTextValue::new(MatchedValue::List, self.offset(), length) + match_and_length(Self::match_list), + |(matched_list, length)| { + EncodedTextValue::new(MatchedValue::List, matched_list.offset(), length) + }, + ), + map( + match_and_length(Self::match_struct), + |(matched_struct, length)| { + EncodedTextValue::new(MatchedValue::Struct, matched_struct.offset(), length) }, ), // TODO: The other Ion types @@ -331,6 +411,74 @@ impl<'data> TextBufferView<'data> { .parse(self) } + /// Matches a list. + /// + /// If the input does not contain the entire list, returns `IonError::Incomplete(_)`. + pub fn match_list(self) -> IonMatchResult<'data> { + // If it doesn't start with [, it isn't a list. + if self.bytes().first() != Some(&b'[') { + let error = InvalidInputError::new(self); + return Err(nom::Err::Error(IonParseError::Invalid(error))); + } + // Scan ahead to find the end of this list. + let list_body = self.slice_to_end(1); + let sequence_iter = RawTextSequenceIterator::new(b']', list_body); + let span = match sequence_iter.find_span() { + Ok(span) => span, + // If the complete container isn't available, return an incomplete. + Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), + // If invalid syntax was encountered, return a failure to prevent nom from trying + // other parser kinds. + Err(e) => { + return { + let error = InvalidInputError::new(self) + .with_label("matching a list") + .with_description(format!("{}", e)); + Err(nom::Err::Failure(IonParseError::Invalid(error))) + } + } + }; + + // For the matched span, we use `self` again to include the opening `[` + let matched = self.slice(0, span.len()); + let remaining = self.slice_to_end(span.len()); + Ok((remaining, matched)) + } + + /// Matches a struct. + /// + /// If the input does not contain the entire struct, returns `IonError::Incomplete(_)`. + pub fn match_struct(self) -> IonMatchResult<'data> { + // If it doesn't start with {, it isn't a struct. + if self.bytes().first() != Some(&b'{') { + let error = InvalidInputError::new(self); + return Err(nom::Err::Error(IonParseError::Invalid(error))); + } + // Scan ahead to find the end of this struct. + let struct_body = self.slice_to_end(1); + let struct_iter = RawTextStructIterator::new(struct_body); + let span = match struct_iter.find_span() { + Ok(span) => span, + // If the complete container isn't available, return an incomplete. + Err(IonError::Incomplete(_)) => return Err(nom::Err::Incomplete(Needed::Unknown)), + // If invalid syntax was encountered, return a failure to prevent nom from trying + // other parser kinds. + Err(e) => { + return { + let error = InvalidInputError::new(self) + .with_label("matching a struct") + .with_description(format!("{}", e)); + Err(nom::Err::Failure(IonParseError::Invalid(error))) + } + } + }; + + // For the matched span, we use `self` again to include the opening `{` + let matched = self.slice(0, span.len()); + let remaining = self.slice_to_end(span.len()); + Ok((remaining, matched)) + } + /// Matches a boolean value. pub fn match_bool(self) -> IonMatchResult<'data> { recognize(Self::read_bool)(self) @@ -617,7 +765,11 @@ impl<'data> TextBufferView<'data> { fn match_short_string(self) -> IonParseResult<'data, MatchedString> { delimited(char('"'), Self::match_short_string_body, char('"')) .map(|(_matched, contains_escaped_chars)| { - MatchedString::Short(MatchedShortString::new(contains_escaped_chars)) + if contains_escaped_chars { + MatchedString::ShortWithEscapes + } else { + MatchedString::ShortWithoutEscapes + } }) .parse(self) } @@ -715,7 +867,13 @@ impl<'data> TextBufferView<'data> { /// Matches a quoted symbol (`'foo'`). fn match_quoted_symbol(self) -> IonParseResult<'data, MatchedSymbol> { delimited(char('\''), Self::match_quoted_symbol_body, char('\'')) - .map(|(_matched, contains_escaped_chars)| MatchedSymbol::Quoted(contains_escaped_chars)) + .map(|(_matched, contains_escaped_chars)| { + if contains_escaped_chars { + MatchedSymbol::QuotedWithEscapes + } else { + MatchedSymbol::QuotedWithoutEscapes + } + }) .parse(self) } @@ -906,6 +1064,20 @@ impl<'data> nom::InputTakeAtPosition for TextBufferView<'data> { // === end of `nom` trait implementations +/// Takes a given parser and returns a new one that accepts any amount of leading whitespace before +/// calling the original parser. +fn whitespace_and_then<'data, P, O>( + parser: P, +) -> impl Parser, O, IonParseError<'data>> +where + P: Parser, O, IonParseError<'data>>, +{ + preceded( + TextBufferView::match_optional_comments_and_whitespace, + parser, + ) +} + /// Augments a given parser such that it returns the matched value and the number of input bytes /// that it matched. fn match_and_length<'data, P, O>( @@ -926,6 +1098,26 @@ where } } +/// Augments a given parser such that it returns the matched value and the range of input bytes +/// that it matched. +fn match_and_span<'data, P, O>( + mut parser: P, +) -> impl Parser, (O, Range), IonParseError<'data>> +where + P: Parser, O, IonParseError<'data>>, +{ + move |input: TextBufferView<'data>| { + let offset_before = input.offset(); + let (remaining, matched) = match parser.parse(input) { + Ok((remaining, matched)) => (remaining, matched), + Err(e) => return Err(e), + }; + let offset_after = remaining.offset(); + let span = offset_before..offset_after; + Ok((remaining, (matched, span))) + } +} + /// Returns the number of bytes that the provided parser matched. fn match_length<'data, P, O>( parser: P, diff --git a/src/lazy/text/encoded_value.rs b/src/lazy/text/encoded_value.rs index eeb08fa9..dfe40a94 100644 --- a/src/lazy/text/encoded_value.rs +++ b/src/lazy/text/encoded_value.rs @@ -1,5 +1,7 @@ -use crate::lazy::text::matched::MatchedValue; -use crate::IonType; +use crate::lazy::text::buffer::TextBufferView; +use crate::lazy::text::matched::{MatchedSymbol, MatchedValue}; +use crate::result::IonFailure; +use crate::{IonResult, IonType}; use std::ops::Range; /// Represents the type, offset, and length metadata of the various components of an encoded value @@ -51,7 +53,7 @@ pub(crate) struct EncodedTextValue { // If there is whitespace before the field name, this will not include it. field_name_length: u32, // The number of bytes used to encode the annotations sequence preceding the data, if any. - // If there is no annotations sequence, this will be zero. // If there is whitespace before the + // If there is no annotations sequence, this will be zero. If there is whitespace before the // annotations sequence, this will not include it. annotations_length: u32, @@ -60,6 +62,7 @@ pub(crate) struct EncodedTextValue { // value is stored. For others (e.g. a timestamp), the various components of the value are // recognized during matching and partial information like subfield offsets can be stored here. matched_value: MatchedValue, + field_name_syntax: Option, } impl EncodedTextValue { @@ -76,6 +79,7 @@ impl EncodedTextValue { annotations_offset: 0, annotations_length: 0, matched_value, + field_name_syntax: None, } } @@ -86,7 +90,13 @@ impl EncodedTextValue { // 'foo' // "foo" // $10 - pub(crate) fn with_field_name(mut self, offset: usize, length: usize) -> EncodedTextValue { + pub(crate) fn with_field_name( + mut self, + field_name_syntax: MatchedSymbol, + offset: usize, + length: usize, + ) -> EncodedTextValue { + self.field_name_syntax = Some(field_name_syntax); self.field_name_offset = (self.data_offset - offset) as u32; self.field_name_length = length as u32; self @@ -118,6 +128,7 @@ impl EncodedTextValue { MatchedValue::String(_) => IonType::String, MatchedValue::Symbol(_) => IonType::Symbol, MatchedValue::List => IonType::List, + MatchedValue::Struct => IonType::Struct, } } @@ -125,6 +136,10 @@ impl EncodedTextValue { matches!(self.matched_value, MatchedValue::Null(_)) } + pub fn data_offset(&self) -> usize { + self.data_offset + } + pub fn data_length(&self) -> usize { self.data_length } @@ -133,6 +148,17 @@ impl EncodedTextValue { self.data_offset..(self.data_offset + self.data_length) } + pub fn field_name<'data>(&self, input: TextBufferView<'data>) -> IonResult<&'data str> { + if self.field_name_offset == 0 { + return IonResult::illegal_operation( + "requested field name, but value was not in a struct field", + ); + } + let relative_start = self.data_offset - input.offset() - (self.field_name_offset as usize); + let field_name_bytes = input.slice(relative_start, self.field_name_length as usize); + field_name_bytes.as_text() + } + pub fn field_name_range(&self) -> Option> { if self.field_name_offset == 0 { return None; @@ -166,6 +192,10 @@ impl EncodedTextValue { self.data_length + u32::max(self.annotations_offset, self.field_name_offset) as usize } + pub fn field_name_syntax(&self) -> Option { + self.field_name_syntax + } + pub fn matched(&self) -> MatchedValue { self.matched_value } @@ -184,7 +214,7 @@ mod tests { #[test] fn total_length_data_with_field_name() { let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) - .with_field_name(90, 4); + .with_field_name(MatchedSymbol::Identifier, 90, 4); assert_eq!(value.total_length(), 22); } @@ -198,13 +228,13 @@ mod tests { #[test] fn total_length_data_with_field_name_and_annotations() { let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) - .with_field_name(90, 4) + .with_field_name(MatchedSymbol::Identifier, 90, 4) .with_annotations_sequence(94, 6); assert_eq!(value.total_length(), 22); // Same test but with extra whitespace between the components let value = EncodedTextValue::new(MatchedValue::Null(IonType::Null), 100, 12) - .with_field_name(80, 4) + .with_field_name(MatchedSymbol::Identifier, 80, 4) .with_annotations_sequence(91, 6); assert_eq!(value.total_length(), 32, "{:?}", value); } diff --git a/src/lazy/text/matched.rs b/src/lazy/text/matched.rs index a6aec359..a7477f26 100644 --- a/src/lazy/text/matched.rs +++ b/src/lazy/text/matched.rs @@ -36,7 +36,7 @@ use crate::result::{DecodingError, IonFailure}; use crate::{Int, IonError, IonResult, IonType, RawSymbolTokenRef}; /// A partially parsed Ion value. -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub(crate) enum MatchedValue { // `Null` and `Bool` are fully parsed because they only involve matching a keyword. Null(IonType), @@ -46,6 +46,7 @@ pub(crate) enum MatchedValue { String(MatchedString), Symbol(MatchedSymbol), List, + Struct, // TODO: ...the other types } @@ -53,6 +54,7 @@ pub(crate) enum MatchedValue { #[derive(Copy, Clone, Debug, PartialEq)] pub(crate) struct MatchedInt { radix: u32, + // Offset of the digits from the beginning of the value digits_offset: usize, is_negative: bool, } @@ -160,60 +162,52 @@ impl MatchedFloat { } } -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub(crate) enum MatchedString { /// The string only has one segment. (e.g. "foo") - Short(MatchedShortString), + ShortWithoutEscapes, + ShortWithEscapes, /// The string is in multiple segments: /// """hello,""" /// """ world!""" Long, } -#[derive(Clone, Copy, Debug, PartialEq)] -pub(crate) struct MatchedShortString { - contains_escaped_chars: bool, -} - -impl MatchedShortString { - pub fn new(contains_escaped_chars: bool) -> Self { - Self { - contains_escaped_chars, - } - } - pub fn contains_escaped_chars(&self) -> bool { - self.contains_escaped_chars - } -} - impl MatchedString { // Strings longer than 64 bytes will allocate a larger space on the heap. const STACK_ALLOC_BUFFER_CAPACITY: usize = 64; pub fn read<'data>(&self, matched_input: TextBufferView<'data>) -> IonResult> { match self { - MatchedString::Short(short) => self.read_short_string(*short, matched_input), + MatchedString::ShortWithoutEscapes => { + self.read_short_string_without_escapes(matched_input) + } + MatchedString::ShortWithEscapes => self.read_short_string_with_escapes(matched_input), MatchedString::Long => todo!("long-form strings"), } } - fn read_short_string<'data>( + fn read_short_string_without_escapes<'data>( &self, - short: MatchedShortString, matched_input: TextBufferView<'data>, ) -> IonResult> { // Take a slice of the input that ignores the first and last bytes, which are quotes. let body = matched_input.slice(1, matched_input.len() - 2); - if !short.contains_escaped_chars() { - // There are no escaped characters, so we can just validate the string in-place. - let text = body.as_text()?; - let str_ref = StrRef::from(text); - return Ok(str_ref); - } - // Otherwise, there are escaped characters. We need to build a new version of our string + // There are no escaped characters, so we can just validate the string in-place. + let text = body.as_text()?; + let str_ref = StrRef::from(text); + Ok(str_ref) + } + + fn read_short_string_with_escapes<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + // Take a slice of the input that ignores the first and last bytes, which are quotes. + let body = matched_input.slice(1, matched_input.len() - 2); + // There are escaped characters. We need to build a new version of our string // that replaces the escaped characters with their corresponding bytes. let mut sanitized = Vec::with_capacity(matched_input.len()); - escape_text(body, &mut sanitized)?; let text = String::from_utf8(sanitized).unwrap(); Ok(StrRef::from(text.to_string())) @@ -375,15 +369,16 @@ fn code_point_is_a_high_surrogate(value: u32) -> bool { (0xD800..=0xDFFF).contains(&value) } -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub(crate) enum MatchedSymbol { /// A numeric symbol ID (e.g. `$21`) SymbolId, /// The symbol is an unquoted identifier (e.g. `foo`) Identifier, - /// The symbol is delimited by single quotes. Holds a `bool` indicating whether the - /// matched input contained any escaped bytes. - Quoted(bool), + /// The symbol is delimited by single quotes but contains no escape sequences. + QuotedWithoutEscapes, + /// The symbol is delimited by single quotes and has at least one escape sequence. + QuotedWithEscapes, // TODO: Operators in S-Expressions } @@ -395,27 +390,31 @@ impl MatchedSymbol { match self { MatchedSymbol::SymbolId => self.read_symbol_id(matched_input), MatchedSymbol::Identifier => self.read_identifier(matched_input), - MatchedSymbol::Quoted(contains_escaped_chars) => { - self.read_quoted(matched_input, *contains_escaped_chars) - } + MatchedSymbol::QuotedWithEscapes => self.read_quoted_with_escapes(matched_input), + MatchedSymbol::QuotedWithoutEscapes => self.read_quoted_without_escapes(matched_input), } } - fn read_quoted<'data>( + pub(crate) fn read_quoted_without_escapes<'data>( &self, matched_input: TextBufferView<'data>, - contains_escaped_chars: bool, ) -> IonResult> { // Take a slice of the input that ignores the first and last bytes, which are quotes. let body = matched_input.slice(1, matched_input.len() - 2); - if !contains_escaped_chars { - // There are no escaped characters, so we can just validate the string in-place. - let text = body.as_text()?; - let str_ref = RawSymbolTokenRef::Text(text.into()); - return Ok(str_ref); - } + // There are no escaped characters, so we can just validate the string in-place. + let text = body.as_text()?; + let str_ref = RawSymbolTokenRef::Text(text.into()); + Ok(str_ref) + } - // Otherwise, there are escaped characters. We need to build a new version of our symbol + pub(crate) fn read_quoted_with_escapes<'data>( + &self, + matched_input: TextBufferView<'data>, + ) -> IonResult> { + // Take a slice of the input that ignores the first and last bytes, which are quotes. + let body = matched_input.slice(1, matched_input.len() - 2); + + // There are escaped characters. We need to build a new version of our symbol // that replaces the escaped characters with their corresponding bytes. let mut sanitized = Vec::with_capacity(matched_input.len()); @@ -423,7 +422,8 @@ impl MatchedSymbol { let text = String::from_utf8(sanitized).unwrap(); Ok(RawSymbolTokenRef::Text(text.into())) } - fn read_identifier<'data>( + + pub(crate) fn read_identifier<'data>( &self, matched_input: TextBufferView<'data>, ) -> IonResult> { @@ -431,6 +431,7 @@ impl MatchedSymbol { .as_text() .map(|t| RawSymbolTokenRef::Text(Cow::Borrowed(t))) } + fn read_symbol_id<'data>( &self, matched_input: TextBufferView<'data>, diff --git a/src/lazy/text/raw/mod.rs b/src/lazy/text/raw/mod.rs index a9ad6f8d..43f7a659 100644 --- a/src/lazy/text/raw/mod.rs +++ b/src/lazy/text/raw/mod.rs @@ -1,2 +1,3 @@ pub mod reader; pub mod sequence; +pub mod r#struct; diff --git a/src/lazy/text/raw/reader.rs b/src/lazy/text/raw/reader.rs index 6106af62..51d87874 100644 --- a/src/lazy/text/raw/reader.rs +++ b/src/lazy/text/raw/reader.rs @@ -39,18 +39,24 @@ impl<'data> LazyRawTextReader<'data> { { let buffer = self.buffer; if buffer.is_empty() { - return IonResult::incomplete("reading a top-level value", buffer.offset()); + return Ok(RawStreamItem::EndOfStream); + } + + let (buffer_after_whitespace, _whitespace) = + match buffer.match_optional_comments_and_whitespace() { + Ok((buf, ws)) => (buf, ws), + Err(nom::Err::Incomplete(_)) => return Ok(RawStreamItem::EndOfStream), + Err(e) => return IonResult::decoding_error(format!("broken: {:?}", e)), + }; + + if buffer_after_whitespace.is_empty() { + return Ok(RawStreamItem::EndOfStream); } - let (buffer_after_whitespace, _whitespace) = buffer - .match_optional_comments_and_whitespace() - .with_context( - "skipping comments and whitespace between top-level values", - buffer, - )?; let (remaining, matched) = buffer_after_whitespace .match_top_level() .with_context("reading a top-level value", buffer_after_whitespace)?; - // If we successfully moved to the next value, store the remaining buffer view + // Since we successfully matched the next value, we'll update the buffer + // so a future call to `next()` will resume parsing the remaining input. self.buffer = remaining; Ok(matched) } @@ -68,11 +74,12 @@ impl<'data> LazyRawReader<'data, TextEncoding> for LazyRawTextReader<'data> { #[cfg(test)] mod tests { - use super::*; - use crate::lazy::decoder::LazyRawValue; + use crate::lazy::decoder::{LazyRawStruct, LazyRawValue}; use crate::lazy::raw_value_ref::RawValueRef; use crate::{IonType, RawSymbolTokenRef}; + use super::*; + #[test] fn test_top_level() -> IonResult<()> { let mut data = String::new(); @@ -149,6 +156,15 @@ mod tests { // Third item 3 ] + + { + // Identifier + foo: 100, + // Quoted symbol + 'bar': 200, + // Short-form string + "baz": 300 + } "#, ); @@ -263,6 +279,8 @@ mod tests { RawValueRef::Symbol(RawSymbolTokenRef::SymbolId(733)), ); + // [1, 2, 3] + let list = reader.next()?.expect_value()?.read()?.expect_list()?; let mut sum = 0; for value in &list { @@ -270,6 +288,16 @@ mod tests { } assert_eq!(sum, 6); + // {foo: 100, bar: 200, baz: 300} + let item = reader.next()?; + let value = item.expect_value()?.read()?; + let strukt = value.expect_struct()?; + let mut sum = 0; + sum += strukt.get_expected("foo")?.expect_i64()?; + sum += strukt.get_expected("bar")?.expect_i64()?; + sum += strukt.get_expected("baz")?.expect_i64()?; + assert_eq!(sum, 600); + Ok(()) } } diff --git a/src/lazy/text/raw/sequence.rs b/src/lazy/text/raw/sequence.rs index ab1f4616..2e4e7e1c 100644 --- a/src/lazy/text/raw/sequence.rs +++ b/src/lazy/text/raw/sequence.rs @@ -1,3 +1,9 @@ +use std::fmt; +use std::fmt::{Debug, Formatter}; +use std::ops::Range; + +use nom::character::streaming::satisfy; + use crate::lazy::decoder::private::LazyContainerPrivate; use crate::lazy::decoder::{LazyDecoder, LazyRawSequence, LazyRawValue}; use crate::lazy::encoding::TextEncoding; @@ -6,8 +12,6 @@ use crate::lazy::text::parse_result::AddContext; use crate::lazy::text::parse_result::ToIteratorOutput; use crate::lazy::text::value::LazyRawTextValue; use crate::{IonResult, IonType}; -use std::fmt; -use std::fmt::{Debug, Formatter}; #[derive(Copy, Clone)] pub struct LazyRawTextSequence<'data> { @@ -21,7 +25,7 @@ impl<'data> LazyRawTextSequence<'data> { pub fn iter(&self) -> RawTextSequenceIterator<'data> { // Make an iterator over the input bytes that follow the initial `[` - RawTextSequenceIterator::new(self.value.input.slice_to_end(1)) + RawTextSequenceIterator::new(b']', self.value.input.slice_to_end(1)) } } @@ -98,13 +102,50 @@ impl<'a> Debug for LazyRawTextSequence<'a> { } } +#[derive(Copy, Clone, Debug)] pub struct RawTextSequenceIterator<'data> { + end_delimiter: u8, input: TextBufferView<'data>, + // If this iterator has returned an error, it should return `None` forever afterwards + has_returned_error: bool, +} + +impl<'data> RawTextSequenceIterator<'data> { + pub(crate) fn new( + end_delimiter: u8, + input: TextBufferView<'data>, + ) -> RawTextSequenceIterator<'data> { + RawTextSequenceIterator { + end_delimiter, + input, + has_returned_error: false, + } + } } impl<'data> RawTextSequenceIterator<'data> { - pub(crate) fn new(input: TextBufferView<'data>) -> RawTextSequenceIterator<'data> { - RawTextSequenceIterator { input } + pub(crate) fn find_span(&self) -> IonResult> { + // The input has already skipped past the opening delimiter. + let start = self.input.offset() - 1; + // We need to find the input slice containing the closing delimiter. It's either... + let input_after_last = if let Some(value_result) = self.last() { + let value = value_result?; + // ...the input slice that follows the last sequence value... + value.input.slice_to_end(value.encoded_value.total_length()) + } else { + // ...or there aren't values, so it's just the input after the opening delimiter. + self.input + }; + let (input_after_ws, _ws) = input_after_last + .match_optional_comments_and_whitespace() + .with_context("seeking the end of a sequence", input_after_last)?; + let (input_after_end, _end_delimiter) = + satisfy(|c| c == self.end_delimiter as char)(input_after_ws).with_context( + "seeking the closing delimiter of a sequence", + input_after_ws, + )?; + let end = input_after_end.offset(); + Ok(start..end) } } @@ -112,15 +153,61 @@ impl<'data> Iterator for RawTextSequenceIterator<'data> { type Item = IonResult>; fn next(&mut self) -> Option { + if self.has_returned_error { + return None; + } match self.input.match_list_value() { Ok((remaining, Some(value))) => { self.input = remaining; Some(Ok(value)) } Ok((_remaining, None)) => None, - Err(e) => e - .with_context("reading the next list value", self.input) - .transpose(), + Err(e) => { + self.has_returned_error = true; + e.with_context("reading the next list value", self.input) + .transpose() + } + } + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use crate::lazy::text::raw::reader::LazyRawTextReader; + use crate::IonResult; + + fn expect_sequence_range(ion_data: &str, expected: Range) -> IonResult<()> { + let reader = &mut LazyRawTextReader::new(ion_data.as_bytes()); + let value = reader.next()?.expect_value()?; + let actual_range = value.encoded_value.data_range(); + assert_eq!( + actual_range, expected, + "Sequence range ({:?}) did not match expected range ({:?})", + actual_range, expected + ); + Ok(()) + } + + #[test] + fn list_range() -> IonResult<()> { + // For each pair below, we'll confirm that the top-level list is found to + // occupy the specified input span. + let tests = &[ + // (Ion input, expected range of the sequence) + ("[]", 0..2), + (" [] ", 2..4), + ("[1, 2]", 0..6), + ("[1, /* comment ]]] */ 2]", 0..24), + // Nested + ("[1, 2, [3, 4, 5], 6]", 0..20), + // Doubly nested + ("[1, 2, [3, [a, b, c], 5], 6]", 0..28), + ]; + for test in tests { + expect_sequence_range(test.0, test.1.clone())?; } + Ok(()) } } diff --git a/src/lazy/text/raw/struct.rs b/src/lazy/text/raw/struct.rs new file mode 100644 index 00000000..f9f3742d --- /dev/null +++ b/src/lazy/text/raw/struct.rs @@ -0,0 +1,232 @@ +use crate::lazy::decoder::private::{LazyContainerPrivate, LazyRawFieldPrivate}; +use crate::lazy::decoder::{LazyRawField, LazyRawStruct, LazyRawValue}; +use crate::lazy::encoding::{TextEncoding, ToDoTextAnnotationsIterator}; +use crate::lazy::raw_value_ref::RawValueRef; +use crate::lazy::text::buffer::TextBufferView; +use crate::lazy::text::parse_result::{AddContext, ToIteratorOutput}; +use crate::lazy::text::value::LazyRawTextValue; +use crate::raw_symbol_token_ref::AsRawSymbolTokenRef; +use crate::{IonResult, RawSymbolTokenRef}; +use nom::character::streaming::satisfy; +use std::ops::Range; + +#[derive(Clone, Copy, Debug)] +pub struct RawTextStructIterator<'data> { + input: TextBufferView<'data>, + has_returned_error: bool, +} + +impl<'data> RawTextStructIterator<'data> { + pub(crate) fn new(input: TextBufferView<'data>) -> Self { + RawTextStructIterator { + input, + has_returned_error: false, + } + } + + pub(crate) fn find_span(&self) -> IonResult> { + // The input has already skipped past the opening delimiter. + let start = self.input.offset() - 1; + // We need to find the input slice containing the closing delimiter. It's either... + let input_after_last = if let Some(field_result) = self.last() { + let field = field_result?; + // ...the input slice that follows the last field... + field + .value + .input + .slice_to_end(field.value.encoded_value.total_length()) + } else { + // ...or there aren't fields, so it's just the input after the opening delimiter. + self.input + }; + let (input_after_ws, _ws) = input_after_last + .match_optional_comments_and_whitespace() + .with_context("seeking the end of a struct", input_after_last)?; + let (input_after_end, _end_delimiter) = satisfy(|c| c == b'}' as char)(input_after_ws) + .with_context("seeking the closing delimiter of a struct", input_after_ws)?; + let end = input_after_end.offset(); + Ok(start..end) + } +} + +impl<'data> Iterator for RawTextStructIterator<'data> { + type Item = IonResult>; + + fn next(&mut self) -> Option { + if self.has_returned_error { + return None; + } + match self.input.match_struct_field() { + Ok((remaining_input, Some(field))) => { + self.input = remaining_input; + Some(Ok(field)) + } + Ok((_, None)) => None, + Err(e) => { + self.has_returned_error = true; + e.with_context("reading the next struct field", self.input) + .transpose() + } + } + } +} + +#[derive(Clone, Copy, Debug)] +pub struct LazyRawTextField<'data> { + pub(crate) value: LazyRawTextValue<'data>, +} + +impl<'data> LazyRawTextField<'data> { + pub(crate) fn new(value: LazyRawTextValue<'data>) -> Self { + LazyRawTextField { value } + } + + pub fn name(&self) -> RawSymbolTokenRef<'data> { + // We're in a struct field, the field name _must_ be populated. + // If it's not (or the field name is not a valid SID or UTF-8 string), + // that's a bug. We can safely unwrap/expect here. + let matched_symbol = self + .value + .encoded_value + .field_name_syntax() + .expect("field name syntax not available"); + let name_length = self + .value + .encoded_value + .field_name_range() + .expect("field name length not available") + .len(); + matched_symbol + .read(self.value.input.slice(0, name_length)) + .expect("invalid struct field name") + } + + pub fn value(&self) -> &LazyRawTextValue<'data> { + &self.value + } + + pub(crate) fn into_value(self) -> LazyRawTextValue<'data> { + self.value + } +} + +impl<'data> LazyRawFieldPrivate<'data, TextEncoding> for LazyRawTextField<'data> { + fn into_value(self) -> LazyRawTextValue<'data> { + self.value + } +} + +impl<'data> LazyRawField<'data, TextEncoding> for LazyRawTextField<'data> { + fn name(&self) -> RawSymbolTokenRef<'data> { + LazyRawTextField::name(self) + } + + fn value(&self) -> &LazyRawTextValue<'data> { + LazyRawTextField::value(self) + } +} + +#[derive(Clone, Copy, Debug)] +pub struct LazyRawTextStruct<'data> { + pub(crate) value: LazyRawTextValue<'data>, +} + +impl<'data> LazyRawTextStruct<'data> { + fn find(&self, name: &str) -> IonResult>> { + let name: RawSymbolTokenRef = name.as_raw_symbol_token_ref(); + for field_result in *self { + let field = field_result?; + let field_name = field.name(); + if field_name == name { + let value = field.value; + return Ok(Some(value)); + } + } + Ok(None) + } + + fn get(&self, name: &str) -> IonResult>> { + self.find(name)?.map(|f| f.read()).transpose() + } +} + +impl<'data> LazyContainerPrivate<'data, TextEncoding> for LazyRawTextStruct<'data> { + fn from_value(value: LazyRawTextValue<'data>) -> Self { + LazyRawTextStruct { value } + } +} + +impl<'data> LazyRawStruct<'data, TextEncoding> for LazyRawTextStruct<'data> { + type Field = LazyRawTextField<'data>; + type Iterator = RawTextStructIterator<'data>; + + fn annotations(&self) -> ToDoTextAnnotationsIterator<'data> { + todo!() + } + + fn find(&self, name: &str) -> IonResult>> { + self.find(name) + } + + fn get(&self, name: &str) -> IonResult>> { + self.get(name) + } + + fn iter(&self) -> Self::Iterator { + // Slice the input to skip the opening `{` + RawTextStructIterator::new(self.value.input.slice_to_end(1)) + } +} + +impl<'data> IntoIterator for LazyRawTextStruct<'data> { + type Item = IonResult>; + type IntoIter = RawTextStructIterator<'data>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use crate::lazy::text::raw::reader::LazyRawTextReader; + use crate::IonResult; + + fn expect_struct_range(ion_data: &str, expected: Range) -> IonResult<()> { + let reader = &mut LazyRawTextReader::new(ion_data.as_bytes()); + let value = reader.next()?.expect_value()?; + let actual_range = value.encoded_value.data_range(); + assert_eq!( + actual_range, expected, + "Struct range ({:?}) did not match expected range ({:?})", + actual_range, expected + ); + println!("input ok: {}", ion_data); + Ok(()) + } + + #[test] + fn struct_range() -> IonResult<()> { + // For each pair below, we'll confirm that the top-level list is found to + // occupy the specified input span. + let tests = &[ + // (Ion input, expected range of the struct) + ("{}", 0..2), + (" {} ", 2..4), + ("{a:1}", 0..5), + ("{a: 1}", 0..6), + ("{a: 1, b: 2}", 0..12), + ("{a: 1, /* comment }}} */ b: 2}", 0..30), + // Nested + ("{a: 1, b: 2, c: {d: 3, e: 4, f: 5}, g: 6}", 0..41), + // Doubly nested + ("{a: 1, b: 2, c: {d: 3, e: {foo: bar}, f: 5}, g: 6}", 0..50), + ]; + for test in tests { + expect_struct_range(test.0, test.1.clone())?; + } + Ok(()) + } +} diff --git a/src/lazy/text/value.rs b/src/lazy/text/value.rs index f06fb650..ec3b8290 100644 --- a/src/lazy/text/value.rs +++ b/src/lazy/text/value.rs @@ -1,3 +1,6 @@ +use std::fmt; +use std::fmt::{Debug, Formatter}; + use crate::lazy::decoder::private::LazyRawValuePrivate; use crate::lazy::decoder::{LazyDecoder, LazyRawValue}; use crate::lazy::encoding::TextEncoding; @@ -5,10 +8,9 @@ use crate::lazy::raw_value_ref::RawValueRef; use crate::lazy::text::buffer::TextBufferView; use crate::lazy::text::encoded_value::EncodedTextValue; use crate::lazy::text::matched::MatchedValue; +use crate::lazy::text::raw::r#struct::LazyRawTextStruct; use crate::lazy::text::raw::sequence::LazyRawTextSequence; use crate::{IonResult, IonType, RawSymbolTokenRef}; -use std::fmt; -use std::fmt::{Debug, Formatter}; /// A value that has been identified in the text input stream but whose data has not yet been read. /// @@ -46,7 +48,10 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { } fn read(&self) -> IonResult> { - let matched_input = self.input.slice(0, self.encoded_value.data_length()); + let matched_input = self.input.slice( + self.encoded_value.data_offset() - self.input.offset(), + self.encoded_value.data_length(), + ); let value_ref = match self.encoded_value.matched() { MatchedValue::Null(ion_type) => RawValueRef::Null(ion_type), MatchedValue::Bool(b) => RawValueRef::Bool(b), @@ -58,6 +63,10 @@ impl<'data> LazyRawValue<'data, TextEncoding> for LazyRawTextValue<'data> { MatchedValue::List => { let lazy_sequence = LazyRawTextSequence { value: *self }; RawValueRef::List(lazy_sequence) + } + MatchedValue::Struct => { + let lazy_struct = LazyRawTextStruct { value: *self }; + RawValueRef::Struct(lazy_struct) } // ...and the rest! }; Ok(value_ref)