Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds lazy reader support for reading clobs #638

Merged
merged 37 commits into from
Sep 7, 2023
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
e0a83d8
Top-level nulls, bools, ints
Jul 16, 2023
89f79aa
Consolidate impls of AsUtf8 w/helper fn
Jul 25, 2023
840be4d
Improved TextBufferView docs, removed DataSource
Jul 25, 2023
5db1ff0
Adds lazy text floats
Jul 27, 2023
07d4a70
Adds LazyRawTextReader support for comments
Jul 27, 2023
181e0a5
Adds LazyRawTextReader support for reading strings
Jul 28, 2023
357ca8f
clippy fixes
Jul 28, 2023
716ff34
Fix a couple of unit tests
Jul 29, 2023
e29fec5
Less ambitious float eq comparison
Jul 29, 2023
8f79a36
Adds LazyRawTextReader support for reading symbols
Aug 1, 2023
4cb9b2b
Adds more doc comments
Aug 1, 2023
54470d2
More doc comments
Aug 1, 2023
78014e7
Adds `LazyRawTextReader` support for reading lists
Aug 3, 2023
a6a3aa8
Adds `LazyRawTextReader` support for structs
Aug 10, 2023
4fc9078
More doc comments
Aug 10, 2023
11174ac
Adds `LazyRawTextReader` support for reading IVMs
Aug 10, 2023
719dbaa
Initial impl of a LazyRawAnyReader
Aug 11, 2023
f603872
Improved comments.
Aug 11, 2023
4696ca5
Adds LazyRawTextReader support for annotations
Aug 11, 2023
c7129ac
Adds lazy reader support for timestamps
Aug 14, 2023
44435ea
Lazy reader support for s-expressions
Aug 18, 2023
d50e05b
Fixed doc comments
Aug 18, 2023
8283422
Fix internal doc link
Aug 18, 2023
0f01099
Adds lazy reader support for decimals
Aug 19, 2023
b60f1fe
Fixed bad unit test example case
Aug 20, 2023
915c83a
clippy fixes
Aug 20, 2023
fe922ff
Adds lazy reader support for blobs
Aug 20, 2023
066ddd8
Adds lazy reader support for long strings
Aug 21, 2023
c58e5f0
Merged long string matcher tests into overall string tests
Aug 21, 2023
6b5ce1c
wip
Aug 21, 2023
e45ec35
Merge main, complete support for clobs
Sep 1, 2023
a3f8a21
clippy suggestion
Sep 1, 2023
62be7c9
Adds lazy reader support for clobs
Sep 3, 2023
0eacd3a
clippy suggestion
Sep 3, 2023
175009d
Fix newline normalization, add unit tests
Sep 3, 2023
3421393
comment cleanup
Sep 3, 2023
45cbf40
Merge branch 'main' into lazy-clobs
zslayton Sep 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/lazy/binary/raw/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ impl<'data> LazyRawBinaryValue<'data> {
fn read_clob(&self) -> ValueParseResult<'data, BinaryEncoding> {
debug_assert!(self.encoded_value.ion_type() == IonType::Clob);
let bytes = self.value_body()?;
Ok(RawValueRef::Clob(bytes))
Ok(RawValueRef::Clob(bytes.into()))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ Reading a clob now returns a BytesRef<'_> instead of a &[u8] to accommodate the escape decoding process that happens in text clobs. This change mirrors the one made for blobs in #629.

}

/// Helper method called by [`Self::read`]. Reads the current value as an S-expression.
Expand Down
4 changes: 2 additions & 2 deletions src/lazy/raw_value_ref.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pub enum RawValueRef<'data, D: LazyDecoder<'data>> {
String(StrRef<'data>),
Symbol(RawSymbolTokenRef<'data>),
Blob(BytesRef<'data>),
Clob(&'data [u8]),
Clob(BytesRef<'data>),
SExp(D::SExp),
List(D::List),
Struct(D::Struct),
Expand Down Expand Up @@ -149,7 +149,7 @@ impl<'data, D: LazyDecoder<'data>> RawValueRef<'data, D> {
}
}

pub fn expect_clob(self) -> IonResult<&'data [u8]> {
pub fn expect_clob(self) -> IonResult<BytesRef<'data>> {
if let RawValueRef::Clob(c) = self {
Ok(c)
} else {
Expand Down
10 changes: 10 additions & 0 deletions src/lazy/str_ref.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::lazy::bytes_ref::BytesRef;
use crate::text::text_formatter::IonValueFormatter;
use crate::Str;
use std::borrow::Cow;
Expand Down Expand Up @@ -80,3 +81,12 @@ impl<'data> From<StrRef<'data>> for Str {
Str::from(text)
}
}

impl<'data> From<StrRef<'data>> for BytesRef<'data> {
fn from(value: StrRef<'data>) -> Self {
match value.text {
Cow::Borrowed(text) => text.as_bytes().into(),
Cow::Owned(text) => Vec::from(text).into(),
}
}
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ This impl converts a String into its underlying Vec or a &str to its underlying &[u8].

120 changes: 114 additions & 6 deletions src/lazy/text/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ use crate::lazy::encoding::TextEncoding;
use crate::lazy::raw_stream_item::RawStreamItem;
use crate::lazy::text::encoded_value::EncodedTextValue;
use crate::lazy::text::matched::{
MatchedBlob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt, MatchedString,
MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue,
MatchedBlob, MatchedClob, MatchedDecimal, MatchedFloat, MatchedHoursAndMinutes, MatchedInt,
MatchedString, MatchedSymbol, MatchedTimestamp, MatchedTimestampOffset, MatchedValue,
};
use crate::lazy::text::parse_result::{InvalidInputError, IonParseError};
use crate::lazy::text::parse_result::{IonMatchResult, IonParseResult};
Expand Down Expand Up @@ -508,6 +508,12 @@ impl<'data> TextBufferView<'data> {
EncodedTextValue::new(MatchedValue::Blob(matched_blob), self.offset(), length)
},
),
map(
match_and_length(Self::match_clob),
|(matched_clob, length)| {
EncodedTextValue::new(MatchedValue::Clob(matched_clob), self.offset(), length)
},
),
map(
match_and_length(Self::match_list),
|(matched_list, length)| {
Expand Down Expand Up @@ -983,12 +989,12 @@ impl<'data> TextBufferView<'data> {
}

/// Matches short- or long-form string.
fn match_string(self) -> IonParseResult<'data, MatchedString> {
pub fn match_string(self) -> IonParseResult<'data, MatchedString> {
alt((Self::match_short_string, Self::match_long_string))(self)
}

/// Matches a short string. For example: `"foo"`
fn match_short_string(self) -> IonParseResult<'data, MatchedString> {
pub(crate) fn match_short_string(self) -> IonParseResult<'data, MatchedString> {
delimited(char('"'), Self::match_short_string_body, char('"'))
.map(|(_matched, contains_escaped_chars)| {
if contains_escaped_chars {
Expand All @@ -1002,13 +1008,13 @@ impl<'data> TextBufferView<'data> {

/// Returns a matched buffer and a boolean indicating whether any escaped characters were
/// found in the short string.
fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> {
pub(crate) fn match_short_string_body(self) -> IonParseResult<'data, (Self, bool)> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🗺️ The clob reading logic re-uses the short- and long-form string matchers to isolate the content within the larger match.

Self::match_text_until_unescaped(self, b'\"')
}

/// Matches a long string comprised of any number of `'''`-enclosed segments interleaved
/// with optional comments and whitespace.
pub fn match_long_string(self) -> IonParseResult<'data, MatchedString> {
pub(crate) fn match_long_string(self) -> IonParseResult<'data, MatchedString> {
fold_many1(
// Parser to keep applying repeatedly
whitespace_and_then(Self::match_long_string_segment),
Expand Down Expand Up @@ -1435,6 +1441,66 @@ impl<'data> TextBufferView<'data> {
.parse(self)
}

/// Matches a clob of either short- or long-form syntax.
pub fn match_clob(self) -> IonParseResult<'data, MatchedClob> {
delimited(
tag("{{"),
preceded(
Self::match_optional_whitespace,
alt((
value(MatchedClob::Short, Self::match_short_clob_body),
value(MatchedClob::Long, Self::match_long_clob_body),
)),
),
preceded(Self::match_optional_whitespace, tag("}}")),
)(self)
}

/// Matches the body (inside the `{{` and `}}`) of a short-form clob.
fn match_short_clob_body(self) -> IonMatchResult<'data> {
let (remaining, (body, _matched_string)) = consumed(Self::match_short_string)(self)?;
body.validate_clob_text()?;
Ok((remaining, body))
}

/// Matches the body (inside the `{{` and `}}`) of a long-form clob.
fn match_long_clob_body(self) -> IonMatchResult<'data> {
recognize(many1_count(preceded(
Self::match_optional_whitespace,
Self::match_long_clob_body_segment,
)))(self)
}

/// Matches a single segment of a long-form clob's content.
fn match_long_clob_body_segment(self) -> IonMatchResult<'data> {
let (remaining, (body, _matched_string)) = consumed(Self::match_long_string_segment)(self)?;
body.validate_clob_text()?;
Ok((remaining, body))
}

/// Returns an error if the buffer contains any byte that is not legal inside a clob.
fn validate_clob_text(self) -> IonMatchResult<'data> {
for byte in self.bytes().iter().copied() {
if !Self::byte_is_legal_clob_ascii(byte) {
let message = format!("found an illegal byte '{:0x}'in clob", byte);
let error = InvalidInputError::new(self).with_description(message);
return Err(nom::Err::Failure(IonParseError::Invalid(error)));
}
}
// Return success without consuming
Ok((self, self.slice(0, 0)))
}

/// Returns `false` if the specified byte cannot appear unescaped in a clob.
fn byte_is_legal_clob_ascii(b: u8) -> bool {
// Depending on where you look in the spec and/or `ion-tests`, you'll find conflicting
// information about which ASCII characters can appear unescaped in a clob. Some say
// "characters >= 0x20", but that excludes lots of whitespace characters that are < 0x20.
// Some say "displayable ASCII", but DEL (0x7F) is shown to be legal in one of the ion-tests.
// The definition used here has largely been inferred from the contents of `ion-tests`.
b.is_ascii()
&& (u32::from(b) >= 0x20 || WHITESPACE_CHARACTERS_AS_STR.as_bytes().contains(&b))
}
/// Matches the base64 content within a blob. Ion allows the base64 content to be broken up with
/// whitespace, so the matched input region may need to be stripped of whitespace before
/// the data can be decoded.
Expand Down Expand Up @@ -2189,6 +2255,48 @@ mod tests {
}

#[test]
fn test_match_clob() {
fn match_clob(input: &str) {
MatchTest::new(input).expect_match(match_length(TextBufferView::match_clob));
}
fn mismatch_blob(input: &str) {
MatchTest::new(input).expect_mismatch(match_length(TextBufferView::match_clob));
}
// Base64 encodings of utf-8 strings
let good_inputs = &[
r#"{{""}}"#,
r#"{{''''''}}"#,
r#"{{"foo"}}"#,
r#"{{ "foo"}}"#,
r#"{{ "foo" }}"#,
r#"{{"foo" }}"#,
r#"{{'''foo'''}}"#,
r#"{{"foobar"}}"#,
r#"{{'''foo''' '''bar'''}}"#,
r#"{{
'''foo'''
'''bar'''
'''baz'''
}}"#,
];
for input in good_inputs {
match_clob(input);
}

let bad_inputs = &[
r#"{{foo}}"#, // No quotes
r#"{{"foo}}"#, // Missing closing quote
r#"{{"foo"}"#, // Missing closing brace
r#"{{'''foo'''}"#, // Missing closing brace
r#"{{'''foo''' /*hi!*/ '''bar'''}}"#, // Interleaved comments
r#"{{'''foo''' "bar"}}"#, // Mixed quote style
r#"{{"😎🙂🙃"}}"#, // Contains unescaped non-ascii characters
];
for input in bad_inputs {
mismatch_blob(input);
}
}

fn test_match_text_until_unescaped_str() {
let input = TextBufferView::new(r" foo bar \''' baz''' quux ".as_bytes());
let (_remaining, (matched, contains_escapes)) =
Expand Down
1 change: 1 addition & 0 deletions src/lazy/text/encoded_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ impl EncodedTextValue {
MatchedValue::String(_) => IonType::String,
MatchedValue::Symbol(_) => IonType::Symbol,
MatchedValue::Blob(_) => IonType::Blob,
MatchedValue::Clob(_) => IonType::Clob,
MatchedValue::List => IonType::List,
MatchedValue::SExp => IonType::SExp,
MatchedValue::Struct => IonType::Struct,
Expand Down