Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional tests for parquet reader utf8 validation #6023

Merged
merged 1 commit into from
Jul 11, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 146 additions & 56 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2425,89 +2425,179 @@ mod tests {

fn test_invalid_utf8_string_array_inner<O: OffsetSizeTrait>() {
let cases = [
(
invalid_utf8_first_char::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data",
),
(
invalid_utf8_later_char::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 6",
),
invalid_utf8_first_char::<O>(),
invalid_utf8_first_char_long_strings::<O>(),
invalid_utf8_later_char::<O>(),
invalid_utf8_later_char_long_strings::<O>(),
invalid_utf8_later_char_really_long_strings::<O>(),
invalid_utf8_later_char_really_long_strings2::<O>(),
];
for (array, expected_error) in cases {
// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringArray
let array = unsafe {
GenericStringArray::<O>::new_unchecked(
array.offsets().clone(),
array.values().clone(),
array.nulls().cloned(),
)
};
let data_type = array.data_type().clone();
let data = write_to_parquet(Arc::new(array));
let err = read_from_parquet(data).unwrap_err();
assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}")
for array in &cases {
for encoding in STRING_ENCODINGS {
alamb marked this conversation as resolved.
Show resolved Hide resolved
// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringArray
let array = unsafe {
GenericStringArray::<O>::new_unchecked(
array.offsets().clone(),
array.values().clone(),
array.nulls().cloned(),
)
};
let data_type = array.data_type().clone();
let data = write_to_parquet_with_encoding(Arc::new(array), *encoding);
let err = read_from_parquet(data).unwrap_err();
let expected_err =
"Parquet argument error: Parquet error: encountered non UTF-8 data";
assert!(
err.to_string().contains(expected_err),
"data type: {data_type:?}, expected: {expected_err}, got: {err}"
);
}
}
}

#[test]
fn test_invalid_utf8_string_view_array() {
let cases = [
(
invalid_utf8_first_char::<i32>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 11",
),
(
invalid_utf8_later_char::<i32>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 14",
),
invalid_utf8_first_char::<i32>(),
invalid_utf8_first_char_long_strings::<i32>(),
invalid_utf8_later_char::<i32>(),
invalid_utf8_later_char_long_strings::<i32>(),
invalid_utf8_later_char_really_long_strings::<i32>(),
invalid_utf8_later_char_really_long_strings2::<i32>(),
];
for (array, expected_error) in cases {
let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap();
let array = array.as_binary_view();

// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringArray
let array = unsafe {
StringViewArray::new_unchecked(
array.views().clone(),
array.data_buffers().to_vec(),
array.nulls().cloned(),
)
};
let data_type = array.data_type().clone();
let data = write_to_parquet(Arc::new(array));
let err = read_from_parquet(data).unwrap_err();
assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}")

for encoding in STRING_ENCODINGS {
for array in &cases {
let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap();
let array = array.as_binary_view();

// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringViewArray
let array = unsafe {
StringViewArray::new_unchecked(
array.views().clone(),
array.data_buffers().to_vec(),
array.nulls().cloned(),
)
};

let data_type = array.data_type().clone();
let data = write_to_parquet_with_encoding(Arc::new(array), *encoding);
let err = read_from_parquet(data).unwrap_err();
let expected_err =
"Parquet argument error: Parquet error: encountered non UTF-8 data";
assert!(
err.to_string().contains(expected_err),
"data type: {data_type:?}, expected: {expected_err}, got: {err}"
);
}
}
}

/// Encodings suitable for string data
const STRING_ENCODINGS: &[Option<Encoding>] = &[
None,
Some(Encoding::PLAIN),
Some(Encoding::DELTA_LENGTH_BYTE_ARRAY),
Some(Encoding::DELTA_BYTE_ARRAY),
];

/// Invalid Utf-8 sequence in the first character
/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
const INVALID_UTF8_FIRST_CHAR: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];

/// Invalid Utf=8 sequence in NOT the first character
/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
const INVALID_UTF8_LATER_CHAR: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20];

/// returns a BinaryArray with invalid UTF8 data in the first character
fn invalid_utf8_first_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
// invalid sequence in the first character
// https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
let valid: &[u8] = b" ";
let invalid: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
let invalid = INVALID_UTF8_FIRST_CHAR;
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(invalid)])
}

/// Returns a BinaryArray with invalid UTF8 data in the first character of a
/// string larger than 12 bytes which is handled specially when reading
/// `ByteViewArray`s
fn invalid_utf8_first_char_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut invalid = vec![];
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR);
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
}

/// returns a BinaryArray with invalid UTF8 data in a character other than
/// the first (this is checked in a special codepath)
fn invalid_utf8_later_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
// invalid sequence in NOT the first character
// https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
let valid: &[u8] = b" ";
let invalid: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20];
let invalid: &[u8] = INVALID_UTF8_LATER_CHAR;
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(invalid)])
}

// writes the array into a single column parquet file
fn write_to_parquet(array: ArrayRef) -> Vec<u8> {
/// returns a BinaryArray with invalid UTF8 data in a character other than
/// the first in a string larger than 12 bytes which is handled specially
/// when reading `ByteViewArray`s (this is checked in a special codepath)
fn invalid_utf8_later_char_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut invalid = vec![];
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
}

/// returns a BinaryArray with invalid UTF8 data in a character other than
/// the first in a string larger than 128 bytes which is handled specially
/// when reading `ByteViewArray`s (this is checked in a special codepath)
fn invalid_utf8_later_char_really_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut invalid = vec![];
for _ in 0..10 {
// each instance is 38 bytes
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
}
invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
}

/// returns a BinaryArray with small invalid UTF8 data followed by a large
/// invalid UTF8 data in a character other than the first in a string larger
fn invalid_utf8_later_char_really_long_strings2<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut valid_long = vec![];
for _ in 0..10 {
// each instance is 38 bytes
valid_long.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
}
let invalid = INVALID_UTF8_LATER_CHAR;
GenericBinaryArray::<O>::from_iter(vec![
None,
Some(valid),
Some(invalid),
None,
Some(&valid_long),
Some(valid),
])
}

/// writes the array into a single column parquet file with the specified
/// encoding.
///
/// If no encoding is specified, use default (dictionary) encoding
fn write_to_parquet_with_encoding(array: ArrayRef, encoding: Option<Encoding>) -> Vec<u8> {
let batch = RecordBatch::try_from_iter(vec![("c", array)]).unwrap();
let mut data = vec![];
let schema = batch.schema();
let props = None;
let props = encoding.map(|encoding| {
WriterProperties::builder()
// must disable dictionary encoding to actually use encoding
.set_dictionary_enabled(false)
.set_encoding(encoding)
.build()
});

{
let mut writer = ArrowWriter::try_new(&mut data, schema, props).unwrap();
writer.write(&batch).unwrap();
Expand Down
Loading