Skip to content

Commit

Permalink
Additional tests for parquet reader utf8 validation
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Jul 8, 2024
1 parent 8355823 commit 5e9996f
Showing 1 changed file with 118 additions and 41 deletions.
159 changes: 118 additions & 41 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2429,25 +2429,39 @@ mod tests {
invalid_utf8_first_char::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data",
),
(
invalid_utf8_first_char_long_strings::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 41",
),
(
invalid_utf8_later_char::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 6",
),
(
invalid_utf8_later_char_long_strings::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 44",
),
(
invalid_utf8_later_char_really_long_strings::<O>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 386",
),
];
for (array, expected_error) in cases {
// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringArray
let array = unsafe {
GenericStringArray::<O>::new_unchecked(
array.offsets().clone(),
array.values().clone(),
array.nulls().cloned(),
)
};
let data_type = array.data_type().clone();
let data = write_to_parquet(Arc::new(array));
let err = read_from_parquet(data).unwrap_err();
assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}")
for (array, expected_error) in &cases {
for encoding in STRING_ENCODINGS {
// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringArray
let array = unsafe {
GenericStringArray::<O>::new_unchecked(
array.offsets().clone(),
array.values().clone(),
array.nulls().cloned(),
)
};
let data_type = array.data_type().clone();
let data = write_to_parquet_with_encoding(Arc::new(array), *encoding);
let err = read_from_parquet(data).unwrap_err();
assert_eq!(err.to_string(), *expected_error, "data type: {data_type:?}")
}
}
}

Expand All @@ -2458,58 +2472,121 @@ mod tests {
invalid_utf8_first_char::<i32>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 11",
),
(
invalid_utf8_first_char_long_strings::<i32>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 49",
),
(
invalid_utf8_later_char::<i32>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 14",
),
(
invalid_utf8_later_char_long_strings::<i32>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 52",
),
(
invalid_utf8_later_char_really_long_strings::<i32>(),
"Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 383",
),
];
for (array, expected_error) in cases {
let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap();
let array = array.as_binary_view();

// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringArray
let array = unsafe {
StringViewArray::new_unchecked(
array.views().clone(),
array.data_buffers().to_vec(),
array.nulls().cloned(),
)
};
let data_type = array.data_type().clone();
let data = write_to_parquet(Arc::new(array));
let err = read_from_parquet(data).unwrap_err();
assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}")

for encoding in STRING_ENCODINGS {
for (array, expected_error) in &cases {
let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap();
let array = array.as_binary_view();

// data is not valid utf8 we can not construct a correct StringArray
// safely, so purposely create an invalid StringViewArray
let array = unsafe {
StringViewArray::new_unchecked(
array.views().clone(),
array.data_buffers().to_vec(),
array.nulls().cloned(),
)
};

let data_type = array.data_type().clone();
let data = write_to_parquet_with_encoding(Arc::new(array), *encoding);
let err = read_from_parquet(data).unwrap_err();
assert_eq!(err.to_string(), *expected_error, "data type: {data_type:?}")
}
}
}

/// Encodings suitable for string data
const STRING_ENCODINGS: &[Encoding] = &[
Encoding::PLAIN,
Encoding::DELTA_LENGTH_BYTE_ARRAY,
Encoding::DELTA_BYTE_ARRAY,
];

/// Invalid Utf-8 sequence in the first character
/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
const INVALID_UTF8_FIRST_CHAR: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];

/// Invalid Utf=8 sequence in NOT the first character
/// <https://stackoverflow.com/questions/1301402/example-invalid-utf8-string>
const INVALID_UTF8_LATER_CHAR: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20];

/// returns a BinaryArray with invalid UTF8 data in the first character
fn invalid_utf8_first_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
// invalid sequence in the first character
// https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
let valid: &[u8] = b" ";
let invalid: &[u8] = &[0xa0, 0xa1, 0x20, 0x20];
let invalid = INVALID_UTF8_FIRST_CHAR;
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(invalid)])
}

/// Returns a BinaryArray with invalid UTF8 data in the first character of a
/// string larger than 12 bytes which is handled specially when reading
/// `ByteViewArray`s
fn invalid_utf8_first_char_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut invalid = vec![];
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR);
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
}

/// returns a BinaryArray with invalid UTF8 data in a character other than
/// the first (this is checked in a special codepath)
fn invalid_utf8_later_char<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
// invalid sequence in NOT the first character
// https://stackoverflow.com/questions/1301402/example-invalid-utf8-string
let valid: &[u8] = b" ";
let invalid: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20];
let invalid: &[u8] = INVALID_UTF8_LATER_CHAR;
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(invalid)])
}

// writes the array into a single column parquet file
fn write_to_parquet(array: ArrayRef) -> Vec<u8> {
/// returns a BinaryArray with invalid UTF8 data in a character other than
/// the first in a string larger than 12 bytes which is handled specially
/// when reading `ByteViewArray`s (this is checked in a special codepath)
fn invalid_utf8_later_char_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut invalid = vec![];
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
}

/// returns a BinaryArray with invalid UTF8 data in a character other than
/// the first in a string larger than 128 bytes which is handled specially
/// when reading `ByteViewArray`s (this is checked in a special codepath)
fn invalid_utf8_later_char_really_long_strings<O: OffsetSizeTrait>() -> GenericBinaryArray<O> {
let valid: &[u8] = b" ";
let mut invalid = vec![];
for _ in 0..10 {
// each instance is 38 bytes
invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes");
}
invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR);
GenericBinaryArray::<O>::from_iter(vec![None, Some(valid), None, Some(&invalid)])
}

/// writes the array into a single column parquet file with the specified encoding
fn write_to_parquet_with_encoding(array: ArrayRef, encoding: Encoding) -> Vec<u8> {
let batch = RecordBatch::try_from_iter(vec![("c", array)]).unwrap();
let mut data = vec![];
let schema = batch.schema();
let props = None;
let props = WriterProperties::builder().set_encoding(encoding).build();
{
let mut writer = ArrowWriter::try_new(&mut data, schema, props).unwrap();
let mut writer = ArrowWriter::try_new(&mut data, schema, Some(props)).unwrap();
writer.write(&batch).unwrap();
writer.flush().unwrap();
writer.close().unwrap();
Expand Down

0 comments on commit 5e9996f

Please sign in to comment.