diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index cc369cec0ea..705ba9233a0 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -2429,25 +2429,39 @@ mod tests { invalid_utf8_first_char::(), "Parquet argument error: Parquet error: encountered non UTF-8 data", ), + ( + invalid_utf8_first_char_long_strings::(), + "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 41", + ), ( invalid_utf8_later_char::(), "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 6", ), + ( + invalid_utf8_later_char_long_strings::(), + "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 44", + ), + ( + invalid_utf8_later_char_really_long_strings::(), + "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 386", + ), ]; - for (array, expected_error) in cases { - // data is not valid utf8 we can not construct a correct StringArray - // safely, so purposely create an invalid StringArray - let array = unsafe { - GenericStringArray::::new_unchecked( - array.offsets().clone(), - array.values().clone(), - array.nulls().cloned(), - ) - }; - let data_type = array.data_type().clone(); - let data = write_to_parquet(Arc::new(array)); - let err = read_from_parquet(data).unwrap_err(); - assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}") + for (array, expected_error) in &cases { + for encoding in STRING_ENCODINGS { + // data is not valid utf8 we can not construct a correct StringArray + // safely, so purposely create an invalid StringArray + let array = unsafe { + GenericStringArray::::new_unchecked( + array.offsets().clone(), + array.values().clone(), + array.nulls().cloned(), + ) + }; + let data_type = array.data_type().clone(); + let data = write_to_parquet_with_encoding(Arc::new(array), *encoding); + let err = read_from_parquet(data).unwrap_err(); + assert_eq!(err.to_string(), *expected_error, "data type: {data_type:?}") + } } } @@ -2458,58 +2472,121 @@ mod tests { invalid_utf8_first_char::(), "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 11", ), + ( + invalid_utf8_first_char_long_strings::(), + "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 49", + ), ( invalid_utf8_later_char::(), "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 14", ), + ( + invalid_utf8_later_char_long_strings::(), + "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 52", + ), + ( + invalid_utf8_later_char_really_long_strings::(), + "Parquet argument error: Parquet error: encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 383", + ), ]; - for (array, expected_error) in cases { - let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap(); - let array = array.as_binary_view(); - - // data is not valid utf8 we can not construct a correct StringArray - // safely, so purposely create an invalid StringArray - let array = unsafe { - StringViewArray::new_unchecked( - array.views().clone(), - array.data_buffers().to_vec(), - array.nulls().cloned(), - ) - }; - let data_type = array.data_type().clone(); - let data = write_to_parquet(Arc::new(array)); - let err = read_from_parquet(data).unwrap_err(); - assert_eq!(err.to_string(), expected_error, "data type: {data_type:?}") + + for encoding in STRING_ENCODINGS { + for (array, expected_error) in &cases { + let array = arrow_cast::cast(&array, &ArrowDataType::BinaryView).unwrap(); + let array = array.as_binary_view(); + + // data is not valid utf8 we can not construct a correct StringArray + // safely, so purposely create an invalid StringViewArray + let array = unsafe { + StringViewArray::new_unchecked( + array.views().clone(), + array.data_buffers().to_vec(), + array.nulls().cloned(), + ) + }; + + let data_type = array.data_type().clone(); + let data = write_to_parquet_with_encoding(Arc::new(array), *encoding); + let err = read_from_parquet(data).unwrap_err(); + assert_eq!(err.to_string(), *expected_error, "data type: {data_type:?}") + } } } + /// Encodings suitable for string data + const STRING_ENCODINGS: &[Encoding] = &[ + Encoding::PLAIN, + Encoding::DELTA_LENGTH_BYTE_ARRAY, + Encoding::DELTA_BYTE_ARRAY, + ]; + + /// Invalid Utf-8 sequence in the first character + /// + const INVALID_UTF8_FIRST_CHAR: &[u8] = &[0xa0, 0xa1, 0x20, 0x20]; + + /// Invalid Utf=8 sequence in NOT the first character + /// + const INVALID_UTF8_LATER_CHAR: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20]; + /// returns a BinaryArray with invalid UTF8 data in the first character fn invalid_utf8_first_char() -> GenericBinaryArray { - // invalid sequence in the first character - // https://stackoverflow.com/questions/1301402/example-invalid-utf8-string let valid: &[u8] = b" "; - let invalid: &[u8] = &[0xa0, 0xa1, 0x20, 0x20]; + let invalid = INVALID_UTF8_FIRST_CHAR; GenericBinaryArray::::from_iter(vec![None, Some(valid), None, Some(invalid)]) } + /// Returns a BinaryArray with invalid UTF8 data in the first character of a + /// string larger than 12 bytes which is handled specially when reading + /// `ByteViewArray`s + fn invalid_utf8_first_char_long_strings() -> GenericBinaryArray { + let valid: &[u8] = b" "; + let mut invalid = vec![]; + invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes"); + invalid.extend_from_slice(INVALID_UTF8_FIRST_CHAR); + GenericBinaryArray::::from_iter(vec![None, Some(valid), None, Some(&invalid)]) + } + /// returns a BinaryArray with invalid UTF8 data in a character other than /// the first (this is checked in a special codepath) fn invalid_utf8_later_char() -> GenericBinaryArray { - // invalid sequence in NOT the first character - // https://stackoverflow.com/questions/1301402/example-invalid-utf8-string let valid: &[u8] = b" "; - let invalid: &[u8] = &[0x20, 0x20, 0x20, 0xa0, 0xa1, 0x20, 0x20]; + let invalid: &[u8] = INVALID_UTF8_LATER_CHAR; GenericBinaryArray::::from_iter(vec![None, Some(valid), None, Some(invalid)]) } - // writes the array into a single column parquet file - fn write_to_parquet(array: ArrayRef) -> Vec { + /// returns a BinaryArray with invalid UTF8 data in a character other than + /// the first in a string larger than 12 bytes which is handled specially + /// when reading `ByteViewArray`s (this is checked in a special codepath) + fn invalid_utf8_later_char_long_strings() -> GenericBinaryArray { + let valid: &[u8] = b" "; + let mut invalid = vec![]; + invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes"); + invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR); + GenericBinaryArray::::from_iter(vec![None, Some(valid), None, Some(&invalid)]) + } + + /// returns a BinaryArray with invalid UTF8 data in a character other than + /// the first in a string larger than 128 bytes which is handled specially + /// when reading `ByteViewArray`s (this is checked in a special codepath) + fn invalid_utf8_later_char_really_long_strings() -> GenericBinaryArray { + let valid: &[u8] = b" "; + let mut invalid = vec![]; + for _ in 0..10 { + // each instance is 38 bytes + invalid.extend_from_slice(b"ThisStringIsCertainlyLongerThan12Bytes"); + } + invalid.extend_from_slice(INVALID_UTF8_LATER_CHAR); + GenericBinaryArray::::from_iter(vec![None, Some(valid), None, Some(&invalid)]) + } + + /// writes the array into a single column parquet file with the specified encoding + fn write_to_parquet_with_encoding(array: ArrayRef, encoding: Encoding) -> Vec { let batch = RecordBatch::try_from_iter(vec![("c", array)]).unwrap(); let mut data = vec![]; let schema = batch.schema(); - let props = None; + let props = WriterProperties::builder().set_encoding(encoding).build(); { - let mut writer = ArrowWriter::try_new(&mut data, schema, props).unwrap(); + let mut writer = ArrowWriter::try_new(&mut data, schema, Some(props)).unwrap(); writer.write(&batch).unwrap(); writer.flush().unwrap(); writer.close().unwrap();