From 8825d38a76cc8a6ebfa1c3994ad191042e4ae5d3 Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Mon, 16 May 2022 20:17:04 +0800 Subject: [PATCH] Fix: Null buffer accounts for `offset` in `substring` kernel. (#1704) * add a failed test Signed-off-by: remzi <13716567376yh@gmail.com> * add tests and fix bug for binary Signed-off-by: remzi <13716567376yh@gmail.com> * fix bug and add tests for string array Signed-off-by: remzi <13716567376yh@gmail.com> * generate offsets from usize Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/substring.rs | 99 ++++++++++++++++++++++++-- 1 file changed, 94 insertions(+), 5 deletions(-) diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index 0ef488d972..866e427073 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -156,7 +156,6 @@ fn binary_substring( length: Option, ) -> Result { let offsets = array.value_offsets(); - let null_bit_buffer = array.data_ref().null_buffer().cloned(); let values = array.value_data(); let data = values.as_slice(); let zero = OffsetSize::zero(); @@ -201,7 +200,10 @@ fn binary_substring( GenericBinaryArray::::get_data_type(), array.len(), None, - null_bit_buffer, + array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())), 0, vec![Buffer::from_slice_ref(&new_offsets), new_values.into()], vec![], @@ -266,7 +268,6 @@ fn utf8_substring( length: Option, ) -> Result { let offsets = array.value_offsets(); - let null_bit_buffer = array.data_ref().null_buffer().cloned(); let values = array.value_data(); let data = values.as_slice(); let zero = OffsetSize::zero(); @@ -330,7 +331,10 @@ fn utf8_substring( GenericStringArray::::get_data_type(), array.len(), None, - null_bit_buffer, + array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice(array.offset(), array.len())), 0, vec![Buffer::from_slice_ref(&new_offsets), new_values.into()], vec![], @@ -555,6 +559,49 @@ mod tests { without_nulls_generic_binary::() } + fn generic_binary_with_non_zero_offset() -> Result<()> { + let values = 0_u8..15; + let offsets = &[ + O::zero(), + O::from_usize(5).unwrap(), + O::from_usize(10).unwrap(), + O::from_usize(15).unwrap(), + ]; + // set the first and third element to be valid + let bitmap = [0b101_u8]; + + let data = ArrayData::builder(GenericBinaryArray::::get_data_type()) + .len(2) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from_iter(values)) + .null_bit_buffer(Buffer::from(bitmap)) + .offset(1) + .build()?; + // array is `[null, [10, 11, 12, 13, 14]]` + let array = GenericBinaryArray::::from(data); + // result is `[null, [11, 12, 13, 14]]` + let result = substring(&array, 1, None)?; + let result = result + .as_any() + .downcast_ref::>() + .unwrap(); + let expected = + GenericBinaryArray::::from_opt_vec(vec![None, Some(&[11_u8, 12, 13, 14])]); + assert_eq!(result, &expected); + + Ok(()) + } + + #[test] + fn binary_with_non_zero_offset() -> Result<()> { + generic_binary_with_non_zero_offset::() + } + + #[test] + fn large_binary_with_non_zero_offset() -> Result<()> { + generic_binary_with_non_zero_offset::() + } + #[test] #[allow(clippy::type_complexity)] fn with_nulls_fixed_size_binary() -> Result<()> { @@ -811,7 +858,7 @@ mod tests { } #[test] - fn offset_fixed_size_binary() -> Result<()> { + fn fixed_size_binary_with_non_zero_offset() -> Result<()> { let values: [u8; 15] = *b"hellotherearrow"; // set the first and third element to be valid let bits_v = [0b101_u8]; @@ -994,6 +1041,48 @@ mod tests { without_nulls_generic_string::() } + fn generic_string_with_non_zero_offset() -> Result<()> { + let values = "hellotherearrow"; + let offsets = &[ + O::zero(), + O::from_usize(5).unwrap(), + O::from_usize(10).unwrap(), + O::from_usize(15).unwrap(), + ]; + // set the first and third element to be valid + let bitmap = [0b101_u8]; + + let data = ArrayData::builder(GenericStringArray::::get_data_type()) + .len(2) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from(values)) + .null_bit_buffer(Buffer::from(bitmap)) + .offset(1) + .build()?; + // array is `[null, "arrow"]` + let array = GenericStringArray::::from(data); + // result is `[null, "rrow"]` + let result = substring(&array, 1, None)?; + let result = result + .as_any() + .downcast_ref::>() + .unwrap(); + let expected = GenericStringArray::::from(vec![None, Some("rrow")]); + assert_eq!(result, &expected); + + Ok(()) + } + + #[test] + fn string_with_non_zero_offset() -> Result<()> { + generic_string_with_non_zero_offset::() + } + + #[test] + fn large_string_with_non_zero_offset() -> Result<()> { + generic_string_with_non_zero_offset::() + } + #[test] fn dictionary() -> Result<()> { _dictionary::()?;