From aa30da8117d212b178353ec4363ae6540a864778 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sun, 20 Mar 2022 10:09:03 +0800 Subject: [PATCH 1/5] support length on binary array (not test) rewrite unary_offset using macro Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/length.rs | 107 +++++++++++++++++----------- 1 file changed, 66 insertions(+), 41 deletions(-) diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index 7a884ee26248..2d4def22c974 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Defines kernel for length of a string array +//! Defines kernel for length of string arrays and binary arrays use crate::{ array::*, @@ -27,47 +27,54 @@ use crate::{ error::{ArrowError, Result}, }; -fn unary_offsets_string( - array: &GenericStringArray, - data_type: DataType, - op: F, +macro_rules! unary_offsets { + ($array: expr, $data_type: expr, $op: expr) => {{ + // note: offsets are stored as u8, but they can be interpreted as OffsetSize + let offsets = &$array.data_ref().buffers()[0]; + // this is a 30% improvement over iterating over u8s and building OffsetSize, which + // justifies the usage of `unsafe`. + let slice: &[O] = &unsafe { offsets.typed_data::() }[$array.offset()..]; + + let lengths = slice.windows(2).map(|offset| $op(offset[1] - offset[0])); + + // JUSTIFICATION + // Benefit + // ~60% speedup + // Soundness + // `values` come from a slice iterator with a known size. + let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) }; + + let null_bit_buffer = $array + .data_ref() + .null_buffer() + .map(|b| b.bit_slice($array.offset(), $array.len())); + + let data = unsafe { + ArrayData::new_unchecked( + $data_type, + $array.len(), + None, + null_bit_buffer, + 0, + vec![buffer], + vec![], + ) + }; + make_array(data) + }}; +} + +fn octet_length_binary( + array: &dyn Array, ) -> ArrayRef where - O: StringOffsetSizeTrait + ArrowNativeType, - F: Fn(O) -> O, + T::Native: BinaryOffsetSizeTrait, { - // note: offsets are stored as u8, but they can be interpreted as OffsetSize - let offsets = &array.data_ref().buffers()[0]; - // this is a 30% improvement over iterating over u8s and building OffsetSize, which - // justifies the usage of `unsafe`. - let slice: &[O] = &unsafe { offsets.typed_data::() }[array.offset()..]; - - let lengths = slice.windows(2).map(|offset| op(offset[1] - offset[0])); - - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` come from a slice iterator with a known size. - let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) }; - - let null_bit_buffer = array - .data_ref() - .null_buffer() - .map(|b| b.bit_slice(array.offset(), array.len())); - - let data = unsafe { - ArrayData::new_unchecked( - data_type, - array.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ) - }; - make_array(data) + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + unary_offsets!(array, T::DATA_TYPE, |x| x) } fn octet_length( @@ -80,7 +87,21 @@ where .as_any() .downcast_ref::>() .unwrap(); - unary_offsets_string::(array, T::DATA_TYPE, |x| x) + unary_offsets!(array, T::DATA_TYPE, |x| x) +} + +fn bit_length_impl_binary( + array: &dyn Array, +) -> ArrayRef +where + T::Native: BinaryOffsetSizeTrait, +{ + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + let bits_in_bytes = O::from_usize(8).unwrap(); + unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes) } fn bit_length_impl( @@ -94,7 +115,7 @@ where .downcast_ref::>() .unwrap(); let bits_in_bytes = O::from_usize(8).unwrap(); - unary_offsets_string::(array, T::DATA_TYPE, |x| x * bits_in_bytes) + unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes) } /// Returns an array of Int32/Int64 denoting the number of bytes in each string in the array. @@ -106,6 +127,8 @@ pub fn length(array: &dyn Array) -> Result { match array.data_type() { DataType::Utf8 => Ok(octet_length::(array)), DataType::LargeUtf8 => Ok(octet_length::(array)), + DataType::Binary => Ok(octet_length_binary::(array)), + DataType::LargeBinary => Ok(octet_length_binary::(array)), _ => Err(ArrowError::ComputeError(format!( "length not supported for {:?}", array.data_type() @@ -122,6 +145,8 @@ pub fn bit_length(array: &dyn Array) -> Result { match array.data_type() { DataType::Utf8 => Ok(bit_length_impl::(array)), DataType::LargeUtf8 => Ok(bit_length_impl::(array)), + DataType::Binary => Ok(bit_length_impl_binary::(array)), + DataType::LargeBinary => Ok(bit_length_impl_binary::(array)), _ => Err(ArrowError::ComputeError(format!( "bit_length not supported for {:?}", array.data_type() From 3c33b8617d8a5707f42cc278c233909f30e63ece Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sun, 20 Mar 2022 20:36:14 +0800 Subject: [PATCH 2/5] add tests Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/length.rs | 131 +++++++++++++++++++++++----- 1 file changed, 111 insertions(+), 20 deletions(-) diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index 2d4def22c974..2b83fc0371aa 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -17,11 +17,7 @@ //! Defines kernel for length of string arrays and binary arrays -use crate::{ - array::*, - buffer::Buffer, - datatypes::{ArrowNativeType, ArrowPrimitiveType}, -}; +use crate::{array::*, buffer::Buffer, datatypes::ArrowPrimitiveType}; use crate::{ datatypes::{DataType, Int32Type, Int64Type}, error::{ArrowError, Result}, @@ -138,7 +134,7 @@ pub fn length(array: &dyn Array) -> Result { /// Returns an array of Int32/Int64 denoting the number of bits in each string in the array. /// -/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8 +/// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray /// * bit_length of null is null. /// * bit_length is in number of bits pub fn bit_length(array: &dyn Array) -> Result { @@ -158,11 +154,11 @@ pub fn bit_length(array: &dyn Array) -> Result { mod tests { use super::*; - fn length_cases() -> Vec<(Vec<&'static str>, usize, Vec)> { - fn double_vec(v: Vec) -> Vec { - [&v[..], &v[..]].concat() - } + fn double_vec(v: Vec) -> Vec { + [&v[..], &v[..]].concat() + } + fn length_cases_string() -> Vec<(Vec<&'static str>, usize, Vec)> { // a large array let mut values = vec!["one", "on", "o", ""]; let mut expected = vec![3, 2, 1, 0]; @@ -179,10 +175,21 @@ mod tests { ] } + macro_rules! length_binary_helper { + ($offset_ty: ty, $result_ty: ty, $kernel: ident, $value: expr, $expected: expr) => {{ + let array = GenericBinaryArray::<$offset_ty>::from($value); + let result = $kernel(&array)?; + let result = result.as_any().downcast_ref::<$result_ty>().unwrap(); + let expected: $result_ty = $expected.into(); + assert_eq!(expected.data(), result.data()); + Ok(()) + }}; + } + #[test] #[cfg_attr(miri, ignore)] // running forever fn length_test_string() -> Result<()> { - length_cases() + length_cases_string() .into_iter() .try_for_each(|(input, len, expected)| { let array = StringArray::from(input); @@ -199,7 +206,7 @@ mod tests { #[test] #[cfg_attr(miri, ignore)] // running forever fn length_test_large_string() -> Result<()> { - length_cases() + length_cases_string() .into_iter() .try_for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); @@ -213,9 +220,23 @@ mod tests { }) } + #[test] + fn length_test_binary() -> Result<()> { + let value: Vec<&[u8]> = vec![b"1", b"22", b"333"]; + let result: Vec = vec![1, 2, 3]; + length_binary_helper!(i32, Int32Array, length, value, result) + } + + #[test] + fn length_test_large_binary() -> Result<()> { + let value: Vec<&[u8]> = vec![b"1", b"22", b"333"]; + let result: Vec = vec![1, 2, 3]; + length_binary_helper!(i64, Int64Array, length, value, result) + } + type OptionStr = Option<&'static str>; - fn length_null_cases() -> Vec<(Vec, usize, Vec>)> { + fn length_null_cases_string() -> Vec<(Vec, usize, Vec>)> { vec![( vec![Some("one"), None, Some("three"), Some("four")], 4, @@ -225,7 +246,7 @@ mod tests { #[test] fn length_null_string() -> Result<()> { - length_null_cases() + length_null_cases_string() .into_iter() .try_for_each(|(input, len, expected)| { let array = StringArray::from(input); @@ -241,7 +262,7 @@ mod tests { #[test] fn length_null_large_string() -> Result<()> { - length_null_cases() + length_null_cases_string() .into_iter() .try_for_each(|(input, len, expected)| { let array = LargeStringArray::from(input); @@ -260,6 +281,20 @@ mod tests { }) } + #[test] + fn length_null_binary() -> Result<()> { + let value: Vec> = vec![Some(b"1"), None, Some(b"22"), Some(b"333")]; + let result: Vec> = vec![Some(1), None, Some(2), Some(3)]; + length_binary_helper!(i32, Int32Array, length, value, result) + } + + #[test] + fn length_null_large_binary() -> Result<()> { + let value: Vec> = vec![Some(b"1"), None, Some(b"22"), Some(b"333")]; + let result: Vec> = vec![Some(1), None, Some(2), Some(3)]; + length_binary_helper!(i64, Int64Array, length, value, result) + } + /// Tests that length is not valid for u64. #[test] fn length_wrong_type() { @@ -282,11 +317,22 @@ mod tests { Ok(()) } - fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec)> { - fn double_vec(v: Vec) -> Vec { - [&v[..], &v[..]].concat() - } + #[test] + fn binary_length_offsets() -> Result<()> { + let value: Vec> = + vec![Some(b"hello"), Some(b" "), Some(b"world"), None]; + let a = BinaryArray::from(value); + let b = a.slice(1, 3); + let result = length(b.as_ref())?; + let result: &Int32Array = as_primitive_array(&result); + + let expected = Int32Array::from(vec![Some(1), Some(5), None]); + assert_eq!(&expected, result); + Ok(()) + } + + fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec)> { // a large array let mut values = vec!["one", "on", "o", ""]; let mut expected = vec![24, 16, 8, 0]; @@ -337,6 +383,20 @@ mod tests { }) } + #[test] + fn bit_length_binary() -> Result<()> { + let value: Vec<&[u8]> = vec![b"one", b"three", b"four"]; + let expected: Vec = vec![24, 40, 32]; + length_binary_helper!(i32, Int32Array, bit_length, value, expected) + } + + #[test] + fn bit_length_large_binary() -> Result<()> { + let value: Vec<&[u8]> = vec![b"one", b"three", b"four"]; + let expected: Vec = vec![24, 40, 32]; + length_binary_helper!(i64, Int64Array, bit_length, value, expected) + } + fn bit_length_null_cases() -> Vec<(Vec, usize, Vec>)> { vec![( vec![Some("one"), None, Some("three"), Some("four")], @@ -382,6 +442,22 @@ mod tests { }) } + #[test] + fn bit_length_null_binary() -> Result<()> { + let value: Vec> = + vec![Some(b"one"), None, Some(b"three"), Some(b"four")]; + let expected = vec![Some(24), None, Some(40), Some(32)]; + length_binary_helper!(i32, Int32Array, bit_length, value, expected) + } + + #[test] + fn bit_length_null_large_binary() -> Result<()> { + let value: Vec> = + vec![Some(b"one"), None, Some(b"three"), Some(b"four")]; + let expected: Vec> = vec![Some(24), None, Some(40), Some(32)]; + length_binary_helper!(i64, Int64Array, bit_length, value, expected) + } + /// Tests that bit_length is not valid for u64. #[test] fn bit_length_wrong_type() { @@ -392,7 +468,7 @@ mod tests { /// Tests with an offset #[test] - fn bit_length_offsets() -> Result<()> { + fn bit_length_offsets_string() -> Result<()> { let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]); let b = a.slice(1, 3); let result = bit_length(b.as_ref())?; @@ -403,4 +479,19 @@ mod tests { Ok(()) } + + #[test] + fn bit_length_offsets_binary() -> Result<()> { + let value: Vec> = + vec![Some(b"hello"), Some(b" "), Some(b"world"), None]; + let a = BinaryArray::from(value); + let b = a.slice(1, 3); + let result = bit_length(b.as_ref())?; + let result: &Int32Array = as_primitive_array(&result); + + let expected = Int32Array::from(vec![Some(8), Some(40), None]); + assert_eq!(&expected, result); + + Ok(()) + } } From aa4235c2b26d3b3b371d76772949a2741f0244ed Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sun, 20 Mar 2022 20:46:45 +0800 Subject: [PATCH 3/5] add non-utf8 test cases Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/length.rs | 40 ++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index 2b83fc0371aa..6cd7f4d40d71 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -222,15 +222,15 @@ mod tests { #[test] fn length_test_binary() -> Result<()> { - let value: Vec<&[u8]> = vec![b"1", b"22", b"333"]; - let result: Vec = vec![1, 2, 3]; + let value: Vec<&[u8]> = vec![b"zero", b"one", &[0xff, 0xf8]]; + let result: Vec = vec![4, 3, 2]; length_binary_helper!(i32, Int32Array, length, value, result) } #[test] fn length_test_large_binary() -> Result<()> { - let value: Vec<&[u8]> = vec![b"1", b"22", b"333"]; - let result: Vec = vec![1, 2, 3]; + let value: Vec<&[u8]> = vec![b"zero", &[0xff, 0xf8], b"two"]; + let result: Vec = vec![4, 2, 3]; length_binary_helper!(i64, Int64Array, length, value, result) } @@ -283,15 +283,15 @@ mod tests { #[test] fn length_null_binary() -> Result<()> { - let value: Vec> = vec![Some(b"1"), None, Some(b"22"), Some(b"333")]; - let result: Vec> = vec![Some(1), None, Some(2), Some(3)]; + let value: Vec> = vec![Some(b"zero"), None, Some(&[0xff, 0xf8]), Some(b"three")]; + let result: Vec> = vec![Some(4), None, Some(2), Some(5)]; length_binary_helper!(i32, Int32Array, length, value, result) } #[test] fn length_null_large_binary() -> Result<()> { - let value: Vec> = vec![Some(b"1"), None, Some(b"22"), Some(b"333")]; - let result: Vec> = vec![Some(1), None, Some(2), Some(3)]; + let value: Vec> = vec![Some(&[0xff, 0xf8]), None, Some(b"two"), Some(b"three")]; + let result: Vec> = vec![Some(2), None, Some(3), Some(5)]; length_binary_helper!(i64, Int64Array, length, value, result) } @@ -320,13 +320,13 @@ mod tests { #[test] fn binary_length_offsets() -> Result<()> { let value: Vec> = - vec![Some(b"hello"), Some(b" "), Some(b"world"), None]; + vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None]; let a = BinaryArray::from(value); let b = a.slice(1, 3); let result = length(b.as_ref())?; let result: &Int32Array = as_primitive_array(&result); - let expected = Int32Array::from(vec![Some(1), Some(5), None]); + let expected = Int32Array::from(vec![Some(1), Some(2), None]); assert_eq!(&expected, result); Ok(()) @@ -385,15 +385,15 @@ mod tests { #[test] fn bit_length_binary() -> Result<()> { - let value: Vec<&[u8]> = vec![b"one", b"three", b"four"]; - let expected: Vec = vec![24, 40, 32]; + let value: Vec<&[u8]> = vec![b"one", &[0xff, 0xf8], b"three"]; + let expected: Vec = vec![24, 16, 40]; length_binary_helper!(i32, Int32Array, bit_length, value, expected) } #[test] fn bit_length_large_binary() -> Result<()> { - let value: Vec<&[u8]> = vec![b"one", b"three", b"four"]; - let expected: Vec = vec![24, 40, 32]; + let value: Vec<&[u8]> = vec![b"zero", b" ", &[0xff, 0xf8]]; + let expected: Vec = vec![32, 8, 16]; length_binary_helper!(i64, Int64Array, bit_length, value, expected) } @@ -445,16 +445,16 @@ mod tests { #[test] fn bit_length_null_binary() -> Result<()> { let value: Vec> = - vec![Some(b"one"), None, Some(b"three"), Some(b"four")]; - let expected = vec![Some(24), None, Some(40), Some(32)]; + vec![Some(b"one"), None, Some(b"three"), Some(&[0xff, 0xf8])]; + let expected = vec![Some(24), None, Some(40), Some(16)]; length_binary_helper!(i32, Int32Array, bit_length, value, expected) } #[test] fn bit_length_null_large_binary() -> Result<()> { let value: Vec> = - vec![Some(b"one"), None, Some(b"three"), Some(b"four")]; - let expected: Vec> = vec![Some(24), None, Some(40), Some(32)]; + vec![Some(b"one"), None, Some(&[0xff, 0xf8]), Some(b"four")]; + let expected: Vec> = vec![Some(24), None, Some(16), Some(32)]; length_binary_helper!(i64, Int64Array, bit_length, value, expected) } @@ -483,13 +483,13 @@ mod tests { #[test] fn bit_length_offsets_binary() -> Result<()> { let value: Vec> = - vec![Some(b"hello"), Some(b" "), Some(b"world"), None]; + vec![Some(b"hello"), Some(&[]), Some(b"world"), None]; let a = BinaryArray::from(value); let b = a.slice(1, 3); let result = bit_length(b.as_ref())?; let result: &Int32Array = as_primitive_array(&result); - let expected = Int32Array::from(vec![Some(8), Some(40), None]); + let expected = Int32Array::from(vec![Some(0), Some(40), None]); assert_eq!(&expected, result); Ok(()) From af7c4e5bf08dc6d6b417ac0fba41eb1c06568fec Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Sun, 20 Mar 2022 20:53:08 +0800 Subject: [PATCH 4/5] fix some doc Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/length.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index 6cd7f4d40d71..4efd4697b201 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -116,7 +116,7 @@ where /// Returns an array of Int32/Int64 denoting the number of bytes in each string in the array. /// -/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8 +/// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray /// * length of null is null. /// * length is in number of bytes pub fn length(array: &dyn Array) -> Result { @@ -283,14 +283,16 @@ mod tests { #[test] fn length_null_binary() -> Result<()> { - let value: Vec> = vec![Some(b"zero"), None, Some(&[0xff, 0xf8]), Some(b"three")]; + let value: Vec> = + vec![Some(b"zero"), None, Some(&[0xff, 0xf8]), Some(b"three")]; let result: Vec> = vec![Some(4), None, Some(2), Some(5)]; length_binary_helper!(i32, Int32Array, length, value, result) } #[test] fn length_null_large_binary() -> Result<()> { - let value: Vec> = vec![Some(&[0xff, 0xf8]), None, Some(b"two"), Some(b"three")]; + let value: Vec> = + vec![Some(&[0xff, 0xf8]), None, Some(b"two"), Some(b"three")]; let result: Vec> = vec![Some(2), None, Some(3), Some(5)]; length_binary_helper!(i64, Int64Array, length, value, result) } @@ -446,7 +448,7 @@ mod tests { fn bit_length_null_binary() -> Result<()> { let value: Vec> = vec![Some(b"one"), None, Some(b"three"), Some(&[0xff, 0xf8])]; - let expected = vec![Some(24), None, Some(40), Some(16)]; + let expected: Vec> = vec![Some(24), None, Some(40), Some(16)]; length_binary_helper!(i32, Int32Array, bit_length, value, expected) } From 71838222f2ea752b496f9f2e8ddd50323e894fec Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Thu, 24 Mar 2022 20:47:49 +0800 Subject: [PATCH 5/5] update doc simplify the way to get offsets. No performance penalty Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/compute/kernels/length.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index 4efd4697b201..19be3369404b 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -25,11 +25,7 @@ use crate::{ macro_rules! unary_offsets { ($array: expr, $data_type: expr, $op: expr) => {{ - // note: offsets are stored as u8, but they can be interpreted as OffsetSize - let offsets = &$array.data_ref().buffers()[0]; - // this is a 30% improvement over iterating over u8s and building OffsetSize, which - // justifies the usage of `unsafe`. - let slice: &[O] = &unsafe { offsets.typed_data::() }[$array.offset()..]; + let slice = $array.value_offsets(); let lengths = slice.windows(2).map(|offset| $op(offset[1] - offset[0])); @@ -114,7 +110,7 @@ where unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes) } -/// Returns an array of Int32/Int64 denoting the number of bytes in each string in the array. +/// Returns an array of Int32/Int64 denoting the number of bytes in each value in the array. /// /// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray /// * length of null is null. @@ -132,7 +128,7 @@ pub fn length(array: &dyn Array) -> Result { } } -/// Returns an array of Int32/Int64 denoting the number of bits in each string in the array. +/// Returns an array of Int32/Int64 denoting the number of bits in each value in the array. /// /// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray /// * bit_length of null is null.