Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support the length kernel on Binary Array #1465

Merged
merged 5 commits into from
Mar 24, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
242 changes: 178 additions & 64 deletions arrow/src/compute/kernels/length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,59 +15,58 @@
// specific language governing permissions and limitations
// under the License.

//! Defines kernel for length of a string array
//! Defines kernel for length of string arrays and binary arrays

use crate::{
array::*,
buffer::Buffer,
datatypes::{ArrowNativeType, ArrowPrimitiveType},
};
use crate::{array::*, buffer::Buffer, datatypes::ArrowPrimitiveType};
use crate::{
datatypes::{DataType, Int32Type, Int64Type},
error::{ArrowError, Result},
};

fn unary_offsets_string<O, F>(
array: &GenericStringArray<O>,
data_type: DataType,
op: F,
macro_rules! unary_offsets {
($array: expr, $data_type: expr, $op: expr) => {{
let slice = $array.value_offsets();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️ that is much nicer


let lengths = slice.windows(2).map(|offset| $op(offset[1] - offset[0]));

// JUSTIFICATION
// Benefit
// ~60% speedup
// Soundness
// `values` come from a slice iterator with a known size.
let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) };

let null_bit_buffer = $array
.data_ref()
.null_buffer()
.map(|b| b.bit_slice($array.offset(), $array.len()));

let data = unsafe {
ArrayData::new_unchecked(
$data_type,
$array.len(),
None,
null_bit_buffer,
0,
vec![buffer],
vec![],
)
};
make_array(data)
}};
}

fn octet_length_binary<O: BinaryOffsetSizeTrait, T: ArrowPrimitiveType>(
array: &dyn Array,
) -> ArrayRef
where
O: StringOffsetSizeTrait + ArrowNativeType,
F: Fn(O) -> O,
T::Native: BinaryOffsetSizeTrait,
{
// note: offsets are stored as u8, but they can be interpreted as OffsetSize
let offsets = &array.data_ref().buffers()[0];
// this is a 30% improvement over iterating over u8s and building OffsetSize, which
// justifies the usage of `unsafe`.
let slice: &[O] = &unsafe { offsets.typed_data::<O>() }[array.offset()..];

let lengths = slice.windows(2).map(|offset| op(offset[1] - offset[0]));

// JUSTIFICATION
// Benefit
// ~60% speedup
// Soundness
// `values` come from a slice iterator with a known size.
let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) };

let null_bit_buffer = array
.data_ref()
.null_buffer()
.map(|b| b.bit_slice(array.offset(), array.len()));

let data = unsafe {
ArrayData::new_unchecked(
data_type,
array.len(),
None,
null_bit_buffer,
0,
vec![buffer],
vec![],
)
};
make_array(data)
let array = array
.as_any()
.downcast_ref::<GenericBinaryArray<O>>()
.unwrap();
unary_offsets!(array, T::DATA_TYPE, |x| x)
}

fn octet_length<O: StringOffsetSizeTrait, T: ArrowPrimitiveType>(
Expand All @@ -80,7 +79,21 @@ where
.as_any()
.downcast_ref::<GenericStringArray<O>>()
.unwrap();
unary_offsets_string::<O, _>(array, T::DATA_TYPE, |x| x)
unary_offsets!(array, T::DATA_TYPE, |x| x)
}

fn bit_length_impl_binary<O: BinaryOffsetSizeTrait, T: ArrowPrimitiveType>(
array: &dyn Array,
) -> ArrayRef
where
T::Native: BinaryOffsetSizeTrait,
{
let array = array
.as_any()
.downcast_ref::<GenericBinaryArray<O>>()
.unwrap();
let bits_in_bytes = O::from_usize(8).unwrap();
unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes)
}

fn bit_length_impl<O: StringOffsetSizeTrait, T: ArrowPrimitiveType>(
Expand All @@ -94,34 +107,38 @@ where
.downcast_ref::<GenericStringArray<O>>()
.unwrap();
let bits_in_bytes = O::from_usize(8).unwrap();
unary_offsets_string::<O, _>(array, T::DATA_TYPE, |x| x * bits_in_bytes)
unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes)
}

/// Returns an array of Int32/Int64 denoting the number of bytes in each string in the array.
/// Returns an array of Int32/Int64 denoting the number of bytes in each value in the array.
///
/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8
/// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray
/// * length of null is null.
/// * length is in number of bytes
pub fn length(array: &dyn Array) -> Result<ArrayRef> {
match array.data_type() {
DataType::Utf8 => Ok(octet_length::<i32, Int32Type>(array)),
DataType::LargeUtf8 => Ok(octet_length::<i64, Int64Type>(array)),
DataType::Binary => Ok(octet_length_binary::<i32, Int32Type>(array)),
DataType::LargeBinary => Ok(octet_length_binary::<i64, Int64Type>(array)),
_ => Err(ArrowError::ComputeError(format!(
"length not supported for {:?}",
array.data_type()
))),
}
}

/// Returns an array of Int32/Int64 denoting the number of bits in each string in the array.
/// Returns an array of Int32/Int64 denoting the number of bits in each value in the array.
///
/// * this only accepts StringArray/Utf8 and LargeString/LargeUtf8
/// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray
/// * bit_length of null is null.
/// * bit_length is in number of bits
pub fn bit_length(array: &dyn Array) -> Result<ArrayRef> {
match array.data_type() {
DataType::Utf8 => Ok(bit_length_impl::<i32, Int32Type>(array)),
DataType::LargeUtf8 => Ok(bit_length_impl::<i64, Int64Type>(array)),
DataType::Binary => Ok(bit_length_impl_binary::<i32, Int32Type>(array)),
DataType::LargeBinary => Ok(bit_length_impl_binary::<i64, Int64Type>(array)),
_ => Err(ArrowError::ComputeError(format!(
"bit_length not supported for {:?}",
array.data_type()
Expand All @@ -133,11 +150,11 @@ pub fn bit_length(array: &dyn Array) -> Result<ArrayRef> {
mod tests {
use super::*;

fn length_cases() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
[&v[..], &v[..]].concat()
}
fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
[&v[..], &v[..]].concat()
}

fn length_cases_string() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
// a large array
let mut values = vec!["one", "on", "o", ""];
let mut expected = vec![3, 2, 1, 0];
Expand All @@ -154,10 +171,21 @@ mod tests {
]
}

macro_rules! length_binary_helper {
($offset_ty: ty, $result_ty: ty, $kernel: ident, $value: expr, $expected: expr) => {{
let array = GenericBinaryArray::<$offset_ty>::from($value);
let result = $kernel(&array)?;
let result = result.as_any().downcast_ref::<$result_ty>().unwrap();
let expected: $result_ty = $expected.into();
assert_eq!(expected.data(), result.data());
Ok(())
}};
}

#[test]
#[cfg_attr(miri, ignore)] // running forever
fn length_test_string() -> Result<()> {
length_cases()
length_cases_string()
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = StringArray::from(input);
Expand All @@ -174,7 +202,7 @@ mod tests {
#[test]
#[cfg_attr(miri, ignore)] // running forever
fn length_test_large_string() -> Result<()> {
length_cases()
length_cases_string()
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
Expand All @@ -188,9 +216,23 @@ mod tests {
})
}

#[test]
fn length_test_binary() -> Result<()> {
let value: Vec<&[u8]> = vec![b"zero", b"one", &[0xff, 0xf8]];
let result: Vec<i32> = vec![4, 3, 2];
length_binary_helper!(i32, Int32Array, length, value, result)
}

#[test]
fn length_test_large_binary() -> Result<()> {
let value: Vec<&[u8]> = vec![b"zero", &[0xff, 0xf8], b"two"];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯 for non UTF8

let result: Vec<i64> = vec![4, 2, 3];
length_binary_helper!(i64, Int64Array, length, value, result)
}

type OptionStr = Option<&'static str>;

fn length_null_cases() -> Vec<(Vec<OptionStr>, usize, Vec<Option<i32>>)> {
fn length_null_cases_string() -> Vec<(Vec<OptionStr>, usize, Vec<Option<i32>>)> {
vec![(
vec![Some("one"), None, Some("three"), Some("four")],
4,
Expand All @@ -200,7 +242,7 @@ mod tests {

#[test]
fn length_null_string() -> Result<()> {
length_null_cases()
length_null_cases_string()
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = StringArray::from(input);
Expand All @@ -216,7 +258,7 @@ mod tests {

#[test]
fn length_null_large_string() -> Result<()> {
length_null_cases()
length_null_cases_string()
.into_iter()
.try_for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
Expand All @@ -235,6 +277,22 @@ mod tests {
})
}

#[test]
fn length_null_binary() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(b"zero"), None, Some(&[0xff, 0xf8]), Some(b"three")];
let result: Vec<Option<i32>> = vec![Some(4), None, Some(2), Some(5)];
length_binary_helper!(i32, Int32Array, length, value, result)
}

#[test]
fn length_null_large_binary() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(&[0xff, 0xf8]), None, Some(b"two"), Some(b"three")];
let result: Vec<Option<i64>> = vec![Some(2), None, Some(3), Some(5)];
length_binary_helper!(i64, Int64Array, length, value, result)
}

/// Tests that length is not valid for u64.
#[test]
fn length_wrong_type() {
Expand All @@ -257,11 +315,22 @@ mod tests {
Ok(())
}

fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
[&v[..], &v[..]].concat()
}
#[test]
fn binary_length_offsets() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None];
let a = BinaryArray::from(value);
let b = a.slice(1, 3);
let result = length(b.as_ref())?;
let result: &Int32Array = as_primitive_array(&result);

let expected = Int32Array::from(vec![Some(1), Some(2), None]);
assert_eq!(&expected, result);

Ok(())
}

fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
// a large array
let mut values = vec!["one", "on", "o", ""];
let mut expected = vec![24, 16, 8, 0];
Expand Down Expand Up @@ -312,6 +381,20 @@ mod tests {
})
}

#[test]
fn bit_length_binary() -> Result<()> {
let value: Vec<&[u8]> = vec![b"one", &[0xff, 0xf8], b"three"];
let expected: Vec<i32> = vec![24, 16, 40];
length_binary_helper!(i32, Int32Array, bit_length, value, expected)
}

#[test]
fn bit_length_large_binary() -> Result<()> {
let value: Vec<&[u8]> = vec![b"zero", b" ", &[0xff, 0xf8]];
let expected: Vec<i64> = vec![32, 8, 16];
length_binary_helper!(i64, Int64Array, bit_length, value, expected)
}

fn bit_length_null_cases() -> Vec<(Vec<OptionStr>, usize, Vec<Option<i32>>)> {
vec![(
vec![Some("one"), None, Some("three"), Some("four")],
Expand Down Expand Up @@ -357,6 +440,22 @@ mod tests {
})
}

#[test]
fn bit_length_null_binary() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(b"one"), None, Some(b"three"), Some(&[0xff, 0xf8])];
let expected: Vec<Option<i32>> = vec![Some(24), None, Some(40), Some(16)];
length_binary_helper!(i32, Int32Array, bit_length, value, expected)
}

#[test]
fn bit_length_null_large_binary() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(b"one"), None, Some(&[0xff, 0xf8]), Some(b"four")];
let expected: Vec<Option<i64>> = vec![Some(24), None, Some(16), Some(32)];
length_binary_helper!(i64, Int64Array, bit_length, value, expected)
}

/// Tests that bit_length is not valid for u64.
#[test]
fn bit_length_wrong_type() {
Expand All @@ -367,7 +466,7 @@ mod tests {

/// Tests with an offset
#[test]
fn bit_length_offsets() -> Result<()> {
fn bit_length_offsets_string() -> Result<()> {
let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]);
let b = a.slice(1, 3);
let result = bit_length(b.as_ref())?;
Expand All @@ -378,4 +477,19 @@ mod tests {

Ok(())
}

#[test]
fn bit_length_offsets_binary() -> Result<()> {
let value: Vec<Option<&[u8]>> =
vec![Some(b"hello"), Some(&[]), Some(b"world"), None];
let a = BinaryArray::from(value);
let b = a.slice(1, 3);
let result = bit_length(b.as_ref())?;
let result: &Int32Array = as_primitive_array(&result);

let expected = Int32Array::from(vec![Some(0), Some(40), None]);
assert_eq!(&expected, result);

Ok(())
}
}