diff --git a/arrow-array/src/array/list_view_array.rs b/arrow-array/src/array/list_view_array.rs new file mode 100644 index 00000000000..905d837bfd0 --- /dev/null +++ b/arrow-array/src/array/list_view_array.rs @@ -0,0 +1,1254 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::{make_array, print_long_array}; +use crate::builder::{GenericListViewBuilder, PrimitiveBuilder}; +use crate::iterator::GenericListViewArrayIter; +use crate::{ + new_empty_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, FixedSizeListArray, + OffsetSizeTrait, +}; +use arrow_buffer::{NullBuffer, ScalarBuffer}; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType, FieldRef}; +use std::any::Any; +use std::ops::Add; +use std::sync::Arc; + +/// A [`GenericListViewArray`] of variable size lists, storing offsets as `i32`. +/// +// See [`ListViewBuilder`](crate::builder::ListViewBuilder) for how to construct a [`ListViewArray`] +pub type ListViewArray = GenericListViewArray; + +/// A [`GenericListViewArray`] of variable size lists, storing offsets as `i64`. +/// +// See [`LargeListViewArray`](crate::builder::LargeListViewBuilder) for how to construct a [`LargeListViewArray`] +pub type LargeListViewArray = GenericListViewArray; + +/// +/// Different than [`crate::GenericListArray`] as it stores both an offset and length +/// meaning that take / filter operations can be implemented without copying the underlying data. +/// +/// [Variable-size List Layout: ListView Layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout +pub struct GenericListViewArray { + data_type: DataType, + nulls: Option, + values: ArrayRef, + value_offsets: ScalarBuffer, + value_sizes: ScalarBuffer, +} + +impl Clone for GenericListViewArray { + fn clone(&self) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.clone(), + values: self.values.clone(), + value_offsets: self.value_offsets.clone(), + value_sizes: self.value_sizes.clone(), + } + } +} + +impl GenericListViewArray { + /// The data type constructor of listview array. + /// The input is the schema of the child array and + /// the output is the [`DataType`], ListView or LargeListView. + pub const DATA_TYPE_CONSTRUCTOR: fn(FieldRef) -> DataType = if OffsetSize::IS_LARGE { + DataType::LargeListView + } else { + DataType::ListView + }; + + /// Create a new [`GenericListViewArray`] from the provided parts + /// + /// # Errors + /// + /// Errors if + /// + /// * `offsets.len() != sizes.len()` + /// * `offsets.len() != nulls.len()` + /// * `offsets.last() > values.len()` + /// * `!field.is_nullable() && values.is_nullable()` + /// * `field.data_type() != values.data_type()` + /// * `0 <= offsets[i] <= length of the child array` + /// * `0 <= offsets[i] + size[i] <= length of the child array` + pub fn try_new( + field: FieldRef, + offsets: ScalarBuffer, + sizes: ScalarBuffer, + values: ArrayRef, + nulls: Option, + ) -> Result { + + let len = offsets.len(); + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect length of null buffer for {}ListViewArray, expected {len} got {}", + OffsetSize::PREFIX, + n.len(), + ))); + } + } + if len != sizes.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}", + OffsetSize::PREFIX, len, sizes.len() + ))); + } + + for i in 0..offsets.len() { + let length = values.len(); + let offset = offsets[i].as_usize(); + let size = sizes[i].as_usize(); + if offset > length { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid offset value for {}ListViewArray, offset: {}, length: {}", + OffsetSize::PREFIX, + offset, + length + ))); + } + if offset.saturating_add(size) > values.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Invalid offset and size values for {}ListViewArray, offset: {}, size: {}, values.len(): {}", + OffsetSize::PREFIX, offset, size, values.len() + ))); + } + } + + + + if !field.is_nullable() && values.is_nullable() { + return Err(ArrowError::InvalidArgumentError(format!( + "Non-nullable field of {}ListViewArray {:?} cannot contain nulls", + OffsetSize::PREFIX, + field.name() + ))); + } + + if field.data_type() != values.data_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "{}ListViewArray expected data type {} got {} for {:?}", + OffsetSize::PREFIX, + field.data_type(), + values.data_type(), + field.name() + ))); + } + + Ok(Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls, + values, + value_offsets: offsets, + value_sizes: sizes, + }) + } + + /// Create a new [`GenericListViewArray`] from the provided parts + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new( + field: FieldRef, + offsets: ScalarBuffer, + sizes: ScalarBuffer, + values: ArrayRef, + nulls: Option, + ) -> Self { + Self::try_new(field, offsets, sizes, values, nulls).unwrap() + } + + /// Create a new [`GenericListViewArray`] of length `len` where all values are null + pub fn new_null(field: FieldRef, len: usize) -> Self { + let values = new_empty_array(field.data_type()); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field), + nulls: Some(NullBuffer::new_null(len)), + value_offsets: ScalarBuffer::new_zeroed(len), + value_sizes: ScalarBuffer::new_zeroed(len), + values, + } + } + + /// Deconstruct this array into its constituent parts + pub fn into_parts( + self, + ) -> ( + FieldRef, + ScalarBuffer, + ScalarBuffer, + ArrayRef, + Option, + ) { + let f = match self.data_type { + DataType::ListView(f) | DataType::LargeListView(f) => f, + _ => unreachable!(), + }; + ( + f, + self.value_offsets, + self.value_sizes, + self.values, + self.nulls, + ) + } + + /// Returns a reference to the offsets of this list + /// + /// Unlike [`Self::value_offsets`] this returns the [`ScalarBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn offsets(&self) -> &ScalarBuffer { + &self.value_offsets + } + + /// Returns a reference to the values of this list + #[inline] + pub fn values(&self) -> &ArrayRef { + &self.values + } + + /// Returns a reference to the sizes of this list + /// + /// Unlike [`Self::value_sizes`] this returns the [`ScalarBuffer`] + /// allowing for zero-copy cloning + #[inline] + pub fn sizes(&self) -> &ScalarBuffer { + &self.value_sizes + } + + /// Returns a clone of the value type of this list. + pub fn value_type(&self) -> DataType { + self.values.data_type().clone() + } + + /// Returns ith value of this list view array. + /// # Safety + /// Caller must ensure that the index is within the array bounds + pub unsafe fn value_unchecked(&self, i: usize) -> ArrayRef { + let end = self.value_offsets().get_unchecked(i + 1).as_usize(); + let start = self.value_offsets().get_unchecked(i).as_usize(); + self.values.slice(start, end - start) + } + + /// Returns ith value of this list view array. + pub fn value(&self, i: usize) -> ArrayRef { + let offset = self.value_offsets()[i].as_usize(); + let length = self.value_sizes()[i].as_usize(); + self.values.slice(offset, length) + } + + /// Returns the offset values in the offsets buffer + #[inline] + pub fn value_offsets(&self) -> &[OffsetSize] { + &self.value_offsets + } + + /// Returns the sizes values in the offsets buffer + #[inline] + pub fn value_sizes(&self) -> &[OffsetSize] { + &self.value_sizes + } + + /// Returns the length for value at index `i`. + #[inline] + pub fn value_length(&self, i: usize) -> OffsetSize { + self.value_sizes[i] + } + + /// constructs a new iterator + pub fn iter<'a>(&'a self) -> GenericListViewArrayIter<'a, OffsetSize> { + GenericListViewArrayIter::<'a, OffsetSize>::new(self) + } + + #[inline] + fn get_type(data_type: &DataType) -> Option<&DataType> { + match (OffsetSize::IS_LARGE, data_type) { + (true, DataType::LargeListView(child)) | (false, DataType::ListView(child)) => { + Some(child.data_type()) + } + _ => None, + } + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + data_type: self.data_type.clone(), + nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)), + values: self.values.clone(), + value_offsets: self.value_offsets.slice(offset, length), + value_sizes: self.value_sizes.slice(offset, length), + } + } + + /// Creates a [`GenericListViewArray`] from an iterator of primitive values + /// + pub fn from_iter_primitive(iter: I) -> Self + where + T: ArrowPrimitiveType, + P: IntoIterator::Native>>, + I: IntoIterator>, + { + let iter = iter.into_iter(); + let size_hint = iter.size_hint().0; + let mut builder = + GenericListViewBuilder::with_capacity(PrimitiveBuilder::::new(), size_hint); + + for i in iter { + match i { + Some(p) => { + let mut size = 0usize; + for t in p { + builder.values().append_option(t); + size = size.add(1); + } + builder.append(true, size); + } + None => builder.append(false, 0), + } + } + builder.finish() + } +} + +impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListViewArray { + type Item = ArrayRef; + + fn value(&self, index: usize) -> Self::Item { + GenericListViewArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + GenericListViewArray::value(self, index) + } +} + +impl Array for GenericListViewArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn to_data(&self) -> ArrayData { + self.clone().into() + } + + fn into_data(self) -> ArrayData { + self.into() + } + + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + Arc::new(self.slice(offset, length)) + } + + fn len(&self) -> usize { + self.values.len() + } + + fn is_empty(&self) -> bool { + self.values.is_empty() + } + + fn offset(&self) -> usize { + 0 + } + + fn nulls(&self) -> Option<&NullBuffer> { + self.nulls.as_ref() + } + + fn get_buffer_memory_size(&self) -> usize { + let mut size = self.values.get_buffer_memory_size(); + size += self.value_offsets.inner().capacity(); + size += self.value_sizes.inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } + + fn get_array_memory_size(&self) -> usize { + let mut size = std::mem::size_of::() + self.values.get_array_memory_size(); + size += self.value_offsets.inner().capacity(); + size += self.value_sizes.inner().capacity(); + if let Some(n) = self.nulls.as_ref() { + size += n.buffer().capacity(); + } + size + } +} + +impl std::fmt::Debug for GenericListViewArray { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let prefix = OffsetSize::PREFIX; + write!(f, "{prefix}ListViewArray\n[\n")?; + print_long_array(self, f, |array, index, f| { + std::fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl From> for ArrayData { + fn from(array: GenericListViewArray) -> Self { + let len = array.len(); + let builder = ArrayDataBuilder::new(array.data_type) + .len(len) + .nulls(array.nulls) + .buffers(vec![ + array.value_offsets.into_inner(), + array.value_sizes.into_inner(), + ]) + .child_data(vec![array.values.to_data()]); + + unsafe { builder.build_unchecked() } + } +} + +impl From for GenericListViewArray { + fn from(data: ArrayData) -> Self { + Self::try_new_from_array_data(data) + .expect("Expected infallible creation of GenericListViewArray from ArrayDataRef failed") + } +} + +impl From for GenericListViewArray { + fn from(value: FixedSizeListArray) -> Self { + let (field, size) = match value.data_type() { + DataType::FixedSizeList(f, size) => (f, *size as usize), + _ => unreachable!(), + }; + let iter = std::iter::repeat(size).take(value.len()); + let mut offsets = Vec::with_capacity(iter.size_hint().0); + offsets.push(OffsetSize::usize_as(0)); + let mut acc = 0_usize; + let iter = std::iter::repeat(size).take(value.len()); + let mut sizes = Vec::with_capacity(iter.size_hint().0); + for size in iter { + acc = acc.checked_add(size).expect("usize overflow"); + offsets.push(OffsetSize::usize_as(acc)); + sizes.push(OffsetSize::usize_as(size)); + } + OffsetSize::from_usize(acc).expect("offset overflow"); + let sizes = ScalarBuffer::from(sizes); + let offsets = ScalarBuffer::from(offsets); + Self { + data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()), + nulls: value.nulls().cloned(), + values: value.values().clone(), + value_offsets: offsets, + value_sizes: sizes, + } + } +} + +impl GenericListViewArray { + fn try_new_from_array_data(data: ArrayData) -> Result { + if data.buffers().len() != 2 { + return Err(ArrowError::InvalidArgumentError(format!( + "ListViewArray data should contain two buffer (value offsets & value size), had {}", + data.buffers().len() + ))); + } + + if data.child_data().len() != 1 { + return Err(ArrowError::InvalidArgumentError(format!( + "ListViewArray should contain a single child array (values array), had {}", + data.child_data().len() + ))); + } + + let values = data.child_data()[0].clone(); + + if let Some(child_data_type) = Self::get_type(data.data_type()) { + if values.data_type() != child_data_type { + return Err(ArrowError::InvalidArgumentError(format!( + "[Large]ListViewArray's child datatype {:?} does not \ + correspond to the List's datatype {:?}", + values.data_type(), + child_data_type + ))); + } + } else { + return Err(ArrowError::InvalidArgumentError(format!( + "[Large]ListViewArray's datatype must be [Large]ListViewArray(). It is {:?}", + data.data_type() + ))); + } + + let values = make_array(values); + // SAFETY: + // ArrayData is valid, and verified type above + let value_offsets = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len()); + let value_sizes = ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()); + + Ok(Self { + data_type: data.data_type().clone(), + nulls: data.nulls().cloned(), + values, + value_offsets, + value_sizes, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::builder::{FixedSizeListBuilder, Int32Builder, ListViewBuilder}; + use crate::cast::AsArray; + use crate::types::Int32Type; + use crate::{Int32Array, Int64Array}; + use arrow_buffer::{bit_util, Buffer, ScalarBuffer}; + use arrow_schema::DataType::LargeListView; + use arrow_schema::Field; + + fn create_from_buffers() -> ListViewArray { + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]); + let offsets = ScalarBuffer::from(vec![0, 3, 6]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let sizes = ScalarBuffer::from(vec![3, 3, 2]); + + ListViewArray::new(field, offsets, sizes, Arc::new(values), None) + } + + #[test] + fn test_from_iter_primitive() { + let data = vec![ + Some(vec![Some(0), Some(1), Some(2)]), + Some(vec![Some(3), Some(4), Some(5)]), + Some(vec![Some(6), Some(7)]), + ]; + let list_array = ListViewArray::from_iter_primitive::(data); + let another = create_from_buffers(); + assert_eq!(list_array, another) + } + + #[test] + fn test_empty_list_view_array() { + // Construct an empty value array + let value_data = ArrayData::builder(DataType::Int32) + .len(0) + .add_buffer(Buffer::from([])) + .build() + .unwrap(); + + // Construct an empty offset buffer + let value_offsets = Buffer::from([]); + // Construct an empty size buffer + let value_sizes = Buffer::from([]); + + // Construct a list view array from the above two + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(0) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data) + .build() + .unwrap(); + + let list_array = ListViewArray::from(list_data); + assert_eq!(list_array.len(), 0) + } + + #[test] + fn test_list_view_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); + // Construct a buffer for value sizes, for the nested array: + let value_sizes = Buffer::from_slice_ref([3, 3, 2, 0]); + + // Construct a list view array from the above two + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(3) + .add_buffer(value_offsets.clone()) + .add_buffer(value_sizes.clone()) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = ListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_sizes()[2]); + assert_eq!(2, list_array.value_length(2)); + assert_eq!( + 0, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + + // Now test with a non-zero offset (skip first element) + // [[3, 4, 5], [6, 7]] + let list_data = ArrayData::builder(list_data_type) + .len(2) + .offset(1) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = ListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(2, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[1]); + assert_eq!(2, list_array.value_sizes()[1]); + assert_eq!(2, list_array.value_length(1)); + assert_eq!( + 3, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 3, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + } + + #[test] + fn test_large_list_view_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); + + // Construct a buffer for value sizes, for the nested array: + let value_sizes = Buffer::from_slice_ref([3i64, 3, 2, 0]); + + // Construct a list view array from the above two + let list_data_type = LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(3) + .add_buffer(value_offsets.clone()) + .add_buffer(value_sizes.clone()) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[2]); + assert_eq!(2, list_array.value_sizes()[2]); + assert_eq!(2, list_array.value_length(2)); + assert_eq!( + 0, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 0, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + + // Now test with a non-zero offset + // [[3, 4, 5], [6, 7]] + let list_data = ArrayData::builder(list_data_type) + .len(2) + .offset(1) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(2, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offsets()[1]); + assert_eq!(2, list_array.value_sizes()[1]); + assert_eq!(2, list_array.value_length(1)); + assert_eq!( + 3, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!( + 3, + unsafe { list_array.value_unchecked(0) } + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + } + + #[test] + fn test_list_view_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + let value_offsets = Buffer::from_slice_ref([0, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + // Construct a buffer for value sizes + let value_sizes = Buffer::from_slice_ref([2, 0, 0, 2, 2, 0, 3, 0, 1]); + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + + // Construct a list view array from the above two + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(9) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = ListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(9, list_array.len()); + assert_eq!(4, list_array.null_count()); + assert_eq!(2, list_array.value_offsets()[3]); + assert_eq!(2, list_array.value_sizes()[3]); + assert_eq!(2, list_array.value_length(3)); + + let sliced_array = list_array.slice(1, 6); + assert_eq!(6, sliced_array.len()); + assert_eq!(3, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, 1 + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(2, sliced_list_array.value_offsets()[2]); + assert_eq!(2, sliced_list_array.value_sizes()[2]); + assert_eq!(2, sliced_list_array.value_length(2)); + + assert_eq!(4, sliced_list_array.value_offsets()[3]); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + assert_eq!(2, sliced_list_array.value_length(3)); + + assert_eq!(6, sliced_list_array.value_offsets()[5]); + assert_eq!(3, sliced_list_array.value_sizes()[5]); + assert_eq!(3, sliced_list_array.value_length(5)); + } + + #[test] + fn test_large_list_view_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + let value_offsets = Buffer::from_slice_ref([0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + // Construct a buffer for value sizes + let value_sizes = Buffer::from_slice_ref([2i64, 0, 0, 2, 2, 0, 3, 0, 1]); + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + + // Construct a list view array from the above two + let list_data_type = LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(9) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data.clone()) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + + let values = list_array.values(); + assert_eq!(value_data, values.to_data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(9, list_array.len()); + assert_eq!(4, list_array.null_count()); + assert_eq!(2, list_array.value_offsets()[3]); + assert_eq!(2, list_array.value_sizes()[3]); + assert_eq!(2, list_array.value_length(3)); + + let sliced_array = list_array.slice(1, 6); + assert_eq!(6, sliced_array.len()); + assert_eq!(3, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, 1 + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(2, sliced_list_array.value_offsets()[2]); + assert_eq!(2, sliced_list_array.value_length(2)); + assert_eq!(2, sliced_list_array.value_sizes()[2]); + + assert_eq!(4, sliced_list_array.value_offsets()[3]); + assert_eq!(2, sliced_list_array.value_length(3)); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + + assert_eq!(6, sliced_list_array.value_offsets()[5]); + assert_eq!(3, sliced_list_array.value_length(5)); + assert_eq!(2, sliced_list_array.value_sizes()[3]); + } + + #[test] + #[should_panic(expected = "index out of bounds: the len is 10 but the index is 10")] + fn test_list_view_array_index_out_of_bound() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1], null, null, [2, 3], [4, 5], null, [6, 7, 8], null, [9]] + let value_offsets = Buffer::from_slice_ref([0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10]); + // Construct a buffer for value sizes + let value_sizes = Buffer::from_slice_ref([2i64, 0, 0, 2, 2, 0, 3, 0, 1]); + // 01011001 00000001 + let mut null_bits: [u8; 2] = [0; 2]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + bit_util::set_bit(&mut null_bits, 6); + bit_util::set_bit(&mut null_bits, 8); + + // Construct a list array from the above two + let list_data_type = LargeListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(9) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = LargeListViewArray::from(list_data); + assert_eq!(9, list_array.len()); + + list_array.value(10); + } + #[test] + #[should_panic( + expected = "ListViewArray data should contain two buffer (value offsets & value size), had 0" + )] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_invalid_buffer_len() { + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build_unchecked() + }; + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + #[should_panic( + expected = "ListViewArray data should contain two buffer (value offsets & value size), had 1" + )] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_invalid_child_array_len() { + let value_offsets = Buffer::from_slice_ref([0, 2, 5, 7]); + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + #[should_panic( + expected = "[Large]ListViewArray's datatype must be [Large]ListViewArray(). It is ListView" + )] + fn test_from_array_data_validation() { + let mut builder = ListViewBuilder::new(Int32Builder::new()); + builder.values().append_value(1); + builder.append(true, 0); + let array = builder.finish(); + let _ = LargeListViewArray::from(array.into_data()); + } + + #[test] + fn test_list_view_array_offsets_need_not_start_at_zero() { + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + let value_offsets = Buffer::from_slice_ref([2, 2, 5, 7]); + let value_sizes = Buffer::from_slice_ref([0, 0, 3, 2]); + + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_buffer(value_sizes) + .add_child_data(value_data) + .build() + .unwrap(); + + let list_array = ListViewArray::from(list_data); + assert_eq!(list_array.value_length(0), 0); + assert_eq!(list_array.value_length(1), 0); + assert_eq!(list_array.value_length(2), 3); + } + + #[test] + #[should_panic(expected = "Memory pointer is not aligned with the specified scalar type")] + #[cfg(not(feature = "force_validate"))] + fn test_list_view_array_alignment() { + let offset_buf = Buffer::from_slice_ref([0_u64]); + let offset_buf2 = offset_buf.slice(1); + + let size_buf = Buffer::from_slice_ref([0_u64]); + let size_buf2 = size_buf.slice(1); + + let values: [i32; 8] = [0; 8]; + let value_data = unsafe { + ArrayData::builder(DataType::Int32) + .add_buffer(Buffer::from_slice_ref(values)) + .build_unchecked() + }; + + let list_data_type = + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, false))); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .add_buffer(offset_buf2) + .add_buffer(size_buf2) + .add_child_data(value_data) + .build_unchecked() + }; + drop(ListViewArray::from(list_data)); + } + + #[test] + fn list_array_equality() { + // test scaffold + fn do_comparison( + lhs_data: Vec>>>, + rhs_data: Vec>>>, + should_equal: bool, + ) { + let lhs = ListViewArray::from_iter_primitive::(lhs_data.clone()); + let rhs = ListViewArray::from_iter_primitive::(rhs_data.clone()); + assert_eq!(lhs == rhs, should_equal); + + let lhs = LargeListViewArray::from_iter_primitive::(lhs_data); + let rhs = LargeListViewArray::from_iter_primitive::(rhs_data); + assert_eq!(lhs == rhs, should_equal); + } + + do_comparison( + vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + true, + ); + + do_comparison( + vec![ + None, + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + vec![ + Some(vec![Some(0), Some(1), Some(2)]), + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + false, + ); + + do_comparison( + vec![ + None, + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(6), Some(7)]), + ], + vec![ + None, + None, + Some(vec![Some(3), None, Some(5)]), + Some(vec![Some(0), Some(0)]), + ], + false, + ); + + do_comparison( + vec![None, None, Some(vec![Some(1)])], + vec![None, None, Some(vec![Some(2)])], + false, + ); + } + + #[test] + fn test_empty_offsets() { + let f = Arc::new(Field::new("element", DataType::Int32, true)); + let string = ListViewArray::from( + ArrayData::builder(DataType::ListView(f.clone())) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.value_offsets(), &[0]); + assert_eq!(string.value_sizes(), &[0]); + + let string = LargeListViewArray::from( + ArrayData::builder(DataType::LargeListView(f)) + .buffers(vec![Buffer::from(&[]), Buffer::from(&[])]) + .add_child_data(ArrayData::new_empty(&DataType::Int32)) + .build() + .unwrap(), + ); + assert_eq!(string.len(), 0); + assert_eq!(string.value_offsets(), &[0]); + assert_eq!(string.value_sizes(), &[0]); + } + + #[test] + fn test_try_new() { + let offsets = ScalarBuffer::from(vec![0, 1, 4, 5]); + let sizes = ScalarBuffer::from(vec![1, 3, 1, 0]); + let values = Int32Array::new(vec![1, 2, 3, 4, 5].into(), None); + let values = Arc::new(values) as ArrayRef; + + let field = Arc::new(Field::new("element", DataType::Int32, false)); + ListViewArray::new( + field.clone(), + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ); + + let nulls = NullBuffer::new_null(4); + ListViewArray::new( + field.clone(), + offsets, + sizes.clone(), + values.clone(), + Some(nulls), + ); + + let nulls = NullBuffer::new_null(4); + let offsets = ScalarBuffer::from(vec![0, 1, 2, 3, 4]); + let sizes = ScalarBuffer::from(vec![1, 1, 1, 1, 0]); + let err = LargeListViewArray::try_new( + field, + offsets.clone(), + sizes.clone(), + values.clone(), + Some(nulls), + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Incorrect length of null buffer for LargeListViewArray, expected 5 got 4" + ); + + let field = Arc::new(Field::new("element", DataType::Int64, false)); + let err = LargeListViewArray::try_new( + field.clone(), + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: LargeListViewArray expected data type Int64 got Int32 for \"element\"" + ); + + let nulls = NullBuffer::new_null(7); + let values = Int64Array::new(vec![0; 7].into(), Some(nulls)); + let values = Arc::new(values); + + let err = LargeListViewArray::try_new( + field, + offsets.clone(), + sizes.clone(), + values.clone(), + None, + ) + .unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Non-nullable field of LargeListViewArray \"element\" cannot contain nulls" + ); + } + + #[test] + fn test_from_fixed_size_list() { + let mut builder = FixedSizeListBuilder::new(Int32Builder::new(), 3); + builder.values().append_slice(&[1, 2, 3]); + builder.append(true); + builder.values().append_slice(&[0, 0, 0]); + builder.append(false); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true); + let list: ListViewArray = builder.finish().into(); + let values: Vec<_> = list + .iter() + .map(|x| x.map(|x| x.as_primitive::().values().to_vec())) + .collect(); + assert_eq!(values, vec![Some(vec![1, 2, 3]), None, Some(vec![4, 5, 6])]) + } +} diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index b115ff9c14c..041e598560b 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -65,13 +65,14 @@ mod union_array; pub use union_array::*; mod run_array; - pub use run_array::*; mod byte_view_array; - pub use byte_view_array::*; +mod list_view_array; +pub use list_view_array::*; + /// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html) pub trait Array: std::fmt::Debug + Send + Sync { /// Returns the array as [`Any`] so that it can be @@ -519,6 +520,12 @@ impl PartialEq for GenericListArray { } } +impl PartialEq for GenericListViewArray { + fn eq(&self, other: &Self) -> bool { + self.to_data().eq(&other.to_data()) + } +} + impl PartialEq for MapArray { fn eq(&self, other: &Self) -> bool { self.to_data().eq(&other.to_data()) @@ -606,7 +613,9 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data)) as ArrayRef, DataType::Utf8View => Arc::new(StringViewArray::from(data)) as ArrayRef, DataType::List(_) => Arc::new(ListArray::from(data)) as ArrayRef, + DataType::ListView(_) => Arc::new(ListViewArray::from(data)) as ArrayRef, DataType::LargeList(_) => Arc::new(LargeListArray::from(data)) as ArrayRef, + DataType::LargeListView(_) => Arc::new(LargeListViewArray::from(data)) as ArrayRef, DataType::Struct(_) => Arc::new(StructArray::from(data)) as ArrayRef, DataType::Map(_, _) => Arc::new(MapArray::from(data)) as ArrayRef, DataType::Union(_, _) => Arc::new(UnionArray::from(data)) as ArrayRef, diff --git a/arrow-array/src/builder/generic_list_builder.rs b/arrow-array/src/builder/generic_list_builder.rs index 25903bcf546..d6fff12e1ab 100644 --- a/arrow-array/src/builder/generic_list_builder.rs +++ b/arrow-array/src/builder/generic_list_builder.rs @@ -548,7 +548,7 @@ mod tests { } #[test] - fn test_boxed_primitive_aray_builder() { + fn test_boxed_primitive_array_builder() { let values_builder = make_builder(&DataType::Int32, 5); let mut builder = ListBuilder::new(values_builder); diff --git a/arrow-array/src/builder/generic_list_view_builder.rs b/arrow-array/src/builder/generic_list_view_builder.rs new file mode 100644 index 00000000000..1e8c8fe125d --- /dev/null +++ b/arrow-array/src/builder/generic_list_view_builder.rs @@ -0,0 +1,708 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::builder::ArrayBuilder; +use crate::{ArrayRef, GenericListViewArray, OffsetSizeTrait}; +use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer}; +use arrow_schema::{Field, FieldRef}; +use std::any::Any; +use std::sync::Arc; + +/// Builder for [`GenericListViewArray`] +/// +#[derive(Debug)] +pub struct GenericListViewBuilder { + offsets_builder: BufferBuilder, + sizes_builder: BufferBuilder, + null_buffer_builder: NullBufferBuilder, + values_builder: T, + field: Option, +} + +impl Default for GenericListViewBuilder { + fn default() -> Self { + Self::new(T::default()) + } +} + +impl ArrayBuilder + for GenericListViewBuilder +where + T: 'static, +{ + /// Returns the builder as a non-mutable `Any` reference. + fn as_any(&self) -> &dyn Any { + self + } + + /// Returns the builder as a mutable `Any` reference. + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + /// Returns the boxed builder as a box of `Any`. + fn into_box_any(self: Box) -> Box { + self + } + + /// Returns the number of array slots in the builder + fn len(&self) -> usize { + self.null_buffer_builder.len() + } + + /// Builds the array and reset this builder. + fn finish(&mut self) -> ArrayRef { + Arc::new(self.finish()) + } + + /// Builds the array without resetting the builder. + fn finish_cloned(&self) -> ArrayRef { + Arc::new(self.finish_cloned()) + } +} + +impl GenericListViewBuilder { + /// Creates a new [`GenericListViewBuilder`] from a given values array builder + pub fn new(values_builder: T) -> Self { + let capacity = values_builder.len(); + Self::with_capacity(values_builder, capacity) + } + + /// Creates a new [`GenericListViewBuilder`] from a given values array builder + /// `capacity` is the number of items to pre-allocate space for in this builder + pub fn with_capacity(values_builder: T, capacity: usize) -> Self { + let offsets_builder = BufferBuilder::::new(capacity); + let sizes_builder = BufferBuilder::::new(capacity); + Self { + offsets_builder, + null_buffer_builder: NullBufferBuilder::new(capacity), + values_builder, + sizes_builder, + field: None, + } + } + + /// Override the field passed to [`GenericListViewArray::new`] + /// + /// By default a nullable field is created with the name `item` + /// + /// Note: [`Self::finish`] and [`Self::finish_cloned`] will panic if the + /// field's data type does not match that of `T` + pub fn with_field(self, field: impl Into) -> Self { + Self { + field: Some(field.into()), + ..self + } + } +} + +impl GenericListViewBuilder +where + T: 'static, +{ + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to append values into the child array builder, + /// but you must call [`append`](#method.append) to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + /// Returns the child array builder as an immutable reference + pub fn values_ref(&self) -> &T { + &self.values_builder + } + + /// Finish the current variable-length list array slot + /// + /// # Panics + /// + /// Panics if the length of [`Self::values`] exceeds `OffsetSize::MAX` + #[inline] + pub fn append(&mut self, is_valid: bool, size: usize) { + self.offsets_builder + .append(OffsetSize::from_usize(self.values_builder.len() - size).unwrap()); + self.sizes_builder.append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); + self.null_buffer_builder.append(is_valid); + } + + /// Append value into this [`GenericListViewBuilder`] + #[inline] + pub fn append_value(&mut self, i: I) + where + T: Extend>, + I: IntoIterator>, + { + self.extend(std::iter::once(Some(i))) + } + + /// Append a null to this [`GenericListViewBuilder`] + /// + /// See [`Self::append_value`] for an example use. + #[inline] + pub fn append_null(&mut self) { + self.offsets_builder + .append(OffsetSize::from_usize(self.values_builder.len()).unwrap()); + self.sizes_builder + .append(OffsetSize::from_usize(0).unwrap()); + self.null_buffer_builder.append_null(); + } + + /// Appends an optional value into this [`GenericListViewBuilder`] + /// + /// If `Some` calls [`Self::append_value`] otherwise calls [`Self::append_null`] + #[inline] + pub fn append_option(&mut self, i: Option) + where + T: Extend>, + I: IntoIterator>, + { + match i { + Some(i) => self.append_value(i), + None => self.append_null(), + } + } + + /// Builds the [`GenericListViewArray`] and reset this builder. + pub fn finish(&mut self) -> GenericListViewArray { + let values = self.values_builder.finish(); + let nulls = self.null_buffer_builder.finish(); + + let offsets = self.offsets_builder.finish(); + // Safety: Safe by construction + let offsets = ScalarBuffer::from(offsets); + + let sizes = self.sizes_builder.finish(); + let sizes = ScalarBuffer::from(sizes); + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + GenericListViewArray::new(field, offsets, sizes, values, nulls) + } + + /// Builds the [`GenericListViewArray`] without resetting the builder. + pub fn finish_cloned(&self) -> GenericListViewArray { + let values = self.values_builder.finish_cloned(); + let nulls = self.null_buffer_builder.finish_cloned(); + + let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice()); + // Safety: safe by construction + let offsets = ScalarBuffer::from(offsets); + + let sizes = Buffer::from_slice_ref(self.sizes_builder.as_slice()); + let sizes = ScalarBuffer::from(sizes); + + let field = match &self.field { + Some(f) => f.clone(), + None => Arc::new(Field::new("item", values.data_type().clone(), true)), + }; + + GenericListViewArray::new(field, offsets, sizes, values, nulls) + } + + /// Returns the current offsets buffer as a slice + pub fn offsets_slice(&self) -> &[OffsetSize] { + self.offsets_builder.as_slice() + } +} + +impl Extend> for GenericListViewBuilder +where + O: OffsetSizeTrait, + B: ArrayBuilder + Extend, + V: IntoIterator, +{ + #[inline] + fn extend>>(&mut self, iter: T) { + for v in iter { + match v { + Some(elements) => { + let origin_size = self.values_builder.len(); + self.values_builder.extend(elements); + let size = self.values_builder.len() - origin_size; + self.append(true, size); + } + None => self.append(false, 0), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::builder::{make_builder, Int32Builder, ListViewBuilder}; + use crate::cast::AsArray; + use crate::types::Int32Type; + use crate::{Array, Int32Array}; + use arrow_schema::DataType; + + fn _test_generic_list_view_array_builder() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListViewBuilder::::new(values_builder); + + // [[0, 1, 2], [3, 4, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true, 3); + builder.values().append_value(3); + builder.values().append_value(4); + builder.values().append_value(5); + builder.append(true, 3); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true, 2); + let list_array = builder.finish(); + + let list_values = list_array.values().as_primitive::(); + assert_eq!(list_values.values(), &[0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(list_array.value_offsets(), [0, 3, 6].map(O::usize_as)); + assert_eq!(list_array.value_sizes(), [3, 3, 2].map(O::usize_as)); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(O::from_usize(6).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(2).unwrap(), list_array.value_sizes()[2]); + assert_eq!(O::from_usize(2).unwrap(), list_array.value_length(2)); + for i in 0..2 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + } + + #[test] + fn test_list_view_array_builder() { + _test_generic_list_view_array_builder::() + } + + #[test] + fn test_large_list_view_array_builder() { + _test_generic_list_view_array_builder::() + } + + fn _test_generic_list_view_array_builder_nulls() { + let values_builder = Int32Builder::with_capacity(10); + let mut builder = GenericListViewBuilder::::new(values_builder); + + // [[0, 1, 2], null, [3, null, 5], [6, 7]] + builder.values().append_value(0); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true, 3); + builder.append(false, 0); + builder.values().append_value(3); + builder.values().append_null(); + builder.values().append_value(5); + builder.append(true, 3); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true, 2); + + let list_array = builder.finish(); + + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(4, list_array.len()); + assert_eq!(1, list_array.null_count()); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_offsets()[2]); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_sizes()[2]); + assert_eq!(O::from_usize(3).unwrap(), list_array.value_length(2)); + } + + #[test] + fn test_list_view_array_builder_nulls() { + _test_generic_list_view_array_builder_nulls::() + } + + #[test] + fn test_large_list_view_array_builder_nulls() { + _test_generic_list_view_array_builder_nulls::() + } + + #[test] + fn test_list_view_array_builder_finish() { + let values_builder = Int32Array::builder(5); + let mut builder = ListViewBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true, 1); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true, 1); + + let mut arr = builder.finish(); + assert_eq!(2, arr.len()); + assert!(builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true, 1); + arr = builder.finish(); + assert_eq!(1, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_view_array_builder_finish_cloned() { + let values_builder = Int32Array::builder(5); + let mut builder = ListViewBuilder::new(values_builder); + + builder.values().append_slice(&[1, 2, 3]); + builder.append(true, 1); + builder.values().append_slice(&[4, 5, 6]); + builder.append(true, 1); + + let mut arr = builder.finish_cloned(); + assert_eq!(2, arr.len()); + assert!(!builder.is_empty()); + + builder.values().append_slice(&[7, 8, 9]); + builder.append(true, 1); + arr = builder.finish(); + assert_eq!(3, arr.len()); + assert!(builder.is_empty()); + } + + #[test] + fn test_list_view_list_view_array_builder() { + let primitive_builder = Int32Builder::with_capacity(10); + let values_builder = ListViewBuilder::new(primitive_builder); + let mut builder = ListViewBuilder::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder.values().values().append_value(1); + builder.values().values().append_value(2); + builder.values().append(true, 2); + builder.values().values().append_value(3); + builder.values().values().append_value(4); + builder.values().append(true, 2); + builder.append(true, 2); + + builder.values().values().append_value(5); + builder.values().values().append_value(6); + builder.values().values().append_value(7); + builder.values().append(true, 3); + builder.values().append(false, 0); + builder.values().values().append_value(8); + builder.values().append(true, 1); + builder.append(true, 3); + + builder.append(false, 0); + + builder.values().values().append_value(9); + builder.values().values().append_value(10); + builder.values().append(true, 2); + builder.append(true, 1); + + let l1 = builder.finish(); + + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5]); + assert_eq!(l1.value_sizes(), &[2, 3, 0, 1]); + + let l2 = l1.values().as_list_view::(); + + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8]); + assert_eq!(l2.value_sizes(), &[2, 2, 3, 0, 1, 2]); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_extend() { + let mut builder = ListViewBuilder::new(Int32Builder::new()); + builder.extend([ + Some(vec![Some(1), Some(2), Some(7), None]), + Some(vec![]), + Some(vec![Some(4), Some(5)]), + None, + ]); + + let array = builder.finish(); + assert_eq!(array.value_offsets(), [0, 4, 4, 6]); + assert_eq!(array.value_sizes(), [4, 0, 2, 0]); + assert_eq!(array.null_count(), 1); + assert!(array.is_null(3)); + let elements = array.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]); + assert_eq!(elements.null_count(), 1); + assert!(elements.is_null(3)); + } + + #[test] + fn test_boxed_primitive_array_builder() { + let values_builder = make_builder(&DataType::Int32, 5); + let mut builder = ListViewBuilder::new(values_builder); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[1, 2, 3]); + builder.append(true, 1); + + builder + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_slice(&[4, 5, 6]); + builder.append(true, 1); + + let arr = builder.finish(); + assert_eq!(2, arr.len()); + + let elements = arr.values().as_primitive::(); + assert_eq!(elements.values(), &[1, 2, 3, 4, 5, 6]); + } + + #[test] + fn test_boxed_list_view_list_view_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::ListView(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); + } + + #[test] + fn test_boxed_large_list_view_large_list_view_array_builder() { + // This test is same as `test_list_list_array_builder` but uses boxed builders. + let values_builder = make_builder( + &DataType::LargeListView(Arc::new(Field::new("item", DataType::Int32, true))), + 10, + ); + test_boxed_generic_list_view_generic_list_view_array_builder::(values_builder); + } + + fn test_boxed_generic_list_view_generic_list_view_array_builder< + O: OffsetSizeTrait + PartialEq, + >( + values_builder: Box, + ) { + let mut builder: GenericListViewBuilder> = + GenericListViewBuilder::>::new(values_builder); + + // [[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], null, [[9, 10]]] + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(1); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true, 2); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(4); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true, 2); + builder.append(true, 2); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(5); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(6); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an (Large)ListViewBuilder") + .append_value(7); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true, 3); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(false, 0); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(8); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true, 1); + builder.append(true, 3); + + builder.append(false, 0); + + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(9); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .values() + .as_any_mut() + .downcast_mut::() + .expect("should be an Int32Builder") + .append_value(10); + builder + .values() + .as_any_mut() + .downcast_mut::>>() + .expect("should be an (Large)ListViewBuilder") + .append(true, 2); + builder.append(true, 1); + + let l1 = builder.finish(); + assert_eq!(4, l1.len()); + assert_eq!(1, l1.null_count()); + assert_eq!(l1.value_offsets(), &[0, 2, 5, 5].map(O::usize_as)); + assert_eq!(l1.value_sizes(), &[2, 3, 0, 1].map(O::usize_as)); + + let l2 = l1.values().as_list_view::(); + assert_eq!(6, l2.len()); + assert_eq!(1, l2.null_count()); + assert_eq!(l2.value_offsets(), &[0, 2, 4, 7, 7, 8].map(O::usize_as)); + assert_eq!(l2.value_sizes(), &[2, 2, 3, 0, 1, 2].map(O::usize_as)); + + let i1 = l2.values().as_primitive::(); + assert_eq!(10, i1.len()); + assert_eq!(0, i1.null_count()); + assert_eq!(i1.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn test_with_field() { + let field = Arc::new(Field::new("bar", DataType::Int32, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), Some(2), Some(3)]); + builder.append_null(); // This is fine as nullability refers to nullability of values + builder.append_value([Some(4)]); + let array = builder.finish(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::ListView(field.clone())); + + builder.append_value([Some(4), Some(5)]); + let array = builder.finish(); + assert_eq!(array.data_type(), &DataType::ListView(field)); + assert_eq!(array.len(), 1); + } + + #[test] + #[should_panic( + expected = "Non-nullable field of ListViewArray \\\"item\\\" cannot contain nulls" + )] + fn test_checks_nullability() { + let field = Arc::new(Field::new("item", DataType::Int32, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1), None]); + builder.finish(); + } + + #[test] + #[should_panic(expected = "ListViewArray expected data type Int64 got Int32")] + fn test_checks_data_type() { + let field = Arc::new(Field::new("item", DataType::Int64, false)); + let mut builder = ListViewBuilder::new(Int32Builder::new()).with_field(field.clone()); + builder.append_value([Some(1)]); + builder.finish(); + } +} diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index e4ab7ae4ba2..dccba9b35f8 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -180,7 +180,9 @@ mod generic_byte_run_builder; pub use generic_byte_run_builder::*; mod generic_bytes_view_builder; pub use generic_bytes_view_builder::*; +mod generic_list_view_builder; mod union_builder; +pub use generic_list_view_builder::*; pub use union_builder::*; @@ -315,3 +317,9 @@ pub type StringBuilder = GenericStringBuilder; /// Builder for [`LargeStringArray`](crate::array::LargeStringArray) pub type LargeStringBuilder = GenericStringBuilder; + +/// Builder for [`ListViewArray`](crate::array::ListViewArray) +pub type ListViewBuilder = GenericListViewBuilder; + +/// Builder for [`LargeListViewArray`](crate::array::LargeListViewArray) +pub type LargeListViewBuilder = GenericListViewBuilder; diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 1e2e402f745..b3adea27e6e 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -253,6 +253,16 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { + let builder = make_builder(field.data_type(), capacity); + Box::new(ListViewBuilder::with_capacity(builder, capacity).with_field(field.clone())) + } + DataType::LargeListView(field) => { + let builder = make_builder(field.data_type(), capacity); + Box::new( + LargeListViewBuilder::with_capacity(builder, capacity).with_field(field.clone()), + ) + } DataType::Map(field, _) => match field.data_type() { DataType::Struct(fields) => { let map_field_names = MapFieldNames { diff --git a/arrow-array/src/cast.rs b/arrow-array/src/cast.rs index 2e21f3e7e64..12b18e9817f 100644 --- a/arrow-array/src/cast.rs +++ b/arrow-array/src/cast.rs @@ -795,6 +795,14 @@ pub trait AsArray: private::Sealed { self.as_list_opt().expect("list array") } + /// Downcast this to a [`GenericListViewArray`] returning `None` if not possible + fn as_list_view_opt(&self) -> Option<&GenericListViewArray>; + + /// Downcast this to a [`GenericListViewArray`] panicking if not possible + fn as_list_view(&self) -> &GenericListViewArray { + self.as_list_view_opt().expect("list view array") + } + /// Downcast this to a [`FixedSizeBinaryArray`] returning `None` if not possible fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray>; @@ -860,6 +868,10 @@ impl AsArray for dyn Array + '_ { self.as_any().downcast_ref() } + fn as_list_view_opt(&self) -> Option<&GenericListViewArray> { + self.as_any().downcast_ref() + } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { self.as_any().downcast_ref() } @@ -907,6 +919,10 @@ impl AsArray for ArrayRef { self.as_ref().as_list_opt() } + fn as_list_view_opt(&self) -> Option<&GenericListViewArray> { + self.as_ref().as_list_view_opt() + } + fn as_fixed_size_binary_opt(&self) -> Option<&FixedSizeBinaryArray> { self.as_ref().as_fixed_size_binary_opt() } diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs index 3f9cc0d525c..5b36d19359e 100644 --- a/arrow-array/src/iterator.rs +++ b/arrow-array/src/iterator.rs @@ -19,7 +19,7 @@ use crate::array::{ ArrayAccessor, BooleanArray, FixedSizeBinaryArray, GenericBinaryArray, GenericListArray, - GenericStringArray, PrimitiveArray, + GenericListViewArray, GenericStringArray, PrimitiveArray, }; use crate::{FixedSizeListArray, MapArray}; use arrow_buffer::NullBuffer; @@ -141,6 +141,8 @@ pub type FixedSizeBinaryIter<'a> = ArrayIter<&'a FixedSizeBinaryArray>; pub type FixedSizeListIter<'a> = ArrayIter<&'a FixedSizeListArray>; /// an iterator that returns Some(T) or None, that can be used on any ListArray pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; +/// an iterator that returns Some(T) or None, that can be used on any ListArray +pub type GenericListViewArrayIter<'a, O> = ArrayIter<&'a GenericListViewArray>; /// an iterator that returns Some(T) or None, that can be used on any MapArray pub type MapArrayIter<'a> = ArrayIter<&'a MapArray>; diff --git a/arrow-buffer/src/buffer/mod.rs b/arrow-buffer/src/buffer/mod.rs index d33e68795e4..0278dc2c332 100644 --- a/arrow-buffer/src/buffer/mod.rs +++ b/arrow-buffer/src/buffer/mod.rs @@ -32,4 +32,5 @@ pub use boolean::*; mod null; pub use null::*; mod run; + pub use run::*; diff --git a/arrow-buffer/src/buffer/scalar.rs b/arrow-buffer/src/buffer/scalar.rs index 2019cc79830..549cd891eff 100644 --- a/arrow-buffer/src/buffer/scalar.rs +++ b/arrow-buffer/src/buffer/scalar.rs @@ -94,6 +94,19 @@ impl ScalarBuffer { pub fn ptr_eq(&self, other: &Self) -> bool { self.buffer.ptr_eq(&other.buffer) } + + /// Returns the length of this buffer in units of `T` + pub fn new_zeroed(len: usize) -> Self { + let len_bytes = len.checked_mul(std::mem::size_of::()).expect("overflow"); + let buffer = MutableBuffer::from_len_zeroed(len_bytes); + Self::from(buffer) + } + + /// Create a new [`ScalarBuffer`] containing a single 0 value + pub fn new_empty() -> Self { + let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::()); + Self::from(buffer.into_buffer()) + } } impl Deref for ScalarBuffer { diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index e227b168eee..630b31d1854 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -119,17 +119,21 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff [buffer, empty_buffer] } DataType::ListView(_) => [ + // offset buffer MutableBuffer::new(capacity * mem::size_of::()), + // size buffer MutableBuffer::new(capacity * mem::size_of::()), ], DataType::LargeList(_) => { - // offset buffer always starts with a zero + // offset buffer let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); buffer.push(0i64); [buffer, empty_buffer] } DataType::LargeListView(_) => [ + // offset buffer MutableBuffer::new(capacity * mem::size_of::()), + // size buffer MutableBuffer::new(capacity * mem::size_of::()), ], DataType::FixedSizeBinary(size) => { @@ -929,6 +933,26 @@ impl ArrayData { Ok(()) } + fn validate_sizes( + &self, + values_length: usize, + ) -> Result<(), ArrowError> { + let sizes: &[T] = self.typed_buffer(1, self.len)?; + if sizes.is_empty() { + return Ok(()); + } + + let total_size: usize = sizes.iter().map(|x| x.to_usize().unwrap()).sum(); + if total_size > values_length { + return Err(ArrowError::InvalidArgumentError(format!( + "Total size of {} is larger than values length {}", + total_size, values_length, + ))); + } + + Ok(()) + } + /// Validates the layout of `child_data` ArrayData structures fn validate_child_data(&self) -> Result<(), ArrowError> { match &self.data_type { @@ -937,11 +961,21 @@ impl ArrayData { self.validate_offsets::(values_data.len)?; Ok(()) } + DataType::ListView(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_sizes::(values_data.len)?; + Ok(()) + } DataType::LargeList(field) => { let values_data = self.get_single_valid_child_data(field.data_type())?; self.validate_offsets::(values_data.len)?; Ok(()) } + DataType::LargeListView(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_sizes::(values_data.len)?; + Ok(()) + } DataType::FixedSizeList(field, list_size) => { let values_data = self.get_single_valid_child_data(field.data_type())?; @@ -1546,9 +1580,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(), DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data DataType::List(_) => DataTypeLayout::new_fixed_width::(), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } + DataType::ListView(_) => DataTypeLayout::new_list_view::(), + DataType::LargeListView(_) => DataTypeLayout::new_list_view::(), DataType::LargeList(_) => DataTypeLayout::new_fixed_width::(), DataType::Map(_, _) => DataTypeLayout::new_fixed_width::(), DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, @@ -1652,6 +1685,24 @@ impl DataTypeLayout { variadic: true, } } + + /// Describes a list view type + pub fn new_list_view() -> Self { + Self { + buffers: vec![ + BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }, + BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }, + ], + can_contain_null_mask: true, + variadic: true, + } + } } /// Layout specification for a single data type buffer diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs new file mode 100644 index 00000000000..ffa3ff4e8d1 --- /dev/null +++ b/arrow-data/src/equal/list_view.rs @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::data::count_nulls; +use crate::ArrayData; +use arrow_buffer::ArrowNativeType; +use num::Integer; + +use super::equal_range; + +pub(super) fn list_view_equal( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + let lhs_offsets = lhs.buffer::(0); + let rhs_offsets = rhs.buffer::(0); + let lhs_sizes = lhs.buffer::(1); + let rhs_sizes = rhs.buffer::(1); + for i in 0..len { + // compare offsets and sizes + let lhs_pos = lhs_start + i; + let rhs_pos = rhs_start + i; + let lhs_offset_start = lhs_offsets[lhs_pos].to_usize().unwrap(); + let rhs_offset_start = rhs_offsets[rhs_pos].to_usize().unwrap(); + let lhs_size = lhs_sizes[lhs_pos].to_usize().unwrap(); + let rhs_size = rhs_sizes[rhs_pos].to_usize().unwrap(); + if lhs_size != rhs_size { + return false; + } + // compare nulls + let lhs_null_count = count_nulls(lhs.nulls(), lhs_offset_start, lhs_size); + let rhs_null_count = count_nulls(rhs.nulls(), rhs_offset_start, lhs_size); + if lhs_null_count != rhs_null_count { + return false; + } + // compare values + if !equal_range( + &lhs.child_data()[0], + &rhs.child_data()[0], + lhs_offset_start, + lhs_offset_start, + lhs_size, + ) { + return false; + } + } + true +} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index dba6a0186a5..65bd6eea704 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -20,6 +20,7 @@ //! depend on dynamic casting of `Array`. use crate::data::ArrayData; +use crate::equal::list_view::list_view_equal; use arrow_buffer::i256; use arrow_schema::{DataType, IntervalUnit}; use half::f16; @@ -30,6 +31,7 @@ mod dictionary; mod fixed_binary; mod fixed_list; mod list; +mod list_view; mod null; mod primitive; mod run; @@ -102,10 +104,9 @@ fn equal_values( byte_view_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not yet implemented") - } + DataType::ListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::LargeListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len),