From b4fe87d5dc03a12cd26704ab87e9fee4b741751d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 3 Apr 2025 11:59:15 -0400 Subject: [PATCH 1/2] Improve documentation on ArrayData::offsets --- arrow-data/src/data.rs | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 10b954777d4..dd2da61ed34 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -201,26 +201,45 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff #[derive(Debug, Clone)] pub struct ArrayData { - /// The data type for this array data + /// The data type data_type: DataType, - /// The number of elements in this array data + /// The number of elements len: usize, - /// The offset into this array data, in number of items + /// The offset in number of items (not bytes). + /// + /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It + /// does NOT apply to [`Self::nulls`]. offset: usize, - /// The buffers for this array data. Note that depending on the array types, this - /// could hold different kinds of buffers (e.g., value buffer, value offset buffer) - /// at different positions. + /// The buffers that store the actual data for this array, as defined + /// in the [Arrow Spec]. + /// + /// Depending on the array types, [`Self::buffers`] can hold different + /// kinds of buffers (e.g., value buffer, value offset buffer) at different + /// positions. + /// + /// This ArrayData's first logical element begins at `offset` + /// + /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout) buffers: Vec, - /// The child(ren) of this array. Only non-empty for nested types, currently - /// `ListArray` and `StructArray`. + /// The child(ren) of this array. + /// + /// Only non-empty for nested types, such as `ListArray` and + /// `StructArray`. + /// + /// The first logical element in each child element begins at `offset`. child_data: Vec, - /// The null bitmap. A `None` value for this indicates all values are non-null in - /// this array. + /// The null bitmap. + /// + /// `None` indicates all values are non-null in this array. + /// + /// [`Self::offset]` does not apply to the null bitmap. While the + /// BooleanBuffer may be sliced (have its own offset) internally, this + /// `NullBuffer` always represents exactly `len` elements. nulls: Option, } @@ -555,6 +574,7 @@ impl ArrayData { } /// Returns the `buffer` as a slice of type `T` starting at self.offset + /// /// # Panics /// This function panics if: /// * the buffer is not byte-aligned with type T, or From c2b4a6dc2a01e2765076c7bf461d4bb1a0771545 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 3 Apr 2025 16:40:30 -0400 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Weston Pace --- arrow-data/src/data.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index dd2da61ed34..4c117184de7 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -220,7 +220,9 @@ pub struct ArrayData { /// kinds of buffers (e.g., value buffer, value offset buffer) at different /// positions. /// - /// This ArrayData's first logical element begins at `offset` + /// The buffer may be larger than needed. Some items at the beginning may be skipped if + /// there is an `offset`. Some items at the end may be skipped if the buffer is longer than + /// we need to satisfy `len`. /// /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout) buffers: Vec, @@ -231,6 +233,9 @@ pub struct ArrayData { /// `StructArray`. /// /// The first logical element in each child element begins at `offset`. + /// + /// If the child element also has an offset then these offsets are + /// cumulative. child_data: Vec, /// The null bitmap.