Skip to content

Commit

Permalink
Correct array memory usage calculation for dictionary arrays (#505)
Browse files Browse the repository at this point in the history
* Correct array memory usage calculation for dictionary arrays

* update comments

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* update comments

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* Adjust memory usage calculation and move related tests to the same file

* Comment about bitmap size

* Clippy fixes

* Adjust calculation for validity bitmap

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
  • Loading branch information
jhorstmann and alamb authored Jun 29, 2021
1 parent 99b1c90 commit f1a831f
Show file tree
Hide file tree
Showing 11 changed files with 121 additions and 164 deletions.
118 changes: 115 additions & 3 deletions arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,21 @@ pub trait Array: fmt::Debug + Send + Sync + JsonEqual {
self.data_ref().null_count()
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this array.
fn get_buffer_memory_size(&self) -> usize;
/// Returns the total number of bytes of memory pointed to by this array.
/// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map.
fn get_buffer_memory_size(&self) -> usize {
self.data_ref().get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this array.
fn get_array_memory_size(&self) -> usize;
/// This value will always be greater than returned by `get_buffer_memory_size()` and
/// includes the overhead of the data structures that contain the pointers to the various buffers.
fn get_array_memory_size(&self) -> usize {
// both data.get_array_memory_size and size_of_val(self) include ArrayData fields,
// to only count additional fields of this array substract size_of(ArrayData)
self.data_ref().get_array_memory_size() + std::mem::size_of_val(self)
- std::mem::size_of::<ArrayData>()
}

/// returns two pointers that represent this array in the C Data Interface (FFI)
fn to_raw(
Expand Down Expand Up @@ -575,6 +585,7 @@ where
#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_empty_primitive() {
let array = new_empty_array(&DataType::Int32);
Expand Down Expand Up @@ -661,4 +672,105 @@ mod tests {
null_array.data().buffers()[0].len()
);
}

#[test]
fn test_memory_size_null() {
let null_arr = NullArray::new(32);

assert_eq!(0, null_arr.get_buffer_memory_size());
assert_eq!(
std::mem::size_of::<NullArray>(),
null_arr.get_array_memory_size()
);
assert_eq!(
std::mem::size_of::<NullArray>(),
std::mem::size_of::<ArrayData>(),
);
}

#[test]
fn test_memory_size_primitive() {
let arr = PrimitiveArray::<Int64Type>::from_iter_values(0..128);
let empty =
PrimitiveArray::<Int64Type>::from(ArrayData::new_empty(arr.data_type()));

// substract empty array to avoid magic numbers for the size of additional fields
assert_eq!(
arr.get_array_memory_size() - empty.get_array_memory_size(),
128 * std::mem::size_of::<i64>()
);
}

#[test]
fn test_memory_size_primitive_nullable() {
let arr: PrimitiveArray<Int64Type> = (0..128).map(Some).collect();
let empty_with_bitmap = PrimitiveArray::<Int64Type>::from(
ArrayData::builder(arr.data_type().clone())
.add_buffer(MutableBuffer::new(0).into())
.null_bit_buffer(MutableBuffer::new_null(0).into())
.build(),
);

// expected size is the size of the PrimitiveArray struct,
// which includes the optional validity buffer
// plus one buffer on the heap
assert_eq!(
std::mem::size_of::<PrimitiveArray<Int64Type>>()
+ std::mem::size_of::<Buffer>(),
empty_with_bitmap.get_array_memory_size()
);

// substract empty array to avoid magic numbers for the size of additional fields
// the size of the validity bitmap is rounded up to 64 bytes
assert_eq!(
arr.get_array_memory_size() - empty_with_bitmap.get_array_memory_size(),
128 * std::mem::size_of::<i64>() + 64
);
}

#[test]
fn test_memory_size_dictionary() {
let values = PrimitiveArray::<Int64Type>::from_iter_values(0..16);
let keys = PrimitiveArray::<Int16Type>::from_iter_values(
(0..256).map(|i| (i % values.len()) as i16),
);

let dict_data = ArrayData::builder(DataType::Dictionary(
Box::new(keys.data_type().clone()),
Box::new(values.data_type().clone()),
))
.len(keys.len())
.buffers(keys.data_ref().buffers().to_vec())
.child_data(vec![ArrayData::builder(DataType::Int64)
.len(values.len())
.buffers(values.data_ref().buffers().to_vec())
.build()])
.build();

let empty_data = ArrayData::new_empty(&DataType::Dictionary(
Box::new(DataType::Int16),
Box::new(DataType::Int64),
));

let arr = DictionaryArray::<Int16Type>::from(dict_data);
let empty = DictionaryArray::<Int16Type>::from(empty_data);

let expected_keys_size = 256 * std::mem::size_of::<i16>();
assert_eq!(
arr.keys().get_array_memory_size() - empty.keys().get_array_memory_size(),
expected_keys_size
);

let expected_values_size = 16 * std::mem::size_of::<i64>();
assert_eq!(
arr.values().get_array_memory_size() - empty.values().get_array_memory_size(),
expected_values_size
);

let expected_size = expected_keys_size + expected_values_size;
assert_eq!(
arr.get_array_memory_size() - empty.get_array_memory_size(),
expected_size
);
}
}
31 changes: 0 additions & 31 deletions arrow/src/array/array_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

use std::convert::{From, TryInto};
use std::fmt;
use std::mem;
use std::{any::Any, iter::FromIterator};

use super::{
Expand Down Expand Up @@ -199,16 +198,6 @@ impl<OffsetSize: BinaryOffsetSizeTrait> Array for GenericBinaryArray<OffsetSize>
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [$name].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl<OffsetSize: BinaryOffsetSizeTrait> From<ArrayData>
Expand Down Expand Up @@ -600,16 +589,6 @@ impl Array for FixedSizeBinaryArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeBinaryArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [FixedSizeBinaryArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

/// A type of `DecimalArray` whose elements are binaries.
Expand Down Expand Up @@ -780,16 +759,6 @@ impl Array for DecimalArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [DecimalArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [DecimalArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

#[cfg(test)]
Expand Down
11 changes: 0 additions & 11 deletions arrow/src/array/array_boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
use std::borrow::Borrow;
use std::convert::From;
use std::iter::{FromIterator, IntoIterator};
use std::mem;
use std::{any::Any, fmt};

use super::*;
Expand Down Expand Up @@ -114,16 +113,6 @@ impl Array for BooleanArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [BooleanArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [BooleanArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl From<Vec<bool>> for BooleanArray {
Expand Down
13 changes: 0 additions & 13 deletions arrow/src/array/array_dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
use std::any::Any;
use std::fmt;
use std::iter::IntoIterator;
use std::mem;
use std::{convert::From, iter::FromIterator};

use super::{
Expand Down Expand Up @@ -209,18 +208,6 @@ impl<T: ArrowPrimitiveType> Array for DictionaryArray<T> {
fn data(&self) -> &ArrayData {
&self.data
}

fn get_buffer_memory_size(&self) -> usize {
// Since both `keys` and `values` derive (are references from) `data`, we only need to account for `data`.
self.data.get_buffer_memory_size()
}

fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size()
+ self.keys.get_array_memory_size()
+ self.values.get_array_memory_size()
+ mem::size_of_val(self)
}
}

impl<T: ArrowPrimitiveType> fmt::Debug for DictionaryArray<T> {
Expand Down
23 changes: 0 additions & 23 deletions arrow/src/array/array_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

use std::any::Any;
use std::fmt;
use std::mem;

use num::Num;

Expand Down Expand Up @@ -261,16 +260,6 @@ impl<OffsetSize: 'static + OffsetSizeTrait> Array for GenericListArray<OffsetSiz
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [ListArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [ListArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl<OffsetSize: OffsetSizeTrait> fmt::Debug for GenericListArray<OffsetSize> {
Expand Down Expand Up @@ -444,18 +433,6 @@ impl Array for FixedSizeListArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeListArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size() + self.values().get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [FixedSizeListArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size()
+ self.values().get_array_memory_size()
+ mem::size_of_val(self)
}
}

impl fmt::Debug for FixedSizeListArray {
Expand Down
16 changes: 0 additions & 16 deletions arrow/src/array/array_primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,6 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [PrimitiveArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [PrimitiveArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of::<RawPtrBox<T::Native>>()
}
}

fn as_datetime<T: ArrowPrimitiveType>(v: i64) -> Option<NaiveDateTime> {
Expand Down Expand Up @@ -508,9 +498,6 @@ mod tests {
assert!(arr.is_valid(i));
assert_eq!(i as i32, arr.value(i));
}

assert_eq!(64, arr.get_buffer_memory_size());
assert_eq!(136, arr.get_array_memory_size());
}

#[test]
Expand All @@ -530,9 +517,6 @@ mod tests {
assert!(!arr.is_valid(i));
}
}

assert_eq!(128, arr.get_buffer_memory_size());
assert_eq!(216, arr.get_array_memory_size());
}

#[test]
Expand Down
11 changes: 0 additions & 11 deletions arrow/src/array/array_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

use std::convert::From;
use std::fmt;
use std::mem;
use std::{any::Any, iter::FromIterator};

use super::{
Expand Down Expand Up @@ -286,16 +285,6 @@ impl<OffsetSize: StringOffsetSizeTrait> Array for GenericStringArray<OffsetSize>
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [$name].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl<OffsetSize: StringOffsetSizeTrait> From<ArrayData>
Expand Down
11 changes: 0 additions & 11 deletions arrow/src/array/array_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ use std::any::Any;
use std::convert::{From, TryFrom};
use std::fmt;
use std::iter::IntoIterator;
use std::mem;

use super::{make_array, Array, ArrayData, ArrayRef};
use crate::datatypes::DataType;
Expand Down Expand Up @@ -178,16 +177,6 @@ impl Array for StructArray {
fn len(&self) -> usize {
self.data_ref().len()
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [StructArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [StructArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl From<Vec<(Field, ArrayRef)>> for StructArray {
Expand Down
Loading

0 comments on commit f1a831f

Please sign in to comment.