Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct array memory usage calculation for dictionary arrays #505

Merged
merged 7 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
118 changes: 115 additions & 3 deletions arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,21 @@ pub trait Array: fmt::Debug + Send + Sync + JsonEqual {
self.data_ref().null_count()
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this array.
fn get_buffer_memory_size(&self) -> usize;
/// Returns the total number of bytes of memory pointed to by this array.
/// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map.
fn get_buffer_memory_size(&self) -> usize {
self.data_ref().get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this array.
jhorstmann marked this conversation as resolved.
Show resolved Hide resolved
fn get_array_memory_size(&self) -> usize;
/// This value will always be greater than returned by `get_buffer_memory_size()` and
/// includes the overhead of the data structures that contain the pointers to the various buffers.
fn get_array_memory_size(&self) -> usize {
// both data.get_array_memory_size and size_of_val(self) include ArrayData fields,
// to only count additional fields of this array substract size_of(ArrayData)
self.data_ref().get_array_memory_size() + std::mem::size_of_val(self)
- std::mem::size_of::<ArrayData>()
}

/// returns two pointers that represent this array in the C Data Interface (FFI)
fn to_raw(
Expand Down Expand Up @@ -575,6 +585,7 @@ where
#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_empty_primitive() {
let array = new_empty_array(&DataType::Int32);
Expand Down Expand Up @@ -661,4 +672,105 @@ mod tests {
null_array.data().buffers()[0].len()
);
}

#[test]
fn test_memory_size_null() {
let null_arr = NullArray::new(32);

assert_eq!(0, null_arr.get_buffer_memory_size());
assert_eq!(
std::mem::size_of::<NullArray>(),
null_arr.get_array_memory_size()
);
assert_eq!(
std::mem::size_of::<NullArray>(),
std::mem::size_of::<ArrayData>(),
);
}

#[test]
fn test_memory_size_primitive() {
let arr = PrimitiveArray::<Int64Type>::from_iter_values(0..128);
let empty =
PrimitiveArray::<Int64Type>::from(ArrayData::new_empty(arr.data_type()));

// substract empty array to avoid magic numbers for the size of additional fields
assert_eq!(
arr.get_array_memory_size() - empty.get_array_memory_size(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a cool calculation 👍

128 * std::mem::size_of::<i64>()
);
}

#[test]
fn test_memory_size_primitive_nullable() {
let arr: PrimitiveArray<Int64Type> = (0..128).map(Some).collect();
let empty_with_bitmap = PrimitiveArray::<Int64Type>::from(
ArrayData::builder(arr.data_type().clone())
.add_buffer(MutableBuffer::new(0).into())
.null_bit_buffer(MutableBuffer::new_null(0).into())
.build(),
);

// expected size is the size of the PrimitiveArray struct,
// which includes the optional validity buffer
// plus one buffer on the heap
assert_eq!(
std::mem::size_of::<PrimitiveArray<Int64Type>>()
+ std::mem::size_of::<Buffer>(),
empty_with_bitmap.get_array_memory_size()
);

// substract empty array to avoid magic numbers for the size of additional fields
// the size of the validity bitmap is rounded up to 64 bytes
assert_eq!(
arr.get_array_memory_size() - empty_with_bitmap.get_array_memory_size(),
128 * std::mem::size_of::<i64>() + 64
);
}

#[test]
fn test_memory_size_dictionary() {
let values = PrimitiveArray::<Int64Type>::from_iter_values(0..16);
let keys = PrimitiveArray::<Int16Type>::from_iter_values(
(0..256).map(|i| (i % values.len()) as i16),
);

let dict_data = ArrayData::builder(DataType::Dictionary(
Box::new(keys.data_type().clone()),
Box::new(values.data_type().clone()),
))
.len(keys.len())
.buffers(keys.data_ref().buffers().to_vec())
.child_data(vec![ArrayData::builder(DataType::Int64)
.len(values.len())
.buffers(values.data_ref().buffers().to_vec())
.build()])
.build();

let empty_data = ArrayData::new_empty(&DataType::Dictionary(
Box::new(DataType::Int16),
Box::new(DataType::Int64),
));

let arr = DictionaryArray::<Int16Type>::from(dict_data);
let empty = DictionaryArray::<Int16Type>::from(empty_data);

let expected_keys_size = 256 * std::mem::size_of::<i16>();
assert_eq!(
arr.keys().get_array_memory_size() - empty.keys().get_array_memory_size(),
expected_keys_size
);

let expected_values_size = 16 * std::mem::size_of::<i64>();
assert_eq!(
arr.values().get_array_memory_size() - empty.values().get_array_memory_size(),
expected_values_size
);

let expected_size = expected_keys_size + expected_values_size;
assert_eq!(
arr.get_array_memory_size() - empty.get_array_memory_size(),
expected_size
);
}
}
31 changes: 0 additions & 31 deletions arrow/src/array/array_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

use std::convert::{From, TryInto};
use std::fmt;
use std::mem;
use std::{any::Any, iter::FromIterator};

use super::{
Expand Down Expand Up @@ -199,16 +198,6 @@ impl<OffsetSize: BinaryOffsetSizeTrait> Array for GenericBinaryArray<OffsetSize>
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [$name].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl<OffsetSize: BinaryOffsetSizeTrait> From<ArrayData>
Expand Down Expand Up @@ -600,16 +589,6 @@ impl Array for FixedSizeBinaryArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeBinaryArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [FixedSizeBinaryArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

/// A type of `DecimalArray` whose elements are binaries.
Expand Down Expand Up @@ -780,16 +759,6 @@ impl Array for DecimalArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [DecimalArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [DecimalArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

#[cfg(test)]
Expand Down
11 changes: 0 additions & 11 deletions arrow/src/array/array_boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
use std::borrow::Borrow;
use std::convert::From;
use std::iter::{FromIterator, IntoIterator};
use std::mem;
use std::{any::Any, fmt};

use super::*;
Expand Down Expand Up @@ -114,16 +113,6 @@ impl Array for BooleanArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [BooleanArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [BooleanArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl From<Vec<bool>> for BooleanArray {
Expand Down
13 changes: 0 additions & 13 deletions arrow/src/array/array_dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
use std::any::Any;
use std::fmt;
use std::iter::IntoIterator;
use std::mem;
use std::{convert::From, iter::FromIterator};

use super::{
Expand Down Expand Up @@ -209,18 +208,6 @@ impl<T: ArrowPrimitiveType> Array for DictionaryArray<T> {
fn data(&self) -> &ArrayData {
&self.data
}

fn get_buffer_memory_size(&self) -> usize {
// Since both `keys` and `values` derive (are references from) `data`, we only need to account for `data`.
self.data.get_buffer_memory_size()
}

fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size()
+ self.keys.get_array_memory_size()
+ self.values.get_array_memory_size()
+ mem::size_of_val(self)
}
}

impl<T: ArrowPrimitiveType> fmt::Debug for DictionaryArray<T> {
Expand Down
23 changes: 0 additions & 23 deletions arrow/src/array/array_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

use std::any::Any;
use std::fmt;
use std::mem;

use num::Num;

Expand Down Expand Up @@ -261,16 +260,6 @@ impl<OffsetSize: 'static + OffsetSizeTrait> Array for GenericListArray<OffsetSiz
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [ListArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [ListArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl<OffsetSize: OffsetSizeTrait> fmt::Debug for GenericListArray<OffsetSize> {
Expand Down Expand Up @@ -444,18 +433,6 @@ impl Array for FixedSizeListArray {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [FixedSizeListArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size() + self.values().get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [FixedSizeListArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size()
+ self.values().get_array_memory_size()
+ mem::size_of_val(self)
}
}

impl fmt::Debug for FixedSizeListArray {
Expand Down
16 changes: 0 additions & 16 deletions arrow/src/array/array_primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,6 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [PrimitiveArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [PrimitiveArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of::<RawPtrBox<T::Native>>()
}
}

fn as_datetime<T: ArrowPrimitiveType>(v: i64) -> Option<NaiveDateTime> {
Expand Down Expand Up @@ -508,9 +498,6 @@ mod tests {
assert!(arr.is_valid(i));
assert_eq!(i as i32, arr.value(i));
}

assert_eq!(64, arr.get_buffer_memory_size());
assert_eq!(136, arr.get_array_memory_size());
}

#[test]
Expand All @@ -530,9 +517,6 @@ mod tests {
assert!(!arr.is_valid(i));
}
}

assert_eq!(128, arr.get_buffer_memory_size());
assert_eq!(216, arr.get_array_memory_size());
}

#[test]
Expand Down
11 changes: 0 additions & 11 deletions arrow/src/array/array_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

use std::convert::From;
use std::fmt;
use std::mem;
use std::{any::Any, iter::FromIterator};

use super::{
Expand Down Expand Up @@ -286,16 +285,6 @@ impl<OffsetSize: StringOffsetSizeTrait> Array for GenericStringArray<OffsetSize>
fn data(&self) -> &ArrayData {
&self.data
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [$name].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [$name].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl<OffsetSize: StringOffsetSizeTrait> From<ArrayData>
Expand Down
11 changes: 0 additions & 11 deletions arrow/src/array/array_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ use std::any::Any;
use std::convert::{From, TryFrom};
use std::fmt;
use std::iter::IntoIterator;
use std::mem;

use super::{make_array, Array, ArrayData, ArrayRef};
use crate::datatypes::DataType;
Expand Down Expand Up @@ -178,16 +177,6 @@ impl Array for StructArray {
fn len(&self) -> usize {
self.data_ref().len()
}

/// Returns the total number of bytes of memory occupied by the buffers owned by this [StructArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}

/// Returns the total number of bytes of memory occupied physically by this [StructArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}

impl From<Vec<(Field, ArrayRef)>> for StructArray {
Expand Down