Skip to content

Commit

Permalink
Reduce memory usage of concat (large)utf8 (#348)
Browse files Browse the repository at this point in the history
* reduce memory needed for concat

* reuse code for str allocation buffer
  • Loading branch information
ritchie46 authored and alamb committed Jun 4, 2021
1 parent cd95d7c commit 36511f7
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 1 deletion.
41 changes: 40 additions & 1 deletion arrow/src/array/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@ use crate::{
error::{ArrowError, Result},
util::bit_util,
};
use std::mem;

use super::{
data::{into_buffers, new_buffers},
ArrayData,
};
use crate::array::StringOffsetSizeTrait;

mod boolean;
mod fixed_binary;
Expand Down Expand Up @@ -324,6 +326,37 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
})
}

fn preallocate_str_buffer<Offset: StringOffsetSizeTrait>(
capacity: usize,
arrays: &[&ArrayData],
) -> [MutableBuffer; 2] {
// offsets
let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>());
// safety: `unsafe` code assumes that this buffer is initialized with one element
if Offset::is_large() {
buffer.push(0i64);
} else {
buffer.push(0i32)
}
let str_values_size = arrays
.iter()
.map(|data| {
// get the length of the value buffer
let buf_len = data.buffers()[1].len();
// find the offset of the buffer
// this returns a slice of offsets, starting from the offset of the array
// so we can take the first value
let offset = data.buffer::<Offset>(0)[0];
buf_len - offset.to_usize().unwrap()
})
.sum::<usize>();

[
buffer,
MutableBuffer::new(str_values_size * mem::size_of::<u8>()),
]
}

impl<'a> MutableArrayData<'a> {
/// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an
/// [ArrayData] from multiple `arrays`.
Expand All @@ -341,7 +374,13 @@ impl<'a> MutableArrayData<'a> {
use_nulls = true;
};

let [buffer1, buffer2] = new_buffers(data_type, capacity);
// We can prevent reallocation by precomputing the needed size.
// This is faster and more memory efficient.
let [buffer1, buffer2] = match data_type {
DataType::LargeUtf8 => preallocate_str_buffer::<i64>(capacity, &arrays),
DataType::Utf8 => preallocate_str_buffer::<i32>(capacity, &arrays),
_ => new_buffers(data_type, capacity),
};

let child_data = match &data_type {
DataType::Null
Expand Down
19 changes: 19 additions & 0 deletions arrow/src/compute/kernels/concat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,4 +452,23 @@ mod tests {
let concat = concat_dictionary(input_1, input_2);
assert_eq!(concat, expected);
}

#[test]
fn test_concat_string_sizes() -> Result<()> {
let a: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect();
let b: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect();
let c = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
// 150 * 3 = 450
// 150 * 3 = 450
// 3 * 3 = 9
// ------------+
// 909
// closest 64 byte aligned cap = 960

let arr = concat(&[&a, &b, &c])?;
// this would have been 1280 if we did not precompute the value lengths.
assert_eq!(arr.data().buffers()[1].capacity(), 960);

Ok(())
}
}

0 comments on commit 36511f7

Please sign in to comment.