Reduce memory usage of concat (large)utf8 (#348)

* reduce memory needed for concat * reuse code for str allocation buffer
apache · Jun 4, 2021 · 36511f7 · 36511f7
1 parent cd95d7c
commit 36511f7
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 1 deletion.
diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs
@@ -21,11 +21,13 @@ use crate::{
     error::{ArrowError, Result},
     util::bit_util,
 };
+use std::mem;
 
 use super::{
     data::{into_buffers, new_buffers},
     ArrayData,
 };
+use crate::array::StringOffsetSizeTrait;
 
 mod boolean;
 mod fixed_binary;
@@ -324,6 +326,37 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
     })
 }
 
+fn preallocate_str_buffer<Offset: StringOffsetSizeTrait>(
+    capacity: usize,
+    arrays: &[&ArrayData],
+) -> [MutableBuffer; 2] {
+    // offsets
+    let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>());
+    // safety: `unsafe` code assumes that this buffer is initialized with one element
+    if Offset::is_large() {
+        buffer.push(0i64);
+    } else {
+        buffer.push(0i32)
+    }
+    let str_values_size = arrays
+        .iter()
+        .map(|data| {
+            // get the length of the value buffer
+            let buf_len = data.buffers()[1].len();
+            // find the offset of the buffer
+            // this returns a slice of offsets, starting from the offset of the array
+            // so we can take the first value
+            let offset = data.buffer::<Offset>(0)[0];
+            buf_len - offset.to_usize().unwrap()
+        })
+        .sum::<usize>();
+
+    [
+        buffer,
+        MutableBuffer::new(str_values_size * mem::size_of::<u8>()),
+    ]
+}
+
 impl<'a> MutableArrayData<'a> {
     /// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an
     /// [ArrayData] from multiple `arrays`.
@@ -341,7 +374,13 @@ impl<'a> MutableArrayData<'a> {
             use_nulls = true;
         };
 
-        let [buffer1, buffer2] = new_buffers(data_type, capacity);
+        // We can prevent reallocation by precomputing the needed size.
+        // This is faster and more memory efficient.
+        let [buffer1, buffer2] = match data_type {
+            DataType::LargeUtf8 => preallocate_str_buffer::<i64>(capacity, &arrays),
+            DataType::Utf8 => preallocate_str_buffer::<i32>(capacity, &arrays),
+            _ => new_buffers(data_type, capacity),
+        };
 
         let child_data = match &data_type {
             DataType::Null

diff --git a/arrow/src/compute/kernels/concat.rs b/arrow/src/compute/kernels/concat.rs
@@ -452,4 +452,23 @@ mod tests {
         let concat = concat_dictionary(input_1, input_2);
         assert_eq!(concat, expected);
     }
+
+    #[test]
+    fn test_concat_string_sizes() -> Result<()> {
+        let a: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect();
+        let b: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect();
+        let c = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
+        // 150 * 3 = 450
+        // 150 * 3 = 450
+        // 3 * 3   = 9
+        // ------------+
+        // 909
+        // closest 64 byte aligned cap = 960
+
+        let arr = concat(&[&a, &b, &c])?;
+        // this would have been 1280 if we did not precompute the value lengths.
+        assert_eq!(arr.data().buffers()[1].capacity(), 960);
+
+        Ok(())
+    }
 }