Skip to content

Commit

Permalink
reuse code for str allocation buffer
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed May 26, 2021
1 parent 9b30c2b commit b3bfb5d
Showing 1 changed file with 34 additions and 48 deletions.
82 changes: 34 additions & 48 deletions arrow/src/array/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use super::{
data::{into_buffers, new_buffers},
ArrayData,
};
use crate::array::StringOffsetSizeTrait;

mod boolean;
mod fixed_binary;
Expand Down Expand Up @@ -325,6 +326,37 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
})
}

fn preallocate_str_buffer<Offset: StringOffsetSizeTrait>(
capacity: usize,
arrays: &[&ArrayData],
) -> [MutableBuffer; 2] {
// offsets
let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>());
// safety: `unsafe` code assumes that this buffer is initialized with one element
if Offset::is_large() {
buffer.push(0i64);
} else {
buffer.push(0i32)
}
let str_values_size = arrays
.iter()
.map(|data| {
// get the length of the value buffer
let buf_len = data.buffers()[1].len();
// find the offset of the buffer
// this returns a slice of offsets, starting from the offset of the array
// so we can take the first value
let offset = data.buffer::<Offset>(0)[0];
buf_len - offset.to_usize().unwrap()
})
.sum::<usize>();

[
buffer,
MutableBuffer::new(str_values_size * mem::size_of::<u8>()),
]
}

impl<'a> MutableArrayData<'a> {
/// returns a new [MutableArrayData] with capacity to `capacity` slots and specialized to create an
/// [ArrayData] from multiple `arrays`.
Expand All @@ -345,54 +377,8 @@ impl<'a> MutableArrayData<'a> {
// We can prevent reallocation by precomputing the needed size.
// This is faster and more memory efficient.
let [buffer1, buffer2] = match data_type {
DataType::LargeUtf8 => {
// offsets
let mut buffer =
MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
// safety: `unsafe` code assumes that this buffer is initialized with one element
buffer.push(0i64);
let str_values_size = arrays
.iter()
.map(|data| {
// get the length of the value buffer
let buf_len = data.buffers()[1].len();
// find the offset of the buffer
// this returns a slice of offsets, starting from the offset of the array
// so we can take the first value
let offset = data.buffer::<i64>(0)[0];
buf_len - offset as usize
})
.sum::<usize>();

[
buffer,
MutableBuffer::new(str_values_size * mem::size_of::<u8>()),
]
}
DataType::Utf8 => {
// offsets
let mut buffer =
MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
// safety: `unsafe` code assumes that this buffer is initialized with one element
buffer.push(0i32);
let str_values_size = arrays
.iter()
.map(|data| {
// get the length of the value buffer
let buf_len = data.buffers()[1].len();
// find the offset of the buffer
// this returns a slice of offsets, starting from the offset of the array
// so we can take the first value
let offset = data.buffer::<i32>(0)[0];
buf_len - offset as usize
})
.sum::<usize>();

[
buffer,
MutableBuffer::new(str_values_size * mem::size_of::<u8>()),
]
}
DataType::LargeUtf8 => preallocate_str_buffer::<i64>(capacity, &arrays),
DataType::Utf8 => preallocate_str_buffer::<i32>(capacity, &arrays),
_ => new_buffers(data_type, capacity),
};

Expand Down

0 comments on commit b3bfb5d

Please sign in to comment.