From efb78836fa2fc1b45563db31f19bbd761d825d0d Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Fri, 25 Dec 2020 08:14:07 +0000 Subject: [PATCH] Optimized creation of string array from iterator. --- rust/arrow/src/array/array_string.rs | 33 +++++++++++++++------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/rust/arrow/src/array/array_string.rs b/rust/arrow/src/array/array_string.rs index 38a9dfc50ede9..28a587d7374e5 100644 --- a/rust/arrow/src/array/array_string.rs +++ b/rust/arrow/src/array/array_string.rs @@ -123,25 +123,28 @@ impl GenericStringArray { } pub(crate) fn from_vec(v: Vec<&str>) -> Self { - let mut offsets = Vec::with_capacity(v.len() + 1); - let mut values = Vec::new(); + let mut offsets = + MutableBuffer::new((v.len() + 1) * std::mem::size_of::()); + let mut values = MutableBuffer::new(0); + let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); + offsets.extend_from_slice(length_so_far.to_byte_slice()); + for s in &v { length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); + offsets.extend_from_slice(length_so_far.to_byte_slice()); values.extend_from_slice(s.as_bytes()); } let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) .len(v.len()) - .add_buffer(Buffer::from(offsets.to_byte_slice())) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(offsets.into()) + .add_buffer(values.into()) .build(); Self::from(array_data) } pub(crate) fn from_opt_vec(v: Vec>) -> Self { - GenericStringArray::from_iter(v.into_iter()) + v.into_iter().collect() } } @@ -155,32 +158,32 @@ where let (_, data_len) = iter.size_hint(); let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound. - let mut offsets = Vec::with_capacity(data_len + 1); - let mut values = Vec::new(); + let mut offsets = + MutableBuffer::new((data_len + 1) * std::mem::size_of::()); + let mut values = MutableBuffer::new(0); let mut null_buf = MutableBuffer::new_null(data_len); + let null_slice = null_buf.as_slice_mut(); let mut length_so_far = OffsetSize::zero(); - offsets.push(length_so_far); + offsets.extend_from_slice(length_so_far.to_byte_slice()); for (i, s) in iter.enumerate() { if let Some(s) = s { let s = s.as_ref(); // set null bit - let null_slice = null_buf.as_slice_mut(); bit_util::set_bit(null_slice, i); length_so_far += OffsetSize::from_usize(s.len()).unwrap(); - offsets.push(length_so_far); values.extend_from_slice(s.as_bytes()); } else { - offsets.push(length_so_far); values.extend_from_slice(b""); } + offsets.extend_from_slice(length_so_far.to_byte_slice()); } let array_data = ArrayData::builder(OffsetSize::DATA_TYPE) .len(data_len) - .add_buffer(Buffer::from(offsets.to_byte_slice())) - .add_buffer(Buffer::from(&values[..])) + .add_buffer(offsets.into()) + .add_buffer(values.into()) .null_bit_buffer(null_buf.into()) .build(); Self::from(array_data)