From cb3babc9d1a68fc7deffa3a1577a37da0cf443e1 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Wed, 10 Jul 2024 06:33:36 -0400 Subject: [PATCH] Improve performance reading `ByteViewArray` from parquet by removing an implicit copy (#6031) * update byte view array to not implicit copy * Add small comments --- parquet/src/arrow/array_reader/byte_view_array.rs | 11 ++++++++--- parquet/src/arrow/buffer/view_buffer.rs | 1 - 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index dc4ce3f9c1b..d1a0313dc1e 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -71,7 +71,6 @@ struct ByteViewArrayReader { } impl ByteViewArrayReader { - #[allow(unused)] fn new( pages: Box, data_type: ArrowType, @@ -316,7 +315,10 @@ impl ByteViewArrayDecoderPlain { } pub fn read(&mut self, output: &mut ViewBuffer, len: usize) -> Result { - let block_id = output.append_block(self.buf.clone().into()); + // Here we convert `bytes::Bytes` into `arrow_buffer::Bytes`, which is zero copy + // Then we convert `arrow_buffer::Bytes` into `arrow_buffer:Buffer`, which is also zero copy + let buf = arrow_buffer::Buffer::from_bytes(self.buf.clone().into()); + let block_id = output.append_block(buf); let to_read = len.min(self.max_remaining_values); @@ -546,7 +548,10 @@ impl ByteViewArrayDecoderDeltaLength { let src_lengths = &self.lengths[self.length_offset..self.length_offset + to_read]; - let block_id = output.append_block(self.data.clone().into()); + // Here we convert `bytes::Bytes` into `arrow_buffer::Bytes`, which is zero copy + // Then we convert `arrow_buffer::Bytes` into `arrow_buffer:Buffer`, which is also zero copy + let bytes = arrow_buffer::Buffer::from_bytes(self.data.clone().into()); + let block_id = output.append_block(bytes); let mut current_offset = self.data_offset; let initial_offset = current_offset; diff --git a/parquet/src/arrow/buffer/view_buffer.rs b/parquet/src/arrow/buffer/view_buffer.rs index ae83ac31777..2256f4877d6 100644 --- a/parquet/src/arrow/buffer/view_buffer.rs +++ b/parquet/src/arrow/buffer/view_buffer.rs @@ -68,7 +68,6 @@ impl ViewBuffer { } /// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer` - #[allow(unused)] pub fn into_array(self, null_buffer: Option, data_type: &ArrowType) -> ArrayRef { let len = self.views.len(); let views = Buffer::from_vec(self.views);