Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-10946: [Rust] Simplified bit chunk iterator #8942

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion rust/arrow/src/buffer.rs
Expand Up @@ -183,7 +183,7 @@ impl Buffer {
/// in larger chunks and starting at arbitrary bit offsets.
/// Note that both `offset` and `length` are measured in bits.
pub fn bit_chunks(&self, offset: usize, len: usize) -> BitChunks {
BitChunks::new(&self, offset, len)
BitChunks::new(&self.data.as_slice()[self.offset..], offset, len)
}

/// Returns the number of 1-bits in this buffer.
Expand Down
20 changes: 7 additions & 13 deletions rust/arrow/src/util/bit_chunk_iterator.rs
Expand Up @@ -14,14 +14,12 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::buffer::Buffer;
use crate::util::bit_util::ceil;
use std::fmt::Debug;

#[derive(Debug)]
pub struct BitChunks<'a> {
buffer: &'a Buffer,
raw_data: *const u8,
buffer: &'a [u8],
/// offset inside a byte, guaranteed to be between 0 and 7 (inclusive)
bit_offset: usize,
/// number of complete u64 chunks
Expand All @@ -31,22 +29,19 @@ pub struct BitChunks<'a> {
}

impl<'a> BitChunks<'a> {
pub fn new(buffer: &'a Buffer, offset: usize, len: usize) -> Self {
pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
assert!(ceil(offset + len, 8) <= buffer.len() * 8);

let byte_offset = offset / 8;
let bit_offset = offset % 8;

let raw_data = unsafe { buffer.raw_data().add(byte_offset) };

let chunk_bits = 8 * std::mem::size_of::<u64>();

let chunk_len = len / chunk_bits;
let remainder_len = len & (chunk_bits - 1);

BitChunks::<'a> {
buffer: &buffer,
raw_data,
buffer: &buffer[byte_offset..],
bit_offset,
chunk_len,
remainder_len,
Expand All @@ -56,8 +51,7 @@ impl<'a> BitChunks<'a> {

#[derive(Debug)]
pub struct BitChunkIterator<'a> {
buffer: &'a Buffer,
raw_data: *const u8,
buffer: &'a [u8],
bit_offset: usize,
chunk_len: usize,
index: usize,
Expand All @@ -83,7 +77,8 @@ impl<'a> BitChunks<'a> {
let byte_len = ceil(bit_len + bit_offset, 8);
// pointer to remainder bytes after all complete chunks
let base = unsafe {
self.raw_data
self.buffer
.as_ptr()
.add(self.chunk_len * std::mem::size_of::<u64>())
};

Expand All @@ -102,7 +97,6 @@ impl<'a> BitChunks<'a> {
pub const fn iter(&self) -> BitChunkIterator<'a> {
BitChunkIterator::<'a> {
buffer: self.buffer,
raw_data: self.raw_data,
bit_offset: self.bit_offset,
chunk_len: self.chunk_len,
index: 0,
Expand Down Expand Up @@ -131,7 +125,7 @@ impl Iterator for BitChunkIterator<'_> {

// cast to *const u64 should be fine since we are using read_unaligned below
#[allow(clippy::cast_ptr_alignment)]
let raw_data = self.raw_data as *const u64;
let raw_data = self.buffer.as_ptr() as *const u64;

// bit-packed buffers are stored starting with the least-significant byte first
// so when reading as u64 on a big-endian machine, the bytes need to be swapped
Expand Down