Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 101 additions & 6 deletions arrow-buffer/src/buffer/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ impl MutableBuffer {
/// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity`.
///
/// See [`MutableBuffer::with_capacity`].
///
/// # Panics
///
/// See [`MutableBuffer::with_capacity`].
#[inline]
pub fn new(capacity: usize) -> Self {
Self::with_capacity(capacity)
Expand Down Expand Up @@ -156,6 +160,10 @@ impl MutableBuffer {
/// let data = buffer.as_slice_mut();
/// assert_eq!(data[126], 0u8);
/// ```
///
/// # Panics
///
/// Panics if `len` is too large to construct a valid allocation [`Layout`]
pub fn from_len_zeroed(len: usize) -> Self {
let layout = Layout::from_size_align(len, ALIGNMENT).unwrap();
let data = match layout.size() {
Expand Down Expand Up @@ -199,6 +207,10 @@ impl MutableBuffer {

/// creates a new [MutableBuffer] with capacity and length capable of holding `len` bits.
/// This is useful to create a buffer for packed bitmaps.
///
/// # Panics
///
/// See [`MutableBuffer::from_len_zeroed`].
pub fn new_null(len: usize) -> Self {
let num_bytes = bit_util::ceil(len, 8);
MutableBuffer::from_len_zeroed(num_bytes)
Expand All @@ -210,6 +222,10 @@ impl MutableBuffer {
/// This is useful when one wants to clear (or set) the bits and then manipulate
/// the buffer directly (e.g., modifying the buffer by holding a mutable reference
/// from `data_mut()`).
///
/// # Panics
///
/// Panics if `end` exceeds the buffer capacity.
pub fn with_bitset(mut self, end: usize, val: bool) -> Self {
assert!(end <= self.layout.size());
let v = if val { 255 } else { 0 };
Expand All @@ -225,6 +241,10 @@ impl MutableBuffer {
/// This is used to initialize the bits in a buffer, however, it has no impact on the
/// `len` of the buffer and so can be used to initialize the memory region from
/// `len` to `capacity`.
///
/// # Panics
///
/// Panics if the byte range `start..start + count` exceeds the buffer capacity.
pub fn set_null_bits(&mut self, start: usize, count: usize) {
assert!(
start.saturating_add(count) <= self.layout.size(),
Expand All @@ -250,11 +270,19 @@ impl MutableBuffer {
/// let buffer: Buffer = buffer.into();
/// assert_eq!(buffer.len(), 253);
/// ```
///
/// # Panics
///
/// Panics if `self.len + additional` overflows `usize`, or if the required capacity is too
/// large to round up to the next 64-byte boundary and construct a valid allocation layout.
// For performance reasons, this must be inlined so that the `if` is executed inside the caller, and not as an extra call that just
// exits.
#[inline(always)]
pub fn reserve(&mut self, additional: usize) {
let required_cap = self.len + additional;
let required_cap = self
.len
.checked_add(additional)
.expect("buffer length overflow");
if required_cap > self.layout.size() {
let new_capacity = bit_util::round_upto_multiple_of_64(required_cap);
let new_capacity = std::cmp::max(new_capacity, self.layout.size() * 2);
Expand All @@ -274,6 +302,12 @@ impl MutableBuffer {
/// buffer.repeat_slice_n_times(bytes_to_repeat, 3);
/// assert_eq!(buffer.as_slice(), b"ababab");
/// ```
///
/// # Panics
///
/// Panics if the repeated slice byte length overflows `usize`, if the resulting buffer
/// length overflows `usize`, or if reserving the required capacity fails for the same
/// reasons as [`MutableBuffer::reserve`].
pub fn repeat_slice_n_times<T: ArrowNativeType>(
&mut self,
slice_to_repeat: &[T],
Expand Down Expand Up @@ -391,6 +425,11 @@ impl MutableBuffer {
/// buffer.resize(253, 2); // allocates for the first time
/// assert_eq!(buffer.as_slice()[252], 2u8);
/// ```
///
/// # Panics
///
/// Panics if growing the buffer requires reserving a capacity that fails for the same
/// reasons as [`MutableBuffer::reserve`].
// For performance reasons, this must be inlined so that the `if` is executed inside the caller, and not as an extra call that just
// exits.
#[inline(always)]
Expand Down Expand Up @@ -426,6 +465,11 @@ impl MutableBuffer {
/// buffer.shrink_to_fit();
/// assert!(buffer.capacity() >= 64 && buffer.capacity() < 128);
/// ```
///
/// # Panics
///
/// Panics if the current length is too large to round up to the next 64-byte boundary and
/// construct a valid allocation layout.
pub fn shrink_to_fit(&mut self) {
let new_capacity = bit_util::round_upto_multiple_of_64(self.len);
if new_capacity < self.layout.size() {
Expand Down Expand Up @@ -505,8 +549,8 @@ impl MutableBuffer {
///
/// # Panics
///
/// This function panics if the underlying buffer is not aligned
/// correctly for type `T`.
/// This function panics if the underlying buffer is not aligned correctly for type `T`, or
/// if its length is not a multiple of `size_of::<T>()`.
pub fn typed_data_mut<T: ArrowNativeType>(&mut self) -> &mut [T] {
// SAFETY
// ArrowNativeType is trivially transmutable, is sealed to prevent potentially incorrect
Expand All @@ -520,8 +564,8 @@ impl MutableBuffer {
///
/// # Panics
///
/// This function panics if the underlying buffer is not aligned
/// correctly for type `T`.
/// This function panics if the underlying buffer is not aligned correctly for type `T`, or
/// if its length is not a multiple of `size_of::<T>()`.
pub fn typed_data<T: ArrowNativeType>(&self) -> &[T] {
// SAFETY
// ArrowNativeType is trivially transmutable, is sealed to prevent potentially incorrect
Expand All @@ -539,6 +583,11 @@ impl MutableBuffer {
/// buffer.extend_from_slice(&[2u32, 0]);
/// assert_eq!(buffer.len(), 8) // u32 has 4 bytes
/// ```
///
/// # Panics
///
/// Panics if extending the buffer requires reserving a capacity that fails for the same
/// reasons as [`MutableBuffer::reserve`].
#[inline]
pub fn extend_from_slice<T: ArrowNativeType>(&mut self, items: &[T]) {
let additional = mem::size_of_val(items);
Expand All @@ -562,6 +611,11 @@ impl MutableBuffer {
/// buffer.push(256u32);
/// assert_eq!(buffer.len(), 4) // u32 has 4 bytes
/// ```
///
/// # Panics
///
/// Panics if extending the buffer requires reserving a capacity that fails for the same
/// reasons as [`MutableBuffer::reserve`].
#[inline]
pub fn push<T: ToByteSlice>(&mut self, item: T) {
let additional = std::mem::size_of::<T>();
Expand All @@ -587,13 +641,26 @@ impl MutableBuffer {
}

/// Extends the buffer by `additional` bytes equal to `0u8`, incrementing its capacity if needed.
///
/// # Panics
///
/// Panics if `self.len + additional` overflows `usize`, or if growing the buffer requires
/// reserving a capacity that fails for the same reasons as [`MutableBuffer::reserve`].
#[inline]
pub fn extend_zeros(&mut self, additional: usize) {
self.resize(self.len + additional, 0);
let new_len = self
.len
.checked_add(additional)
.expect("buffer length overflow");
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Introducing internal panics is not ideal, tho clearly better than UB. How should library users code defensively to avoid panic, and how can we make their life easier?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree panics are not good and should be avoided when possible

We had a bit of a philosophical debate about this earlier (when to panic vs Error) and the conclusion we came to got codified in this doc, which I think is relevant here: https://github.com/apache/arrow-rs#guidelines-for-panic-vs-result

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically, do we force all downstream consumers to check for what is very likely an error that will never happen? I think the answer depends on opinion

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah... arguably the guidelines say we should be returning an error here (because asking for too many entries is a form of invalid input), but it certainly complicates the API. I don't know if there's a way to provide a fallible version of this API, for paranoid consumers to use?

I do agree it should be a very rare error, but I've also been unpleasantly surprised at how often 32-bit StringArray offsets blow up in practice.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are 64-bit values right? Probably ok to leave it as a panic because last I knew most hardware cannot physically index more than 48 bits of virtual memory and most operating systems cap the size of any one memory mapping to a few TB of contiguous virtual address space (even one not backed by memory).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think usize is 64 bits on 64-bit architectures and 32 bits on 32-bit architectures

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do agree it should be a very rare error, but I've also been unpleasantly surprised at how often 32-bit StringArray offsets blow up in practice.

Yeah, i32 (2GB strings) is shockingly common

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if there's a way to provide a fallible version of this API, for paranoid consumers to use?

I mean we could add a try_extend_zeros or something 🤔

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I agree it might be nice to add some try_ functions and then document that the current versions might panic.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I filed a ticket to consider adding new try_ variants

I also pushed a bunch of documentation updates to this PR to document that the APIs panic in certain cases

self.resize(new_len, 0);
}

/// # Safety
/// The caller must ensure that the buffer was properly initialized up to `len`.
///
/// # Panics
///
/// Panics if `len` exceeds the buffer capacity.
#[inline]
pub unsafe fn set_len(&mut self, len: usize) {
assert!(len <= self.capacity());
Expand Down Expand Up @@ -641,6 +708,13 @@ impl MutableBuffer {
/// `offset` indicates the starting offset in bits in this buffer to begin writing to
/// and must be less than or equal to the current length of this buffer.
/// All bits not written to (but readable due to byte alignment) will be zeroed out.
///
/// # Panics
///
/// Panics if `iter` does not report an exact size via `size_hint`, or if it yields fewer
/// items than reported, or if extending the buffer requires reserving a capacity that fails
/// for the same reasons as [`MutableBuffer::reserve`].
///
/// # Safety
/// Callers must ensure that `iter` reports an exact size via `size_hint`.
#[inline]
Expand Down Expand Up @@ -866,6 +940,13 @@ impl MutableBuffer {
/// let buffer = unsafe { MutableBuffer::from_trusted_len_iter(iter) };
/// assert_eq!(buffer.len(), 4) // u32 has 4 bytes
/// ```
///
/// # Panics
///
/// Panics if the iterator does not report an upper bound via `size_hint`, or if the
/// reported length does not match the number of items produced, or if allocating the
/// required buffer fails for the same reasons as [`MutableBuffer::new`].
///
/// # Safety
/// This method assumes that the iterator's size is correct and is undefined behavior
/// to use it on an iterator that reports an incorrect length.
Expand Down Expand Up @@ -910,6 +991,12 @@ impl MutableBuffer {
/// let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(iter) };
/// assert_eq!(buffer.len(), 1) // 3 booleans have 1 byte
/// ```
///
/// # Panics
///
/// Panics if the iterator does not report an upper bound via `size_hint`, or if it yields
/// fewer items than reported.
///
/// # Safety
/// This method assumes that the iterator's size is correct and is undefined behavior
/// to use it on an iterator that reports an incorrect length.
Expand All @@ -928,6 +1015,14 @@ impl MutableBuffer {
/// Creates a [`MutableBuffer`] from an [`Iterator`] with a trusted (upper) length or errors
/// if any of the items of the iterator is an error.
/// Prefer this to `collect` whenever possible, as it is faster ~60% faster.
///
/// # Panics
///
/// Panics if the iterator does not report an upper bound via `size_hint`, or if the
/// reported length does not match the number of items produced before an error-free finish,
/// or if allocating the required buffer fails for the same reasons as
/// [`MutableBuffer::new`].
///
/// # Safety
/// This method assumes that the iterator's size is correct and is undefined behavior
/// to use it on an iterator that reports an incorrect length.
Expand Down
24 changes: 24 additions & 0 deletions arrow-buffer/src/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -439,4 +439,28 @@ mod tests {
let slice = builder.as_slice_mut();
assert_eq!(slice.len(), 222);
}

#[test]
#[should_panic(expected = "buffer length overflow")]
fn reserve_length_overflow() {
let mut builder = BufferBuilder::<u8>::new(1);
builder.append(0);
builder.reserve(usize::MAX);
}

#[test]
#[should_panic(expected = "buffer length overflow")]
fn append_n_zeroed_length_overflow() {
let mut builder = BufferBuilder::<u64>::new(1);
builder.append_n_zeroed(1);
builder.append_n_zeroed(usize::MAX / mem::size_of::<u64>());
}

#[test]
#[should_panic(expected = "buffer length overflow")]
fn advance_length_overflow() {
let mut builder = BufferBuilder::<u64>::new(1);
builder.advance(1);
builder.advance(usize::MAX / mem::size_of::<u64>());
}
}
Loading