Skip to content

Commit

Permalink
GH-36417: [C++] Add Buffer::data_as, Buffer::mutable_data_as (#36418)
Browse files Browse the repository at this point in the history
### Rationale for this change

There's a lot of boilerplate in casting buffer bytes. A templated accessor will decrease that.

### What changes are included in this PR?

- `{Buffer, BufferBuilder, BufferSpan}::{data_as<T>, mutable_data_as<T>}`
- rewriting a few files' worth of reinterpret_casts to use these accessors to showcase usage
- `Buffer::FromVector`, a std::vector analog to Buffer::FromString
- some tangential cleanup of unnecessary copies in the Array classes

### Are these changes tested?

Not really. The rewritten files exercise the new accessors

### Are there any user-facing changes?

No, but I've noticed `Buffer::operator[]` again and that should probably be deprecated.

* Closes: #36417

Authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
bkietz committed Jul 12, 2023
1 parent f18d8b3 commit 95a8bfb
Show file tree
Hide file tree
Showing 12 changed files with 141 additions and 71 deletions.
4 changes: 2 additions & 2 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ class ARROW_EXPORT Array {
/// \see GetNullCount
int64_t ComputeLogicalNullCount() const;

std::shared_ptr<DataType> type() const { return data_->type; }
const std::shared_ptr<DataType>& type() const { return data_->type; }
Type::type type_id() const { return data_->type->id(); }

/// Buffer for the validity (null) bitmap, if any. Note that Union types
Expand Down Expand Up @@ -251,7 +251,7 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray {
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

/// Does not account for any slice offset
std::shared_ptr<Buffer> values() const { return data_->buffers[1]; }
const std::shared_ptr<Buffer>& values() const { return data_->buffers[1]; }

protected:
PrimitiveArray() : raw_values_(NULLPTR) {}
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/array/array_dict.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ using internal::CopyBitmap;
// ----------------------------------------------------------------------
// DictionaryArray

std::shared_ptr<Array> DictionaryArray::indices() const { return indices_; }
const std::shared_ptr<Array>& DictionaryArray::indices() const { return indices_; }

int64_t DictionaryArray::GetValueIndex(int64_t i) const {
const uint8_t* indices_data = data_->buffers[1]->data();
Expand Down Expand Up @@ -106,8 +106,9 @@ DictionaryArray::DictionaryArray(const std::shared_ptr<DataType>& type,
SetData(data);
}

std::shared_ptr<Array> DictionaryArray::dictionary() const {
const std::shared_ptr<Array>& DictionaryArray::dictionary() const {
if (!dictionary_) {
// TODO(GH-36503) this isn't thread safe
dictionary_ = MakeArray(data_->dictionary);
}
return dictionary_;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/array/array_dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ class ARROW_EXPORT DictionaryArray : public Array {

/// \brief Return the dictionary for this array, which is stored as
/// a member of the ArrayData internal structure
std::shared_ptr<Array> dictionary() const;
std::shared_ptr<Array> indices() const;
const std::shared_ptr<Array>& dictionary() const;
const std::shared_ptr<Array>& indices() const;

/// \brief Return the ith value of indices, cast to int64_t. Not recommended
/// for use in performance-sensitive code. Does not validate whether the
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/array/array_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -470,11 +470,11 @@ const FixedSizeListType* FixedSizeListArray::list_type() const {
return checked_cast<const FixedSizeListType*>(data_->type.get());
}

std::shared_ptr<DataType> FixedSizeListArray::value_type() const {
const std::shared_ptr<DataType>& FixedSizeListArray::value_type() const {
return list_type()->value_type();
}

std::shared_ptr<Array> FixedSizeListArray::values() const { return values_; }
const std::shared_ptr<Array>& FixedSizeListArray::values() const { return values_; }

Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
const std::shared_ptr<Array>& values, int32_t list_size) {
Expand Down
18 changes: 9 additions & 9 deletions cpp/src/arrow/array/array_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ class BaseListArray : public Array {
/// \brief Return array object containing the list's values
///
/// Note that this buffer does not account for any slice offset or length.
std::shared_ptr<Array> values() const { return values_; }
const std::shared_ptr<Array>& values() const { return values_; }

/// Note that this buffer does not account for any slice offset or length.
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
const std::shared_ptr<Buffer>& value_offsets() const { return data_->buffers[1]; }

std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
const std::shared_ptr<DataType>& value_type() const { return list_type_->value_type(); }

/// Return pointer to raw value offsets accounting for any slice offset
const offset_type* raw_value_offsets() const {
Expand Down Expand Up @@ -269,10 +269,10 @@ class ARROW_EXPORT MapArray : public ListArray {
const MapType* map_type() const { return map_type_; }

/// \brief Return array object containing all map keys
std::shared_ptr<Array> keys() const { return keys_; }
const std::shared_ptr<Array>& keys() const { return keys_; }

/// \brief Return array object containing all mapped items
std::shared_ptr<Array> items() const { return items_; }
const std::shared_ptr<Array>& items() const { return items_; }

/// Validate child data before constructing the actual MapArray.
static Status ValidateChildData(
Expand Down Expand Up @@ -310,9 +310,9 @@ class ARROW_EXPORT FixedSizeListArray : public Array {
const FixedSizeListType* list_type() const;

/// \brief Return array object containing the list's values
std::shared_ptr<Array> values() const;
const std::shared_ptr<Array>& values() const;

std::shared_ptr<DataType> value_type() const;
const std::shared_ptr<DataType>& value_type() const;

// The following functions will not perform boundschecking
int64_t value_offset(int64_t i) const {
Expand Down Expand Up @@ -432,7 +432,7 @@ class ARROW_EXPORT UnionArray : public Array {
using type_code_t = int8_t;

/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> type_codes() const { return data_->buffers[1]; }
const std::shared_ptr<Buffer>& type_codes() const { return data_->buffers[1]; }

const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; }

Expand Down Expand Up @@ -571,7 +571,7 @@ class ARROW_EXPORT DenseUnionArray : public UnionArray {
}

/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
const std::shared_ptr<Buffer>& value_offsets() const { return data_->buffers[2]; }

int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }

Expand Down
9 changes: 9 additions & 0 deletions cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,15 @@ struct ARROW_EXPORT BufferSpan {
int64_t size = 0;
// Pointer back to buffer that owns this memory
const std::shared_ptr<Buffer>* owner = NULLPTR;

template <typename T>
const T* data_as() const {
return reinterpret_cast<const T*>(data);
}
template <typename T>
T* mutable_data_as() {
return reinterpret_cast<T*>(data);
}
};

/// \brief EXPERIMENTAL: A non-owning ArrayData reference that is cheaply
Expand Down
14 changes: 9 additions & 5 deletions cpp/src/arrow/buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cstdint>
#include <utility>

#include "arrow/memory_pool_internal.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/util/bit_util.h"
Expand All @@ -43,6 +44,8 @@ Result<std::shared_ptr<Buffer>> Buffer::CopySlice(const int64_t start,
return std::move(new_buffer);
}

Buffer::Buffer() : Buffer(memory_pool::internal::kZeroSizeArea, 0) {}

namespace {

Status CheckBufferSlice(const Buffer& buffer, int64_t offset, int64_t length) {
Expand Down Expand Up @@ -147,11 +150,12 @@ Result<std::shared_ptr<Buffer>> Buffer::ViewOrCopy(

class StlStringBuffer : public Buffer {
public:
explicit StlStringBuffer(std::string data)
: Buffer(nullptr, 0), input_(std::move(data)) {
data_ = reinterpret_cast<const uint8_t*>(input_.c_str());
size_ = static_cast<int64_t>(input_.size());
capacity_ = size_;
explicit StlStringBuffer(std::string data) : input_(std::move(data)) {
if (!input_.empty()) {
data_ = reinterpret_cast<const uint8_t*>(input_.c_str());
size_ = static_cast<int64_t>(input_.size());
capacity_ = size_;
}
}

private:
Expand Down
53 changes: 49 additions & 4 deletions cpp/src/arrow/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ namespace arrow {
/// The following invariant is always true: Size <= Capacity
class ARROW_EXPORT Buffer {
public:
ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer);

/// \brief Construct from buffer and size without copying memory
///
/// \param[in] data a memory buffer
Expand Down Expand Up @@ -136,6 +138,32 @@ class ARROW_EXPORT Buffer {
/// \return a new Buffer instance
static std::shared_ptr<Buffer> FromString(std::string data);

/// \brief Construct an immutable buffer that takes ownership of the contents
/// of an std::vector (without copying it). Only vectors of TrivialType objects
/// (integers, floating point numbers, ...) can be wrapped by this function.
///
/// \param[in] vec a vector to own
/// \return a new Buffer instance
template <typename T>
static std::shared_ptr<Buffer> FromVector(std::vector<T> vec) {
static_assert(std::is_trivial_v<T>,
"Buffer::FromVector can only wrap vectors of trivial objects");

if (vec.empty()) {
return std::shared_ptr<Buffer>{new Buffer()};
}

auto* data = reinterpret_cast<uint8_t*>(vec.data());
auto size_in_bytes = static_cast<int64_t>(vec.size() * sizeof(T));
return std::shared_ptr<Buffer>{
new Buffer{data, size_in_bytes},
// Keep the vector's buffer alive inside the shared_ptr's destructor until after
// we have deleted the Buffer. Note we can't use this trick in FromString since
// std::string's data is inline for short strings so moving invalidates pointers
// into the string's buffer.
[vec = std::move(vec)](Buffer* buffer) { delete buffer; }};
}

/// \brief Create buffer referencing typed memory with some length without
/// copying
/// \param[in] data the typed memory as C array
Expand Down Expand Up @@ -182,6 +210,15 @@ class ARROW_EXPORT Buffer {
return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR;
}

/// \brief Return a pointer to the buffer's data cast to a specific type
///
/// The buffer has to be a CPU buffer (`is_cpu()` is true).
/// Otherwise, an assertion may be thrown or a null pointer may be returned.
template <typename T>
const T* data_as() const {
return reinterpret_cast<const T*>(data());
}

/// \brief Return a writable pointer to the buffer's data
///
/// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
Expand All @@ -199,6 +236,16 @@ class ARROW_EXPORT Buffer {
: NULLPTR;
}

/// \brief Return a writable pointer to the buffer's data cast to a specific type
///
/// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
/// are true). Otherwise, an assertion may be thrown or a null pointer may
/// be returned.
template <typename T>
T* mutable_data_as() {
return reinterpret_cast<T*>(mutable_data());
}

/// \brief Return the device address of the buffer's data
uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); }

Expand Down Expand Up @@ -298,17 +345,15 @@ class ARROW_EXPORT Buffer {
std::shared_ptr<MemoryManager> memory_manager_;

protected:
Buffer();

void CheckMutable() const;
void CheckCPU() const;

void SetMemoryManager(std::shared_ptr<MemoryManager> mm) {
memory_manager_ = std::move(mm);
is_cpu_ = memory_manager_->is_cpu();
}

private:
Buffer() = delete;
ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer);
};

/// \defgroup buffer-slicing-functions Functions for slicing buffers
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/buffer_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ class ARROW_EXPORT BufferBuilder {
return Status::OK();
}

/// \brief Append the given data to the buffer
///
/// The buffer is automatically expanded if necessary.
Status Append(std::string_view v) { return Append(v.data(), v.size()); }

/// \brief Append copies of a value to the buffer
///
/// The buffer is automatically expanded if necessary.
Expand All @@ -138,6 +143,7 @@ class ARROW_EXPORT BufferBuilder {
memcpy(data_ + size_, data, static_cast<size_t>(length));
size_ += length;
}
void UnsafeAppend(std::string_view v) { UnsafeAppend(v.data(), v.size()); }

void UnsafeAppend(const int64_t num_copies, uint8_t value) {
memset(data_ + size_, value, static_cast<size_t>(num_copies));
Expand Down Expand Up @@ -196,6 +202,14 @@ class ARROW_EXPORT BufferBuilder {
int64_t length() const { return size_; }
const uint8_t* data() const { return data_; }
uint8_t* mutable_data() { return data_; }
template <typename T>
const T* data_as() const {
return reinterpret_cast<const T*>(data_);
}
template <typename T>
T* mutable_data_as() {
return reinterpret_cast<T*>(data_);
}

private:
std::shared_ptr<ResizableBuffer> buffer_;
Expand Down

0 comments on commit 95a8bfb

Please sign in to comment.