Skip to content

Commit

Permalink
ARROW-5935: [C++] ArrayBuilder::type() should be kept accurate
Browse files Browse the repository at this point in the history
`ArrayBuilder::type()` should always be identical to the type of the array which would be produced by `ArrayBuilder::Finish()`, even for builders whose type may change. This includes:
- adaptive int builders, which may increase the size of integers built
- dictionary builders, which may increase the size of their indices
- union builders, which may append a new child
- nested builders whose children include a builder with mutable type

Closes #4930 from bkietz/5935-ArrayBuilders-with-mutable-type-are-not- and squashes the following commits:

7e8730b <Benjamin Kietzman> explicit constructor
61ec25c <Benjamin Kietzman> also revert change to AdaptiveUIntBuilder::Append
4dfb55b <Benjamin Kietzman> revert change to AdaptiveIntBuilder::Append
7c91b0c <Benjamin Kietzman> msvc: explicit cast to int
9d3f17f <Benjamin Kietzman> msvc: explicit cast to int
2d0a776 <Benjamin Kietzman> correct BasicUnionBuilder::type()
b1483fa <Benjamin Kietzman> revert modernization of union_ and struct_
3337566 <Benjamin Kietzman> correct StructBuilder::type()
a932317 <Benjamin Kietzman> fix broken union builder::type() method
f4e1cc5 <Benjamin Kietzman> fix broken usage of dereferenced type
9022d70 <Benjamin Kietzman> refactor to virtual ArrayBuilder::type()
6d3e703 <Benjamin Kietzman> add assert(type_ != nullptr), test for struct(union)
b39ea30 <Benjamin Kietzman> lint: nullptr in headers
4e35165 <Benjamin Kietzman> add explicit parens
08d2c8f <Benjamin Kietzman> remove redundant switch in FinishInternal
721e782 <Benjamin Kietzman> refactor to less frantic type updating
3593b4b <Benjamin Kietzman> iwyu: <utility>
25bc3b5 <Benjamin Kietzman> refactor to keep ArrayBuilder::type_ correct
29e3b73 <Benjamin Kietzman> revert change to ArrayBuilder::Advance()
0707c57 <Benjamin Kietzman> loop with const reference
1a81ce4 <Benjamin Kietzman> builders with mutable type will report this via type_ == nullptr

Authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
  • Loading branch information
bkietz authored and nealrichardson committed Sep 19, 2019
1 parent 7ef8e05 commit d4e489d
Show file tree
Hide file tree
Showing 23 changed files with 415 additions and 330 deletions.
142 changes: 62 additions & 80 deletions cpp/src/arrow/array/builder_adaptive.cc
Expand Up @@ -35,13 +35,7 @@ namespace arrow {

using internal::AdaptiveIntBuilderBase;

AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(MemoryPool* pool)
: ArrayBuilder(int64(), pool),
data_(nullptr),
raw_data_(nullptr),
int_size_(1),
pending_pos_(0),
pending_has_nulls_(false) {}
AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(MemoryPool* pool) : ArrayBuilder(pool) {}

void AdaptiveIntBuilderBase::Reset() {
ArrayBuilder::Reset();
Expand All @@ -66,34 +60,60 @@ Status AdaptiveIntBuilderBase::Resize(int64_t capacity) {
return ArrayBuilder::Resize(capacity);
}

AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {}

Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
RETURN_NOT_OK(CommitPendingData());
std::shared_ptr<DataType> AdaptiveUIntBuilder::type() const {
auto int_size = int_size_;
if (pending_pos_ != 0) {
const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr;
int_size =
internal::DetectUIntWidth(pending_data_, valid_bytes, pending_pos_, int_size_);
}
switch (int_size) {
case 1:
return uint8();
case 2:
return uint16();
case 4:
return uint32();
case 8:
return uint64();
default:
DCHECK(false);
}
return nullptr;
}

std::shared_ptr<DataType> output_type;
switch (int_size_) {
std::shared_ptr<DataType> AdaptiveIntBuilder::type() const {
auto int_size = int_size_;
if (pending_pos_ != 0) {
const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr;
int_size = internal::DetectIntWidth(reinterpret_cast<const int64_t*>(pending_data_),
valid_bytes, pending_pos_, int_size_);
}
switch (int_size) {
case 1:
output_type = int8();
break;
return int8();
case 2:
output_type = int16();
break;
return int16();
case 4:
output_type = int32();
break;
return int32();
case 8:
output_type = int64();
break;
return int64();
default:
return Status::NotImplemented("Only ints of size 1,2,4,8 are supported");
DCHECK(false);
}
return nullptr;
}

AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {}

Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
RETURN_NOT_OK(CommitPendingData());

std::shared_ptr<Buffer> null_bitmap;
RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get()));

*out = ArrayData::Make(output_type, length_, {null_bitmap, data_}, null_count_);
*out = ArrayData::Make(type(), length_, {null_bitmap, data_}, null_count_);

data_ = nullptr;
capacity_ = length_ = null_count_ = 0;
Expand Down Expand Up @@ -191,9 +211,8 @@ AdaptiveIntBuilder::ExpandIntSizeInternal() {
return Status::OK();
}

#define __LESS(a, b) (a) < (b)
template <typename new_type, typename old_type>
typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
AdaptiveIntBuilder::ExpandIntSizeInternal() {
int_size_ = sizeof(new_type);
RETURN_NOT_OK(Resize(data_->size() / sizeof(old_type)));
Expand All @@ -207,23 +226,18 @@ AdaptiveIntBuilder::ExpandIntSizeInternal() {

return Status::OK();
}
#undef __LESS

template <typename new_type>
Status AdaptiveIntBuilder::ExpandIntSizeN() {
switch (int_size_) {
case 1:
RETURN_NOT_OK((ExpandIntSizeInternal<new_type, int8_t>()));
break;
return ExpandIntSizeInternal<new_type, int8_t>();
case 2:
RETURN_NOT_OK((ExpandIntSizeInternal<new_type, int16_t>()));
break;
return ExpandIntSizeInternal<new_type, int16_t>();
case 4:
RETURN_NOT_OK((ExpandIntSizeInternal<new_type, int32_t>()));
break;
return ExpandIntSizeInternal<new_type, int32_t>();
case 8:
RETURN_NOT_OK((ExpandIntSizeInternal<new_type, int64_t>()));
break;
return ExpandIntSizeInternal<new_type, int64_t>();
default:
DCHECK(false);
}
Expand All @@ -233,17 +247,13 @@ Status AdaptiveIntBuilder::ExpandIntSizeN() {
Status AdaptiveIntBuilder::ExpandIntSize(uint8_t new_int_size) {
switch (new_int_size) {
case 1:
RETURN_NOT_OK((ExpandIntSizeN<int8_t>()));
break;
return ExpandIntSizeN<int8_t>();
case 2:
RETURN_NOT_OK((ExpandIntSizeN<int16_t>()));
break;
return ExpandIntSizeN<int16_t>();
case 4:
RETURN_NOT_OK((ExpandIntSizeN<int32_t>()));
break;
return ExpandIntSizeN<int32_t>();
case 8:
RETURN_NOT_OK((ExpandIntSizeN<int64_t>()));
break;
return ExpandIntSizeN<int64_t>();
default:
DCHECK(false);
}
Expand All @@ -256,29 +266,11 @@ AdaptiveUIntBuilder::AdaptiveUIntBuilder(MemoryPool* pool)
Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
RETURN_NOT_OK(CommitPendingData());

std::shared_ptr<DataType> output_type;
switch (int_size_) {
case 1:
output_type = uint8();
break;
case 2:
output_type = uint16();
break;
case 4:
output_type = uint32();
break;
case 8:
output_type = uint64();
break;
default:
return Status::NotImplemented("Only ints of size 1,2,4,8 are supported");
}

std::shared_ptr<Buffer> null_bitmap;
RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get()));

*out = ArrayData::Make(output_type, length_, {null_bitmap, data_}, null_count_);
*out = ArrayData::Make(type(), length_, {null_bitmap, data_}, null_count_);

data_ = nullptr;
capacity_ = length_ = null_count_ = 0;
Expand Down Expand Up @@ -346,9 +338,8 @@ AdaptiveUIntBuilder::ExpandIntSizeInternal() {
return Status::OK();
}

#define __LESS(a, b) (a) < (b)
template <typename new_type, typename old_type>
typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
AdaptiveUIntBuilder::ExpandIntSizeInternal() {
int_size_ = sizeof(new_type);
RETURN_NOT_OK(Resize(data_->size() / sizeof(old_type)));
Expand All @@ -361,23 +352,18 @@ AdaptiveUIntBuilder::ExpandIntSizeInternal() {

return Status::OK();
}
#undef __LESS

template <typename new_type>
Status AdaptiveUIntBuilder::ExpandIntSizeN() {
switch (int_size_) {
case 1:
RETURN_NOT_OK((ExpandIntSizeInternal<new_type, uint8_t>()));
break;
return ExpandIntSizeInternal<new_type, uint8_t>();
case 2:
RETURN_NOT_OK((ExpandIntSizeInternal<new_type, uint16_t>()));
break;
return ExpandIntSizeInternal<new_type, uint16_t>();
case 4:
RETURN_NOT_OK((ExpandIntSizeInternal<new_type, uint32_t>()));
break;
return ExpandIntSizeInternal<new_type, uint32_t>();
case 8:
RETURN_NOT_OK((ExpandIntSizeInternal<new_type, uint64_t>()));
break;
return ExpandIntSizeInternal<new_type, uint64_t>();
default:
DCHECK(false);
}
Expand All @@ -387,17 +373,13 @@ Status AdaptiveUIntBuilder::ExpandIntSizeN() {
Status AdaptiveUIntBuilder::ExpandIntSize(uint8_t new_int_size) {
switch (new_int_size) {
case 1:
RETURN_NOT_OK((ExpandIntSizeN<uint8_t>()));
break;
return ExpandIntSizeN<uint8_t>();
case 2:
RETURN_NOT_OK((ExpandIntSizeN<uint16_t>()));
break;
return ExpandIntSizeN<uint16_t>();
case 4:
RETURN_NOT_OK((ExpandIntSizeN<uint32_t>()));
break;
return ExpandIntSizeN<uint32_t>();
case 8:
RETURN_NOT_OK((ExpandIntSizeN<uint64_t>()));
break;
return ExpandIntSizeN<uint64_t>();
default:
DCHECK(false);
}
Expand Down
20 changes: 10 additions & 10 deletions cpp/src/arrow/array/builder_adaptive.h
Expand Up @@ -58,14 +58,14 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
virtual Status CommitPendingData() = 0;

std::shared_ptr<ResizableBuffer> data_;
uint8_t* raw_data_;
uint8_t int_size_;
uint8_t* raw_data_ = NULLPTR;
uint8_t int_size_ = sizeof(uint8_t);

static constexpr int32_t pending_size_ = 1024;
uint8_t pending_valid_[pending_size_];
uint64_t pending_data_[pending_size_];
int32_t pending_pos_;
bool pending_has_nulls_;
int32_t pending_pos_ = 0;
bool pending_has_nulls_ = false;
};

} // namespace internal
Expand Down Expand Up @@ -100,6 +100,8 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase

Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

std::shared_ptr<DataType> type() const override;

protected:
Status CommitPendingData() override;
Status ExpandIntSize(uint8_t new_int_size);
Expand All @@ -110,11 +112,9 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase
template <typename new_type, typename old_type>
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
ExpandIntSizeInternal();
#define __LESS(a, b) (a) < (b)
template <typename new_type, typename old_type>
typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
ExpandIntSizeInternal();
#undef __LESS

template <typename new_type>
Status ExpandIntSizeN();
Expand Down Expand Up @@ -152,6 +152,8 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase

Status FinishInternal(std::shared_ptr<ArrayData>* out) override;

std::shared_ptr<DataType> type() const override;

protected:
Status CommitPendingData() override;
Status ExpandIntSize(uint8_t new_int_size);
Expand All @@ -162,11 +164,9 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase
template <typename new_type, typename old_type>
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
ExpandIntSizeInternal();
#define __LESS(a, b) (a) < (b)
template <typename new_type, typename old_type>
typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
ExpandIntSizeInternal();
#undef __LESS

template <typename new_type>
Status ExpandIntSizeN();
Expand Down
7 changes: 3 additions & 4 deletions cpp/src/arrow/array/builder_base.h
Expand Up @@ -53,8 +53,7 @@ constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1
/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
class ARROW_EXPORT ArrayBuilder {
public:
explicit ArrayBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
: type_(type), pool_(pool), null_bitmap_builder_(pool) {}
explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {}

virtual ~ArrayBuilder() = default;

Expand Down Expand Up @@ -124,7 +123,8 @@ class ARROW_EXPORT ArrayBuilder {
/// \return Status
Status Finish(std::shared_ptr<Array>* out);

std::shared_ptr<DataType> type() const { return type_; }
/// \brief Return the type of the built Array
virtual std::shared_ptr<DataType> type() const = 0;

protected:
/// Append to null bitmap
Expand Down Expand Up @@ -202,7 +202,6 @@ class ARROW_EXPORT ArrayBuilder {
return Status::OK();
}

std::shared_ptr<DataType> type_;
MemoryPool* pool_;

TypedBufferBuilder<bool> null_bitmap_builder_;
Expand Down
17 changes: 2 additions & 15 deletions cpp/src/arrow/array/builder_binary.cc
Expand Up @@ -40,25 +40,12 @@ namespace arrow {

using internal::checked_cast;

// ----------------------------------------------------------------------
// String and binary

BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BaseBinaryBuilder(binary(), pool) {}

StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {}

LargeBinaryBuilder::LargeBinaryBuilder(MemoryPool* pool)
: BaseBinaryBuilder(large_binary(), pool) {}

LargeStringBuilder::LargeStringBuilder(MemoryPool* pool)
: LargeBinaryBuilder(large_utf8(), pool) {}

// ----------------------------------------------------------------------
// Fixed width binary

FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool)
: ArrayBuilder(type, pool),
: ArrayBuilder(pool),
byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()),
byte_builder_(pool) {}

Expand Down Expand Up @@ -103,7 +90,7 @@ Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {

std::shared_ptr<Buffer> null_bitmap;
RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
*out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_);
*out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);

capacity_ = length_ = null_count_ = 0;
return Status::OK();
Expand Down

0 comments on commit d4e489d

Please sign in to comment.