diff --git a/cpp/src/arrow/array/builder_adaptive.cc b/cpp/src/arrow/array/builder_adaptive.cc index 76183ab0d9fb6..c537b0e5c38fc 100644 --- a/cpp/src/arrow/array/builder_adaptive.cc +++ b/cpp/src/arrow/array/builder_adaptive.cc @@ -35,13 +35,7 @@ namespace arrow { using internal::AdaptiveIntBuilderBase; -AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(MemoryPool* pool) - : ArrayBuilder(int64(), pool), - data_(nullptr), - raw_data_(nullptr), - int_size_(1), - pending_pos_(0), - pending_has_nulls_(false) {} +AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(MemoryPool* pool) : ArrayBuilder(pool) {} void AdaptiveIntBuilderBase::Reset() { ArrayBuilder::Reset(); @@ -66,34 +60,60 @@ Status AdaptiveIntBuilderBase::Resize(int64_t capacity) { return ArrayBuilder::Resize(capacity); } -AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {} - -Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(CommitPendingData()); +std::shared_ptr AdaptiveUIntBuilder::type() const { + auto int_size = int_size_; + if (pending_pos_ != 0) { + const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr; + int_size = + internal::DetectUIntWidth(pending_data_, valid_bytes, pending_pos_, int_size_); + } + switch (int_size) { + case 1: + return uint8(); + case 2: + return uint16(); + case 4: + return uint32(); + case 8: + return uint64(); + default: + DCHECK(false); + } + return nullptr; +} - std::shared_ptr output_type; - switch (int_size_) { +std::shared_ptr AdaptiveIntBuilder::type() const { + auto int_size = int_size_; + if (pending_pos_ != 0) { + const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr; + int_size = internal::DetectIntWidth(reinterpret_cast(pending_data_), + valid_bytes, pending_pos_, int_size_); + } + switch (int_size) { case 1: - output_type = int8(); - break; + return int8(); case 2: - output_type = int16(); - break; + return int16(); case 4: - output_type = int32(); - break; + return int32(); case 8: - output_type = int64(); - break; + return int64(); default: - return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); + DCHECK(false); } + return nullptr; +} + +AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {} + +Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(CommitPendingData()); std::shared_ptr null_bitmap; RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get())); - *out = ArrayData::Make(output_type, length_, {null_bitmap, data_}, null_count_); + *out = ArrayData::Make(type(), length_, {null_bitmap, data_}, null_count_); data_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -191,9 +211,8 @@ AdaptiveIntBuilder::ExpandIntSizeInternal() { return Status::OK(); } -#define __LESS(a, b) (a) < (b) template -typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type +typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type AdaptiveIntBuilder::ExpandIntSizeInternal() { int_size_ = sizeof(new_type); RETURN_NOT_OK(Resize(data_->size() / sizeof(old_type))); @@ -207,23 +226,18 @@ AdaptiveIntBuilder::ExpandIntSizeInternal() { return Status::OK(); } -#undef __LESS template Status AdaptiveIntBuilder::ExpandIntSizeN() { switch (int_size_) { case 1: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; + return ExpandIntSizeInternal(); case 2: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; + return ExpandIntSizeInternal(); case 4: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; + return ExpandIntSizeInternal(); case 8: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; + return ExpandIntSizeInternal(); default: DCHECK(false); } @@ -233,17 +247,13 @@ Status AdaptiveIntBuilder::ExpandIntSizeN() { Status AdaptiveIntBuilder::ExpandIntSize(uint8_t new_int_size) { switch (new_int_size) { case 1: - RETURN_NOT_OK((ExpandIntSizeN())); - break; + return ExpandIntSizeN(); case 2: - RETURN_NOT_OK((ExpandIntSizeN())); - break; + return ExpandIntSizeN(); case 4: - RETURN_NOT_OK((ExpandIntSizeN())); - break; + return ExpandIntSizeN(); case 8: - RETURN_NOT_OK((ExpandIntSizeN())); - break; + return ExpandIntSizeN(); default: DCHECK(false); } @@ -256,29 +266,11 @@ AdaptiveUIntBuilder::AdaptiveUIntBuilder(MemoryPool* pool) Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(CommitPendingData()); - std::shared_ptr output_type; - switch (int_size_) { - case 1: - output_type = uint8(); - break; - case 2: - output_type = uint16(); - break; - case 4: - output_type = uint32(); - break; - case 8: - output_type = uint64(); - break; - default: - return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); - } - std::shared_ptr null_bitmap; RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get())); - *out = ArrayData::Make(output_type, length_, {null_bitmap, data_}, null_count_); + *out = ArrayData::Make(type(), length_, {null_bitmap, data_}, null_count_); data_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -346,9 +338,8 @@ AdaptiveUIntBuilder::ExpandIntSizeInternal() { return Status::OK(); } -#define __LESS(a, b) (a) < (b) template -typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type +typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type AdaptiveUIntBuilder::ExpandIntSizeInternal() { int_size_ = sizeof(new_type); RETURN_NOT_OK(Resize(data_->size() / sizeof(old_type))); @@ -361,23 +352,18 @@ AdaptiveUIntBuilder::ExpandIntSizeInternal() { return Status::OK(); } -#undef __LESS template Status AdaptiveUIntBuilder::ExpandIntSizeN() { switch (int_size_) { case 1: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; + return ExpandIntSizeInternal(); case 2: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; + return ExpandIntSizeInternal(); case 4: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; + return ExpandIntSizeInternal(); case 8: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; + return ExpandIntSizeInternal(); default: DCHECK(false); } @@ -387,17 +373,13 @@ Status AdaptiveUIntBuilder::ExpandIntSizeN() { Status AdaptiveUIntBuilder::ExpandIntSize(uint8_t new_int_size) { switch (new_int_size) { case 1: - RETURN_NOT_OK((ExpandIntSizeN())); - break; + return ExpandIntSizeN(); case 2: - RETURN_NOT_OK((ExpandIntSizeN())); - break; + return ExpandIntSizeN(); case 4: - RETURN_NOT_OK((ExpandIntSizeN())); - break; + return ExpandIntSizeN(); case 8: - RETURN_NOT_OK((ExpandIntSizeN())); - break; + return ExpandIntSizeN(); default: DCHECK(false); } diff --git a/cpp/src/arrow/array/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h index 7f24109526b4b..ae205eb1667a2 100644 --- a/cpp/src/arrow/array/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -58,14 +58,14 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { virtual Status CommitPendingData() = 0; std::shared_ptr data_; - uint8_t* raw_data_; - uint8_t int_size_; + uint8_t* raw_data_ = NULLPTR; + uint8_t int_size_ = sizeof(uint8_t); static constexpr int32_t pending_size_ = 1024; uint8_t pending_valid_[pending_size_]; uint64_t pending_data_[pending_size_]; - int32_t pending_pos_; - bool pending_has_nulls_; + int32_t pending_pos_ = 0; + bool pending_has_nulls_ = false; }; } // namespace internal @@ -100,6 +100,8 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase Status FinishInternal(std::shared_ptr* out) override; + std::shared_ptr type() const override; + protected: Status CommitPendingData() override; Status ExpandIntSize(uint8_t new_int_size); @@ -110,11 +112,9 @@ class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase template typename std::enable_if= sizeof(new_type), Status>::type ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type ExpandIntSizeInternal(); -#undef __LESS template Status ExpandIntSizeN(); @@ -152,6 +152,8 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase Status FinishInternal(std::shared_ptr* out) override; + std::shared_ptr type() const override; + protected: Status CommitPendingData() override; Status ExpandIntSize(uint8_t new_int_size); @@ -162,11 +164,9 @@ class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase template typename std::enable_if= sizeof(new_type), Status>::type ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type ExpandIntSizeInternal(); -#undef __LESS template Status ExpandIntSizeN(); diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index a2b9f598d48ce..915ade5f6d26a 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -53,8 +53,7 @@ constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1 /// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. class ARROW_EXPORT ArrayBuilder { public: - explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) - : type_(type), pool_(pool), null_bitmap_builder_(pool) {} + explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {} virtual ~ArrayBuilder() = default; @@ -124,7 +123,8 @@ class ARROW_EXPORT ArrayBuilder { /// \return Status Status Finish(std::shared_ptr* out); - std::shared_ptr type() const { return type_; } + /// \brief Return the type of the built Array + virtual std::shared_ptr type() const = 0; protected: /// Append to null bitmap @@ -202,7 +202,6 @@ class ARROW_EXPORT ArrayBuilder { return Status::OK(); } - std::shared_ptr type_; MemoryPool* pool_; TypedBufferBuilder null_bitmap_builder_; diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 8a9296ee6248e..8c079305549a5 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -40,25 +40,12 @@ namespace arrow { using internal::checked_cast; -// ---------------------------------------------------------------------- -// String and binary - -BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BaseBinaryBuilder(binary(), pool) {} - -StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {} - -LargeBinaryBuilder::LargeBinaryBuilder(MemoryPool* pool) - : BaseBinaryBuilder(large_binary(), pool) {} - -LargeStringBuilder::LargeStringBuilder(MemoryPool* pool) - : LargeBinaryBuilder(large_utf8(), pool) {} - // ---------------------------------------------------------------------- // Fixed width binary FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), + : ArrayBuilder(pool), byte_width_(checked_cast(*type).byte_width()), byte_builder_(pool) {} @@ -103,7 +90,7 @@ Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr* out) { std::shared_ptr null_bitmap; RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_); + *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_); capacity_ = length_ = null_count_ = 0; return Status::OK(); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 921ae9e9d8040..25e0045a1293a 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -46,8 +46,11 @@ class BaseBinaryBuilder : public ArrayBuilder { using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; + explicit BaseBinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : ArrayBuilder(pool), offsets_builder_(pool), value_data_builder_(pool) {} + BaseBinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} + : BaseBinaryBuilder(pool) {} Status Append(const uint8_t* value, offset_type length) { ARROW_RETURN_NOT_OK(Reserve(1)); @@ -260,7 +263,7 @@ class BaseBinaryBuilder : public ArrayBuilder { ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - *out = ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data}, + *out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data}, null_count_, 0); Reset(); return Status::OK(); @@ -333,7 +336,7 @@ class BaseBinaryBuilder : public ArrayBuilder { /// \brief Builder class for variable-length binary data class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder { public: - explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + using BaseBinaryBuilder::BaseBinaryBuilder; /// \cond FALSE using ArrayBuilder::Finish; @@ -341,8 +344,7 @@ class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder { Status Finish(std::shared_ptr* out) { return FinishTyped(out); } - protected: - using BaseBinaryBuilder::BaseBinaryBuilder; + std::shared_ptr type() const override { return binary(); } }; /// \class StringBuilder @@ -350,20 +352,21 @@ class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder { class ARROW_EXPORT StringBuilder : public BinaryBuilder { public: using BinaryBuilder::BinaryBuilder; - explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); /// \cond FALSE using ArrayBuilder::Finish; /// \endcond Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return utf8(); } }; /// \class LargeBinaryBuilder /// \brief Builder class for large variable-length binary data class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder { public: - explicit LargeBinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + using BaseBinaryBuilder::BaseBinaryBuilder; /// \cond FALSE using ArrayBuilder::Finish; @@ -371,8 +374,7 @@ class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder* out) { return FinishTyped(out); } - protected: - using BaseBinaryBuilder::BaseBinaryBuilder; + std::shared_ptr type() const override { return large_binary(); } }; /// \class LargeStringBuilder @@ -380,13 +382,14 @@ class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return large_utf8(); } }; // ---------------------------------------------------------------------- @@ -484,6 +487,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { return std::numeric_limits::max() - 1; } + std::shared_ptr type() const override { + return fixed_size_binary(byte_width_); + } + protected: int32_t byte_width_; BufferBuilder byte_builder_; diff --git a/cpp/src/arrow/array/builder_decimal.cc b/cpp/src/arrow/array/builder_decimal.cc index 1086cb4001dc1..228a19ae3eb8d 100644 --- a/cpp/src/arrow/array/builder_decimal.cc +++ b/cpp/src/arrow/array/builder_decimal.cc @@ -44,7 +44,8 @@ namespace arrow { Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, MemoryPool* pool) - : FixedSizeBinaryBuilder(type, pool) {} + : FixedSizeBinaryBuilder(type, pool), + decimal_type_(internal::checked_pointer_cast(type)) {} Status Decimal128Builder::Append(Decimal128 value) { RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); @@ -68,7 +69,7 @@ Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { std::shared_ptr null_bitmap; RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_); + *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_); capacity_ = length_ = null_count_ = 0; return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h index 1b8d3b4951e09..9379549a0f967 100644 --- a/cpp/src/arrow/array/builder_decimal.h +++ b/cpp/src/arrow/array/builder_decimal.h @@ -48,6 +48,11 @@ class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { /// \endcond Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + + std::shared_ptr type() const override { return decimal_type_; } + + protected: + std::shared_ptr decimal_type_; }; using DecimalBuilder = Decimal128Builder; diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 4a93d9540a596..665867a2a9365 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -104,24 +104,26 @@ class DictionaryBuilderBase : public ArrayBuilder { template DictionaryBuilderBase( typename std::enable_if::value, - const std::shared_ptr&>::type type, + const std::shared_ptr&>::type value_type, MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(type, pool), - memo_table_(new DictionaryMemoTable(type)), + : ArrayBuilder(pool), + memo_table_(new internal::DictionaryMemoTable(value_type)), delta_offset_(0), byte_width_(-1), - values_builder_(pool) {} + indices_builder_(pool), + value_type_(value_type) {} template explicit DictionaryBuilderBase( typename std::enable_if::value, - const std::shared_ptr&>::type type, + const std::shared_ptr&>::type value_type, MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(type, pool), - memo_table_(new DictionaryMemoTable(type)), + : ArrayBuilder(pool), + memo_table_(new internal::DictionaryMemoTable(value_type)), delta_offset_(0), - byte_width_(static_cast(*type).byte_width()), - values_builder_(pool) {} + byte_width_(static_cast(*value_type).byte_width()), + indices_builder_(pool), + value_type_(value_type) {} template explicit DictionaryBuilderBase( @@ -131,11 +133,12 @@ class DictionaryBuilderBase : public ArrayBuilder { DictionaryBuilderBase(const std::shared_ptr& dictionary, MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(dictionary->type(), pool), - memo_table_(new DictionaryMemoTable(dictionary)), + : ArrayBuilder(pool), + memo_table_(new internal::DictionaryMemoTable(dictionary)), delta_offset_(0), byte_width_(-1), - values_builder_(pool) {} + indices_builder_(pool), + value_type_(dictionary->type()) {} ~DictionaryBuilderBase() override = default; @@ -144,7 +147,7 @@ class DictionaryBuilderBase : public ArrayBuilder { ARROW_RETURN_NOT_OK(Reserve(1)); auto memo_index = memo_table_->GetOrInsert(value); - ARROW_RETURN_NOT_OK(values_builder_.Append(memo_index)); + ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index)); length_ += 1; return Status::OK(); @@ -169,14 +172,14 @@ class DictionaryBuilderBase : public ArrayBuilder { length_ += 1; null_count_ += 1; - return values_builder_.AppendNull(); + return indices_builder_.AppendNull(); } Status AppendNulls(int64_t length) final { length_ += length; null_count_ += length; - return values_builder_.AppendNulls(length); + return indices_builder_.AppendNulls(length); } /// \brief Insert values into the dictionary's memo, but do not append any @@ -210,7 +213,7 @@ class DictionaryBuilderBase : public ArrayBuilder { Status AppendArray( typename std::enable_if::value, const Array&>::type array) { - if (!type_->Equals(*array.type())) { + if (!value_type_->Equals(*array.type())) { return Status::Invalid( "Cannot append FixedSizeBinary array with non-matching type"); } @@ -228,8 +231,8 @@ class DictionaryBuilderBase : public ArrayBuilder { void Reset() override { ArrayBuilder::Reset(); - values_builder_.Reset(); - memo_table_.reset(new DictionaryMemoTable(type_)); + indices_builder_.Reset(); + memo_table_.reset(new internal::DictionaryMemoTable(value_type_)); delta_offset_ = 0; } @@ -242,14 +245,14 @@ class DictionaryBuilderBase : public ArrayBuilder { // XXX should we let the user pass additional size heuristics? delta_offset_ = 0; } - ARROW_RETURN_NOT_OK(values_builder_.Resize(capacity)); - capacity_ = values_builder_.capacity(); + ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity)); + capacity_ = indices_builder_.capacity(); return Status::OK(); } Status FinishInternal(std::shared_ptr* out) override { // Finalize indices array - ARROW_RETURN_NOT_OK(values_builder_.FinishInternal(out)); + ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out)); // Generate dictionary array from hash table contents std::shared_ptr dictionary_data; @@ -258,12 +261,12 @@ class DictionaryBuilderBase : public ArrayBuilder { memo_table_->GetArrayData(pool_, delta_offset_, &dictionary_data)); // Set type of array data to the right dictionary type - (*out)->type = dictionary((*out)->type, type_); + (*out)->type = type(); (*out)->dictionary = MakeArray(dictionary_data); // Update internals for further uses of this DictionaryBuilder delta_offset_ = memo_table_->size(); - values_builder_.Reset(); + indices_builder_.Reset(); return Status::OK(); } @@ -277,6 +280,10 @@ class DictionaryBuilderBase : public ArrayBuilder { /// is the dictionary builder in the delta building mode bool is_building_delta() { return delta_offset_ > 0; } + std::shared_ptr type() const override { + return ::arrow::dictionary(indices_builder_.type(), value_type_); + } + protected: std::unique_ptr memo_table_; @@ -284,36 +291,37 @@ class DictionaryBuilderBase : public ArrayBuilder { // Only used for FixedSizeBinaryType int32_t byte_width_; - BuilderType values_builder_; + BuilderType indices_builder_; + std::shared_ptr value_type_; }; template class DictionaryBuilderBase : public ArrayBuilder { public: - DictionaryBuilderBase(const std::shared_ptr& type, + DictionaryBuilderBase(const std::shared_ptr& value_type, MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(type, pool), values_builder_(pool) {} + : ArrayBuilder(pool), indices_builder_(pool) {} explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(null(), pool), values_builder_(pool) {} + : ArrayBuilder(pool), indices_builder_(pool) {} DictionaryBuilderBase(const std::shared_ptr& dictionary, MemoryPool* pool = default_memory_pool()) - : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {} + : ArrayBuilder(pool), indices_builder_(pool) {} /// \brief Append a scalar null value Status AppendNull() final { length_ += 1; null_count_ += 1; - return values_builder_.AppendNull(); + return indices_builder_.AppendNull(); } Status AppendNulls(int64_t length) final { length_ += length; null_count_ += length; - return values_builder_.AppendNulls(length); + return indices_builder_.AppendNulls(length); } /// \brief Append a whole dense array to the builder @@ -328,17 +336,15 @@ class DictionaryBuilderBase : public ArrayBuilder { ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); capacity = std::max(capacity, kMinBuilderCapacity); - ARROW_RETURN_NOT_OK(values_builder_.Resize(capacity)); - capacity_ = values_builder_.capacity(); + ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity)); + capacity_ = indices_builder_.capacity(); return Status::OK(); } Status FinishInternal(std::shared_ptr* out) override { - std::shared_ptr dictionary = std::make_shared(0); - - ARROW_RETURN_NOT_OK(values_builder_.FinishInternal(out)); - (*out)->type = std::make_shared((*out)->type, type_); - (*out)->dictionary = dictionary; + ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out)); + (*out)->type = dictionary((*out)->type, null()); + (*out)->dictionary.reset(new NullArray(0)); return Status::OK(); } @@ -349,8 +355,12 @@ class DictionaryBuilderBase : public ArrayBuilder { Status Finish(std::shared_ptr* out) { return FinishTyped(out); } + std::shared_ptr type() const override { + return ::arrow::dictionary(indices_builder_.type(), null()); + } + protected: - BuilderType values_builder_; + BuilderType indices_builder_; }; } // namespace internal @@ -368,10 +378,10 @@ class DictionaryBuilder : public internal::DictionaryBuilderBasevalues_builder_.null_count(); - ARROW_RETURN_NOT_OK(this->values_builder_.AppendValues(values, length, valid_bytes)); + int64_t null_count_before = this->indices_builder_.null_count(); + ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); this->length_ += length; - this->null_count_ += this->values_builder_.null_count() - null_count_before; + this->null_count_ += this->indices_builder_.null_count() - null_count_before; return Status::OK(); } }; @@ -390,10 +400,10 @@ class Dictionary32Builder : public internal::DictionaryBuilderBasevalues_builder_.null_count(); - ARROW_RETURN_NOT_OK(this->values_builder_.AppendValues(values, length, valid_bytes)); + int64_t null_count_before = this->indices_builder_.null_count(); + ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes)); this->length_ += length; - this->null_count_ += this->values_builder_.null_count() - null_count_before; + this->null_count_ += this->indices_builder_.null_count() - null_count_before; return Status::OK(); } }; diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index 47f10112185d5..04466c0be2560 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -40,7 +40,10 @@ namespace arrow { MapBuilder::MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, std::shared_ptr const& item_builder, const std::shared_ptr& type) - : ArrayBuilder(type, pool), key_builder_(key_builder), item_builder_(item_builder) { + : ArrayBuilder(pool), key_builder_(key_builder), item_builder_(item_builder) { + auto map_type = internal::checked_cast(type.get()); + keys_sorted_ = map_type->keys_sorted(); + list_builder_ = std::make_shared( pool, key_builder, list(field("key", key_builder->type(), false))); } @@ -59,6 +62,7 @@ Status MapBuilder::Resize(int64_t capacity) { void MapBuilder::Reset() { list_builder_->Reset(); + item_builder_->Reset(); ArrayBuilder::Reset(); } @@ -72,9 +76,10 @@ Status MapBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(item_builder_->FinishInternal(&items_data)); auto keys_data = (*out)->child_data[0]; - (*out)->type = type_; - (*out)->child_data[0] = ArrayData::Make(type_->child(0)->type(), keys_data->length, - {nullptr}, {keys_data, items_data}, 0, 0); + (*out)->type = type(); + (*out)->child_data[0] = + ArrayData::Make((*out)->type->child(0)->type(), keys_data->length, {nullptr}, + {keys_data, items_data}, 0, 0); ArrayBuilder::Reset(); return Status::OK(); } @@ -115,20 +120,20 @@ Status MapBuilder::AppendNulls(int64_t length) { // FixedSizeListBuilder FixedSizeListBuilder::FixedSizeListBuilder( - MemoryPool* pool, std::shared_ptr const& value_builder, - int32_t list_size) - : ArrayBuilder(fixed_size_list(value_builder->type(), list_size), pool), - list_size_(list_size), - value_builder_(value_builder) {} - -FixedSizeListBuilder::FixedSizeListBuilder( - MemoryPool* pool, std::shared_ptr const& value_builder, + MemoryPool* pool, const std::shared_ptr& value_builder, const std::shared_ptr& type) - : ArrayBuilder(type, pool), + : ArrayBuilder(pool), + value_field_(type->child(0)), list_size_( internal::checked_cast(type.get())->list_size()), value_builder_(value_builder) {} +FixedSizeListBuilder::FixedSizeListBuilder( + MemoryPool* pool, const std::shared_ptr& value_builder, + int32_t list_size) + : FixedSizeListBuilder(pool, value_builder, + fixed_size_list(value_builder->type(), list_size)) {} + void FixedSizeListBuilder::Reset() { ArrayBuilder::Reset(); value_builder_->Reset(); @@ -172,16 +177,9 @@ Status FixedSizeListBuilder::FinishInternal(std::shared_ptr* out) { } RETURN_NOT_OK(value_builder_->FinishInternal(&items)); - // If the type has not been specified in the constructor, infer it - // This is the case if the value_builder contains a DenseUnionBuilder - const auto& list_type = internal::checked_cast(*type_); - if (!list_type.value_type()) { - type_ = std::make_shared(value_builder_->type(), - list_type.list_size()); - } std::shared_ptr null_bitmap; RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - *out = ArrayData::Make(type_, length_, {null_bitmap}, {std::move(items)}, null_count_); + *out = ArrayData::Make(type(), length_, {null_bitmap}, {std::move(items)}, null_count_); Reset(); return Status::OK(); } @@ -191,7 +189,7 @@ Status FixedSizeListBuilder::FinishInternal(std::shared_ptr* out) { StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, std::vector> field_builders) - : ArrayBuilder(type, pool) { + : ArrayBuilder(pool), type_(type) { children_ = std::move(field_builders); } @@ -221,21 +219,20 @@ Status StructBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i])); } - // If the type has not been specified in the constructor, infer it - // This is the case if one of the children contains a DenseUnionBuilder - if (!type_) { - std::vector> fields; - for (const auto& field_builder : children_) { - fields.push_back(field("", field_builder->type())); - } - type_ = struct_(fields); - } - - *out = ArrayData::Make(type_, length_, {null_bitmap}, null_count_); + *out = ArrayData::Make(type(), length_, {null_bitmap}, null_count_); (*out)->child_data = std::move(child_data); capacity_ = length_ = null_count_ = 0; return Status::OK(); } +std::shared_ptr StructBuilder::type() const { + DCHECK_EQ(type_->children().size(), children_.size()); + std::vector> fields(children_.size()); + for (int i = 0; i < static_cast(fields.size()); ++i) { + fields[i] = type_->child(i)->WithType(children_[i]->type()); + } + return struct_(std::move(fields)); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 06fca40780332..67a925a58840e 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -40,13 +40,14 @@ class BaseListBuilder : public ArrayBuilder { /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type = NULLPTR) - : ArrayBuilder(type ? type - : std::static_pointer_cast( - std::make_shared(value_builder->type())), - pool), + const std::shared_ptr& type) + : ArrayBuilder(pool), offsets_builder_(pool), - value_builder_(value_builder) {} + value_builder_(value_builder), + value_field_(type->child(0)->WithType(NULLPTR)) {} + + BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder) + : BaseListBuilder(pool, value_builder, list(value_builder->type())) {} Status Resize(int64_t capacity) override { if (capacity > maximum_elements()) { @@ -62,7 +63,6 @@ class BaseListBuilder : public ArrayBuilder { void Reset() override { ArrayBuilder::Reset(); - values_.reset(); offsets_builder_.Reset(); value_builder_->Reset(); } @@ -106,30 +106,20 @@ class BaseListBuilder : public ArrayBuilder { ARROW_RETURN_NOT_OK(AppendNextOffset()); // Offset padding zeroed by BufferBuilder - std::shared_ptr offsets; + std::shared_ptr offsets, null_bitmap; ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - std::shared_ptr items; - if (values_) { - items = values_->data(); - } else { - if (value_builder_->length() == 0) { - // Try to make sure we get a non-null values buffer (ARROW-2744) - ARROW_RETURN_NOT_OK(value_builder_->Resize(0)); - } - ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items)); + if (value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(value_builder_->Resize(0)); } - // If the type has not been specified in the constructor, infer it - // This is the case if the value_builder contains a DenseUnionBuilder - if (!arrow::internal::checked_cast(*type_).value_type()) { - type_ = std::static_pointer_cast( - std::make_shared(value_builder_->type())); - } - std::shared_ptr null_bitmap; - ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - *out = ArrayData::Make(type_, length_, {null_bitmap, offsets}, null_count_); - (*out)->child_data.emplace_back(std::move(items)); + std::shared_ptr items; + ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)}, + null_count_); Reset(); return Status::OK(); } @@ -141,10 +131,14 @@ class BaseListBuilder : public ArrayBuilder { return std::numeric_limits::max() - 1; } + std::shared_ptr type() const override { + return std::make_shared(value_field_->WithType(value_builder_->type())); + } + protected: TypedBufferBuilder offsets_builder_; std::shared_ptr value_builder_; - std::shared_ptr values_; + std::shared_ptr value_field_; Status CheckNextOffset() const { const int64_t num_values = value_builder_->length(); @@ -215,13 +209,14 @@ class ARROW_EXPORT LargeListBuilder : public BaseListBuilder { /// Key uniqueness and ordering are not validated. class ARROW_EXPORT MapBuilder : public ArrayBuilder { public: - /// Use this constructor to incrementally build the key and item arrays along with - /// offsets and null bitmap. + /// Use this constructor to define the built array's type explicitly. If key_builder + /// or item_builder has indeterminate type, this builder will also. MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, const std::shared_ptr& item_builder, const std::shared_ptr& type); - /// Derive built type from key and item builders' types + /// Use this constructor to infer the built array's type. If key_builder or + /// item_builder has indeterminate type, this builder will also. MapBuilder(MemoryPool* pool, const std::shared_ptr& key_builder, const std::shared_ptr& item_builder, bool keys_sorted = false); @@ -255,7 +250,12 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { ArrayBuilder* key_builder() const { return key_builder_.get(); } ArrayBuilder* item_builder() const { return item_builder_.get(); } + std::shared_ptr type() const override { + return map(key_builder_->type(), item_builder_->type(), keys_sorted_); + } + protected: + bool keys_sorted_ = false; std::shared_ptr list_builder_; std::shared_ptr key_builder_; std::shared_ptr item_builder_; @@ -268,10 +268,14 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { /// \brief Builder class for fixed-length list array value types class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { public: + /// Use this constructor to define the built array's type explicitly. If value_builder + /// has indeterminate type, this builder will also. FixedSizeListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, int32_t list_size); + /// Use this constructor to infer the built array's type. If value_builder has + /// indeterminate type, this builder will also. FixedSizeListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, const std::shared_ptr& type); @@ -316,7 +320,12 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { ArrayBuilder* value_builder() const { return value_builder_.get(); } + std::shared_ptr type() const override { + return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_); + } + protected: + std::shared_ptr value_field_; const int32_t list_size_; std::shared_ptr value_builder_; }; @@ -331,6 +340,7 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { /// called to maintain data-structure consistency. class ARROW_EXPORT StructBuilder : public ArrayBuilder { public: + /// If any of field_builders has indeterminate type, this builder will also StructBuilder(const std::shared_ptr& type, MemoryPool* pool, std::vector> field_builders); @@ -369,6 +379,11 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { ArrayBuilder* field_builder(int i) const { return children_[i].get(); } int num_fields() const { return static_cast(children_.size()); } + + std::shared_ptr type() const override; + + private: + std::shared_ptr type_; }; } // namespace arrow diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc index 858429747ac8f..15d25d10530ce 100644 --- a/cpp/src/arrow/array/builder_primitive.cc +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -45,7 +45,7 @@ Status NullBuilder::FinishInternal(std::shared_ptr* out) { } BooleanBuilder::BooleanBuilder(MemoryPool* pool) - : ArrayBuilder(boolean(), pool), data_builder_(pool) {} + : ArrayBuilder(pool), data_builder_(pool) {} BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) : BooleanBuilder(pool) { diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index fecb8ca224682..6f34f32901522 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -29,8 +29,7 @@ namespace arrow { class ARROW_EXPORT NullBuilder : public ArrayBuilder { public: - explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(null(), pool) {} + explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) : ArrayBuilder(pool) {} /// \brief Append the specified number of null elements Status AppendNulls(int64_t length) final { @@ -51,6 +50,8 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { using ArrayBuilder::Finish; /// \endcond + std::shared_ptr type() const override { return null(); } + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } }; @@ -61,13 +62,15 @@ class NumericBuilder : public ArrayBuilder { using TypeClass = T; using value_type = typename T::c_type; using ArrayType = typename TypeTraits::ArrayType; - using ArrayBuilder::ArrayBuilder; template explicit NumericBuilder( typename std::enable_if::is_parameter_free, MemoryPool*>::type pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(TypeTraits::type_singleton(), pool) {} + : ArrayBuilder(pool), type_(TypeTraits::type_singleton()) {} + + NumericBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(pool), type_(type) {} /// Append a single scalar and increase the size if necessary. Status Append(const value_type val) { @@ -162,7 +165,7 @@ class NumericBuilder : public ArrayBuilder { std::shared_ptr data, null_bitmap; ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); ARROW_RETURN_NOT_OK(data_builder_.Finish(&data)); - *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_); + *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_); capacity_ = length_ = null_count_ = 0; return Status::OK(); } @@ -244,7 +247,10 @@ class NumericBuilder : public ArrayBuilder { data_builder_.UnsafeAppend(0); } + std::shared_ptr type() const override { return type_; } + protected: + std::shared_ptr type_; TypedBufferBuilder data_builder_; }; @@ -271,7 +277,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); + BooleanBuilder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory Status AppendNulls(int64_t length) final { @@ -425,6 +432,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { void Reset() override; Status Resize(int64_t capacity) override; + std::shared_ptr type() const override { return boolean(); } + protected: TypedBufferBuilder data_builder_; }; diff --git a/cpp/src/arrow/array/builder_time.h b/cpp/src/arrow/array/builder_time.h index f57e0aea8c440..83597336f354c 100644 --- a/cpp/src/arrow/array/builder_time.h +++ b/cpp/src/arrow/array/builder_time.h @@ -42,8 +42,7 @@ class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder { DayTimeIntervalBuilder(std::shared_ptr type, MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(type, pool), - builder_(fixed_size_binary(sizeof(DayMilliseconds)), pool) {} + : ArrayBuilder(pool), builder_(fixed_size_binary(sizeof(DayMilliseconds)), pool) {} void Reset() override { builder_.Reset(); } Status Resize(int64_t capacity) override { return builder_.Resize(capacity); } @@ -64,6 +63,8 @@ class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder { return result; } + std::shared_ptr type() const override { return day_time_interval(); } + private: FixedSizeBinaryBuilder builder_; }; diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 6e933ab9e612d..fdb53a6cf5def 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -26,41 +26,19 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { std::shared_ptr types, null_bitmap; RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); RETURN_NOT_OK(types_builder_.Finish(&types)); - // If the type has not been specified in the constructor, gather type_codes - std::vector type_codes; - if (type_ == nullptr) { - for (size_t i = 0; i < children_.size(); ++i) { - if (type_id_to_children_[i] != nullptr) { - type_codes.push_back(static_cast(i)); - } - } - } else { - type_codes = checked_cast(*type_).type_codes(); - } - DCHECK_EQ(type_codes.size(), children_.size()); - std::vector> child_data(children_.size()); for (size_t i = 0; i < children_.size(); ++i) { RETURN_NOT_OK(children_[i]->FinishInternal(&child_data[i])); } - // If the type has not been specified in the constructor, infer it - if (type_ == nullptr) { - std::vector> fields; - auto field_names_it = field_names_.begin(); - for (auto&& data : child_data) { - fields.push_back(field(*field_names_it++, data->type)); - } - type_ = union_(fields, type_codes, mode_); - } - - *out = ArrayData::Make(type_, length(), {null_bitmap, types, nullptr}, null_count_); + *out = ArrayData::Make(type(), length(), {null_bitmap, types, nullptr}, null_count_); (*out)->child_data = std::move(child_data); return Status::OK(); } @@ -69,42 +47,62 @@ BasicUnionBuilder::BasicUnionBuilder( MemoryPool* pool, UnionMode::type mode, const std::vector>& children, const std::shared_ptr& type) - : ArrayBuilder(type, pool), mode_(mode), types_builder_(pool) { - auto union_type = checked_cast(type.get()); - DCHECK_NE(union_type, nullptr); - DCHECK_EQ(union_type->mode(), mode); - + : ArrayBuilder(pool), + child_fields_(children.size()), + mode_(mode), + types_builder_(pool) { + DCHECK_EQ(type->id(), Type::UNION); + const auto& union_type = checked_cast(*type); + DCHECK_EQ(union_type.mode(), mode); + DCHECK_EQ(children.size(), union_type.type_codes().size()); + + type_codes_ = union_type.type_codes(); children_ = children; - type_id_to_children_.resize(union_type->max_type_code() + 1, nullptr); - DCHECK_LT(type_id_to_children_.size(), - static_cast( - std::numeric_limits::max())); - auto field_it = type->children().begin(); - auto children_it = children.begin(); - for (auto type_id : union_type->type_codes()) { - type_id_to_children_[type_id] = *children_it++; - field_names_.push_back((*field_it++)->name()); + type_id_to_children_.resize(union_type.max_type_code() + 1, nullptr); + DCHECK_LT(type_id_to_children_.size(), std::numeric_limits::max()); + + for (size_t i = 0; i < children.size(); ++i) { + child_fields_[i] = union_type.child(static_cast(i)); + + auto type_id = union_type.type_codes()[i]; + type_id_to_children_[type_id] = children[i].get(); } - DCHECK_EQ(children_it, children.end()); - DCHECK_EQ(field_it, type->children().end()); } +BasicUnionBuilder::BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode) + : BasicUnionBuilder(pool, mode, {}, union_({}, mode)) {} + int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_child, const std::string& field_name) { - // force type inferrence in Finish - type_ = nullptr; - - field_names_.push_back(field_name); children_.push_back(new_child); + auto new_type_id = NextTypeId(); + + type_id_to_children_[new_type_id] = new_child.get(); + + child_fields_.push_back(field(field_name, nullptr)); + + type_codes_.push_back(static_cast(new_type_id)); + + return new_type_id; +} + +std::shared_ptr BasicUnionBuilder::type() const { + std::vector> child_fields(child_fields_.size()); + for (size_t i = 0; i < child_fields.size(); ++i) { + child_fields[i] = child_fields_[i]->WithType(children_[i]->type()); + } + return union_(std::move(child_fields), type_codes_, mode_); +} + +int8_t BasicUnionBuilder::NextTypeId() { // Find type_id such that type_id_to_children_[type_id] == nullptr // and use that for the new child. Start searching at dense_type_id_ // since type_id_to_children_ is densely packed up at least up to dense_type_id_ for (; static_cast(dense_type_id_) < type_id_to_children_.size(); ++dense_type_id_) { if (type_id_to_children_[dense_type_id_] == nullptr) { - type_id_to_children_[dense_type_id_] = new_child; return dense_type_id_++; } } @@ -114,7 +112,7 @@ int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_c std::numeric_limits::max())); // type_id_to_children_ is already densely packed, so just append the new child - type_id_to_children_.push_back(new_child); + type_id_to_children_.resize(type_id_to_children_.size() + 1); return dense_type_id_++; } diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index f6f6870d79dda..7bbb26561258b 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -19,6 +19,7 @@ #include #include +#include #include #include "arrow/array.h" @@ -48,12 +49,13 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { int8_t AppendChild(const std::shared_ptr& new_child, const std::string& field_name = ""); + std::shared_ptr type() const override; + protected: /// Use this constructor to initialize the UnionBuilder with no child builders, /// allowing type to be inferred. You will need to call AppendChild for each of the /// children builders you want to use. - explicit BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode) - : ArrayBuilder(NULLPTR, pool), mode_(mode), types_builder_(pool) {} + BasicUnionBuilder(MemoryPool* pool, UnionMode::type mode); /// Use this constructor to specify the type explicitly. /// You can still add child builders to the union after using this constructor @@ -61,12 +63,16 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { const std::vector>& children, const std::shared_ptr& type); + int8_t NextTypeId(); + + std::vector> child_fields_; + std::vector type_codes_; UnionMode::type mode_; - std::vector> type_id_to_children_; + + std::vector type_id_to_children_; // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr int8_t dense_type_id_ = 0; TypedBufferBuilder types_builder_; - std::vector field_names_; }; /// \class DenseUnionBuilder diff --git a/cpp/src/arrow/array_dict_test.cc b/cpp/src/arrow/array_dict_test.cc index a1561822d040c..8b8d32cefac4a 100644 --- a/cpp/src/arrow/array_dict_test.cc +++ b/cpp/src/arrow/array_dict_test.cc @@ -792,10 +792,10 @@ TEST(TestDecimalDictionaryBuilder, DoubleTableSize) { const auto& decimal_type = arrow::decimal(21, 0); // Build the dictionary Array - DictionaryBuilder builder(decimal_type); + DictionaryBuilder dict_builder(decimal_type); // Build expected data - FixedSizeBinaryBuilder fsb_builder(decimal_type); + Decimal128Builder decimal_builder(decimal_type); Int16Builder int_builder; // Fill with 1024 different values @@ -816,29 +816,29 @@ TEST(TestDecimalDictionaryBuilder, DoubleTableSize) { 12, static_cast(i / 128), static_cast(i % 128)}; - ASSERT_OK(builder.Append(bytes)); - ASSERT_OK(fsb_builder.Append(bytes)); + ASSERT_OK(dict_builder.Append(bytes)); + ASSERT_OK(decimal_builder.Append(bytes)); ASSERT_OK(int_builder.Append(static_cast(i))); } // Fill with an already existing value const uint8_t known_value[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1}; for (int64_t i = 0; i < 1024; i++) { - ASSERT_OK(builder.Append(known_value)); + ASSERT_OK(dict_builder.Append(known_value)); ASSERT_OK(int_builder.Append(1)); } // Finalize result std::shared_ptr result; - ASSERT_OK(builder.Finish(&result)); + ASSERT_OK(dict_builder.Finish(&result)); // Finalize expected data - std::shared_ptr fsb_array; - ASSERT_OK(fsb_builder.Finish(&fsb_array)); + std::shared_ptr decimal_array; + ASSERT_OK(decimal_builder.Finish(&decimal_array)); std::shared_ptr int_array; ASSERT_OK(int_builder.Finish(&int_array)); - DictionaryArray expected(dictionary(int16(), decimal_type), int_array, fsb_array); + DictionaryArray expected(dictionary(int16(), decimal_type), int_array, decimal_array); ASSERT_TRUE(expected.Equals(result)); } @@ -1060,7 +1060,7 @@ TEST(TestDictionary, TransposeNulls) { CheckTranspose(sliced, {1, 3, 2}, out_dict_type, out_dict, expected_indices); } -TEST(TestDictionary, DISABLED_ListOfDictionary) { +TEST(TestDictionary, ListOfDictionary) { std::unique_ptr root_builder; ASSERT_OK(MakeBuilder(default_memory_pool(), list(dictionary(int8(), utf8())), &root_builder)); @@ -1070,17 +1070,17 @@ TEST(TestDictionary, DISABLED_ListOfDictionary) { ASSERT_OK(list_builder->Append()); std::vector expected; - for (char a : "abc") { - for (char d : "def") { - for (char g : "ghi") { - for (char j : "jkl") { - for (char m : "mno") { - for (char p : "pqr") { + for (char a : util::string_view("abc")) { + for (char d : util::string_view("def")) { + for (char g : util::string_view("ghi")) { + for (char j : util::string_view("jkl")) { + for (char m : util::string_view("mno")) { + for (char p : util::string_view("pqr")) { if ((static_cast(a) + d + g + j + m + p) % 16 == 0) { ASSERT_OK(list_builder->Append()); } // 3**6 distinct strings; too large for int8 - char str[6] = {a, d, g, j, m, p}; + char str[] = {a, d, g, j, m, p, '\0'}; ASSERT_OK(dict_builder->Append(str)); expected.push_back(str); } @@ -1089,6 +1089,9 @@ TEST(TestDictionary, DISABLED_ListOfDictionary) { } } } + + ASSERT_TRUE(list_builder->type()->Equals(list(dictionary(int16(), utf8())))); + std::shared_ptr expected_dict; ArrayFromVector(expected, &expected_dict); diff --git a/cpp/src/arrow/array_list_test.cc b/cpp/src/arrow/array_list_test.cc index 52308d391155b..13f6423ea0622 100644 --- a/cpp/src/arrow/array_list_test.cc +++ b/cpp/src/arrow/array_list_test.cc @@ -713,7 +713,7 @@ TEST_F(TestFixedSizeListArray, TestZeroLength) { ASSERT_OK(result_->Validate()); } -TEST_F(TestFixedSizeListArray, TestBuilderPreserveFieleName) { +TEST_F(TestFixedSizeListArray, TestBuilderPreserveFieldName) { auto list_type_with_name = fixed_size_list(field("counts", int32()), list_size()); std::unique_ptr tmp; diff --git a/cpp/src/arrow/array_test.cc b/cpp/src/arrow/array_test.cc index 3d23c60d6c75c..ea41e242041b4 100644 --- a/cpp/src/arrow/array_test.cc +++ b/cpp/src/arrow/array_test.cc @@ -555,6 +555,17 @@ void TestPrimitiveBuilder::Check(const std::unique_ptr ASSERT_EQ(0, builder->null_count()); } +TEST(TestPrimitiveAdHoc, TestType) { + Int8Builder i8(default_memory_pool()); + ASSERT_TRUE(i8.type()->Equals(int8())); + + DictionaryBuilder d_i8(utf8()); + ASSERT_TRUE(d_i8.type()->Equals(dictionary(int8(), utf8()))); + + Dictionary32Builder d32_i8(utf8()); + ASSERT_TRUE(d32_i8.type()->Equals(dictionary(int32(), utf8()))); +} + TEST(NumericBuilderAccessors, TestSettersGetters) { int64_t datum = 42; int64_t new_datum = 43; @@ -1487,6 +1498,7 @@ class TestAdaptiveIntBuilder : public TestBuilder { }; TEST_F(TestAdaptiveIntBuilder, TestInt8) { + ASSERT_EQ(builder_->type()->id(), Type::INT8); ASSERT_OK(builder_->Append(0)); ASSERT_OK(builder_->Append(127)); ASSERT_OK(builder_->Append(-128)); @@ -1500,7 +1512,9 @@ TEST_F(TestAdaptiveIntBuilder, TestInt8) { TEST_F(TestAdaptiveIntBuilder, TestInt16) { ASSERT_OK(builder_->Append(0)); + ASSERT_EQ(builder_->type()->id(), Type::INT8); ASSERT_OK(builder_->Append(128)); + ASSERT_EQ(builder_->type()->id(), Type::INT16); Done(); std::vector expected_values({0, 128}); @@ -1526,10 +1540,26 @@ TEST_F(TestAdaptiveIntBuilder, TestInt16) { AssertArraysEqual(*expected_, *result_); } +TEST_F(TestAdaptiveIntBuilder, TestInt16Nulls) { + ASSERT_OK(builder_->Append(0)); + ASSERT_EQ(builder_->type()->id(), Type::INT8); + ASSERT_OK(builder_->Append(128)); + ASSERT_EQ(builder_->type()->id(), Type::INT16); + ASSERT_OK(builder_->AppendNull()); + ASSERT_EQ(builder_->type()->id(), Type::INT16); + Done(); + + std::vector expected_values({0, 128, 0}); + ArrayFromVector({1, 1, 0}, expected_values, &expected_); + ASSERT_ARRAYS_EQUAL(*expected_, *result_); +} + TEST_F(TestAdaptiveIntBuilder, TestInt32) { ASSERT_OK(builder_->Append(0)); + ASSERT_EQ(builder_->type()->id(), Type::INT8); ASSERT_OK( builder_->Append(static_cast(std::numeric_limits::max()) + 1)); + ASSERT_EQ(builder_->type()->id(), Type::INT32); Done(); std::vector expected_values( @@ -1559,8 +1589,10 @@ TEST_F(TestAdaptiveIntBuilder, TestInt32) { TEST_F(TestAdaptiveIntBuilder, TestInt64) { ASSERT_OK(builder_->Append(0)); + ASSERT_EQ(builder_->type()->id(), Type::INT8); ASSERT_OK( builder_->Append(static_cast(std::numeric_limits::max()) + 1)); + ASSERT_EQ(builder_->type()->id(), Type::INT64); Done(); std::vector expected_values( @@ -1707,12 +1739,14 @@ TEST_F(TestAdaptiveUIntBuilder, TestUInt8) { TEST_F(TestAdaptiveUIntBuilder, TestUInt16) { ASSERT_OK(builder_->Append(0)); + ASSERT_EQ(builder_->type()->id(), Type::UINT8); ASSERT_OK(builder_->Append(256)); + ASSERT_EQ(builder_->type()->id(), Type::UINT16); Done(); std::vector expected_values({0, 256}); ArrayFromVector(expected_values, &expected_); - ASSERT_TRUE(expected_->Equals(result_)); + ASSERT_ARRAYS_EQUAL(*expected_, *result_); SetUp(); ASSERT_OK(builder_->Append(std::numeric_limits::max())); @@ -1723,10 +1757,26 @@ TEST_F(TestAdaptiveUIntBuilder, TestUInt16) { ASSERT_TRUE(expected_->Equals(result_)); } +TEST_F(TestAdaptiveUIntBuilder, TestUInt16Nulls) { + ASSERT_OK(builder_->Append(0)); + ASSERT_EQ(builder_->type()->id(), Type::UINT8); + ASSERT_OK(builder_->Append(256)); + ASSERT_EQ(builder_->type()->id(), Type::UINT16); + ASSERT_OK(builder_->AppendNull()); + ASSERT_EQ(builder_->type()->id(), Type::UINT16); + Done(); + + std::vector expected_values({0, 256, 0}); + ArrayFromVector({1, 1, 0}, expected_values, &expected_); + ASSERT_ARRAYS_EQUAL(*expected_, *result_); +} + TEST_F(TestAdaptiveUIntBuilder, TestUInt32) { ASSERT_OK(builder_->Append(0)); + ASSERT_EQ(builder_->type()->id(), Type::UINT8); ASSERT_OK( builder_->Append(static_cast(std::numeric_limits::max()) + 1)); + ASSERT_EQ(builder_->type()->id(), Type::UINT32); Done(); std::vector expected_values( @@ -1745,8 +1795,10 @@ TEST_F(TestAdaptiveUIntBuilder, TestUInt32) { TEST_F(TestAdaptiveUIntBuilder, TestUInt64) { ASSERT_OK(builder_->Append(0)); + ASSERT_EQ(builder_->type()->id(), Type::UINT8); ASSERT_OK( builder_->Append(static_cast(std::numeric_limits::max()) + 1)); + ASSERT_EQ(builder_->type()->id(), Type::UINT64); Done(); std::vector expected_values( diff --git a/cpp/src/arrow/array_union_test.cc b/cpp/src/arrow/array_union_test.cc index 2dea3b5252f8a..e70fcc25a22f0 100644 --- a/cpp/src/arrow/array_union_test.cc +++ b/cpp/src/arrow/array_union_test.cc @@ -272,10 +272,6 @@ class UnionBuilderTest : public ::testing::Test { ASSERT_OK(list_builder.Finish(actual)); ArrayFromVector(expected_types_vector, &expected_types); - - ASSERT_EQ(I8, 0); - ASSERT_EQ(STR, 1); - ASSERT_EQ(DBL, 2); } std::vector expected_types_vector; @@ -403,4 +399,14 @@ TEST_F(SparseUnionBuilderTest, InferredType) { ASSERT_EQ(expected->type()->ToString(), actual->type()->ToString()); ASSERT_ARRAYS_EQUAL(*expected, *actual); } + +TEST_F(SparseUnionBuilderTest, StructWithUnion) { + auto union_builder = std::make_shared(default_memory_pool()); + StructBuilder builder(struct_({field("u", union_({}))}), default_memory_pool(), + {union_builder}); + ASSERT_EQ(union_builder->AppendChild(std::make_shared(), "i"), 0); + ASSERT_TRUE( + builder.type()->Equals(struct_({field("u", union_({field("i", int32())}, {0}))}))); +} + } // namespace arrow diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index b13ce200f2661..0a085aceb5eaf 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -111,11 +111,13 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(LARGE_BINARY, LargeBinaryBuilder); BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); BUILDER_CASE(DECIMAL, Decimal128Builder); + case Type::DICTIONARY: { const auto& dict_type = static_cast(*type); DictionaryBuilderCase visitor = {pool, dict_type.value_type(), nullptr, out}; return visitor.Make(); } + case Type::INTERVAL: { const auto& interval_type = internal::checked_cast(*type); if (interval_type.interval_type() == IntervalType::MONTHS) { @@ -128,6 +130,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, } break; } + case Type::LIST: { std::unique_ptr value_builder; std::shared_ptr value_type = @@ -136,6 +139,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, out->reset(new ListBuilder(pool, std::move(value_builder), type)); return Status::OK(); } + case Type::LARGE_LIST: { std::unique_ptr value_builder; std::shared_ptr value_type = @@ -144,6 +148,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, out->reset(new LargeListBuilder(pool, std::move(value_builder), type)); return Status::OK(); } + case Type::MAP: { const auto& map_type = internal::checked_cast(*type); std::unique_ptr key_builder, item_builder; @@ -155,9 +160,9 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, } case Type::FIXED_SIZE_LIST: { + const auto& list_type = internal::checked_cast(*type); std::unique_ptr value_builder; - std::shared_ptr value_type = - internal::checked_cast(*type).value_type(); + auto value_type = list_type.value_type(); RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); out->reset(new FixedSizeListBuilder(pool, std::move(value_builder), type)); return Status::OK(); @@ -167,7 +172,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, const std::vector>& fields = type->children(); std::vector> field_builders; - for (auto it : fields) { + for (const auto& it : fields) { std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder)); field_builders.emplace_back(std::move(builder)); @@ -181,7 +186,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, const std::vector>& fields = type->children(); std::vector> field_builders; - for (auto it : fields) { + for (const auto& it : fields) { std::unique_ptr builder; RETURN_NOT_OK(MakeBuilder(pool, it->type(), &builder)); field_builders.emplace_back(std::move(builder)); @@ -194,10 +199,8 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, return Status::OK(); } - default: { - return Status::NotImplemented("MakeBuilder: cannot construct builder for type ", - type->ToString()); - } + default: + break; } return Status::NotImplemented("MakeBuilder: cannot construct builder for type ", type->ToString()); diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index a9aa4d9eb0cc5..0a39bfec9396d 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -49,6 +49,7 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; namespace py { @@ -765,18 +766,18 @@ class StructConverter Status Init(ArrayBuilder* builder) { this->builder_ = builder; this->typed_builder_ = checked_cast(builder); - const auto& struct_type = checked_cast(*builder->type()); + auto struct_type = checked_pointer_cast(builder->type()); num_fields_ = this->typed_builder_->num_fields(); - DCHECK_EQ(num_fields_, struct_type.num_children()); + DCHECK_EQ(num_fields_, struct_type->num_children()); field_name_list_.reset(PyList_New(num_fields_)); RETURN_IF_PYERROR(); // Initialize the child converters and field names for (int i = 0; i < num_fields_; i++) { - const std::string& field_name(struct_type.child(i)->name()); - std::shared_ptr field_type(struct_type.child(i)->type()); + const std::string& field_name(struct_type->child(i)->name()); + std::shared_ptr field_type(struct_type->child(i)->type()); std::unique_ptr value_converter; RETURN_NOT_OK( @@ -869,7 +870,7 @@ class DecimalConverter Status Init(ArrayBuilder* builder) override { RETURN_NOT_OK(BASE::Init(builder)); - decimal_type_ = checked_cast(this->typed_builder_->type().get()); + decimal_type_ = checked_pointer_cast(this->typed_builder_->type()); return Status::OK(); } @@ -880,7 +881,7 @@ class DecimalConverter } private: - const DecimalType* decimal_type_ = nullptr; + std::shared_ptr decimal_type_ = nullptr; }; #define NUMERIC_CONVERTER(TYPE_ENUM, TYPE) \ diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc index 9946d9a146016..571502db1fc6c 100644 --- a/cpp/src/arrow/python/serialize.cc +++ b/cpp/src/arrow/python/serialize.cc @@ -263,7 +263,10 @@ class SequenceBuilder { class DictBuilder { public: explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) { - builder_.reset(new StructBuilder(nullptr, pool, {keys_.builder(), vals_.builder()})); + builder_.reset( + new StructBuilder(struct_({field("keys", union_({}, UnionMode::DENSE)), + field("vals", union_({}, UnionMode::DENSE))}), + pool, {keys_.builder(), vals_.builder()})); } // Builder for the keys of the dictionary diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 6dbce9d7c69bd..c71bc0990b7c5 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1115,21 +1115,21 @@ std::shared_ptr union_(const std::vector>& chil const std::vector& field_names, const std::vector& given_type_codes, UnionMode::type mode) { - std::vector> types; + std::vector> fields; std::vector type_codes(given_type_codes); uint8_t counter = 0; for (const auto& child : children) { if (field_names.size() == 0) { - types.push_back(field(std::to_string(counter), child->type())); + fields.push_back(field(std::to_string(counter), child->type())); } else { - types.push_back(field(field_names[counter], child->type())); + fields.push_back(field(std::move(field_names[counter]), child->type())); } if (given_type_codes.size() == 0) { type_codes.push_back(counter); } counter++; } - return union_(types, type_codes, mode); + return union_(fields, std::move(type_codes), mode); } std::shared_ptr dictionary(const std::shared_ptr& index_type,