Skip to content

Commit

Permalink
ARROW-7764: [C++] Don't keep a null bitmap in ArrayData if null_count…
Browse files Browse the repository at this point in the history
… == 0

This spotted a couple bugs in kernels where null bitmaps were always iterated even if null.

Closes #6447 from pitrou/ARROW-7764-optimize-no-nulls and squashes the following commits:

577c265 <Antoine Pitrou> Remove pointless DecodeArrowNonNull overrides.
ba1e3a4 <Antoine Pitrou> Fix R failure
5575798 <Antoine Pitrou> ARROW-7764:  Don't keep a null bitmap in ArrayData if null_count == 0

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
pitrou committed Mar 2, 2020
1 parent b85e17f commit 882cd34
Show file tree
Hide file tree
Showing 13 changed files with 251 additions and 158 deletions.
40 changes: 22 additions & 18 deletions cpp/src/arrow/array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,36 +54,40 @@ using internal::CountSetBits;

std::shared_ptr<ArrayData> ArrayData::Make(const std::shared_ptr<DataType>& type,
int64_t length,
std::vector<std::shared_ptr<Buffer>>&& buffers,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count, int64_t offset) {
// In case there are no nulls, don't keep an allocated null bitmap around
if (buffers.size() > 0 && null_count == 0) {
buffers[0] = nullptr;
}
return std::make_shared<ArrayData>(type, length, std::move(buffers), null_count,
offset);
}

std::shared_ptr<ArrayData> ArrayData::Make(
const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Buffer>>& buffers, int64_t null_count,
int64_t offset) {
return std::make_shared<ArrayData>(type, length, buffers, null_count, offset);
}

std::shared_ptr<ArrayData> ArrayData::Make(
const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Buffer>>& buffers,
const std::vector<std::shared_ptr<ArrayData>>& child_data, int64_t null_count,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data, int64_t null_count,
int64_t offset) {
return std::make_shared<ArrayData>(type, length, buffers, child_data, null_count,
offset);
// In case there are no nulls, don't keep an allocated null bitmap around
if (buffers.size() > 0 && null_count == 0) {
buffers[0] = nullptr;
}
return std::make_shared<ArrayData>(type, length, std::move(buffers),
std::move(child_data), null_count, offset);
}

std::shared_ptr<ArrayData> ArrayData::Make(
const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Buffer>>& buffers,
const std::vector<std::shared_ptr<ArrayData>>& child_data,
const std::shared_ptr<Array>& dictionary, int64_t null_count, int64_t offset) {
auto data =
std::make_shared<ArrayData>(type, length, buffers, child_data, null_count, offset);
data->dictionary = dictionary;
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data, std::shared_ptr<Array> dictionary,
int64_t null_count, int64_t offset) {
if (buffers.size() > 0 && null_count == 0) {
buffers[0] = nullptr;
}
auto data = std::make_shared<ArrayData>(type, length, std::move(buffers),
std::move(child_data), null_count, offset);
data->dictionary = std::move(dictionary);
return data;
}

Expand Down
34 changes: 11 additions & 23 deletions cpp/src/arrow/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,50 +92,38 @@ struct ARROW_EXPORT ArrayData {
: type(type), length(length), null_count(null_count), offset(offset) {}

ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Buffer>>& buffers,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: ArrayData(type, length, null_count, offset) {
this->buffers = buffers;
}

ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Buffer>>& buffers,
const std::vector<std::shared_ptr<ArrayData>>& child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: ArrayData(type, length, null_count, offset) {
this->buffers = buffers;
this->child_data = child_data;
this->buffers = std::move(buffers);
}

ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
std::vector<std::shared_ptr<Buffer>>&& buffers,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: ArrayData(type, length, null_count, offset) {
this->buffers = std::move(buffers);
this->child_data = std::move(child_data);
}

static std::shared_ptr<ArrayData> Make(const std::shared_ptr<DataType>& type,
int64_t length,
std::vector<std::shared_ptr<Buffer>>&& buffers,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);

static std::shared_ptr<ArrayData> Make(
const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Buffer>>& buffers,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

static std::shared_ptr<ArrayData> Make(
const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Buffer>>& buffers,
const std::vector<std::shared_ptr<ArrayData>>& child_data,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

static std::shared_ptr<ArrayData> Make(
const std::shared_ptr<DataType>& type, int64_t length,
const std::vector<std::shared_ptr<Buffer>>& buffers,
const std::vector<std::shared_ptr<ArrayData>>& child_data,
const std::shared_ptr<Array>& dictionary, int64_t null_count = kUnknownNullCount,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
std::shared_ptr<Array> dictionary, int64_t null_count = kUnknownNullCount,
int64_t offset = 0);

static std::shared_ptr<ArrayData> Make(const std::shared_ptr<DataType>& type,
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/dict_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr<DataTy

// Shift null buffer if the original offset is non-zero
std::shared_ptr<Buffer> null_bitmap;
if (data_->offset != 0) {
if (data_->offset != 0 && data_->null_count != 0) {
ARROW_ASSIGN_OR_RAISE(
null_bitmap, CopyBitmap(pool, null_bitmap_data_, data_->offset, data_->length));
} else {
Expand Down
6 changes: 5 additions & 1 deletion cpp/src/arrow/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,11 @@ void TestPrimitiveBuilder<PBoolean>::Check(const std::unique_ptr<BooleanBuilder>
AssertArraysEqual(*result, *expected);

// buffers are correctly sized
ASSERT_EQ(result->data()->buffers[0]->size(), BitUtil::BytesForBits(size));
if (result->data()->buffers[0]) {
ASSERT_EQ(result->data()->buffers[0]->size(), BitUtil::BytesForBits(size));
} else {
ASSERT_EQ(result->data()->null_count, 0);
}
ASSERT_EQ(result->data()->buffers[1]->size(), BitUtil::BytesForBits(size));

// Builder is now reset
Expand Down
19 changes: 14 additions & 5 deletions cpp/src/arrow/compute/kernels/aggregate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,25 @@ static SumResult<ArrowType> NaiveSumPartial(const Array& array) {
ResultType result;

auto data = array.data();
internal::BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length());
const auto& array_numeric = reinterpret_cast<const ArrayType&>(array);
const auto values = array_numeric.raw_values();
for (int64_t i = 0; i < array.length(); i++) {
if (reader.IsSet()) {

if (array.null_count() != 0) {
internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
array.length());
for (int64_t i = 0; i < array.length(); i++) {
if (reader.IsSet()) {
result.first += values[i];
result.second++;
}

reader.Next();
}
} else {
for (int64_t i = 0; i < array.length(); i++) {
result.first += values[i];
result.second++;
}

reader.Next();
}

return result;
Expand Down
17 changes: 11 additions & 6 deletions cpp/src/arrow/compute/kernels/minmax.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,19 +84,24 @@ class MinMaxAggregateFunction final
Status Consume(const Array& array, StateType* state) const override {
StateType local;

internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
array.length());
const auto values =
checked_cast<const typename TypeTraits<ArrowType>::ArrayType&>(array)
.raw_values();
for (int64_t i = 0; i < array.length(); i++) {
if (reader.IsSet()) {
if (array.null_count() != 0) {
internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
array.length());
for (int64_t i = 0; i < array.length(); i++) {
if (reader.IsSet()) {
local.MergeOne(values[i]);
}
reader.Next();
}
} else {
for (int64_t i = 0; i < array.length(); i++) {
local.MergeOne(values[i]);
}
reader.Next();
}
*state = local;

return Status::OK();
}

Expand Down
5 changes: 4 additions & 1 deletion cpp/src/arrow/json/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@ void AssertUnconvertedArraysEqual(const Array& expected, const Array& actual) {
}
case Type::LIST: {
ASSERT_EQ(expected.type_id(), Type::LIST);
AssertBufferEqual(*expected.null_bitmap(), *actual.null_bitmap());
ASSERT_EQ(expected.null_count(), actual.null_count());
if (expected.null_count() != 0) {
AssertBufferEqual(*expected.null_bitmap(), *actual.null_bitmap());
}
const auto& expected_offsets = expected.data()->buffers[1];
const auto& actual_offsets = actual.data()->buffers[1];
AssertBufferEqual(*expected_offsets, *actual_offsets);
Expand Down
Loading

0 comments on commit 882cd34

Please sign in to comment.