Skip to content
This repository has been archived by the owner on May 10, 2024. It is now read-only.

Commit

Permalink
PARQUET-1346: [C++] Protect against empty Arrow arrays with null values
Browse files Browse the repository at this point in the history
Author: Antoine Pitrou <antoine@python.org>

Closes #474 from pitrou/PARQUET-1346-null-values and squashes the following commits:

08bad23 [Antoine Pitrou] Do not ignore return value
a1c0378 [Antoine Pitrou] Fix uninitialized value
dfb42d2 [Antoine Pitrou] Try to fix lint
b514d0d [Antoine Pitrou] Try to fix compile failures on Travis-CI (due to old Arrow?)
3951f25 [Antoine Pitrou] PARQUET-1346: [C++] Protect against empty Arrow arrays with null values
  • Loading branch information
pitrou authored and xhochy committed Jul 12, 2018
1 parent d9c262a commit e6739e9
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 5 deletions.
15 changes: 13 additions & 2 deletions src/parquet/arrow/arrow-reader-writer-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,13 @@ class TestParquetIO : public ::testing::Test {
*out = MakeSimpleTable(lists->Slice(3, size - 6), nullable_lists);
}

// Prepare table of empty lists, with null values array (ARROW-2744)
void PrepareEmptyListsTable(int64_t size, std::shared_ptr<Table>* out) {
std::shared_ptr<Array> lists;
ASSERT_OK(MakeEmptyListsArray(size, &lists));
*out = MakeSimpleTable(lists, true /* nullable_lists */);
}

void PrepareListOfListTable(int64_t size, bool nullable_parent_lists,
bool nullable_lists, bool nullable_elements,
int64_t null_count, std::shared_ptr<Table>* out) {
Expand Down Expand Up @@ -713,6 +720,12 @@ TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) {
ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table));
}

TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) {
std::shared_ptr<Table> table;
ASSERT_NO_FATAL_FAILURE(this->PrepareEmptyListsTable(SMALL_SIZE, &table));
ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table));
}

TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) {
std::shared_ptr<Table> table;
ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, true, 10, &table));
Expand Down Expand Up @@ -1524,8 +1537,6 @@ void MakeDoubleTable(int num_columns, int num_rows, int nchunks,

void MakeListArray(int num_rows, std::shared_ptr<::DataType>* out_type,
std::shared_ptr<Array>* out_array) {
::arrow::Int32Builder offset_builder;

std::vector<int32_t> length_draws;
randint(num_rows, 0, 100, &length_draws);

Expand Down
27 changes: 27 additions & 0 deletions src/parquet/arrow/test-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,33 @@ Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
return Status::OK();
}

// Make an array containing only empty lists, with a null values array
Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
// Allocate an offsets buffer containing only zeroes
std::shared_ptr<Buffer> offsets_buffer;
const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
RETURN_NOT_OK(::arrow::AllocateBuffer(::arrow::default_memory_pool(), offsets_nbytes,
&offsets_buffer));
memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);

auto value_field = ::arrow::field("item", ::arrow::float64(),
false /* nullable_values */);
auto list_type = ::arrow::list(value_field);

std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
nullptr /* values */ };
auto child_data = ::arrow::ArrayData::Make(value_field->type(), 0,
std::move(child_buffers));

std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
offsets_buffer };
auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
array_data->child_data.push_back(child_data);

*out_array = ::arrow::MakeArray(array_data);
return Status::OK();
}

static std::shared_ptr<::arrow::Column> MakeColumn(const std::string& name,
const std::shared_ptr<Array>& array,
bool nullable) {
Expand Down
17 changes: 14 additions & 3 deletions src/parquet/arrow/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -411,8 +411,13 @@ Status ArrowColumnWriter::TypedWriteBatch(const Array& array, int64_t num_levels
using ArrowCType = typename ArrowType::c_type;

const auto& data = static_cast<const PrimitiveArray&>(array);
auto values =
reinterpret_cast<const ArrowCType*>(data.values()->data()) + data.offset();
const ArrowCType* values = nullptr;
// The values buffer may be null if the array is empty (ARROW-2744)
if (data.values() != nullptr) {
values = reinterpret_cast<const ArrowCType*>(data.values()->data()) + data.offset();
} else {
DCHECK_EQ(data.length(), 0);
}

if (writer_->descr()->schema_node()->is_required() || (data.null_count() == 0)) {
// no nulls, just dump the data
Expand Down Expand Up @@ -706,7 +711,13 @@ Status ArrowColumnWriter::TypedWriteBatch<BooleanType, ::arrow::BooleanType>(
RETURN_NOT_OK(ctx_->GetScratchData<bool>(array.length(), &buffer));

const auto& data = static_cast<const BooleanArray&>(array);
auto values = reinterpret_cast<const uint8_t*>(data.values()->data());
const uint8_t* values = nullptr;
// The values buffer may be null if the array is empty (ARROW-2744)
if (data.values() != nullptr) {
values = reinterpret_cast<const uint8_t*>(data.values()->data());
} else {
DCHECK_EQ(data.length(), 0);
}

int buffer_idx = 0;
int64_t offset = array.offset();
Expand Down

0 comments on commit e6739e9

Please sign in to comment.