Skip to content

Commit

Permalink
ARROW-8011: [C++] Fix buffer size when reading Parquet data to Arrow
Browse files Browse the repository at this point in the history
In some cases (the so-called "zero copy" reads), Arrow arrays read from Parquet would be backed by too big buffers, leading to possible private data leaks.

Closes #6562 from pitrou/ARROW-8011-parquet-buffer-size and squashes the following commits:

949702d <Antoine Pitrou> ARROW-8011:  Fix buffer size when reading Parquet data to Arrow

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
pitrou authored and wesm committed Mar 9, 2020
1 parent c03f2f6 commit cc74740
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 3 deletions.
13 changes: 10 additions & 3 deletions cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -911,6 +911,12 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>,
return this->num_buffered_values_ - this->num_decoded_values_;
}

// Compute the values capacity in bytes for the given number of elements
int64_t bytes_for_values(int64_t nitems) const {
int type_size = GetTypeByteSize(this->descr_->physical_type());
return nitems * type_size;
}

int64_t ReadRecords(int64_t num_records) override {
// Delimit records, then read values at the end
int64_t records_read = 0;
Expand Down Expand Up @@ -987,6 +993,7 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>,
std::shared_ptr<ResizableBuffer> ReleaseValues() override {
if (uses_values_) {
auto result = values_;
PARQUET_THROW_NOT_OK(result->Resize(bytes_for_values(values_written_), true));
values_ = AllocateBuffer(this->pool_);
return result;
} else {
Expand All @@ -997,6 +1004,7 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>,
std::shared_ptr<ResizableBuffer> ReleaseIsValid() override {
if (nullable_values_) {
auto result = valid_bits_;
PARQUET_THROW_NOT_OK(result->Resize(BitUtil::BytesForBits(values_written_), true));
valid_bits_ = AllocateBuffer(this->pool_);
return result;
} else {
Expand Down Expand Up @@ -1078,12 +1086,11 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>,
new_values_capacity = BitUtil::NextPower2(new_values_capacity + 1);
}

int type_size = GetTypeByteSize(this->descr_->physical_type());

// XXX(wesm): A hack to avoid memory allocation when reading directly
// into builder classes
if (uses_values_) {
PARQUET_THROW_NOT_OK(values_->Resize(new_values_capacity * type_size, false));
PARQUET_THROW_NOT_OK(
values_->Resize(bytes_for_values(new_values_capacity), false));
}

values_capacity_ = new_values_capacity;
Expand Down
26 changes: 26 additions & 0 deletions python/pyarrow/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -3465,3 +3465,29 @@ def test_fastparquet_cross_compatibility(tempdir):
# (no arrow schema in parquet metadata)
df['f'] = df['f'].astype(object)
tm.assert_frame_equal(table_fp.to_pandas(), df)


@pytest.mark.parametrize('array_factory', [
lambda: pa.array([0, None] * 10),
lambda: pa.array([0, None] * 10).dictionary_encode(),
lambda: pa.array(["", None] * 10),
lambda: pa.array(["", None] * 10).dictionary_encode(),
])
@pytest.mark.parametrize('use_dictionary', [False, True])
@pytest.mark.parametrize('read_dictionary', [False, True])
def test_buffer_contents(array_factory, use_dictionary, read_dictionary):
# Test that null values are deterministically initialized to zero
# after a roundtrip through Parquet.
# See ARROW-8006 and ARROW-8011.
orig_table = pa.Table.from_pydict({"col": array_factory()})
bio = io.BytesIO()
pq.write_table(orig_table, bio, use_dictionary=True)
bio.seek(0)
read_dictionary = ['col'] if read_dictionary else None
table = pq.read_table(bio, use_threads=False,
read_dictionary=read_dictionary)

for col in table.columns:
[chunk] = col.chunks
buf = chunk.buffers()[1]
assert buf.to_pybytes() == buf.size * b"\0"

0 comments on commit cc74740

Please sign in to comment.