ARROW-8011: [C++] Fix buffer size when reading Parquet data to Arrow

In some cases (the so-called "zero copy" reads), Arrow arrays read from Parquet would be backed by too big buffers, leading to possible private data leaks. Closes #6562 from pitrou/ARROW-8011-parquet-buffer-size and squashes the following commits: 949702d <Antoine Pitrou> ARROW-8011: Fix buffer size when reading Parquet data to Arrow Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Wes McKinney <wesm+git@apache.org>
apache · Mar 9, 2020 · cc74740 · cc74740
1 parent c03f2f6
commit cc74740
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 3 deletions.
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
@@ -911,6 +911,12 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>,
     return this->num_buffered_values_ - this->num_decoded_values_;
   }
 
+  // Compute the values capacity in bytes for the given number of elements
+  int64_t bytes_for_values(int64_t nitems) const {
+    int type_size = GetTypeByteSize(this->descr_->physical_type());
+    return nitems * type_size;
+  }
+
   int64_t ReadRecords(int64_t num_records) override {
     // Delimit records, then read values at the end
     int64_t records_read = 0;
@@ -987,6 +993,7 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>,
   std::shared_ptr<ResizableBuffer> ReleaseValues() override {
     if (uses_values_) {
       auto result = values_;
+      PARQUET_THROW_NOT_OK(result->Resize(bytes_for_values(values_written_), true));
       values_ = AllocateBuffer(this->pool_);
       return result;
     } else {
@@ -997,6 +1004,7 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>,
   std::shared_ptr<ResizableBuffer> ReleaseIsValid() override {
     if (nullable_values_) {
       auto result = valid_bits_;
+      PARQUET_THROW_NOT_OK(result->Resize(BitUtil::BytesForBits(values_written_), true));
       valid_bits_ = AllocateBuffer(this->pool_);
       return result;
     } else {
@@ -1078,12 +1086,11 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>,
         new_values_capacity = BitUtil::NextPower2(new_values_capacity + 1);
       }
 
-      int type_size = GetTypeByteSize(this->descr_->physical_type());
-
       // XXX(wesm): A hack to avoid memory allocation when reading directly
       // into builder classes
       if (uses_values_) {
-        PARQUET_THROW_NOT_OK(values_->Resize(new_values_capacity * type_size, false));
+        PARQUET_THROW_NOT_OK(
+            values_->Resize(bytes_for_values(new_values_capacity), false));
       }
 
       values_capacity_ = new_values_capacity;

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
@@ -3465,3 +3465,29 @@ def test_fastparquet_cross_compatibility(tempdir):
     # (no arrow schema in parquet metadata)
     df['f'] = df['f'].astype(object)
     tm.assert_frame_equal(table_fp.to_pandas(), df)
+
+
+@pytest.mark.parametrize('array_factory', [
+    lambda: pa.array([0, None] * 10),
+    lambda: pa.array([0, None] * 10).dictionary_encode(),
+    lambda: pa.array(["", None] * 10),
+    lambda: pa.array(["", None] * 10).dictionary_encode(),
+])
+@pytest.mark.parametrize('use_dictionary', [False, True])
+@pytest.mark.parametrize('read_dictionary', [False, True])
+def test_buffer_contents(array_factory, use_dictionary, read_dictionary):
+    # Test that null values are deterministically initialized to zero
+    # after a roundtrip through Parquet.
+    # See ARROW-8006 and ARROW-8011.
+    orig_table = pa.Table.from_pydict({"col": array_factory()})
+    bio = io.BytesIO()
+    pq.write_table(orig_table, bio, use_dictionary=True)
+    bio.seek(0)
+    read_dictionary = ['col'] if read_dictionary else None
+    table = pq.read_table(bio, use_threads=False,
+                          read_dictionary=read_dictionary)
+
+    for col in table.columns:
+        [chunk] = col.chunks
+        buf = chunk.buffers()[1]
+        assert buf.to_pybytes() == buf.size * b"\0"