GH-35859: [Python] Actually change the default row group size to 1Mi (#…

…36012) ### Rationale for this change In #34280 the default row group size was changed to 1Mi. However, this was accidentally reverted (for python, but not C++) in #34435 The problem is that there is both an "absolute max row group size for the writer" and a "row group size to use for this table" The pyarrow user is unable to set the former property. The behavior in pyarrow was previously "If no value is given in the call to write_table then don't specify anything and let the absolute max apply" The first fix changed the absolute max to 1Mi. However, this made it impossible for the user to use a larger row group size. The second fix changed the absolute max back to 64Mi. However, this meant the default didn't change. ### What changes are included in this PR? This change leaves the absolute max at 64Mi. However, if the user does not specify a row group size, we no longer "just use the table size" and instead use 1Mi. ### Are these changes tested? Yes, a unit test was added. ### Are there any user-facing changes? Yes, the default row group size now truly changes to 1Mi. This change was already announced as part of #34280 * Closes: #35859 Authored-by: Weston Pace <weston.pace@gmail.com> Signed-off-by: Weston Pace <weston.pace@gmail.com>
apache · Jun 22, 2023 · 0d1f723 · 0d1f723
1 parent 2ce4a38
commit 0d1f723
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 5 deletions.
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
@@ -42,6 +42,8 @@ from pyarrow.lib import (ArrowException, NativeFile, BufferOutputStream,
 
 cimport cpython as cp
 
+_DEFAULT_ROW_GROUP_SIZE = 1024*1024
+_MAX_ROW_GROUP_SIZE = 64*1024*1024
 
 cdef class Statistics(_Weakrefable):
     """Statistics for a single column in a single row group."""
@@ -1595,7 +1597,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
     # The user can always specify a smaller row group size (and the default
     # is smaller) when calling write_table.  If the call to write_table uses
     # a size larger than this then it will be latched to this value.
-    props.max_row_group_length(64*1024*1024)
+    props.max_row_group_length(_MAX_ROW_GROUP_SIZE)
 
     properties = props.build()
 
@@ -1767,7 +1769,7 @@ cdef class ParquetWriter(_Weakrefable):
             int64_t c_row_group_size
 
         if row_group_size is None or row_group_size == -1:
-            c_row_group_size = ctable.num_rows()
+            c_row_group_size = min(ctable.num_rows(), _DEFAULT_ROW_GROUP_SIZE)
         elif row_group_size == 0:
             raise ValueError('Row group size cannot be 0')
         else:

diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py
@@ -223,15 +223,19 @@ def check_chunk_size(data_size, chunk_size, expect_num_chunks):
         table = pa.Table.from_arrays([
             _range_integers(data_size, 'b')
         ], names=['x'])
-        pq.write_table(table, tempdir / 'test.parquet', row_group_size=chunk_size)
+        if chunk_size is None:
+            pq.write_table(table, tempdir / 'test.parquet')
+        else:
+            pq.write_table(table, tempdir / 'test.parquet', row_group_size=chunk_size)
         metadata = pq.read_metadata(tempdir / 'test.parquet')
+        expected_chunk_size = default_chunk_size if chunk_size is None else chunk_size
         assert metadata.num_row_groups == expect_num_chunks
-        latched_chunk_size = min(chunk_size, abs_max_chunk_size)
+        latched_chunk_size = min(expected_chunk_size, abs_max_chunk_size)
         # First chunks should be full size
         for chunk_idx in range(expect_num_chunks - 1):
             assert metadata.row_group(chunk_idx).num_rows == latched_chunk_size
         # Last chunk may be smaller
-        remainder = data_size - (chunk_size * (expect_num_chunks - 1))
+        remainder = data_size - (expected_chunk_size * (expect_num_chunks - 1))
         if remainder == 0:
             assert metadata.row_group(
                 expect_num_chunks - 1).num_rows == latched_chunk_size
@@ -246,6 +250,11 @@ def check_chunk_size(data_size, chunk_size, expect_num_chunks):
     # by the absolute max chunk size
     check_chunk_size(abs_max_chunk_size * 2, abs_max_chunk_size * 2, 2)
 
+    # These tests don't pass a chunk_size to write_table and so the chunk size
+    # should be default_chunk_size
+    check_chunk_size(default_chunk_size, None, 1)
+    check_chunk_size(default_chunk_size + 1, None, 2)
+
 
 @pytest.mark.pandas
 @pytest.mark.parametrize("filesystem", [