Skip to content

Commit

Permalink
ARROW-12203: [C++][Python] Switch default Parquet version to 2.4 (#13280
Browse files Browse the repository at this point in the history
)

Change the default parquet version to 2.4

Authored-by: Raúl Cumplido <raulcumplido@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
raulcd committed Jun 1, 2022
1 parent e8cb0bd commit 797c88a
Show file tree
Hide file tree
Showing 6 changed files with 12 additions and 13 deletions.
3 changes: 1 addition & 2 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Expand Up @@ -836,8 +836,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
{"int8", ::arrow::int8(), LogicalType::Int(8, true), ParquetType::INT32, -1},
{"uint16", ::arrow::uint16(), LogicalType::Int(16, false), ParquetType::INT32, -1},
{"int16", ::arrow::int16(), LogicalType::Int(16, true), ParquetType::INT32, -1},
{"uint32", ::arrow::uint32(), LogicalType::None(), ParquetType::INT64,
-1}, // Parquet 1.0
{"uint32", ::arrow::uint32(), LogicalType::Int(32, false), ParquetType::INT32, -1},
{"int32", ::arrow::int32(), LogicalType::None(), ParquetType::INT32, -1},
{"uint64", ::arrow::uint64(), LogicalType::Int(64, false), ParquetType::INT64, -1},
{"int64", ::arrow::int64(), LogicalType::None(), ParquetType::INT64, -1},
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/properties.h
Expand Up @@ -166,7 +166,7 @@ class PARQUET_EXPORT WriterProperties {
write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
pagesize_(kDefaultDataPageSize),
version_(ParquetVersion::PARQUET_1_0),
version_(ParquetVersion::PARQUET_2_4),
data_page_version_(ParquetDataPageVersion::V1),
created_by_(DEFAULT_CREATED_BY) {}
virtual ~Builder() {}
Expand Down Expand Up @@ -246,7 +246,7 @@ class PARQUET_EXPORT WriterProperties {
}

/// Specify the Parquet file version.
/// Default PARQUET_1_0.
/// Default PARQUET_2_4.
Builder* version(ParquetVersion::type version) {
version_ = version;
return this;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/properties_test.cc
Expand Up @@ -43,7 +43,7 @@ TEST(TestWriterProperties, Basics) {

ASSERT_EQ(kDefaultDataPageSize, props->data_pagesize());
ASSERT_EQ(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT, props->dictionary_pagesize_limit());
ASSERT_EQ(ParquetVersion::PARQUET_1_0, props->version());
ASSERT_EQ(ParquetVersion::PARQUET_2_4, props->version());
ASSERT_EQ(ParquetDataPageVersion::V1, props->data_page_version());
}

Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/_parquet.pyx
Expand Up @@ -714,7 +714,7 @@ cdef class FileMetaData(_Weakrefable):
"""
Parquet format version used in file (str, such as '1.0', '2.4').
If version is missing or unparsable, will default to assuming '1.0'.
If version is missing or unparsable, will default to assuming '2.4'.
"""
cdef ParquetVersion version = self._metadata.version()
if version == ParquetVersion_V1:
Expand All @@ -726,9 +726,9 @@ cdef class FileMetaData(_Weakrefable):
elif version == ParquetVersion_V2_6:
return '2.6'
else:
warnings.warn('Unrecognized file version, assuming 1.0: {}'
warnings.warn('Unrecognized file version, assuming 2.4: {}'
.format(version))
return '1.0'
return '2.4'

@property
def created_by(self):
Expand Down
8 changes: 4 additions & 4 deletions python/pyarrow/parquet/__init__.py
Expand Up @@ -674,7 +674,7 @@ def _sanitize_table(table, new_schema, flavor):
return table


_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "1.0"
_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.4"
Determine which Parquet logical types are available for use, whether the
reduced set from the Parquet 1.x.x format or the expanded logical types
added in later format versions.
Expand Down Expand Up @@ -862,7 +862,7 @@ class ParquetWriter:

def __init__(self, where, schema, filesystem=None,
flavor=None,
version='1.0',
version='2.4',
use_dictionary=True,
compression='snappy',
write_statistics=True,
Expand Down Expand Up @@ -2854,7 +2854,7 @@ def read_pandas(source, columns=None, **kwargs):
_DNF_filter_doc, "")


def write_table(table, where, row_group_size=None, version='1.0',
def write_table(table, where, row_group_size=None, version='2.4',
use_dictionary=True, compression='snappy',
write_statistics=True,
use_deprecated_int96_timestamps=None,
Expand Down Expand Up @@ -3336,7 +3336,7 @@ def read_metadata(where, memory_map=False, decryption_properties=None):
num_columns: 2
num_rows: 3
num_row_groups: 1
format_version: 1.0
format_version: 2.6
serialized_size: 561
"""
return ParquetFile(where, memory_map=memory_map,
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/parquet/test_pandas.py
Expand Up @@ -256,7 +256,7 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset):
arrow_table = pa.Table.from_pandas(df)

with filename.open('wb') as f:
_write_table(arrow_table, f, version="1.0")
_write_table(arrow_table, f, version="2.4")

data = io.BytesIO(filename.read_bytes())

Expand Down

0 comments on commit 797c88a

Please sign in to comment.