Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-36882: [C++][Parquet] Default RLE for bool values in the parquet version 2.x #36955

Merged
merged 3 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2329,6 +2329,12 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(ColumnChunkMetaDataBuilder* met
const bool use_dictionary = properties->dictionary_enabled(descr->path()) &&
descr->physical_type() != Type::BOOLEAN;
Encoding::type encoding = properties->encoding(descr->path());
if (encoding == Encoding::UNKNOWN) {
encoding = (descr->physical_type() == Type::BOOLEAN &&
properties->version() != ParquetVersion::PARQUET_1_0)
? Encoding::RLE
: Encoding::PLAIN;
}
if (use_dictionary) {
encoding = properties->dictionary_index_encoding();
}
Expand Down
53 changes: 38 additions & 15 deletions cpp/src/parquet/column_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {

if (this->type_num() == Type::BOOLEAN) {
// Dictionary encoding is not allowed for boolean type
// There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case
std::set<Encoding::type> expected({Encoding::PLAIN, Encoding::RLE});
std::set<Encoding::type> expected;
if (version == ParquetVersion::PARQUET_1_0) {
// There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case for
// version 1.0. Note that RLE is used for DL/RL.
expected = {Encoding::PLAIN, Encoding::RLE};
} else {
// There is only 1 encoding (RLE) in a fallback case for version 2.0
expected = {Encoding::RLE};
}
ASSERT_EQ(encodings, expected);
} else if (version == ParquetVersion::PARQUET_1_0) {
// There are 3 encodings (PLAIN_DICTIONARY, PLAIN, RLE) in a fallback case
Expand All @@ -223,7 +230,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
std::vector<parquet::PageEncodingStats> encoding_stats =
this->metadata_encoding_stats();
if (this->type_num() == Type::BOOLEAN) {
ASSERT_EQ(encoding_stats[0].encoding, Encoding::PLAIN);
ASSERT_EQ(encoding_stats[0].encoding,
version == ParquetVersion::PARQUET_1_0 ? Encoding::PLAIN : Encoding::RLE);
ASSERT_EQ(encoding_stats[0].page_type, PageType::DATA_PAGE);
} else if (version == ParquetVersion::PARQUET_1_0) {
std::vector<Encoding::type> expected(
Expand Down Expand Up @@ -757,21 +765,36 @@ TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk) {
ASSERT_EQ(0, this->values_read_);
}

class TestBooleanValuesWriter : public TestPrimitiveWriter<BooleanType> {
public:
void TestWithEncoding(ParquetVersion::type version, Encoding::type encoding) {
this->SetUpSchema(Repetition::REQUIRED);
auto writer = this->BuildWriter(SMALL_SIZE, ColumnProperties(), version,
/*enable_checksum*/ false);
for (int i = 0; i < SMALL_SIZE; i++) {
bool value = (i % 2 == 0) ? true : false;
writer->WriteBatch(1, nullptr, nullptr, &value);
}
writer->Close();
this->ReadColumn();
for (int i = 0; i < SMALL_SIZE; i++) {
ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
}
const auto& encodings = this->metadata_encodings();
auto iter = std::find(encodings.begin(), encodings.end(), encoding);
ASSERT_TRUE(iter != encodings.end());
Comment on lines +783 to +785
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't we just assert the value of encodings? There should be only one encoding, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, in the case of parquet version 1.0, both PLAIN and RLE exists. The reason is here: https://github.com/apache/arrow/blob/main/cpp/src/parquet/metadata.cc#L1487-L1492

}
};

// PARQUET-764
// Correct bitpacking for boolean write at non-byte boundaries
using TestBooleanValuesWriter = TestPrimitiveWriter<BooleanType>;
TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) {
this->SetUpSchema(Repetition::REQUIRED);
auto writer = this->BuildWriter();
for (int i = 0; i < SMALL_SIZE; i++) {
bool value = (i % 2 == 0) ? true : false;
writer->WriteBatch(1, nullptr, nullptr, &value);
}
writer->Close();
this->ReadColumn();
for (int i = 0; i < SMALL_SIZE; i++) {
ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i;
}
TestWithEncoding(ParquetVersion::PARQUET_1_0, Encoding::PLAIN);
}

// Default encoding for boolean is RLE when using V2 pages
TEST_F(TestBooleanValuesWriter, RleEncodedBooleanValues) {
TestWithEncoding(ParquetVersion::PARQUET_2_4, Encoding::RLE);
}

// PARQUET-979
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false;
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def test_parquet_metadata_api():
assert col_meta.is_stats_set is True
assert isinstance(col_meta.statistics, pq.Statistics)
assert col_meta.compression == 'SNAPPY'
assert set(col_meta.encodings) == {'PLAIN', 'RLE'}
assert set(col_meta.encodings) == {'RLE'}
assert col_meta.has_dictionary_page is False
assert col_meta.dictionary_page_offset is None
assert col_meta.data_page_offset > 0
Expand Down
Loading