diff --git a/be/src/format/parquet/vparquet_group_reader.cpp b/be/src/format/parquet/vparquet_group_reader.cpp index 6863926ddba56d..7fb2c6fe67c770 100644 --- a/be/src/format/parquet/vparquet_group_reader.cpp +++ b/be/src/format/parquet/vparquet_group_reader.cpp @@ -277,7 +277,8 @@ bool RowGroupReader::is_dictionary_encoded(const tparquet::ColumnMetaData& colum if (column_metadata.__isset.encoding_stats) { // Condition #1 above for (const tparquet::PageEncodingStats& enc_stat : column_metadata.encoding_stats) { - if (enc_stat.page_type == tparquet::PageType::DATA_PAGE && + if ((enc_stat.page_type == tparquet::PageType::DATA_PAGE || + enc_stat.page_type == tparquet::PageType::DATA_PAGE_V2) && (enc_stat.encoding != tparquet::Encoding::PLAIN_DICTIONARY && enc_stat.encoding != tparquet::Encoding::RLE_DICTIONARY) && enc_stat.count > 0) { diff --git a/be/test/format/parquet/parquet_thrift_test.cpp b/be/test/format/parquet/parquet_thrift_test.cpp index 2253b6c12cce5f..7171fe3b63cd16 100644 --- a/be/test/format/parquet/parquet_thrift_test.cpp +++ b/be/test/format/parquet/parquet_thrift_test.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -457,4 +458,34 @@ TEST_F(ParquetThriftReaderTest, dict_decoder) { read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/dict-decoder.parquet", "./be/test/exec/test_data/parquet_scanner/dict-decoder.txt", 12); } + +TEST_F(ParquetThriftReaderTest, is_dictionary_encoded_rejects_plain_data_page_v2) { + tparquet::ColumnMetaData column_metadata; + column_metadata.type = tparquet::Type::BYTE_ARRAY; + column_metadata.__isset.encoding_stats = true; + + tparquet::PageEncodingStats dict_page; + dict_page.page_type = tparquet::PageType::DATA_PAGE_V2; + dict_page.encoding = tparquet::Encoding::RLE_DICTIONARY; + dict_page.count = 2; + + tparquet::PageEncodingStats plain_page; + plain_page.page_type = tparquet::PageType::DATA_PAGE_V2; + plain_page.encoding = tparquet::Encoding::PLAIN; + plain_page.count = 1; + + column_metadata.encoding_stats = {dict_page, plain_page}; + + tparquet::RowGroup row_group; + row_group.num_rows = 0; + RowGroupReader::PositionDeleteContext position_delete_ctx(row_group.num_rows, 0); + RowGroupReader::LazyReadContext lazy_read_ctx; + std::set column_ids; + std::set filter_column_ids; + RowGroupReader row_group_reader(nullptr, {}, 0, row_group, nullptr, nullptr, + position_delete_ctx, lazy_read_ctx, nullptr, column_ids, + filter_column_ids); + + EXPECT_FALSE(row_group_reader.is_dictionary_encoded(column_metadata)); +} } // namespace doris diff --git a/regression-test/data/query_p0/test_parquet_dict.out b/regression-test/data/query_p0/test_parquet_dict.out new file mode 100644 index 00000000000000..7bbcaf3bf1c0d8 --- /dev/null +++ b/regression-test/data/query_p0/test_parquet_dict.out @@ -0,0 +1,4 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !s3_tvf -- +68535cc98406454081424bf8247d783d + diff --git a/regression-test/suites/query_p0/test_parquet_dict.groovy b/regression-test/suites/query_p0/test_parquet_dict.groovy new file mode 100644 index 00000000000000..c50a89fde3f0b4 --- /dev/null +++ b/regression-test/suites/query_p0/test_parquet_dict.groovy @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_parquet_dict", "p0") { + try { + String ak = context.config.otherConfigs.get("ak") + String sk = context.config.otherConfigs.get("sk") + qt_s3_tvf """ SELECT * FROM FILE ( + "uri" = "https://doris-regression-hk.oss-cn-hongkong.aliyuncs.com/regression/query_p0/test_page_v2.parquet", + "s3.access_key"= "${ak}", + "s3.secret_key" = "${sk}", + "format" = "parquet" + ) where user_id='68535cc98406454081424bf8247d783d' ; + """ + } finally { + } +} \ No newline at end of file