From 383339f017cfc4a0e8f3055aca74cc56bbe0cfe1 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 22 May 2026 11:15:08 +0800 Subject: [PATCH 1/5] [fix](parquet) Fix wrong condition --- be/src/format/parquet/byte_array_plain_decoder.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/be/src/format/parquet/byte_array_plain_decoder.cpp b/be/src/format/parquet/byte_array_plain_decoder.cpp index 1d9f2cfea604d7..d4f7028d3f1984 100644 --- a/be/src/format/parquet/byte_array_plain_decoder.cpp +++ b/be/src/format/parquet/byte_array_plain_decoder.cpp @@ -33,7 +33,7 @@ Status ByteArrayPlainDecoder::skip_values(size_t num_values) { uint32_t length = decode_fixed32_le(reinterpret_cast(_data->data) + _offset); _offset += 4; - if (UNLIKELY(_offset + length) > _data->size) { + if (UNLIKELY(cast_set(_offset + length) > _data->size)) { return Status::IOError("Can't skip enough bytes in plain decoder"); } _offset += length; @@ -68,7 +68,7 @@ Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat uint32_t length = decode_fixed32_le(reinterpret_cast(_data->data) + _offset); _offset += 4; - if (UNLIKELY(_offset + length) > _data->size) { + if (UNLIKELY(cast_set(_offset + length) > _data->size)) { return Status::IOError("Can't read enough bytes in plain decoder"); } string_values.emplace_back(_data->data + _offset, length); @@ -89,7 +89,7 @@ Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat uint32_t length = decode_fixed32_le(reinterpret_cast(_data->data) + _offset); _offset += 4; - if (UNLIKELY(_offset + length) > _data->size) { + if (UNLIKELY(cast_set(_offset + length) > _data->size)) { return Status::IOError("Can't read enough bytes in plain decoder"); } _offset += length; From 1f7887396b50493038df3c3a294212e688bc3172 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 22 May 2026 11:31:26 +0800 Subject: [PATCH 2/5] update --- be/src/format/parquet/byte_array_plain_decoder.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/be/src/format/parquet/byte_array_plain_decoder.cpp b/be/src/format/parquet/byte_array_plain_decoder.cpp index d4f7028d3f1984..d344e214632f37 100644 --- a/be/src/format/parquet/byte_array_plain_decoder.cpp +++ b/be/src/format/parquet/byte_array_plain_decoder.cpp @@ -33,7 +33,7 @@ Status ByteArrayPlainDecoder::skip_values(size_t num_values) { uint32_t length = decode_fixed32_le(reinterpret_cast(_data->data) + _offset); _offset += 4; - if (UNLIKELY(cast_set(_offset + length) > _data->size)) { + if (UNLIKELY(cast_set(_offset) + length > _data->size)) { return Status::IOError("Can't skip enough bytes in plain decoder"); } _offset += length; @@ -68,7 +68,7 @@ Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat uint32_t length = decode_fixed32_le(reinterpret_cast(_data->data) + _offset); _offset += 4; - if (UNLIKELY(cast_set(_offset + length) > _data->size)) { + if (UNLIKELY(cast_set(_offset) + length > _data->size)) { return Status::IOError("Can't read enough bytes in plain decoder"); } string_values.emplace_back(_data->data + _offset, length); @@ -89,7 +89,7 @@ Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat uint32_t length = decode_fixed32_le(reinterpret_cast(_data->data) + _offset); _offset += 4; - if (UNLIKELY(cast_set(_offset + length) > _data->size)) { + if (UNLIKELY(cast_set(_offset) + length > _data->size)) { return Status::IOError("Can't read enough bytes in plain decoder"); } _offset += length; From d650ddea6ffc750f78e49fe93cd8b2c4fc733eef Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 22 May 2026 15:06:57 +0800 Subject: [PATCH 3/5] update --- be/src/format/parquet/byte_array_dict_decoder.cpp | 7 +++++++ .../external_table_p0/tvf/test_hdfs_parquet_group6.groovy | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/be/src/format/parquet/byte_array_dict_decoder.cpp b/be/src/format/parquet/byte_array_dict_decoder.cpp index 11655b865d1b90..cd9f01102007f8 100644 --- a/be/src/format/parquet/byte_array_dict_decoder.cpp +++ b/be/src/format/parquet/byte_array_dict_decoder.cpp @@ -41,7 +41,14 @@ Status ByteArrayDictDecoder::set_dict(DorisUniqueBufferPtr& dict, int32 size_t total_length = 0; for (int i = 0; i < num_values; ++i) { + if (UNLIKELY(offset_cursor > cast_set(length))) { + return Status::Corruption("Wrong data length in dictionary"); + } uint32_t l = decode_fixed32_le(_dict.get() + offset_cursor); + if (UNLIKELY(offset_cursor + l > cast_set(length) || + l > cast_set(length))) { + return Status::Corruption("Wrong data length in dictionary"); + } offset_cursor += 4; offset_cursor += l; total_length += l; diff --git a/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group6.groovy b/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group6.groovy index 3789054a3e7dc7..96ec42256fbf36 100644 --- a/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group6.groovy +++ b/regression-test/suites/external_table_p0/tvf/test_hdfs_parquet_group6.groovy @@ -677,8 +677,8 @@ suite("test_hdfs_parquet_group6", "p0,external") { sql """ select * from HDFS( "uri" = "${uri}", "hadoop.username" = "${hdfsUserName}", - "format" = "parquet") limit 10; """ - exception "Can't read byte array length from plain decoder" + "format" = "parquet"); """ + exception "Can't read enough bytes in plain decode" } From 937ff16f008211bfca95eb119b7bb0b2e293bd7a Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 22 May 2026 16:40:02 +0800 Subject: [PATCH 4/5] update --- be/src/format/parquet/byte_array_dict_decoder.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/be/src/format/parquet/byte_array_dict_decoder.cpp b/be/src/format/parquet/byte_array_dict_decoder.cpp index cd9f01102007f8..76a8d5ff9bf0cc 100644 --- a/be/src/format/parquet/byte_array_dict_decoder.cpp +++ b/be/src/format/parquet/byte_array_dict_decoder.cpp @@ -41,14 +41,10 @@ Status ByteArrayDictDecoder::set_dict(DorisUniqueBufferPtr& dict, int32 size_t total_length = 0; for (int i = 0; i < num_values; ++i) { - if (UNLIKELY(offset_cursor > cast_set(length))) { + if (UNLIKELY(offset_cursor + 4 > cast_set(length))) { return Status::Corruption("Wrong data length in dictionary"); } uint32_t l = decode_fixed32_le(_dict.get() + offset_cursor); - if (UNLIKELY(offset_cursor + l > cast_set(length) || - l > cast_set(length))) { - return Status::Corruption("Wrong data length in dictionary"); - } offset_cursor += 4; offset_cursor += l; total_length += l; From 2785888f4ff92783aa1c8dd4a4d9d6f4625d334b Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 22 May 2026 17:09:51 +0800 Subject: [PATCH 5/5] [fix](parquet) Guard byte array decoder bounds checks ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Parquet byte array decoders still used addition-based bounds checks for malformed length-prefixed data. The plain decoder could overflow _offset + length before rejecting a value, and the dictionary decoder could decode or copy from truncated pages before validating that enough bytes remained for the length prefix and payload. This change validates remaining bytes with subtraction before decoding or advancing offsets, rejects negative dictionary page lengths, and adds unit coverage for truncated prefixes, truncated payloads, and overflow-sized values. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check - Could not run build-support/clang-format.sh because llvm@16/clang-format is not installed in this environment - Could not run BE UT because run-be-ut.sh requires JDK-17, but this environment only has JDK-11 - Behavior changed: No - Does this need documentation: No --- .../parquet/byte_array_dict_decoder.cpp | 28 ++++++++---- .../parquet/byte_array_plain_decoder.cpp | 41 +++++++++--------- .../parquet/byte_array_dict_decoder_test.cpp | 17 ++++++++ .../parquet/byte_array_plain_decoder_test.cpp | 43 +++++++++++++++++++ 4 files changed, 100 insertions(+), 29 deletions(-) diff --git a/be/src/format/parquet/byte_array_dict_decoder.cpp b/be/src/format/parquet/byte_array_dict_decoder.cpp index 76a8d5ff9bf0cc..c7ca433223ed7a 100644 --- a/be/src/format/parquet/byte_array_dict_decoder.cpp +++ b/be/src/format/parquet/byte_array_dict_decoder.cpp @@ -36,16 +36,24 @@ Status ByteArrayDictDecoder::set_dict(DorisUniqueBufferPtr& dict, int32 return Status::Corruption("Wrong dictionary data for byte array type, dict is null."); } _dict_items.reserve(num_values); - uint32_t offset_cursor = 0; + if (UNLIKELY(length < 0)) { + return Status::Corruption("Wrong data length in dictionary"); + } + const size_t dict_length = cast_set(length); + size_t offset_cursor = 0; char* dict_item_address = reinterpret_cast(_dict.get()); size_t total_length = 0; for (int i = 0; i < num_values; ++i) { - if (UNLIKELY(offset_cursor + 4 > cast_set(length))) { + if (UNLIKELY(offset_cursor > dict_length || + dict_length - offset_cursor < sizeof(uint32_t))) { return Status::Corruption("Wrong data length in dictionary"); } uint32_t l = decode_fixed32_le(_dict.get() + offset_cursor); - offset_cursor += 4; + offset_cursor += sizeof(uint32_t); + if (UNLIKELY(l > dict_length - offset_cursor)) { + return Status::Corruption("Wrong data length in dictionary"); + } offset_cursor += l; total_length += l; } @@ -56,20 +64,24 @@ Status ByteArrayDictDecoder::set_dict(DorisUniqueBufferPtr& dict, int32 size_t offset = 0; offset_cursor = 0; for (int i = 0; i < num_values; ++i) { + if (UNLIKELY(offset_cursor > dict_length || + dict_length - offset_cursor < sizeof(uint32_t))) { + return Status::Corruption("Wrong data length in dictionary"); + } uint32_t l = decode_fixed32_le(_dict.get() + offset_cursor); - offset_cursor += 4; + offset_cursor += sizeof(uint32_t); + if (UNLIKELY(l > dict_length - offset_cursor)) { + return Status::Corruption("Wrong data length in dictionary"); + } memcpy(&_dict_data[offset], dict_item_address + offset_cursor, l); _dict_items.emplace_back(&_dict_data[offset], l); offset_cursor += l; offset += l; - if (offset_cursor > length) { - return Status::Corruption("Wrong data length in dictionary"); - } if (l > _max_value_length) { _max_value_length = l; } } - if (offset_cursor != length) { + if (offset_cursor != dict_length) { return Status::Corruption("Wrong dictionary data for byte array type"); } return Status::OK(); diff --git a/be/src/format/parquet/byte_array_plain_decoder.cpp b/be/src/format/parquet/byte_array_plain_decoder.cpp index d344e214632f37..d9048c5549cc44 100644 --- a/be/src/format/parquet/byte_array_plain_decoder.cpp +++ b/be/src/format/parquet/byte_array_plain_decoder.cpp @@ -25,15 +25,22 @@ #include "core/string_ref.h" namespace doris { +namespace { +Status read_length(const Slice* data, uint32_t* offset, uint32_t* length) { + if (UNLIKELY(*offset > data->size || data->size - *offset < sizeof(uint32_t))) { + return Status::IOError("Can't read byte array length from plain decoder"); + } + *length = decode_fixed32_le(reinterpret_cast(data->data) + *offset); + *offset += sizeof(uint32_t); + return Status::OK(); +} +} // namespace + Status ByteArrayPlainDecoder::skip_values(size_t num_values) { for (int i = 0; i < num_values; ++i) { - if (UNLIKELY(_offset + 4 > _data->size)) { - return Status::IOError("Can't read byte array length from plain decoder"); - } - uint32_t length = - decode_fixed32_le(reinterpret_cast(_data->data) + _offset); - _offset += 4; - if (UNLIKELY(cast_set(_offset) + length > _data->size)) { + uint32_t length = 0; + RETURN_IF_ERROR(read_length(_data, &_offset, &length)); + if (UNLIKELY(_offset > _data->size || length > _data->size - _offset)) { return Status::IOError("Can't skip enough bytes in plain decoder"); } _offset += length; @@ -62,13 +69,9 @@ Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat std::vector string_values; string_values.reserve(run_length); for (size_t i = 0; i < run_length; ++i) { - if (UNLIKELY(_offset + 4 > _data->size)) { - return Status::IOError("Can't read byte array length from plain decoder"); - } - uint32_t length = - decode_fixed32_le(reinterpret_cast(_data->data) + _offset); - _offset += 4; - if (UNLIKELY(cast_set(_offset) + length > _data->size)) { + uint32_t length = 0; + RETURN_IF_ERROR(read_length(_data, &_offset, &length)); + if (UNLIKELY(_offset > _data->size || length > _data->size - _offset)) { return Status::IOError("Can't read enough bytes in plain decoder"); } string_values.emplace_back(_data->data + _offset, length); @@ -83,13 +86,9 @@ Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, Dat } case ColumnSelectVector::FILTERED_CONTENT: { for (int i = 0; i < run_length; ++i) { - if (UNLIKELY(_offset + 4 > _data->size)) { - return Status::IOError("Can't read byte array length from plain decoder"); - } - uint32_t length = - decode_fixed32_le(reinterpret_cast(_data->data) + _offset); - _offset += 4; - if (UNLIKELY(cast_set(_offset) + length > _data->size)) { + uint32_t length = 0; + RETURN_IF_ERROR(read_length(_data, &_offset, &length)); + if (UNLIKELY(_offset > _data->size || length > _data->size - _offset)) { return Status::IOError("Can't read enough bytes in plain decoder"); } _offset += length; diff --git a/be/test/format/parquet/byte_array_dict_decoder_test.cpp b/be/test/format/parquet/byte_array_dict_decoder_test.cpp index 3d7545f00f2b6f..29213246db3f92 100644 --- a/be/test/format/parquet/byte_array_dict_decoder_test.cpp +++ b/be/test/format/parquet/byte_array_dict_decoder_test.cpp @@ -511,4 +511,21 @@ TEST_F(ByteArrayDictDecoderTest, test_skip_value) { } } +TEST_F(ByteArrayDictDecoderTest, test_set_dict_rejects_truncated_length_prefix) { + auto dict_data = make_unique_buffer(2); + dict_data[0] = 1; + dict_data[1] = 0; + + ByteArrayDictDecoder decoder; + ASSERT_FALSE(decoder.set_dict(dict_data, 2, 1).ok()); +} + +TEST_F(ByteArrayDictDecoderTest, test_set_dict_rejects_truncated_payload_after_prefix) { + auto dict_data = make_unique_buffer(4); + encode_fixed32_le(dict_data.get(), 1); + + ByteArrayDictDecoder decoder; + ASSERT_FALSE(decoder.set_dict(dict_data, 4, 1).ok()); +} + } // namespace doris diff --git a/be/test/format/parquet/byte_array_plain_decoder_test.cpp b/be/test/format/parquet/byte_array_plain_decoder_test.cpp index 5e8ff531cc9c7f..ad78492a345ccd 100644 --- a/be/test/format/parquet/byte_array_plain_decoder_test.cpp +++ b/be/test/format/parquet/byte_array_plain_decoder_test.cpp @@ -244,4 +244,47 @@ TEST_F(ByteArrayPlainDecoderTest, test_skip_value) { EXPECT_EQ(result_column->get_data_at(0).to_string(), "cherry"); } +TEST_F(ByteArrayPlainDecoderTest, test_decode_truncated_length_prefix) { + uint8_t data[] = {0x01, 0x00}; + _data_slice = Slice(data, sizeof(data)); + + ByteArrayPlainDecoder decoder; + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + size_t num_values = 1; + std::vector run_length_null_map(1, num_values); + std::vector filter_data(num_values, 1); + FilterMap filter_map; + ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(), false).ok()); + ColumnSelectVector select_vector; + ASSERT_TRUE(select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0).ok()); + + ASSERT_FALSE(decoder.decode_values(column, data_type, select_vector, false).ok()); +} + +TEST_F(ByteArrayPlainDecoderTest, test_decode_rejects_overflow_length) { + uint8_t data[8] = {}; + encode_fixed32_le(data, 0); + encode_fixed32_le(data + 4, UINT32_MAX); + _data_slice = Slice(data, sizeof(data)); + + ByteArrayPlainDecoder decoder; + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + ASSERT_TRUE(decoder.skip_values(1).ok()); + + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + size_t num_values = 1; + std::vector run_length_null_map(1, num_values); + std::vector filter_data(num_values, 1); + FilterMap filter_map; + ASSERT_TRUE(filter_map.init(filter_data.data(), filter_data.size(), false).ok()); + ColumnSelectVector select_vector; + ASSERT_TRUE(select_vector.init(run_length_null_map, num_values, nullptr, &filter_map, 0).ok()); + + ASSERT_FALSE(decoder.decode_values(column, data_type, select_vector, false).ok()); +} + } // namespace doris