diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 12b2837fbfd1e..23366b2daafd5 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -271,7 +271,12 @@ class SerializedPageWriter : public PageWriter { } int64_t WriteDictionaryPage(const DictionaryPage& page) override { - int64_t uncompressed_size = page.size(); + int64_t uncompressed_size = page.buffer()->size(); + if (uncompressed_size > std::numeric_limits::max()) { + throw ParquetException( + "Uncompressed dictionary page size overflows INT32_MAX. Size:", + uncompressed_size); + } std::shared_ptr compressed_data; if (has_compressor()) { auto buffer = std::static_pointer_cast( @@ -288,6 +293,11 @@ class SerializedPageWriter : public PageWriter { dict_page_header.__set_is_sorted(page.is_sorted()); const uint8_t* output_data_buffer = compressed_data->data(); + if (compressed_data->size() > std::numeric_limits::max()) { + throw ParquetException( + "Compressed dictionary page size overflows INT32_MAX. Size: ", + uncompressed_size); + } int32_t output_data_len = static_cast(compressed_data->size()); if (data_encryptor_.get()) { @@ -371,18 +381,29 @@ class SerializedPageWriter : public PageWriter { const int64_t uncompressed_size = page.uncompressed_size(); std::shared_ptr compressed_data = page.buffer(); const uint8_t* output_data_buffer = compressed_data->data(); - int32_t output_data_len = static_cast(compressed_data->size()); + int64_t output_data_len = compressed_data->size(); + + if (output_data_len > std::numeric_limits::max()) { + throw ParquetException("Compressed data page size overflows INT32_MAX. Size:", + output_data_len); + } if (data_encryptor_.get()) { PARQUET_THROW_NOT_OK(encryption_buffer_->Resize( data_encryptor_->CiphertextSizeDelta() + output_data_len, false)); UpdateEncryption(encryption::kDataPage); - output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len, + output_data_len = data_encryptor_->Encrypt(compressed_data->data(), + static_cast(output_data_len), encryption_buffer_->mutable_data()); output_data_buffer = encryption_buffer_->data(); } format::PageHeader page_header; + + if (uncompressed_size > std::numeric_limits::max()) { + throw ParquetException("Uncompressed data page size overflows INT32_MAX. Size:", + uncompressed_size); + } page_header.__set_uncompressed_page_size(static_cast(uncompressed_size)); page_header.__set_compressed_page_size(static_cast(output_data_len)); @@ -421,7 +442,7 @@ class SerializedPageWriter : public PageWriter { if (offset_index_builder_ != nullptr) { const int64_t compressed_size = output_data_len + header_size; if (compressed_size > std::numeric_limits::max()) { - throw ParquetException("Compressed page size overflows to INT32_MAX."); + throw ParquetException("Compressed page size overflows INT32_MAX."); } if (!page.first_row_index().has_value()) { throw ParquetException("First row index is not set in data page."); diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 59fc848d7fd57..97421629d2ca6 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -15,9 +15,11 @@ // specific language governing permissions and limitations // under the License. +#include #include #include +#include #include #include "arrow/io/buffered.h" @@ -25,6 +27,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" +#include "parquet/column_page.h" #include "parquet/column_reader.h" #include "parquet/column_writer.h" #include "parquet/file_reader.h" @@ -479,6 +482,9 @@ using TestValuesWriterInt64Type = TestPrimitiveWriter; using TestByteArrayValuesWriter = TestPrimitiveWriter; using TestFixedLengthByteArrayValuesWriter = TestPrimitiveWriter; +using ::testing::HasSubstr; +using ::testing::ThrowsMessage; + TYPED_TEST(TestPrimitiveWriter, RequiredPlain) { this->TestRequiredWithEncoding(Encoding::PLAIN); } @@ -889,6 +895,45 @@ TEST_F(TestByteArrayValuesWriter, CheckDefaultStats) { ASSERT_TRUE(this->metadata_is_stats_set()); } +TEST(TestPageWriter, ThrowsOnPagesTooLarge) { + NodePtr item = schema::Int32("item"); // optional item + NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, ConvertedType::LIST)); + NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); // optional list + std::vector fields = {bag}; + NodePtr root = GroupNode::Make("schema", Repetition::REPEATED, fields); + + SchemaDescriptor schema; + schema.Init(root); + + auto sink = CreateOutputStream(); + auto props = WriterProperties::Builder().build(); + + auto metadata = ColumnChunkMetaDataBuilder::Make(props, schema.Column(0)); + std::unique_ptr pager = + PageWriter::Open(sink, Compression::UNCOMPRESSED, metadata.get()); + + uint8_t data; + std::shared_ptr buffer = + std::make_shared(&data, std::numeric_limits::max() + int64_t{1}); + DataPageV1 over_compressed_limit(buffer, /*num_values=*/100, Encoding::BIT_PACKED, + Encoding::BIT_PACKED, Encoding::BIT_PACKED, + /*uncompressed_size=*/100); + EXPECT_THAT([&]() { pager->WriteDataPage(over_compressed_limit); }, + ThrowsMessage(HasSubstr("overflows INT32_MAX"))); + DictionaryPage dictionary_over_compressed_limit(buffer, /*num_values=*/100, + Encoding::PLAIN); + EXPECT_THAT([&]() { pager->WriteDictionaryPage(dictionary_over_compressed_limit); }, + ThrowsMessage(HasSubstr("overflows INT32_MAX"))); + + buffer = std::make_shared(&data, 1); + DataPageV1 over_uncompressed_limit( + buffer, /*num_values=*/100, Encoding::BIT_PACKED, Encoding::BIT_PACKED, + Encoding::BIT_PACKED, + /*uncompressed_size=*/std::numeric_limits::max() + int64_t{1}); + EXPECT_THAT([&]() { pager->WriteDataPage(over_compressed_limit); }, + ThrowsMessage(HasSubstr("overflows INT32_MAX"))); +} + TEST(TestColumnWriter, RepeatedListsUpdateSpacedBug) { // In ARROW-3930 we discovered a bug when writing from Arrow when we had data // that looks like this: