Skip to content

Commit

Permalink
PARQUET-1829: [C++] Fix crashes on invalid input (OSS-Fuzz)
Browse files Browse the repository at this point in the history
Closes #6728 from pitrou/PARQUET-1829-oss-fuzz

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
pitrou authored and wesm committed Mar 26, 2020
1 parent f83a1ec commit a35cf92
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 10 deletions.
2 changes: 2 additions & 0 deletions cpp/src/arrow/util/bit_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,8 @@ class BitmapReader {
}
}

int64_t position() const { return position_; }

private:
const uint8_t* bitmap_;
int64_t position_;
Expand Down
10 changes: 8 additions & 2 deletions cpp/src/arrow/util/rle_encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count,
arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size);

while (values_read < batch_size) {
DCHECK_LT(bit_reader.position(), batch_size);
bool is_valid = bit_reader.IsSet();
bit_reader.Next();

Expand All @@ -350,6 +351,7 @@ inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count,
repeat_count_--;

while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
DCHECK_LT(bit_reader.position(), batch_size);
if (bit_reader.IsSet()) {
repeat_count_--;
} else {
Expand Down Expand Up @@ -379,6 +381,7 @@ inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count,

// Read the first bitset to the end
while (literals_read < literal_batch) {
DCHECK_LT(bit_reader.position(), batch_size);
if (bit_reader.IsSet()) {
*out = indices[literals_read];
literals_read++;
Expand Down Expand Up @@ -479,6 +482,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary,
arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size);

while (values_read < batch_size) {
DCHECK_LT(bit_reader.position(), batch_size);
bool is_valid = bit_reader.IsSet();
bit_reader.Next();

Expand All @@ -497,6 +501,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary,
repeat_count_--;

while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
DCHECK_LT(bit_reader.position(), batch_size);
if (bit_reader.IsSet()) {
repeat_count_--;
} else {
Expand Down Expand Up @@ -531,6 +536,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary,

// Read the first bitset to the end
while (literals_read < literal_batch) {
DCHECK_LT(bit_reader.position(), batch_size);
if (bit_reader.IsSet()) {
int idx = indices[literals_read];
if (ARROW_PREDICT_FALSE(!IndexInRange(idx, dictionary_length))) {
Expand Down Expand Up @@ -571,12 +577,12 @@ bool RleDecoder::NextCounts() {
bool is_literal = indicator_value & 1;
uint32_t count = indicator_value >> 1;
if (is_literal) {
if (ARROW_PREDICT_FALSE(count > static_cast<uint32_t>(INT32_MAX) / 8)) {
if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX) / 8)) {
return false;
}
literal_count_ = count * 8;
} else {
if (ARROW_PREDICT_FALSE(count > static_cast<uint32_t>(INT32_MAX))) {
if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX))) {
return false;
}
repeat_count_ = count;
Expand Down
16 changes: 16 additions & 0 deletions cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ void LevelDecoder::SetDataV2(int32_t num_bytes, int16_t max_level,
int num_buffered_values, const uint8_t* data) {
// Repetition and definition levels always uses RLE encoding
// in the DataPageV2 format.
if (num_bytes < 0) {
throw ParquetException("Invalid page header (corrupt data page?)");
}
encoding_ = Encoding::RLE;
num_values_remaining_ = num_buffered_values;
bit_width_ = BitUtil::Log2(max_level + 1);
Expand Down Expand Up @@ -343,13 +346,20 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
current_page_header_.dictionary_page_header;

bool is_sorted = dict_header.__isset.is_sorted ? dict_header.is_sorted : false;
if (dict_header.num_values < 0) {
throw ParquetException("Invalid page header (negative number of values)");
}

return std::make_shared<DictionaryPage>(page_buffer, dict_header.num_values,
LoadEnumSafe(&dict_header.encoding),
is_sorted);
} else if (page_type == PageType::DATA_PAGE) {
++page_ordinal_;
const format::DataPageHeader& header = current_page_header_.data_page_header;

if (header.num_values < 0) {
throw ParquetException("Invalid page header (negative number of values)");
}
EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
seen_num_rows_ += header.num_values;

Expand All @@ -361,6 +371,10 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
} else if (page_type == PageType::DATA_PAGE_V2) {
++page_ordinal_;
const format::DataPageHeaderV2& header = current_page_header_.data_page_header_v2;

if (header.num_values < 0) {
throw ParquetException("Invalid page header (negative number of values)");
}
bool is_compressed = header.__isset.is_compressed ? header.is_compressed : false;
EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
seen_num_rows_ += header.num_values;
Expand Down Expand Up @@ -1282,8 +1296,10 @@ class TypedRecordReader : public ColumnReaderImplBase<DType>,
this->max_def_level_, this->max_rep_level_, &values_with_nulls, &null_count,
valid_bits_->mutable_data(), values_written_);
values_to_read = values_with_nulls - null_count;
DCHECK_GE(values_to_read, 0);
ReadValuesSpaced(values_with_nulls, null_count);
} else {
DCHECK_GE(values_to_read, 0);
ReadValuesDense(values_to_read);
}
if (this->max_def_level_ > 0) {
Expand Down
12 changes: 5 additions & 7 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1636,13 +1636,11 @@ void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictio
for (int i = 0; i < dictionary_length_; ++i) {
total_size += dict_values[i].len;
}
if (total_size > 0) {
PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
/*shrink_to_fit=*/false));
PARQUET_THROW_NOT_OK(
byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
/*shrink_to_fit=*/false));
}
PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
/*shrink_to_fit=*/false));
PARQUET_THROW_NOT_OK(
byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
/*shrink_to_fit=*/false));

int32_t offset = 0;
uint8_t* bytes_data = byte_array_data_->mutable_data();
Expand Down

0 comments on commit a35cf92

Please sign in to comment.