Skip to content

Commit

Permalink
PARQUET-2119: [C++] Fix DeltaBitPackDecoder fuzzer found issue
Browse files Browse the repository at this point in the history
DeltaBitPackDecoder was using num_values_(which includes) null to compute batch size instead of total_value_count_. This lead to a failed check when comparing those counts. Changed to just use total_value_count_ and get rid of the check. Alternatively, we could throw an exception instead of the check; I think we only enter this state if the file is malformed (assuming no bugs elsewhere).

Also modified decode arrow to check the return value of DecodeInternal; this might be pointless as its callers only compare the returned value count as a DCHECK. It might be a good idea to standardize at what stage of decoding values should be compared to expected values and the behavior when they are not equal (preferably an exception and not a check).

Closes #12365 from tachyonwill/delta_check

Lead-authored-by: William Butler <tachyonwill@gmail.com>
Co-authored-by: William Butler <wab@google.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
2 people authored and pitrou committed Feb 8, 2022
1 parent 43efadb commit 09c8554
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 12 deletions.
26 changes: 15 additions & 11 deletions cpp/src/parquet/encoding.cc
Expand Up @@ -2109,9 +2109,9 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
}
std::vector<T> values(num_values);
GetInternal(values.data(), num_values);
PARQUET_THROW_NOT_OK(out->AppendValues(values));
return num_values;
int decoded_count = GetInternal(values.data(), num_values);
PARQUET_THROW_NOT_OK(out->AppendValues(values.data(), decoded_count));
return decoded_count;
}

int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
Expand All @@ -2121,12 +2121,12 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
}
std::vector<T> values(num_values);
GetInternal(values.data(), num_values);
PARQUET_THROW_NOT_OK(out->Reserve(num_values));
for (T value : values) {
PARQUET_THROW_NOT_OK(out->Append(value));
int decoded_count = GetInternal(values.data(), num_values);
PARQUET_THROW_NOT_OK(out->Reserve(decoded_count));
for (int i = 0; i < decoded_count; ++i) {
PARQUET_THROW_NOT_OK(out->Append(values[i]));
}
return num_values;
return decoded_count;
}

private:
Expand Down Expand Up @@ -2181,12 +2181,11 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DTyp
}

int GetInternal(T* buffer, int max_values) {
max_values = std::min(max_values, this->num_values_);
max_values = static_cast<int>(std::min<int64_t>(max_values, total_value_count_));
if (max_values == 0) {
return 0;
}

DCHECK_LE(static_cast<uint32_t>(max_values), total_value_count_);
int i = 0;
while (i < max_values) {
if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) {
Expand Down Expand Up @@ -2424,7 +2423,12 @@ class DeltaByteArrayDecoder : public DecoderImpl,
return max_values;
}

suffix_decoder_.Decode(buffer, max_values);
int suffix_read = suffix_decoder_.Decode(buffer, max_values);
if (ARROW_PREDICT_FALSE(suffix_read != max_values)) {
ParquetException::EofException("Read " + std::to_string(suffix_read) +
", expecting " + std::to_string(max_values) +
" from suffix decoder");
}

int64_t data_size = 0;
const int32_t* prefix_len_ptr =
Expand Down
2 changes: 1 addition & 1 deletion testing

0 comments on commit 09c8554

Please sign in to comment.