Skip to content

Commit

Permalink
PARQUET-1810: [C++] Fix undefined behaviour on invalid enum values (O…
Browse files Browse the repository at this point in the history
…SS-Fuzz)

Should fix the following issues:
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=20686
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=20696
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=20733
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=20849
* https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=21010

Closes #6537 from pitrou/PARQUET-1810-invalid-enum-values and squashes the following commits:

83e43fe <Antoine Pitrou> PARQUET-1810:  Fix undefined behaviour on invalid enum values (OSS-Fuzz)

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
pitrou authored and wesm committed Mar 5, 2020
1 parent 90385fe commit 367cd2e
Show file tree
Hide file tree
Showing 6 changed files with 183 additions and 110 deletions.
18 changes: 10 additions & 8 deletions cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -294,17 +294,19 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
page_buffer = decompression_buffer_;
}

if (current_page_header_.type == format::PageType::DICTIONARY_PAGE) {
const PageType::type page_type = LoadEnumSafe(&current_page_header_.type);

if (page_type == PageType::DICTIONARY_PAGE) {
crypto_ctx_.start_decrypt_with_dictionary_page = false;
const format::DictionaryPageHeader& dict_header =
current_page_header_.dictionary_page_header;

bool is_sorted = dict_header.__isset.is_sorted ? dict_header.is_sorted : false;

return std::make_shared<DictionaryPage>(page_buffer, dict_header.num_values,
FromThrift(dict_header.encoding),
LoadEnumSafe(&dict_header.encoding),
is_sorted);
} else if (current_page_header_.type == format::PageType::DATA_PAGE) {
} else if (page_type == PageType::DATA_PAGE) {
++page_ordinal_;
const format::DataPageHeader& header = current_page_header_.data_page_header;

Expand All @@ -328,10 +330,10 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
seen_num_rows_ += header.num_values;

return std::make_shared<DataPageV1>(
page_buffer, header.num_values, FromThrift(header.encoding),
FromThrift(header.definition_level_encoding),
FromThrift(header.repetition_level_encoding), page_statistics);
} else if (current_page_header_.type == format::PageType::DATA_PAGE_V2) {
page_buffer, header.num_values, LoadEnumSafe(&header.encoding),
LoadEnumSafe(&header.definition_level_encoding),
LoadEnumSafe(&header.repetition_level_encoding), page_statistics);
} else if (page_type == PageType::DATA_PAGE_V2) {
++page_ordinal_;
const format::DataPageHeaderV2& header = current_page_header_.data_page_header_v2;
bool is_compressed = header.__isset.is_compressed ? header.is_compressed : false;
Expand All @@ -340,7 +342,7 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {

return std::make_shared<DataPageV2>(
page_buffer, header.num_values, header.num_nulls, header.num_rows,
FromThrift(header.encoding), header.definition_levels_byte_length,
LoadEnumSafe(&header.encoding), header.definition_levels_byte_length,
header.repetition_levels_byte_length, is_compressed);
} else {
// We don't know what this page type is. We're allowed to skip non-data
Expand Down
12 changes: 6 additions & 6 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,12 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
}
}
}
for (auto encoding : column_metadata_->encodings) {
encodings_.push_back(FromThrift(encoding));
for (const auto& encoding : column_metadata_->encodings) {
encodings_.push_back(LoadEnumSafe(&encoding));
}
for (auto encoding_stats : column_metadata_->encoding_stats) {
encoding_stats_.push_back({FromThrift(encoding_stats.page_type),
FromThrift(encoding_stats.encoding),
encoding_stats_.push_back({LoadEnumSafe(&encoding_stats.page_type),
LoadEnumSafe(&encoding_stats.encoding),
encoding_stats.count});
}
possible_stats_ = nullptr;
Expand All @@ -225,7 +225,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
inline int64_t file_offset() const { return column_->file_offset; }
inline const std::string& file_path() const { return column_->file_path; }

inline Type::type type() const { return FromThrift(column_metadata_->type); }
inline Type::type type() const { return LoadEnumSafe(&column_metadata_->type); }

inline int64_t num_values() const { return column_metadata_->num_values; }

Expand Down Expand Up @@ -257,7 +257,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
}

inline Compression::type compression() const {
return FromThrift(column_metadata_->codec);
return LoadEnumSafe(&column_metadata_->codec);
}

const std::vector<Encoding::type>& encodings() const { return encodings_; }
Expand Down
78 changes: 12 additions & 66 deletions cpp/src/parquet/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -400,71 +400,19 @@ std::unique_ptr<Node> GroupNode::FromParquet(const void* opaque_element,
if (element->__isset.logicalType) {
// updated writer with logical type present
group_node = std::unique_ptr<GroupNode>(
new GroupNode(element->name, FromThrift(element->repetition_type), fields,
new GroupNode(element->name, LoadEnumSafe(&element->repetition_type), fields,
LogicalType::FromThrift(element->logicalType), field_id));
} else {
group_node = std::unique_ptr<GroupNode>(new GroupNode(
element->name, FromThrift(element->repetition_type), fields,
(element->__isset.converted_type ? FromThrift(element->converted_type)
element->name, LoadEnumSafe(&element->repetition_type), fields,
(element->__isset.converted_type ? LoadEnumSafe(&element->converted_type)
: ConvertedType::NONE),
field_id));
}

return std::unique_ptr<Node>(group_node.release());
}

namespace {

// If the parquet file is corrupted it is possible the type value decoded
// will not be in the range of format::Type::type, which is undefined behavior.
// This method prevents this by loading the value as the underlying type and checking
// to make sure it is in range.
template <typename ApiType>
struct SafeLoader {
using ApiTypeEnum = typename ApiType::type;
using ApiTypeRawEnum = typename std::underlying_type<ApiTypeEnum>::type;

template <typename ThriftType>
inline static ApiTypeRawEnum LoadRaw(ThriftType* in) {
static_assert(
sizeof(ApiTypeEnum) >= sizeof(ThriftType),
"parquet type should always be the same size of larger then thrift type");
typename std::underlying_type<ThriftType>::type raw_value;
memcpy(&raw_value, in, sizeof(ThriftType));
return static_cast<ApiTypeRawEnum>(raw_value);
}

template <typename ThriftType, bool IsUnsigned = true>
inline static ApiTypeEnum LoadChecked(
typename std::enable_if<IsUnsigned, ThriftType>::type* in) {
auto raw_value = LoadRaw(in);
if (ARROW_PREDICT_FALSE(raw_value >=
static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED))) {
return ApiType::UNDEFINED;
}
return FromThrift(static_cast<ThriftType>(raw_value));
}

template <typename ThriftType, bool IsUnsigned = false>
inline static ApiTypeEnum LoadChecked(
typename std::enable_if<!IsUnsigned, ThriftType>::type* in) {
auto raw_value = LoadRaw(in);
if (ARROW_PREDICT_FALSE(raw_value >=
static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED) ||
raw_value < 0)) {
return ApiType::UNDEFINED;
}
return FromThrift(static_cast<ThriftType>(raw_value));
}

template <typename ThriftType>
inline static ApiTypeEnum Load(ThriftType* in) {
return LoadChecked<ThriftType, std::is_unsigned<ApiTypeRawEnum>::value>(in);
}
};

} // namespace

std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element,
int field_id) {
const format::SchemaElement* element =
Expand All @@ -477,23 +425,21 @@ std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element,
std::unique_ptr<PrimitiveNode> primitive_node;
if (element->__isset.logicalType) {
// updated writer with logical type present
primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
element->name, SafeLoader<Repetition>::Load(&(element->repetition_type)),
LogicalType::FromThrift(element->logicalType),
SafeLoader<Type>::Load(&(element->type)), element->type_length, field_id));
primitive_node = std::unique_ptr<PrimitiveNode>(
new PrimitiveNode(element->name, LoadEnumSafe(&element->repetition_type),
LogicalType::FromThrift(element->logicalType),
LoadEnumSafe(&element->type), element->type_length, field_id));
} else if (element->__isset.converted_type) {
// legacy writer with logical type present
primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
element->name, SafeLoader<Repetition>::Load(&(element->repetition_type)),
SafeLoader<Type>::Load(&(element->type)),
SafeLoader<ConvertedType>::Load(&(element->converted_type)), element->type_length,
element->precision, element->scale, field_id));
element->name, LoadEnumSafe(&element->repetition_type),
LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type),
element->type_length, element->precision, element->scale, field_id));
} else {
// logical type not present
primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
element->name, SafeLoader<Repetition>::Load(&(element->repetition_type)),
NoLogicalType::Make(), SafeLoader<Type>::Load(&(element->type)),
element->type_length, field_id));
element->name, LoadEnumSafe(&element->repetition_type), NoLogicalType::Make(),
LoadEnumSafe(&element->type), element->type_length, field_id));
}

// Return as unique_ptr to the base type
Expand Down
172 changes: 144 additions & 28 deletions cpp/src/parquet/thrift_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,29 +63,164 @@ using ::std::shared_ptr;
#endif

// ----------------------------------------------------------------------
// Convert Thrift enums to / from parquet enums
// Convert Thrift enums to Parquet enums

static inline Type::type FromThrift(format::Type::type type) {
// Unsafe enum converters (input is not checked for validity)

static inline Type::type FromThriftUnsafe(format::Type::type type) {
return static_cast<Type::type>(type);
}

static inline ConvertedType::type FromThrift(format::ConvertedType::type type) {
static inline ConvertedType::type FromThriftUnsafe(format::ConvertedType::type type) {
// item 0 is NONE
return static_cast<ConvertedType::type>(static_cast<int>(type) + 1);
}

static inline Repetition::type FromThrift(format::FieldRepetitionType::type type) {
static inline Repetition::type FromThriftUnsafe(format::FieldRepetitionType::type type) {
return static_cast<Repetition::type>(type);
}

static inline Encoding::type FromThrift(format::Encoding::type type) {
static inline Encoding::type FromThriftUnsafe(format::Encoding::type type) {
return static_cast<Encoding::type>(type);
}

static inline PageType::type FromThrift(format::PageType::type type) {
static inline PageType::type FromThriftUnsafe(format::PageType::type type) {
return static_cast<PageType::type>(type);
}

static inline Compression::type FromThriftUnsafe(format::CompressionCodec::type type) {
switch (type) {
case format::CompressionCodec::UNCOMPRESSED:
return Compression::UNCOMPRESSED;
case format::CompressionCodec::SNAPPY:
return Compression::SNAPPY;
case format::CompressionCodec::GZIP:
return Compression::GZIP;
case format::CompressionCodec::LZO:
return Compression::LZO;
case format::CompressionCodec::BROTLI:
return Compression::BROTLI;
case format::CompressionCodec::LZ4:
return Compression::LZ4;
case format::CompressionCodec::ZSTD:
return Compression::ZSTD;
default:
DCHECK(false) << "Cannot reach here";
return Compression::UNCOMPRESSED;
}
}

namespace internal {

template <typename T>
struct ThriftEnumTypeTraits {};

template <>
struct ThriftEnumTypeTraits<::parquet::format::Type::type> {
using ParquetEnum = Type;
};

template <>
struct ThriftEnumTypeTraits<::parquet::format::ConvertedType::type> {
using ParquetEnum = ConvertedType;
};

template <>
struct ThriftEnumTypeTraits<::parquet::format::FieldRepetitionType::type> {
using ParquetEnum = Repetition;
};

template <>
struct ThriftEnumTypeTraits<::parquet::format::Encoding::type> {
using ParquetEnum = Encoding;
};

template <>
struct ThriftEnumTypeTraits<::parquet::format::PageType::type> {
using ParquetEnum = PageType;
};

// If the parquet file is corrupted it is possible the enum value decoded
// will not be in the range of defined values, which is undefined behaviour.
// This facility prevents this by loading the value as the underlying type
// and checking to make sure it is in range.

template <typename EnumType,
typename EnumTypeRaw = typename std::underlying_type<EnumType>::type>
inline static EnumTypeRaw LoadEnumRaw(const EnumType* in) {
EnumTypeRaw raw_value;
// Use memcpy(), as a regular cast would be undefined behaviour on invalid values
memcpy(&raw_value, in, sizeof(EnumType));
return raw_value;
}

template <typename ApiType>
struct SafeLoader {
using ApiTypeEnum = typename ApiType::type;
using ApiTypeRawEnum = typename std::underlying_type<ApiTypeEnum>::type;

template <typename ThriftType>
inline static ApiTypeRawEnum LoadRaw(const ThriftType* in) {
static_assert(sizeof(ApiTypeEnum) == sizeof(ThriftType),
"parquet type should always be the same size as thrift type");
return static_cast<ApiTypeRawEnum>(LoadEnumRaw(in));
}

template <typename ThriftType, bool IsUnsigned = true>
inline static ApiTypeEnum LoadChecked(
const typename std::enable_if<IsUnsigned, ThriftType>::type* in) {
auto raw_value = LoadRaw(in);
if (ARROW_PREDICT_FALSE(raw_value >=
static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED))) {
return ApiType::UNDEFINED;
}
return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
}

template <typename ThriftType, bool IsUnsigned = false>
inline static ApiTypeEnum LoadChecked(
const typename std::enable_if<!IsUnsigned, ThriftType>::type* in) {
auto raw_value = LoadRaw(in);
if (ARROW_PREDICT_FALSE(raw_value >=
static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED) ||
raw_value < 0)) {
return ApiType::UNDEFINED;
}
return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
}

template <typename ThriftType>
inline static ApiTypeEnum Load(const ThriftType* in) {
return LoadChecked<ThriftType, std::is_unsigned<ApiTypeRawEnum>::value>(in);
}
};

} // namespace internal

// Safe enum loader: will check for invalid enum value before converting

template <typename ThriftType,
typename ParquetEnum =
typename internal::ThriftEnumTypeTraits<ThriftType>::ParquetEnum>
inline typename ParquetEnum::type LoadEnumSafe(const ThriftType* in) {
return internal::SafeLoader<ParquetEnum>::Load(in);
}

inline typename Compression::type LoadEnumSafe(const format::CompressionCodec::type* in) {
const auto raw_value = internal::LoadEnumRaw(in);
// Check bounds manually, as Compression::type doesn't have the same values
// as format::CompressionCodec.
const auto min_value =
static_cast<decltype(raw_value)>(format::CompressionCodec::UNCOMPRESSED);
const auto max_value = static_cast<decltype(raw_value)>(format::CompressionCodec::ZSTD);
if (raw_value < min_value || raw_value > max_value) {
return Compression::UNCOMPRESSED;
}
return FromThriftUnsafe(*in);
}

// Safe non-enum converters

static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) {
return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique,
aesGcmV1.supply_aad_prefix};
Expand All @@ -111,6 +246,9 @@ static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encrypt
return encryption_algorithm;
}

// ----------------------------------------------------------------------
// Convert Thrift enums from Parquet enums

static inline format::Type::type ToThrift(Type::type type) {
return static_cast<format::Type::type>(type);
}
Expand All @@ -129,28 +267,6 @@ static inline format::Encoding::type ToThrift(Encoding::type type) {
return static_cast<format::Encoding::type>(type);
}

static inline Compression::type FromThrift(format::CompressionCodec::type type) {
switch (type) {
case format::CompressionCodec::UNCOMPRESSED:
return Compression::UNCOMPRESSED;
case format::CompressionCodec::SNAPPY:
return Compression::SNAPPY;
case format::CompressionCodec::GZIP:
return Compression::GZIP;
case format::CompressionCodec::LZO:
return Compression::LZO;
case format::CompressionCodec::BROTLI:
return Compression::BROTLI;
case format::CompressionCodec::LZ4:
return Compression::LZ4;
case format::CompressionCodec::ZSTD:
return Compression::ZSTD;
default:
DCHECK(false) << "Cannot reach here";
return Compression::UNCOMPRESSED;
}
}

static inline format::CompressionCodec::type ToThrift(Compression::type type) {
switch (type) {
case Compression::UNCOMPRESSED:
Expand Down
Loading

0 comments on commit 367cd2e

Please sign in to comment.