Skip to content

Commit

Permalink
PARQUET-1770: [C++][CI] Add fuzz target for reading Parquet files
Browse files Browse the repository at this point in the history
This fuzz target goes through the Parquet Arrow file reader.

Closes #6405 from pitrou/ARROW-6273-parquet-fuzz and squashes the following commits:

d71778a <Antoine Pitrou> Restore dynamic_cast
5a0fae3 <Antoine Pitrou> Improve data generation with more varied columns
2c08e48 <Antoine Pitrou> PARQUET-1770:  Add fuzz target for reading Parquet files

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
pitrou committed Feb 12, 2020
1 parent 220c437 commit b9fbc21
Show file tree
Hide file tree
Showing 17 changed files with 402 additions and 100 deletions.
8 changes: 8 additions & 0 deletions cpp/build-support/fuzzing/generate_corpuses.sh
Expand Up @@ -30,10 +30,18 @@ CORPUS_DIR=/tmp/corpus
ARROW=$(cd $(dirname $BASH_SOURCE)/../..; pwd)
OUT=$1

# NOTE: name of seed corpus output file should be "<FUZZ TARGET>-seed_corpus.zip"
# where "<FUZZ TARGET>" is the exact name of the fuzz target executable the
# seed corpus is generated for.

rm -rf ${CORPUS_DIR}
${OUT}/arrow-ipc-generate-fuzz-corpus -stream ${CORPUS_DIR}
${ARROW}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-stream-fuzz_seed_corpus.zip

rm -rf ${CORPUS_DIR}
${OUT}/arrow-ipc-generate-fuzz-corpus -file ${CORPUS_DIR}
${ARROW}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-file-fuzz_seed_corpus.zip

rm -rf ${CORPUS_DIR}
${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR}
${ARROW}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/parquet-arrow-fuzz_seed_corpus.zip
16 changes: 5 additions & 11 deletions cpp/cmake_modules/BuildUtils.cmake
Expand Up @@ -687,17 +687,17 @@ endfunction()
#
# Fuzzing
#
# Add new fuzzing test executable.
# Add new fuzz target executable.
#
# The single source file must define a function:
# extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
#
# No main function must be present within the source file!
#
function(ADD_ARROW_FUZZING REL_FUZZING_NAME)
function(ADD_FUZZ_TARGET REL_FUZZING_NAME)
set(options)
set(one_value_args)
set(multi_value_args PREFIX)
set(one_value_args PREFIX)
set(multi_value_args LINK_LIBS)
cmake_parse_arguments(ARG
"${options}"
"${one_value_args}"
Expand All @@ -720,12 +720,6 @@ function(ADD_ARROW_FUZZING REL_FUZZING_NAME)
set(FUZZING_NAME "${ARG_PREFIX}-${FUZZING_NAME}")
endif()

if(ARROW_BUILD_STATIC)
set(FUZZ_LINK_LIBS arrow_static)
else()
set(FUZZ_LINK_LIBS arrow_shared)
endif()

# For OSS-Fuzz
# (https://google.github.io/oss-fuzz/advanced-topics/ideal-integration/)
if(DEFINED ENV{LIB_FUZZING_ENGINE})
Expand All @@ -735,7 +729,7 @@ function(ADD_ARROW_FUZZING REL_FUZZING_NAME)
endif()

add_executable(${FUZZING_NAME} "${REL_FUZZING_NAME}.cc")
target_link_libraries(${FUZZING_NAME} ${FUZZ_LINK_LIBS})
target_link_libraries(${FUZZING_NAME} ${LINK_LIBS})
target_compile_options(${FUZZING_NAME} PRIVATE ${FUZZ_LDFLAGS})
set_target_properties(${FUZZING_NAME}
PROPERTIES LINK_FLAGS ${FUZZ_LDFLAGS} LABELS "fuzzing")
Expand Down
29 changes: 29 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Expand Up @@ -57,6 +57,35 @@ function(ADD_ARROW_TEST REL_TEST_NAME)
${ARG_UNPARSED_ARGUMENTS})
endfunction()

function(ADD_ARROW_FUZZ_TARGET REL_FUZZING_NAME)
set(options)
set(one_value_args PREFIX)
set(multi_value_args)
cmake_parse_arguments(ARG
"${options}"
"${one_value_args}"
"${multi_value_args}"
${ARGN})

if(ARG_PREFIX)
set(PREFIX ${ARG_PREFIX})
else()
set(PREFIX "arrow")
endif()

if(ARROW_BUILD_STATIC)
set(LINK_LIBS arrow_static)
else()
set(LINK_LIBS arrow_shared)
endif()
add_fuzz_target(${REL_FUZZING_NAME}
PREFIX
${PREFIX}
LINK_LIBS
${LINK_LIBS}
${ARG_UNPARSED_ARGUMENTS})
endfunction()

function(ADD_ARROW_BENCHMARK REL_TEST_NAME)
set(options)
set(one_value_args PREFIX)
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/ipc/CMakeLists.txt
Expand Up @@ -67,5 +67,5 @@ if(ARROW_FUZZING)
${ARROW_TEST_LINK_LIBS})
endif()

add_arrow_fuzzing(file_fuzz PREFIX "arrow-ipc")
add_arrow_fuzzing(stream_fuzz PREFIX "arrow-ipc")
add_arrow_fuzz_target(file_fuzz PREFIX "arrow-ipc")
add_arrow_fuzz_target(stream_fuzz PREFIX "arrow-ipc")
69 changes: 48 additions & 21 deletions cpp/src/arrow/testing/random.cc
Expand Up @@ -31,6 +31,7 @@
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/logging.h"

namespace arrow {
namespace random {
Expand All @@ -41,11 +42,13 @@ struct GenerateOptions {
: min_(min), max_(max), seed_(seed), probability_(probability) {}

void GenerateData(uint8_t* buffer, size_t n) {
GenerateTypedData(reinterpret_cast<ValueType*>(buffer), n);
}

void GenerateTypedData(ValueType* data, size_t n) {
std::default_random_engine rng(seed_++);
DistributionType dist(min_, max_);

ValueType* data = reinterpret_cast<ValueType*>(buffer);

// A static cast is required due to the int16 -> int8 handling.
std::generate(data, data + n,
[&dist, &rng] { return static_cast<ValueType>(dist(rng)); });
Expand Down Expand Up @@ -146,10 +149,9 @@ PRIMITIVE_RAND_FLOAT_IMPL(Float64, double, DoubleType)
#undef PRIMITIVE_RAND_IMPL

template <typename TypeClass>
static std::shared_ptr<arrow::Array> GenerateBinaryArray(RandomArrayGenerator* gen,
int64_t size, int32_t min_length,
int32_t max_length,
double null_probability) {
static std::shared_ptr<Array> GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size,
int32_t min_length, int32_t max_length,
double null_probability) {
using offset_type = typename TypeClass::offset_type;
using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
Expand Down Expand Up @@ -179,40 +181,42 @@ static std::shared_ptr<arrow::Array> GenerateBinaryArray(RandomArrayGenerator* g
}
}

std::shared_ptr<arrow::Array> result;
std::shared_ptr<Array> result;
ABORT_NOT_OK(builder.Finish(&result));
return result;
}

std::shared_ptr<arrow::Array> RandomArrayGenerator::String(int64_t size,
int32_t min_length,
int32_t max_length,
double null_probability) {
std::shared_ptr<Array> RandomArrayGenerator::String(int64_t size, int32_t min_length,
int32_t max_length,
double null_probability) {
return GenerateBinaryArray<StringType>(this, size, min_length, max_length,
null_probability);
}

std::shared_ptr<arrow::Array> RandomArrayGenerator::LargeString(int64_t size,
int32_t min_length,
int32_t max_length,
double null_probability) {
std::shared_ptr<Array> RandomArrayGenerator::LargeString(int64_t size, int32_t min_length,
int32_t max_length,
double null_probability) {
return GenerateBinaryArray<LargeStringType>(this, size, min_length, max_length,
null_probability);
}

std::shared_ptr<arrow::Array> RandomArrayGenerator::BinaryWithRepeats(
int64_t size, int64_t unique, int32_t min_length, int32_t max_length,
double null_probability) {
std::shared_ptr<Array> RandomArrayGenerator::BinaryWithRepeats(int64_t size,
int64_t unique,
int32_t min_length,
int32_t max_length,
double null_probability) {
auto strings =
StringWithRepeats(size, unique, min_length, max_length, null_probability);
std::shared_ptr<Array> out;
ABORT_NOT_OK(strings->View(binary(), &out));
return out;
}

std::shared_ptr<arrow::Array> RandomArrayGenerator::StringWithRepeats(
int64_t size, int64_t unique, int32_t min_length, int32_t max_length,
double null_probability) {
std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(int64_t size,
int64_t unique,
int32_t min_length,
int32_t max_length,
double null_probability) {
// Generate a random string dictionary without any nulls
auto array = String(unique, min_length, max_length, /*null_probability=*/0);
auto dictionary = std::dynamic_pointer_cast<StringArray>(array);
Expand All @@ -236,5 +240,28 @@ std::shared_ptr<arrow::Array> RandomArrayGenerator::StringWithRepeats(
ABORT_NOT_OK(builder.Finish(&result));
return result;
}

std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
int32_t last_offset) {
using GenOpt = GenerateOptions<int32_t, std::uniform_int_distribution<int32_t>>;
GenOpt options(seed(), first_offset, last_offset, /*null_probability=*/0);

BufferVector buffers{2};

ABORT_NOT_OK(AllocateBuffer(sizeof(int32_t) * size, &buffers[1]));
auto data = reinterpret_cast<int32_t*>(buffers[1]->mutable_data());
options.GenerateTypedData(data, size);
// Ensure offsets are in increasing order
std::sort(data, data + size);
// Ensure first and last offsets are as required
DCHECK_GE(data[0], first_offset);
DCHECK_LE(data[size - 1], last_offset);
data[0] = first_offset;
data[size - 1] = last_offset;

auto array_data = ArrayData::Make(int32(), size, buffers, /*null_count=*/0);
return std::make_shared<Int32Array>(array_data);
}

} // namespace random
} // namespace arrow

0 comments on commit b9fbc21

Please sign in to comment.