Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-7633: [C++][CI] Create fuzz targets for tensors and sparse tensors #6302

Closed
wants to merge 14 commits into from
4 changes: 4 additions & 0 deletions cpp/build-support/fuzzing/generate_corpuses.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ rm -rf ${CORPUS_DIR}
${OUT}/arrow-ipc-generate-fuzz-corpus -file ${CORPUS_DIR}
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-file-fuzz_seed_corpus.zip

rm -rf ${CORPUS_DIR}
${OUT}/arrow-ipc-generate-tensor-fuzz-corpus -stream ${CORPUS_DIR}
${ARROW_CPP}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-tensor-stream-fuzz_seed_corpus.zip

rm -rf ${CORPUS_DIR}
${OUT}/parquet-arrow-generate-fuzz-corpus ${CORPUS_DIR}
cp ${ARROW_CPP}/submodules/parquet-testing/data/*.parquet ${CORPUS_DIR}
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/ipc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,12 @@ if(ARROW_FUZZING)
add_executable(arrow-ipc-generate-fuzz-corpus generate_fuzz_corpus.cc)
target_link_libraries(arrow-ipc-generate-fuzz-corpus ${ARROW_UTIL_LIB}
${ARROW_TEST_LINK_LIBS})

add_executable(arrow-ipc-generate-tensor-fuzz-corpus generate_tensor_fuzz_corpus.cc)
target_link_libraries(arrow-ipc-generate-tensor-fuzz-corpus ${ARROW_UTIL_LIB}
${ARROW_TEST_LINK_LIBS})
endif()

add_arrow_fuzz_target(file_fuzz PREFIX "arrow-ipc")
add_arrow_fuzz_target(stream_fuzz PREFIX "arrow-ipc")
add_arrow_fuzz_target(tensor_stream_fuzz PREFIX "arrow-ipc")
134 changes: 134 additions & 0 deletions cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// A command line executable that generates a bunch of valid IPC files
// containing example tensors. Those are used as fuzzing seeds to make
// fuzzing more efficient.

#include <cstdlib>
#include <iostream>
#include <memory>
#include <string>
#include <vector>

#include "arrow/io/file.h"
#include "arrow/io/memory.h"
#include "arrow/ipc/test_common.h"
#include "arrow/ipc/writer.h"
#include "arrow/result.h"
#include "arrow/tensor.h"
#include "arrow/util/io_util.h"

namespace arrow {
namespace ipc {

using ::arrow::internal::PlatformFilename;

Result<PlatformFilename> PrepareDirectory(const std::string& dir) {
ARROW_ASSIGN_OR_RAISE(auto dir_fn, PlatformFilename::FromString(dir));
RETURN_NOT_OK(::arrow::internal::CreateDir(dir_fn));
return std::move(dir_fn);
}

Result<std::shared_ptr<Buffer>> MakeSerializedBuffer(
std::function<Status(const std::shared_ptr<io::BufferOutputStream>&)> fn) {
ARROW_ASSIGN_OR_RAISE(auto sink, io::BufferOutputStream::Create(1024));
RETURN_NOT_OK(fn(sink));
return sink->Finish();
}

Result<std::shared_ptr<Buffer>> SerializeTensor(const std::shared_ptr<Tensor>& tensor) {
return MakeSerializedBuffer(
[&](const std::shared_ptr<io::BufferOutputStream>& sink) -> Status {
int32_t metadata_length;
int64_t body_length;
return ipc::WriteTensor(*tensor, sink.get(), &metadata_length, &body_length);
});
}

Result<std::vector<std::shared_ptr<Tensor>>> Tensors() {
std::vector<std::shared_ptr<Tensor>> tensors;
std::shared_ptr<Tensor> tensor;
std::vector<int64_t> shape = {5, 3, 7};
std::shared_ptr<DataType> types[] = {int8(), int16(), int32(), int64(),
uint8(), uint16(), uint32(), uint64()};
uint32_t seed = 0;
for (auto type : types) {
RETURN_NOT_OK(
test::MakeRandomTensor(type, shape, /*row_major_p=*/true, &tensor, seed++));
tensors.push_back(tensor);
RETURN_NOT_OK(
test::MakeRandomTensor(type, shape, /*row_major_p=*/false, &tensor, seed++));
tensors.push_back(tensor);
}
return tensors;
}

Status GenerateTensors(const PlatformFilename& dir_fn) {
int sample_num = 1;
auto sample_name = [&]() -> std::string {
return "tensor-" + std::to_string(sample_num++);
};

ARROW_ASSIGN_OR_RAISE(auto tensors, Tensors());

for (const auto& tensor : tensors) {
ARROW_ASSIGN_OR_RAISE(auto buf, SerializeTensor(tensor));
ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name()));
std::cerr << sample_fn.ToString() << std::endl;
ARROW_ASSIGN_OR_RAISE(auto file, io::FileOutputStream::Open(sample_fn.ToString()));
RETURN_NOT_OK(file->Write(buf));
RETURN_NOT_OK(file->Close());
}
return Status::OK();
}

Status DoMain(const std::string& out_dir) {
ARROW_ASSIGN_OR_RAISE(auto dir_fn, PrepareDirectory(out_dir));
return GenerateTensors(dir_fn);
}

ARROW_NORETURN void Usage() {
std::cerr << "Usage: arrow-ipc-generate-tensor-fuzz-corpus "
<< "-stream <output directory>" << std::endl;
std::exit(2);
}

int Main(int argc, char** argv) {
if (argc != 3) {
Usage();
}

auto opt = std::string(argv[1]);
if (opt != "-stream") {
Usage();
}

auto out_dir = std::string(argv[2]);

Status st = DoMain(out_dir);
if (!st.ok()) {
std::cerr << st.ToString() << std::endl;
return 1;
}
return 0;
}

} // namespace ipc
} // namespace arrow

int main(int argc, char** argv) { return arrow::ipc::Main(argc, argv); }
11 changes: 8 additions & 3 deletions cpp/src/arrow/ipc/metadata_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1349,17 +1349,22 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
return Status::IOError("Header-type of flatbuffer-encoded Message is not Tensor.");
}

int ndim = static_cast<int>(tensor->shape()->size());
flatbuffers::uoffset_t ndim = tensor->shape()->size();
mrkn marked this conversation as resolved.
Show resolved Hide resolved

for (int i = 0; i < ndim; ++i) {
for (flatbuffers::uoffset_t i = 0; i < ndim; ++i) {
mrkn marked this conversation as resolved.
Show resolved Hide resolved
auto dim = tensor->shape()->Get(i);

shape->push_back(dim->size());
dim_names->push_back(StringFromFlatbuffers(dim->name()));
}

if (tensor->strides() && tensor->strides()->size() > 0) {
for (int i = 0; i < ndim; ++i) {
if (tensor->strides()->size() != ndim) {
return Status::IOError(
"The sizes of shape and strides in a tensor are mismatched.");
}

for (decltype(ndim) i = 0; i < ndim; ++i) {
strides->push_back(tensor->strides()->Get(i));
}
}
Expand Down
17 changes: 17 additions & 0 deletions cpp/src/arrow/ipc/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1747,6 +1747,23 @@ Status FuzzIpcFile(const uint8_t* data, int64_t size) {
return Status::OK();
}

Status FuzzIpcTensorStream(const uint8_t* data, int64_t size) {
auto buffer = std::make_shared<Buffer>(data, size);
io::BufferReader buffer_reader(buffer);

std::shared_ptr<Tensor> tensor;

while (true) {
ARROW_ASSIGN_OR_RAISE(tensor, ReadTensor(&buffer_reader));
if (tensor == nullptr) {
break;
}
RETURN_NOT_OK(tensor->Validate());
}

return Status::OK();
}

} // namespace internal
} // namespace ipc
} // namespace arrow
2 changes: 2 additions & 0 deletions cpp/src/arrow/ipc/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,8 @@ Result<std::shared_ptr<SparseTensor>> ReadSparseTensorPayload(const IpcPayload&
ARROW_EXPORT
Status FuzzIpcStream(const uint8_t* data, int64_t size);
ARROW_EXPORT
Status FuzzIpcTensorStream(const uint8_t* data, int64_t size);
ARROW_EXPORT
Status FuzzIpcFile(const uint8_t* data, int64_t size);

} // namespace internal
Expand Down
29 changes: 29 additions & 0 deletions cpp/src/arrow/ipc/tensor_stream_fuzz.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <memory>

#include "arrow/ipc/reader.h"
#include "arrow/status.h"
#include "arrow/util/macros.h"

extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
auto status =
arrow::ipc::internal::FuzzIpcTensorStream(data, static_cast<int64_t>(size));
ARROW_UNUSED(status);
return 0;
}
99 changes: 99 additions & 0 deletions cpp/src/arrow/ipc/test_common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

#include <algorithm>
#include <cstdint>
#include <functional>
#include <memory>
#include <numeric>
#include <string>
#include <type_traits>
#include <vector>

#include "arrow/array.h"
Expand All @@ -30,6 +32,7 @@
#include "arrow/pretty_print.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
#include "arrow/tensor.h"
#include "arrow/testing/extension_type.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
Expand Down Expand Up @@ -1000,6 +1003,102 @@ Status MakeDictExtension(std::shared_ptr<RecordBatch>* out) {
return Status::OK();
}

namespace {

template <typename CValueType, typename SeedType, typename DistributionType>
void FillRandomData(CValueType* data, size_t n, CValueType min, CValueType max,
SeedType seed) {
std::default_random_engine rng(seed);
DistributionType dist(min, max);
std::generate(data, data + n,
[&dist, &rng] { return static_cast<CValueType>(dist(rng)); });
}

template <typename CValueType, typename SeedType>
enable_if_t<std::is_integral<CValueType>::value && std::is_signed<CValueType>::value,
void>
FillRandomData(CValueType* data, size_t n, SeedType seed) {
FillRandomData<CValueType, SeedType, std::uniform_int_distribution<CValueType>>(
data, n, -1000, 1000, seed);
}

template <typename CValueType, typename SeedType>
enable_if_t<std::is_integral<CValueType>::value && std::is_unsigned<CValueType>::value,
void>
FillRandomData(CValueType* data, size_t n, SeedType seed) {
FillRandomData<CValueType, SeedType, std::uniform_int_distribution<CValueType>>(
data, n, 0, 1000, seed);
}

template <typename CValueType, typename SeedType>
enable_if_t<std::is_floating_point<CValueType>::value, void> FillRandomData(
CValueType* data, size_t n, SeedType seed) {
FillRandomData<CValueType, SeedType, std::uniform_real_distribution<CValueType>>(
data, n, -1000, 1000, seed);
}

} // namespace

Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
const std::vector<int64_t>& shape, bool row_major_p,
std::shared_ptr<Tensor>* out, uint32_t seed) {
const auto& element_type = internal::checked_cast<const FixedWidthType&>(*type);
std::vector<int64_t> strides;
if (row_major_p) {
internal::ComputeRowMajorStrides(element_type, shape, &strides);
} else {
internal::ComputeColumnMajorStrides(element_type, shape, &strides);
}

const int64_t element_size = element_type.bit_width() / CHAR_BIT;
const int64_t len =
std::accumulate(shape.begin(), shape.end(), int64_t(1), std::multiplies<int64_t>());

ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> buf, AllocateBuffer(element_size * len));

switch (type->id()) {
case Type::INT8:
FillRandomData<int8_t, uint32_t, std::uniform_int_distribution<int16_t>>(
reinterpret_cast<int8_t*>(buf->mutable_data()), len, -128, 127, seed);
break;
case Type::UINT8:
FillRandomData<uint8_t, uint32_t, std::uniform_int_distribution<uint16_t>>(
reinterpret_cast<uint8_t*>(buf->mutable_data()), len, 0, 255, seed);
break;
case Type::INT16:
FillRandomData(reinterpret_cast<int16_t*>(buf->mutable_data()), len, seed);
break;
case Type::UINT16:
FillRandomData(reinterpret_cast<uint16_t*>(buf->mutable_data()), len, seed);
break;
case Type::INT32:
FillRandomData(reinterpret_cast<int32_t*>(buf->mutable_data()), len, seed);
break;
case Type::UINT32:
FillRandomData(reinterpret_cast<uint32_t*>(buf->mutable_data()), len, seed);
break;
case Type::INT64:
FillRandomData(reinterpret_cast<int64_t*>(buf->mutable_data()), len, seed);
break;
case Type::UINT64:
FillRandomData(reinterpret_cast<uint64_t*>(buf->mutable_data()), len, seed);
break;
case Type::HALF_FLOAT:
FillRandomData(reinterpret_cast<int16_t*>(buf->mutable_data()), len, seed);
break;
case Type::FLOAT:
FillRandomData(reinterpret_cast<float*>(buf->mutable_data()), len, seed);
break;
case Type::DOUBLE:
FillRandomData(reinterpret_cast<double*>(buf->mutable_data()), len, seed);
break;
default:
return Status::Invalid(type->ToString(), " is not valid data type for a tensor");
}

return Tensor::Make(type, buf, shape, strides).Value(out);
}

} // namespace test
} // namespace ipc
} // namespace arrow
6 changes: 6 additions & 0 deletions cpp/src/arrow/ipc/test_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <cstdint>
#include <memory>
#include <vector>

#include "arrow/array.h"
#include "arrow/record_batch.h"
Expand Down Expand Up @@ -161,6 +162,11 @@ Status MakeUuid(std::shared_ptr<RecordBatch>* out);
ARROW_TESTING_EXPORT
Status MakeDictExtension(std::shared_ptr<RecordBatch>* out);

ARROW_TESTING_EXPORT
Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
const std::vector<int64_t>& shape, bool row_major_p,
std::shared_ptr<Tensor>* out, uint32_t seed = 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two things:

  1. Why not put this in arrow/testing/random.h?
  2. Return Result<Tensor> instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because I followed MakeRandomArray. It is in arrow/ipc/test_common.h.


} // namespace test
} // namespace ipc
} // namespace arrow
Loading