From f5a9a59a10a0022aba13bb1b7caa4b8e5c164c1a Mon Sep 17 00:00:00 2001 From: Ben Harkins <60872452+benibus@users.noreply.github.com> Date: Tue, 8 Aug 2023 13:40:08 -0400 Subject: [PATCH] GH-36892: [C++] Fix performance regressions in `FieldPath::Get` (#37032) ### Rationale for this change https://github.com/apache/arrow/pull/35197 appears to have introduced significant performance regressions in `FieldPath::Get` - indicated [here](https://conbench.ursa.dev/compare/runs/9cf73ac83f0a44179e6538b2c1c7babd...3d76cb5ffb8849bf8c3ea9b32d08b3b7/), in a benchmark that uses a wide (10K column) dataframe. ### What changes are included in this PR? - Adds basic benchmarks for `FieldPath::Get` across various input types, as they didn't previously exist - Addresses several performance issues. These came in the form of extremely high upfront costs for the `RecordBatch` and `ArrayData` overloads specifically - Some minor refactoring of `NestedSelector` ### Are these changes tested? Yes (covered by existing tests) ### Are there any user-facing changes? No * Closes: #36892 Lead-authored-by: benibus Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/arrow/type.cc | 102 +++++++++++++++++--------- cpp/src/arrow/type_benchmark.cc | 125 ++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 68dc2aabe96ad..9267f1e499720 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1066,17 +1066,29 @@ std::string FieldPath::ToString() const { return repr; } -static Status NonStructError() { - return Status::NotImplemented("Get child data of non-struct array"); -} +struct NestedSelectorUtil { + static Status NonStructError() { + return Status::NotImplemented("Get child data of non-struct array"); + } + + template + static const DataType* GetType(const T& input) { + if constexpr (std::is_same_v) { + return input.type.get(); + } else { + return input.type().get(); + } + } +}; -// Utility class for retrieving a child field/column from a top-level Field, Array, or -// ChunkedArray. The "root" value can either be a single parent or a vector of its -// children. +// Utility class for retrieving a child field/column from a top-level Field, Array, +// ArrayData, or ChunkedArray. The "root" value can either be a single parent or a vector +// of its children. template class NestedSelector { public: using ArrowType = T; + using Util = NestedSelectorUtil; explicit NestedSelector(const std::vector>& children) : parent_or_children_(&children) {} @@ -1095,7 +1107,18 @@ class NestedSelector { Result GetChild(int i) const { std::shared_ptr child; if (auto parent = get_parent()) { - ARROW_ASSIGN_OR_RAISE(child, GetChild(*parent, i, pool_)); + const DataType* type = Util::GetType(*parent); + // We avoid this check for schema fields since it's inconsequential (plus there are + // tests elsewhere that rely on it not happening) + if constexpr (!std::is_same_v) { + if (ARROW_PREDICT_FALSE(type->id() != Type::STRUCT)) { + return Util::NonStructError(); + } + } + // Bounds-check the index *once* using the parent's type + if (ARROW_PREDICT_TRUE(i >= 0 && i < type->num_fields())) { + ARROW_ASSIGN_OR_RAISE(child, GetChild(*parent, i, pool_)); + } } else if (auto children = get_children()) { if (ARROW_PREDICT_TRUE(i >= 0 && static_cast(i) < children->size())) { child = (*children)[i]; @@ -1129,10 +1152,10 @@ class NestedSelector { *os << "column types: { "; if (auto children = get_children()) { for (const auto& child : *children) { - *os << *child->type() << ", "; + *os << *Util::GetType(*child) << ", "; } } else if (auto parent = get_parent()) { - for (const auto& field : parent->type()->fields()) { + for (const auto& field : Util::GetType(*parent)->fields()) { *os << *field->type() << ", "; } } @@ -1155,21 +1178,33 @@ class NestedSelector { } static Result> GetChild(const Field& field, int i, MemoryPool*) { - if (ARROW_PREDICT_FALSE(i < 0 || i >= field.type()->num_fields())) { - return nullptr; - } return field.type()->field(i); } - static Result> GetChild(const Array& array, int i, - MemoryPool* pool) { - if (ARROW_PREDICT_FALSE(array.type_id() != Type::STRUCT)) { - return NonStructError(); - } - if (ARROW_PREDICT_FALSE(i < 0 || i >= array.num_fields())) { - return nullptr; + static Result> GetChild(const ArrayData& data, int i, + MemoryPool* pool) { + std::shared_ptr child_data; + if constexpr (IsFlattening) { + // First, convert to an Array so we can use StructArray::GetFlattenedField + auto array = MakeArray(data.Copy()); + ARROW_ASSIGN_OR_RAISE(auto child_array, GetChild(*array, i, pool)); + child_data = child_array->data(); + } else { + // We could achieve the same result by converting to an Array (via MakeArray), + // calling StructArray::field(i), and pulling out the new ArrayData. However, this + // process can be very expensive when there are many columns - so we just + // reimplement the functionality that we need + child_data = data.child_data[i]; + if (data.offset != 0 || data.child_data[i]->length != data.length) { + child_data = child_data->Slice(data.offset, data.length); + } } + return std::move(child_data); + } + + static Result> GetChild(const Array& array, int i, + MemoryPool* pool) { const auto& struct_array = checked_cast(array); if constexpr (IsFlattening) { return struct_array.GetFlattenedField(i, pool); @@ -1181,22 +1216,15 @@ class NestedSelector { static Result> GetChild(const ChunkedArray& chunked_array, int i, MemoryPool* pool) { const auto& type = *chunked_array.type(); - if (ARROW_PREDICT_FALSE(type.id() != Type::STRUCT)) { - return NonStructError(); - } - if (ARROW_PREDICT_FALSE(i < 0 || i >= type.num_fields())) { - return nullptr; - } ArrayVector chunks; chunks.reserve(chunked_array.num_chunks()); for (const auto& parent_chunk : chunked_array.chunks()) { ARROW_ASSIGN_OR_RAISE(auto chunk, GetChild(*parent_chunk, i, pool)); - if (!chunk) return nullptr; chunks.push_back(std::move(chunk)); } - return ChunkedArray::Make(std::move(chunks), type.field(i)->type()); + return std::make_shared(std::move(chunks), type.field(i)->type()); } std::shared_ptr owned_parent_; @@ -1289,7 +1317,11 @@ Result> FieldPath::GetAll(const Schema& schm, } Result> FieldPath::Get(const RecordBatch& batch) const { - return FieldPathGetImpl::Get(this, ZeroCopySelector(batch.columns())); + // Deliberately calling `column_data` here because `RecordBatch::columns` is nontrivial + ARROW_ASSIGN_OR_RAISE( + auto data, + FieldPathGetImpl::Get(this, ZeroCopySelector(batch.column_data()))); + return MakeArray(data); } Result> FieldPath::Get(const Table& table) const { @@ -1301,11 +1333,7 @@ Result> FieldPath::Get(const Array& array) const { } Result> FieldPath::Get(const ArrayData& data) const { - // We indirect from ArrayData to Array rather than vice-versa because, when selecting a - // nested column, the StructArray::field method does the work of adjusting the data's - // offset/length if necessary. - ARROW_ASSIGN_OR_RAISE(auto array, Get(*MakeArray(data.Copy()))); - return array->data(); + return FieldPathGetImpl::Get(this, ZeroCopySelector(data)); } Result> FieldPath::Get( @@ -1320,8 +1348,7 @@ Result> FieldPath::GetFlattened(const Array& array, Result> FieldPath::GetFlattened(const ArrayData& data, MemoryPool* pool) const { - ARROW_ASSIGN_OR_RAISE(auto array, GetFlattened(*MakeArray(data.Copy()), pool)); - return array->data(); + return FieldPathGetImpl::Get(this, FlatteningSelector(data, pool)); } Result> FieldPath::GetFlattened( @@ -1332,7 +1359,10 @@ Result> FieldPath::GetFlattened( Result> FieldPath::GetFlattened(const RecordBatch& batch, MemoryPool* pool) const { - return FieldPathGetImpl::Get(this, FlatteningSelector(batch.columns(), pool)); + ARROW_ASSIGN_OR_RAISE( + auto data, FieldPathGetImpl::Get( + this, FlatteningSelector(batch.column_data(), pool))); + return MakeArray(data); } Result> FieldPath::GetFlattened(const Table& table, diff --git a/cpp/src/arrow/type_benchmark.cc b/cpp/src/arrow/type_benchmark.cc index de90577ffdf64..17dccfcb33138 100644 --- a/cpp/src/arrow/type_benchmark.cc +++ b/cpp/src/arrow/type_benchmark.cc @@ -18,15 +18,19 @@ #include #include #include +#include #include #include #include #include "benchmark/benchmark.h" +#include "arrow/array.h" #include "arrow/result.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/type.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" @@ -418,6 +422,120 @@ static void ErrorSchemeExceptionNoInline( state.SetItemsProcessed(state.iterations() * integers.size()); } +// ---------------------------------------------------------------------- +// FieldPath::Get benchmarks + +static std::shared_ptr GenerateTestSchema(int num_columns) { + FieldVector fields(num_columns); + for (int i = 0; i < num_columns; ++i) { + auto name = std::string("f") + std::to_string(i); + fields[i] = field(std::move(name), int64()); + } + return schema(std::move(fields)); +} + +static std::shared_ptr GenerateTestArray(int num_columns) { + constexpr int64_t kLength = 100; + + auto rand = random::RandomArrayGenerator(0xbeef); + auto schm = GenerateTestSchema(num_columns); + + ArrayVector columns(num_columns); + for (auto& column : columns) { + column = rand.Int64(kLength, 0, std::numeric_limits::max()); + } + + return *StructArray::Make(columns, schm->fields()); +} + +static std::shared_ptr ToBatch(const std::shared_ptr& array) { + return *RecordBatch::FromStructArray(array); +} + +static std::shared_ptr ToChunked(const std::shared_ptr& array, + double chunk_proportion = 1.0) { + auto struct_array = internal::checked_pointer_cast(array); + const auto num_rows = struct_array->length(); + const auto chunk_length = static_cast(std::ceil(num_rows * chunk_proportion)); + + ArrayVector chunks; + for (int64_t offset = 0; offset < num_rows;) { + int64_t slice_length = std::min(chunk_length, num_rows - offset); + chunks.push_back(*struct_array->SliceSafe(offset, slice_length)); + offset += slice_length; + } + + return *ChunkedArray::Make(std::move(chunks)); +} + +static std::shared_ptr ToTable(const std::shared_ptr& array, + double chunk_proportion = 1.0) { + return *Table::FromChunkedStructArray(ToChunked(array, chunk_proportion)); +} + +template +static void BenchmarkFieldPathGet(benchmark::State& state, // NOLINT non-const reference + const T& input, int num_columns, + std::optional num_chunks = {}) { + // Reassigning a single FieldPath var within each iteration's scope seems to be costly + // enough to influence the timings, so we preprocess them. + std::vector paths(num_columns); + for (int i = 0; i < num_columns; ++i) { + paths[i] = {i}; + } + + for (auto _ : state) { + for (const auto& path : paths) { + benchmark::DoNotOptimize(path.Get(input).ValueOrDie()); + } + } + + state.SetItemsProcessed(state.iterations() * num_columns); + state.counters["num_columns"] = num_columns; + if (num_chunks.has_value()) { + state.counters["num_chunks"] = num_chunks.value(); + } +} + +static void FieldPathGetFromWideArray( + benchmark::State& state) { // NOLINT non-const reference + constexpr int kNumColumns = 10000; + auto array = GenerateTestArray(kNumColumns); + BenchmarkFieldPathGet(state, *array, kNumColumns); +} + +static void FieldPathGetFromWideArrayData( + benchmark::State& state) { // NOLINT non-const reference + constexpr int kNumColumns = 10000; + auto array = GenerateTestArray(kNumColumns); + BenchmarkFieldPathGet(state, *array->data(), kNumColumns); +} + +static void FieldPathGetFromWideBatch( + benchmark::State& state) { // NOLINT non-const reference + constexpr int kNumColumns = 10000; + auto batch = ToBatch(GenerateTestArray(kNumColumns)); + BenchmarkFieldPathGet(state, *batch, kNumColumns); +} + +static void FieldPathGetFromWideChunkedArray( + benchmark::State& state) { // NOLINT non-const reference + constexpr int kNumColumns = 10000; + // Percentage representing the size of each chunk relative to the total length (smaller + // proportion means more chunks) + const double chunk_proportion = state.range(0) / 100.0; + auto chunked_array = ToChunked(GenerateTestArray(kNumColumns), chunk_proportion); + BenchmarkFieldPathGet(state, *chunked_array, kNumColumns, chunked_array->num_chunks()); +} + +static void FieldPathGetFromWideTable( + benchmark::State& state) { // NOLINT non-const reference + constexpr int kNumColumns = 10000; + const double chunk_proportion = state.range(0) / 100.0; + auto table = ToTable(GenerateTestArray(kNumColumns), chunk_proportion); + BenchmarkFieldPathGet(state, *table, kNumColumns, table->column(0)->num_chunks()); +} + BENCHMARK(TypeEqualsSimple); BENCHMARK(TypeEqualsComplex); BENCHMARK(TypeEqualsWithMetadata); @@ -436,4 +554,11 @@ BENCHMARK(ErrorSchemeStatusNoInline); BENCHMARK(ErrorSchemeResultNoInline); BENCHMARK(ErrorSchemeExceptionNoInline); +BENCHMARK(FieldPathGetFromWideArray); +BENCHMARK(FieldPathGetFromWideArrayData); +BENCHMARK(FieldPathGetFromWideBatch); + +BENCHMARK(FieldPathGetFromWideChunkedArray)->Arg(2)->Arg(10)->Arg(25)->Arg(100); +BENCHMARK(FieldPathGetFromWideTable)->Arg(2)->Arg(10)->Arg(25)->Arg(100); + } // namespace arrow