diff --git a/be/benchmark/benchmark_column_view.hpp b/be/benchmark/benchmark_column_view.hpp new file mode 100644 index 00000000000000..02ce3294b5b46a --- /dev/null +++ b/be/benchmark/benchmark_column_view.hpp @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// ============================================================ +// Benchmark: ColumnView vs hand-written column access (Int64) +// +// ColumnView (see column_execute_util.h) provides a unified interface +// to read column values regardless of whether the underlying column is +// Plain, ColumnConst, ColumnNullable, or Const(Nullable). +// +// This benchmark measures whether ColumnView introduces measurable +// overhead compared to hand-written (direct) column access code. +// ============================================================ + +#include + +#include + +#include "core/assert_cast.h" +#include "core/column/column_const.h" +#include "core/column/column_execute_util.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" +#include "core/data_type/primitive_type.h" + +namespace doris { + +static constexpr size_t NUM_ROWS = 4096; + +// ============================================================ +// Column factory helpers +// ============================================================ + +static ColumnPtr make_plain_column() { + auto col = ColumnInt64::create(); + col->reserve(NUM_ROWS); + for (size_t i = 0; i < NUM_ROWS; ++i) { + col->insert_value(static_cast(i + 1)); + } + return col; +} + +static ColumnPtr make_const_column() { + auto inner = ColumnInt64::create(); + inner->insert_value(42); + return ColumnConst::create(std::move(inner), NUM_ROWS); +} + +static ColumnPtr make_nullable_column() { + return ColumnNullable::create(make_plain_column()->assume_mutable(), + ColumnUInt8::create(NUM_ROWS, 0)); +} + +// ============================================================ +// Helper: extract Int64 data from various column forms +// ============================================================ + +struct PlainAccessor { + const ColumnInt64::Container& data; + + explicit PlainAccessor(const ColumnPtr& col) + : data(assert_cast(*col).get_data()) {} + + int64_t get(size_t i) const { return data[i]; } +}; + +struct ConstAccessor { + const int64_t value; + + explicit ConstAccessor(const ColumnPtr& col) + : value(assert_cast( + assert_cast(*col).get_data_column()) + .get_data()[0]) {} + + int64_t get(size_t /*i*/) const { return value; } +}; + +struct NullableAccessor { + const ColumnInt64::Container& data; + const NullMap& null_map; + + explicit NullableAccessor(const ColumnPtr& col) + : data(assert_cast( + assert_cast(*col).get_nested_column()) + .get_data()), + null_map(assert_cast(*col).get_null_map_data()) {} + + int64_t get(size_t i) const { return data[i]; } + bool is_null(size_t i) const { return null_map[i]; } +}; + +struct ConstNullableAccessor { + const int64_t value; + const bool is_null_value; + + explicit ConstNullableAccessor(const ColumnPtr& col) + : value(assert_cast( + assert_cast( + assert_cast(*col).get_data_column()) + .get_nested_column()) + .get_data()[0]), + is_null_value(assert_cast( + assert_cast(*col).get_data_column()) + .get_null_map_data()[0]) {} + + int64_t get(size_t /*i*/) const { return value; } + bool is_null(size_t /*i*/) const { return is_null_value; } +}; + +// ============================================================ +// Unary benchmarks: sum = Σ a[i] +// ============================================================ + +// ---- Unary: Plain ---- + +static void Handwritten_Unary_Plain(benchmark::State& state) { + const auto col_a = make_plain_column(); + PlainAccessor a(col_a); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += a.get(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(Handwritten_Unary_Plain)->Unit(benchmark::kNanosecond); + +static void ColumnView_Unary_Plain(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto view_a = ColumnView::create(col_a); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += view_a.value_at(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(ColumnView_Unary_Plain)->Unit(benchmark::kNanosecond); + +// ---- Unary: Nullable ---- + +static void Handwritten_Unary_Nullable(benchmark::State& state) { + const auto col_a = make_nullable_column(); + NullableAccessor a(col_a); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + if (!a.is_null(i)) { + sum += a.get(i); + } + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(Handwritten_Unary_Nullable)->Unit(benchmark::kNanosecond); + +static void ColumnView_Unary_Nullable(benchmark::State& state) { + const auto col_a = make_nullable_column(); + const auto view_a = ColumnView::create(col_a); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + if (!view_a.is_null_at(i)) { + sum += view_a.value_at(i); + } + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(ColumnView_Unary_Nullable)->Unit(benchmark::kNanosecond); + +// ============================================================ +// Binary benchmarks: sum = Σ (a[i] + b[i]) +// ============================================================ + +// ---- Binary: (Plain, Plain) ---- + +static void Handwritten_Binary_Plain_Plain(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto col_b = make_plain_column(); + PlainAccessor a(col_a); + PlainAccessor b(col_b); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += a.get(i) + b.get(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(Handwritten_Binary_Plain_Plain)->Unit(benchmark::kNanosecond); + +static void ColumnView_Binary_Plain_Plain(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto col_b = make_plain_column(); + const auto view_a = ColumnView::create(col_a); + const auto view_b = ColumnView::create(col_b); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += view_a.value_at(i) + view_b.value_at(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(ColumnView_Binary_Plain_Plain)->Unit(benchmark::kNanosecond); + +// ---- Binary: (Plain, Const) ---- + +static void Handwritten_Binary_Plain_Const(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto col_b = make_const_column(); + PlainAccessor a(col_a); + ConstAccessor b(col_b); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += a.get(i) + b.get(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(Handwritten_Binary_Plain_Const)->Unit(benchmark::kNanosecond); + +static void ColumnView_Binary_Plain_Const(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto col_b = make_const_column(); + const auto view_a = ColumnView::create(col_a); + const auto view_b = ColumnView::create(col_b); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += view_a.value_at(i) + view_b.value_at(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(ColumnView_Binary_Plain_Const)->Unit(benchmark::kNanosecond); + +// ---- Binary: (Plain, Nullable) ---- + +static void Handwritten_Binary_Plain_Nullable(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto col_b = make_nullable_column(); + PlainAccessor a(col_a); + NullableAccessor b(col_b); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + int64_t val = a.get(i); + if (!b.is_null(i)) { + val += b.get(i); + } + sum += val; + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(Handwritten_Binary_Plain_Nullable)->Unit(benchmark::kNanosecond); + +static void ColumnView_Binary_Plain_Nullable(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto col_b = make_nullable_column(); + const auto view_a = ColumnView::create(col_a); + const auto view_b = ColumnView::create(col_b); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + int64_t val = view_a.value_at(i); + if (!view_b.is_null_at(i)) { + val += view_b.value_at(i); + } + sum += val; + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(ColumnView_Binary_Plain_Nullable)->Unit(benchmark::kNanosecond); + +// ---- Binary: (Nullable, Nullable) ---- + +static void Handwritten_Binary_Nullable_Nullable(benchmark::State& state) { + const auto col_a = make_nullable_column(); + const auto col_b = make_nullable_column(); + NullableAccessor a(col_a); + NullableAccessor b(col_b); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + if (!a.is_null(i) && !b.is_null(i)) { + sum += a.get(i) + b.get(i); + } + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(Handwritten_Binary_Nullable_Nullable)->Unit(benchmark::kNanosecond); + +static void ColumnView_Binary_Nullable_Nullable(benchmark::State& state) { + const auto col_a = make_nullable_column(); + const auto col_b = make_nullable_column(); + const auto view_a = ColumnView::create(col_a); + const auto view_b = ColumnView::create(col_b); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + if (!view_a.is_null_at(i) && !view_b.is_null_at(i)) { + sum += view_a.value_at(i) + view_b.value_at(i); + } + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(ColumnView_Binary_Nullable_Nullable)->Unit(benchmark::kNanosecond); + +// ============================================================ +// Ternary benchmarks: sum = Σ (a[i] + b[i] + c[i]) +// ============================================================ + +// ---- Ternary: (Plain, Plain, Plain) ---- + +static void Handwritten_Ternary_Plain_Plain_Plain(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto col_b = make_plain_column(); + const auto col_c = make_plain_column(); + PlainAccessor a(col_a); + PlainAccessor b(col_b); + PlainAccessor c(col_c); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += a.get(i) + b.get(i) + c.get(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(Handwritten_Ternary_Plain_Plain_Plain)->Unit(benchmark::kNanosecond); + +static void ColumnView_Ternary_Plain_Plain_Plain(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto col_b = make_plain_column(); + const auto col_c = make_plain_column(); + const auto view_a = ColumnView::create(col_a); + const auto view_b = ColumnView::create(col_b); + const auto view_c = ColumnView::create(col_c); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += view_a.value_at(i) + view_b.value_at(i) + view_c.value_at(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(ColumnView_Ternary_Plain_Plain_Plain)->Unit(benchmark::kNanosecond); + +// ---- Ternary: (Const, Const, Plain) ---- + +static void Handwritten_Ternary_Const_Const_Plain(benchmark::State& state) { + const auto col_a = make_const_column(); + const auto col_b = make_const_column(); + const auto col_c = make_plain_column(); + ConstAccessor a(col_a); + ConstAccessor b(col_b); + PlainAccessor c(col_c); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += a.get(i) + b.get(i) + c.get(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(Handwritten_Ternary_Const_Const_Plain)->Unit(benchmark::kNanosecond); + +static void ColumnView_Ternary_Const_Const_Plain(benchmark::State& state) { + const auto col_a = make_const_column(); + const auto col_b = make_const_column(); + const auto col_c = make_plain_column(); + const auto view_a = ColumnView::create(col_a); + const auto view_b = ColumnView::create(col_b); + const auto view_c = ColumnView::create(col_c); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += view_a.value_at(i) + view_b.value_at(i) + view_c.value_at(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(ColumnView_Ternary_Const_Const_Plain)->Unit(benchmark::kNanosecond); + +// ---- Ternary: (Plain, Const, Plain) ---- + +static void Handwritten_Ternary_Plain_Const_Plain(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto col_b = make_const_column(); + const auto col_c = make_plain_column(); + PlainAccessor a(col_a); + ConstAccessor b(col_b); + PlainAccessor c(col_c); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += a.get(i) + b.get(i) + c.get(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(Handwritten_Ternary_Plain_Const_Plain)->Unit(benchmark::kNanosecond); + +static void ColumnView_Ternary_Plain_Const_Plain(benchmark::State& state) { + const auto col_a = make_plain_column(); + const auto col_b = make_const_column(); + const auto col_c = make_plain_column(); + const auto view_a = ColumnView::create(col_a); + const auto view_b = ColumnView::create(col_b); + const auto view_c = ColumnView::create(col_c); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + sum += view_a.value_at(i) + view_b.value_at(i) + view_c.value_at(i); + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(ColumnView_Ternary_Plain_Const_Plain)->Unit(benchmark::kNanosecond); + +// ---- Ternary: (Nullable, Nullable, Nullable) ---- + +static void Handwritten_Ternary_Nullable_Nullable_Nullable(benchmark::State& state) { + const auto col_a = make_nullable_column(); + const auto col_b = make_nullable_column(); + const auto col_c = make_nullable_column(); + NullableAccessor a(col_a); + NullableAccessor b(col_b); + NullableAccessor c(col_c); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + if (!a.is_null(i) && !b.is_null(i) && !c.is_null(i)) { + sum += a.get(i) + b.get(i) + c.get(i); + } + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(Handwritten_Ternary_Nullable_Nullable_Nullable)->Unit(benchmark::kNanosecond); + +static void ColumnView_Ternary_Nullable_Nullable_Nullable(benchmark::State& state) { + const auto col_a = make_nullable_column(); + const auto col_b = make_nullable_column(); + const auto col_c = make_nullable_column(); + const auto view_a = ColumnView::create(col_a); + const auto view_b = ColumnView::create(col_b); + const auto view_c = ColumnView::create(col_c); + for (auto _ : state) { + int64_t sum = 0; + for (size_t i = 0; i < NUM_ROWS; ++i) { + if (!view_a.is_null_at(i) && !view_b.is_null_at(i) && !view_c.is_null_at(i)) { + sum += view_a.value_at(i) + view_b.value_at(i) + view_c.value_at(i); + } + } + benchmark::DoNotOptimize(sum); + } +} +BENCHMARK(ColumnView_Ternary_Nullable_Nullable_Nullable)->Unit(benchmark::kNanosecond); + +} // namespace doris diff --git a/be/benchmark/benchmark_fastunion.hpp b/be/benchmark/benchmark_fastunion.hpp index ba469b75fa6ae3..ae574321aec348 100644 --- a/be/benchmark/benchmark_fastunion.hpp +++ b/be/benchmark/benchmark_fastunion.hpp @@ -19,7 +19,7 @@ #include -#include "util/bitmap_value.h" +#include "core/value/bitmap_value.h" using Roaring64Map = doris::detail::Roaring64Map; diff --git a/be/benchmark/benchmark_hll_merge.hpp b/be/benchmark/benchmark_hll_merge.hpp index d923d208fe4446..2fc6c47eaa68b8 100644 --- a/be/benchmark/benchmark_hll_merge.hpp +++ b/be/benchmark/benchmark_hll_merge.hpp @@ -17,7 +17,7 @@ #include -#include "olap/hll.h" +#include "core/value/hll.h" #include "util/hash_util.hpp" namespace doris { diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp index 5c516440a0c8ea..f76dbb370e5351 100644 --- a/be/benchmark/benchmark_main.cpp +++ b/be/benchmark/benchmark_main.cpp @@ -20,14 +20,15 @@ #include "benchmark_bit_pack.hpp" #include "benchmark_bits.hpp" #include "benchmark_block_bloom_filter.hpp" +#include "benchmark_column_view.hpp" #include "benchmark_fastunion.hpp" #include "benchmark_hll_merge.hpp" #include "benchmark_string.hpp" #include "binary_cast_benchmark.hpp" #include "core/block/block.h" -#include "vec/columns/column_string.h" -#include "vec/data_types/data_type.h" -#include "vec/data_types/data_type_string.h" +#include "core/column/column_string.h" +#include "core/data_type/data_type.h" +#include "core/data_type/data_type_string.h" namespace doris { // change if need diff --git a/be/benchmark/benchmark_string.hpp b/be/benchmark/benchmark_string.hpp index fc993e757445a6..bab869b693de5b 100644 --- a/be/benchmark/benchmark_string.hpp +++ b/be/benchmark/benchmark_string.hpp @@ -20,8 +20,8 @@ #include #include -#include "vec/functions/function_string.cpp" -#include "vec/functions/string_hex_util.h" +#include "exprs/function/function_string.cpp" +#include "exprs/function/string_hex_util.h" namespace doris { diff --git a/be/benchmark/binary_cast_benchmark.hpp b/be/benchmark/binary_cast_benchmark.hpp index cc5874a82ca44c..fec9ddb949aecd 100644 --- a/be/benchmark/binary_cast_benchmark.hpp +++ b/be/benchmark/binary_cast_benchmark.hpp @@ -21,7 +21,7 @@ #include #include -#include "util/binary_cast.hpp" +#include "core/binary_cast.hpp" namespace doris { diff --git a/be/src/core/column/column_execute_util.h b/be/src/core/column/column_execute_util.h index d8f33782c48c1a..187f439d2f72b2 100644 --- a/be/src/core/column/column_execute_util.h +++ b/be/src/core/column/column_execute_util.h @@ -61,6 +61,30 @@ struct ColumnElementView { // 3. Nullable(ColumnInt32) // 4. Const(Nullable(ColumnInt32)) (although this case is rare, it can still occur; many of our previous code did not consider this) // You can use is_null_at and value_at to get the data at the corresponding position +// +// ====== Performance Guide: When to Use ColumnView ====== +// +// 1. Expensive per-element operations (e.g. geo functions, complex string ops): +// Use ColumnView freely — its overhead is negligible relative to the work. +// +// 2. Cheap per-element operations that the compiler can inline (e.g. simple arithmetic): +// +// a) Inputs are NOT nullable (e.g. the function framework already strips nullable): +// Safe to use. The compiler optimizes the is_const branch into code equivalent +// to hand-written direct array access (verified via assembly and benchmarks). +// +// b) Inputs involve nullable columns: +// - Unary operations: safe to use, the compiler still optimizes effectively. +// - Binary / ternary operations: the combined is_null_at checks across multiple +// columns inhibit compiler vectorization and branch optimization, causing +// significant regression (~1.4x for binary, ~1.8x for ternary in benchmarks). +// In this case, hand-written column access is recommended for best performance. +// +// In summary, ColumnView is designed to eliminate the combinatorial explosion of +// handling 4 column forms. It is suitable for the vast majority of use cases. +// Only the specific combination of "cheap computation + nullable + multi-column" +// requires weighing whether to hand-write the access code. +// ====== End of Performance Guide ====== template struct ColumnView {