diff --git a/be/src/core/block/block.cpp b/be/src/core/block/block.cpp index a0598dcc8121d5..4c0de4418c181a 100644 --- a/be/src/core/block/block.cpp +++ b/be/src/core/block/block.cpp @@ -41,6 +41,7 @@ #include "core/column/column_const.h" #include "core/column/column_nothing.h" #include "core/column/column_nullable.h" +#include "core/column/column_variant.h" #include "core/column/column_vector.h" #include "core/data_type/data_type_factory.hpp" #include "core/data_type/data_type_nullable.h" @@ -59,6 +60,33 @@ namespace doris::segment_v2 { enum CompressionTypePB : int; } // namespace doris::segment_v2 namespace doris { +namespace { + +ColumnPtr clone_finalized_variant_for_serialization(const ColumnWithTypeAndName& column) { + if (remove_nullable(column.type)->get_primitive_type() != PrimitiveType::TYPE_VARIANT) { + return column.column; + } + + const auto& column_ptr = column.column; + const auto* nullable = check_and_get_column(*column_ptr); + const IColumn* nested_column = + nullable != nullptr ? &nullable->get_nested_column() : column_ptr.get(); + const auto* variant = check_and_get_column(*nested_column); + if (variant == nullptr || variant->is_finalized()) { + return column_ptr; + } + + auto finalized_variant = variant->clone_finalized(); + if (nullable == nullptr) { + return finalized_variant; + } + + auto null_map = nullable->get_null_map_column_ptr()->clone_resized(nullable->size()); + return ColumnNullable::create(std::move(finalized_variant), std::move(null_map)); +} + +} // namespace + template void clear_blocks(moodycamel::ConcurrentQueue& blocks, RuntimeProfile::Counter* memory_used_counter = nullptr) { @@ -1012,10 +1040,23 @@ Status Block::serialize(int be_exec_version, PBlock* pblock, bool allow_transfer_large_data) const { RETURN_IF_ERROR(BeExecVersionManager::check_be_exec_version(be_exec_version)); pblock->set_be_exec_version(be_exec_version); + Block block_for_serialization; + const Block* serialized_block = this; + for (size_t i = 0; i < data.size(); ++i) { + auto serialized_column = clone_finalized_variant_for_serialization(data[i]); + if (serialized_column.get() == data[i].column.get()) { + continue; + } + if (serialized_block == this) { + block_for_serialization = *this; + serialized_block = &block_for_serialization; + } + block_for_serialization.replace_by_position(i, std::move(serialized_column)); + } // calc uncompressed size for allocation size_t content_uncompressed_size = 0; - for (const auto& c : *this) { + for (const auto& c : *serialized_block) { PColumnMeta* pcm = pblock->add_column_metas(); c.to_pb_column_meta(pcm); DCHECK(pcm->type() != PGenericType::UNKNOWN) << " forget to set pb type"; @@ -1038,7 +1079,7 @@ Status Block::serialize(int be_exec_version, PBlock* pblock, } char* buf = column_values.data(); - for (const auto& c : *this) { + for (const auto& c : *serialized_block) { buf = c.type->serialize(*(c.column), buf, pblock->be_exec_version()); } *uncompressed_bytes = content_uncompressed_size; diff --git a/be/src/core/column/column_nullable.cpp b/be/src/core/column/column_nullable.cpp index d3ce21fc8e39ab..08e08362a9f507 100644 --- a/be/src/core/column/column_nullable.cpp +++ b/be/src/core/column/column_nullable.cpp @@ -183,17 +183,14 @@ void ColumnNullable::update_crc32c_batch(uint32_t* __restrict hashes, const uint8_t* __restrict /* null_map */) const { const auto* __restrict real_null_data = get_null_map_column().get_data().data(); if (_nested_column->support_replace_column_null_data()) { - // nullmap process is slow, replace null data to default value to avoid nullmap process - // This is an intentional in-place mutation inside a logically-const hash computation: - // null positions are overwritten with defaults so the inner hash loop needs no null checks. - // The invariant is that a column instance is not hashed concurrently through the same - // owner while this per-block hash path runs. Shared aliases are detached by mutate() - // before this normalized nested column is written back. - auto nested_mut = std::move(*static_cast(_nested_column)).mutate(); - nested_mut->replace_column_null_data(real_null_data); - static_cast(const_cast(_nested_column)) = - std::move(nested_mut); - _nested_column->update_crc32c_batch(hashes, nullptr); + if (!has_null()) { + _nested_column->update_crc32c_batch(hashes, nullptr); + return; + } + auto nested_column = is_exclusive() ? _nested_column->assert_mutable() + : _nested_column->clone_resized(_nested_column->size()); + nested_column->replace_column_null_data(real_null_data); + nested_column->update_crc32c_batch(hashes, nullptr); } else { auto s = size(); for (int i = 0; i < s; ++i) { diff --git a/be/src/core/column/column_variant.cpp b/be/src/core/column/column_variant.cpp index 88cd501acb3bf6..0e4dde6733ccb9 100644 --- a/be/src/core/column/column_variant.cpp +++ b/be/src/core/column/column_variant.cpp @@ -80,6 +80,14 @@ namespace doris { namespace { +IColumn::WrappedPtr clone_column_deep(const IColumn::WrappedPtr& column) { + auto full_column = column->convert_to_full_column_if_const(); + auto cloned = full_column->clone_resized(full_column->size()); + cloned->for_each_subcolumn( + [](IColumn::WrappedPtr& subcolumn) { subcolumn = clone_column_deep(subcolumn); }); + return cloned; +} + DataTypePtr create_array_of_type(PrimitiveType type, size_t num_dimensions, bool is_nullable, int precision = -1, int scale = -1) { DataTypePtr result = type == PrimitiveType::INVALID_TYPE @@ -2841,4 +2849,33 @@ MutableColumnPtr ColumnVariant::clone() const { return res; } +MutableColumnPtr ColumnVariant::clone_finalized() const { + auto res = ColumnVariant::create(_max_subcolumns_count, _enable_doc_mode); + Subcolumns new_subcolumns; + for (const auto& subcolumn : subcolumns) { + auto new_subcolumn = subcolumn->data; + for (auto& part : new_subcolumn.data) { + part = clone_column_deep(part); + } + if (subcolumn->data.is_root) { + new_subcolumns.create_root(std::move(new_subcolumn)); + } else if (!new_subcolumns.add(subcolumn->path, std::move(new_subcolumn))) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "add path {} is error in clone_finalized()", + subcolumn->path.get_path()); + } + } + if (!new_subcolumns.get_root()) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, "root is nullptr in clone_finalized()"); + } + res->subcolumns = std::move(new_subcolumns); + res->serialized_sparse_column = clone_column_deep(serialized_sparse_column); + res->serialized_doc_value_column = clone_column_deep(serialized_doc_value_column); + res->set_num_rows(num_rows); + + ENABLE_CHECK_CONSISTENCY(res.get()); + res->finalize(FinalizeMode::READ_MODE); + return res; +} + } // namespace doris diff --git a/be/src/core/column/column_variant.h b/be/src/core/column/column_variant.h index 1d5c4eed1378a2..a21d8a3622814c 100644 --- a/be/src/core/column/column_variant.h +++ b/be/src/core/column/column_variant.h @@ -440,11 +440,7 @@ class ColumnVariant final : public COWHelper { bool is_finalized() const; - MutableColumnPtr clone_finalized() const { - auto finalized = IColumn::mutate(get_ptr()); - static_cast(finalized.get())->finalize(FinalizeMode::READ_MODE); - return finalized; - } + MutableColumnPtr clone_finalized() const; MutableColumnPtr clone() const override; diff --git a/be/src/core/data_type/data_type_variant.cpp b/be/src/core/data_type/data_type_variant.cpp index 133226def49d7c..fff6ff53408cc2 100644 --- a/be/src/core/data_type/data_type_variant.cpp +++ b/be/src/core/data_type/data_type_variant.cpp @@ -61,14 +61,16 @@ bool DataTypeVariant::equals(const IDataType& rhs) const { int64_t DataTypeVariant::get_uncompressed_serialized_bytes(const IColumn& column, int be_exec_version) const { - const auto& column_variant = assert_cast(column); - if (!column_variant.is_finalized()) { - // Icolumn originates from MutablePtr or block, and therefore can be modified. - // todo: We should reconsider the logic here, why are we using finalize() in this context? - const_cast(column_variant).finalize(); + const auto* column_variant = assert_cast(&column); + MutableColumnPtr finalized_column; + if (!column_variant->is_finalized()) { + // Local exchange can share the same block across downstream tasks. Serialize a private + // finalized copy so serialization never mutates shared variant columns. + finalized_column = column_variant->clone_finalized(); + column_variant = assert_cast(finalized_column.get()); } - const auto& subcolumns = column_variant.get_subcolumns(); + const auto& subcolumns = column_variant->get_subcolumns(); size_t size = 0; size += sizeof(uint32_t); @@ -95,26 +97,28 @@ int64_t DataTypeVariant::get_uncompressed_serialized_bytes(const IColumn& column // sparse column // TODO make compability with sparse column size += ColumnVariant::get_binary_column_type()->get_uncompressed_serialized_bytes( - *column_variant.get_sparse_column(), be_exec_version); + *column_variant->get_sparse_column(), be_exec_version); size += ColumnVariant::get_binary_column_type()->get_uncompressed_serialized_bytes( - *column_variant.get_doc_value_column(), be_exec_version); + *column_variant->get_doc_value_column(), be_exec_version); return size; } char* DataTypeVariant::serialize(const IColumn& column, char* buf, int be_exec_version) const { - const auto& column_variant = assert_cast(column); - if (!column_variant.is_finalized()) { - // Icolumn originates from block, and therefore can be modified. - // todo: We should reconsider the logic here, why are we using finalize() in this context? - const_cast(column_variant).finalize(); + const auto* column_variant = assert_cast(&column); + MutableColumnPtr finalized_column; + if (!column_variant->is_finalized()) { + // Local exchange can share the same block across downstream tasks. Serialize a private + // finalized copy so serialization never mutates shared variant columns. + finalized_column = column_variant->clone_finalized(); + column_variant = assert_cast(finalized_column.get()); } #ifndef NDEBUG // DCHECK size - column_variant.check_consistency(); + column_variant->check_consistency(); #endif - const auto& subcolumns = column_variant.get_subcolumns(); + const auto& subcolumns = column_variant->get_subcolumns(); char* size_pos = buf; buf += sizeof(uint32_t); @@ -147,15 +151,15 @@ char* DataTypeVariant::serialize(const IColumn& column, char* buf, int be_exec_v // Safe case unaligned_store(size_pos, static_cast(num_of_columns)); // serialize num of rows, only take effect when subcolumns empty - unaligned_store(buf, static_cast(column_variant.rows())); + unaligned_store(buf, static_cast(column_variant->rows())); buf += sizeof(uint32_t); // serialize sparse column // TODO make compability with sparse column - buf = ColumnVariant::get_binary_column_type()->serialize(*column_variant.get_sparse_column(), - buf, be_exec_version); - buf = ColumnVariant::get_binary_column_type()->serialize(*column_variant.get_doc_value_column(), + buf = ColumnVariant::get_binary_column_type()->serialize(*column_variant->get_sparse_column(), buf, be_exec_version); + buf = ColumnVariant::get_binary_column_type()->serialize( + *column_variant->get_doc_value_column(), buf, be_exec_version); return buf; } diff --git a/be/src/exec/operator/nested_loop_join_probe_operator.cpp b/be/src/exec/operator/nested_loop_join_probe_operator.cpp index c0203a74f6f186..05b05d237fbd9f 100644 --- a/be/src/exec/operator/nested_loop_join_probe_operator.cpp +++ b/be/src/exec/operator/nested_loop_join_probe_operator.cpp @@ -49,8 +49,47 @@ ColumnPtr align_eval_column_nullable(const ColumnWithTypeAndName& target, const return column; } +IColumn::WrappedPtr clone_column_deep(const IColumn::WrappedPtr& column) { + auto full_column = column->convert_to_full_column_if_const(); + auto cloned = full_column->clone_resized(full_column->size()); + cloned->for_each_subcolumn( + [](IColumn::WrappedPtr& subcolumn) { subcolumn = clone_column_deep(subcolumn); }); + return cloned; +} + +Status copy_lazy_probe_block_rows(const Block& src, Block* dst, + const std::set& lazy_eval_column_ids, + const std::set& materialize_column_ids) { + RETURN_IF_CATCH_EXCEPTION({ + ColumnsWithTypeAndName copied_columns; + copied_columns.reserve(src.columns()); + const auto rows = src.rows(); + for (size_t column_idx = 0; column_idx < src.columns(); ++column_idx) { + const auto& src_column = src.get_by_position(column_idx); + const auto column_id = cast_set(column_idx); + const bool should_copy_column = + lazy_eval_column_ids.find(column_id) != lazy_eval_column_ids.end() || + materialize_column_ids.find(column_id) != materialize_column_ids.end(); + ColumnPtr column; + if (should_copy_column) { + column = clone_column_deep(src_column.column); + } else { + column = src_column.type->create_column_const_with_default_value(rows); + } + copied_columns.emplace_back(std::move(column), src_column.type, src_column.name); + } + *dst = Block(std::move(copied_columns)); + }); + return Status::OK(); +} + void append_many_from_source(MutableColumnPtr& dst_column, const ColumnWithTypeAndName& src_column, size_t row, size_t rows) { + if (src_column.column->is_nullable() && src_column.column->is_null_at(row)) { + DCHECK(dst_column->is_nullable()); + dst_column->insert_many_defaults(rows); + return; + } if (!src_column.column->is_nullable() && dst_column->is_nullable()) { const auto origin_size = dst_column->size(); auto* nullable_column = assert_cast(dst_column.get()); @@ -67,6 +106,23 @@ void append_filtered_from_source(MutableColumnPtr& dst_column, if (selected_rows == 0) { return; } + if (src_column.column->is_nullable()) { + DCHECK(dst_column->is_nullable()); + size_t appended_rows = 0; + for (size_t row = 0; row < filter.size() && appended_rows < selected_rows; ++row) { + if (!filter[row]) { + continue; + } + if (src_column.column->is_null_at(row)) { + dst_column->insert_default(); + } else { + dst_column->insert_from(*src_column.column, row); + } + ++appended_rows; + } + DCHECK_EQ(appended_rows, selected_rows); + return; + } auto filtered_column = src_column.column->filter(filter, selected_rows); if (!src_column.column->is_nullable() && dst_column->is_nullable()) { const auto origin_size = dst_column->size(); @@ -131,6 +187,7 @@ Status NestedLoopJoinProbeLocalState::close(RuntimeState* state) { return Status::OK(); } _child_block->clear(); + _lazy_probe_block.clear(); return JoinProbeLocalState::close( state); @@ -889,7 +946,7 @@ Status NestedLoopJoinProbeLocalState::generate_inner_join_block_data(RuntimeStat _probe_side_process_count = 0; DCHECK(!_need_more_input_data || !_matched_rows_done); auto& p = _parent->cast(); - auto* probe_block = _child_block.get(); + auto* probe_block = p._enable_lazy_materialize ? &_lazy_probe_block : _child_block.get(); if (p._enable_lazy_materialize) { if (!_matched_rows_done && !_need_more_input_data) { @@ -931,7 +988,7 @@ Status NestedLoopJoinProbeLocalState::generate_other_join_block_data(RuntimeStat DCHECK(!_need_more_input_data || !_matched_rows_done); auto& p = _parent->cast(); - auto* probe_block = _child_block.get(); + auto* probe_block = p._enable_lazy_materialize ? &_lazy_probe_block : _child_block.get(); if (p._enable_lazy_materialize) { if (!_matched_rows_done && !_need_more_input_data) { @@ -1230,7 +1287,6 @@ Status NestedLoopJoinProbeOperatorX::push(doris::RuntimeState* state, Block* blo bool eos) const { auto& local_state = get_local_state(state); COUNTER_UPDATE(local_state._probe_rows_counter, block->rows()); - COUNTER_SET(local_state._memory_used_counter, block->allocated_bytes()); SCOPED_PEAK_MEM(&local_state.estimate_memory_usage()); local_state._cur_probe_row_visited_flags.resize(block->rows()); std::fill(local_state._cur_probe_row_visited_flags.begin(), @@ -1243,6 +1299,14 @@ Status NestedLoopJoinProbeOperatorX::push(doris::RuntimeState* state, Block* blo local_state._probe_block_pos = 0; local_state._need_more_input_data = false; local_state._shared_state->probe_side_eos = eos; + if (_enable_lazy_materialize) { + RETURN_IF_ERROR(copy_lazy_probe_block_rows(*block, &local_state._lazy_probe_block, + _lazy_eval_column_ids, _materialize_column_ids)); + } + COUNTER_SET(local_state._memory_used_counter, + block->allocated_bytes() + + (_enable_lazy_materialize ? local_state._lazy_probe_block.allocated_bytes() + : 0)); if (!_is_output_probe_side_only) { auto func = [&](auto&& join_op_variants, auto set_build_side_flag, diff --git a/be/src/exec/operator/nested_loop_join_probe_operator.h b/be/src/exec/operator/nested_loop_join_probe_operator.h index 890b6e2cbe1c7a..646a22bff7890d 100644 --- a/be/src/exec/operator/nested_loop_join_probe_operator.h +++ b/be/src/exec/operator/nested_loop_join_probe_operator.h @@ -233,6 +233,7 @@ class NestedLoopJoinProbeLocalState final } bool _matched_rows_done; + Block _lazy_probe_block; int _probe_block_start_pos = 0; int _probe_block_pos; // current scan pos in _probe_block int _probe_side_process_count = 0; diff --git a/be/src/exprs/function/cast/cast_to_variant.h b/be/src/exprs/function/cast/cast_to_variant.h index 6c6ed1743fcdf0..0efc29047b15f9 100644 --- a/be/src/exprs/function/cast/cast_to_variant.h +++ b/be/src/exprs/function/cast/cast_to_variant.h @@ -32,45 +32,52 @@ inline Status cast_from_variant_impl(FunctionContext* context, Block& block, auto& col_with_type_and_name = block.get_by_position(arguments[0]); auto& col_from = col_with_type_and_name.column; const IColumn* variant_column = col_from.get(); - if (const auto* nullable = check_and_get_column(*variant_column)) { + const auto* nullable = check_and_get_column(*variant_column); + if (nullable != nullptr) { variant_column = &nullable->get_nested_column(); } + const auto* variant = assert_cast(variant_column); + ColumnPtr col_to = data_type_to->create_column(); - if (!assert_cast(*variant_column).is_finalized()) { - // ColumnVariant should be finalized before parsing, finalize maybe modify original column structure - auto mutable_column = IColumn::mutate(std::move(col_with_type_and_name.column)); - if (auto* nullable = check_and_get_column(*mutable_column)) { - const auto& const_nullable = *nullable; - auto nested_column = IColumn::mutate(const_nullable.get_nested_column_ptr()); - assert_cast(*nested_column).finalize(); - ColumnPtr nested_column_ptr = std::move(nested_column); - nullable->change_nested_column(nested_column_ptr); + ColumnPtr finalized_input_column; + if (!variant->is_finalized()) { + // Local exchange can share the same input block across multiple downstream tasks. + // Finalize a private copy so variant casts never mutate shared input columns. + auto finalized_variant = variant->clone_finalized(); + variant = assert_cast(finalized_variant.get()); + if (nullable != nullptr) { + auto cloned_null_map = + nullable->get_null_map_column_ptr()->clone_resized(input_rows_count); + finalized_input_column = ColumnNullable::create(std::move(finalized_variant), + std::move(cloned_null_map)); } else { - assert_cast(*mutable_column).finalize(); + finalized_input_column = std::move(finalized_variant); } - col_with_type_and_name.column = std::move(mutable_column); - } - - variant_column = col_with_type_and_name.column.get(); - if (const auto* nullable = check_and_get_column(*variant_column)) { - variant_column = &nullable->get_nested_column(); } - const auto& variant = assert_cast(*variant_column); - ColumnPtr col_to = data_type_to->create_column(); + auto execute_on_finalized_input = [&](auto&& executor) -> Status { + if (!finalized_input_column) { + return executor(block); + } + Block finalized_block = block; + finalized_block.replace_by_position(arguments[0], finalized_input_column); + RETURN_IF_ERROR(executor(finalized_block)); + block.replace_by_position(result, finalized_block.get_by_position(result).column); + return Status::OK(); + }; // It's important to convert as many elements as possible in this context. For instance, // if the root of this variant column is a number column, converting it to a number column // is acceptable. However, if the destination type is a string and root is none scalar root, then // we should convert the entire tree to a string. - bool is_root_valuable = variant.is_scalar_variant() || - (!variant.is_null_root() && - variant.get_root_type()->get_primitive_type() != INVALID_TYPE && + bool is_root_valuable = variant->is_scalar_variant() || + (!variant->is_null_root() && + variant->get_root_type()->get_primitive_type() != INVALID_TYPE && !is_string_type(data_type_to->get_primitive_type()) && data_type_to->get_primitive_type() != TYPE_JSONB); if (is_root_valuable) { - ColumnPtr nested = variant.get_root(); - auto nested_from_type = variant.get_root_type(); + ColumnPtr nested = variant->get_root(); + auto nested_from_type = variant->get_root_type(); // DCHECK(nested_from_type->is_nullable()); DCHECK(!data_type_to->is_nullable()); auto new_context = context == nullptr ? nullptr : context->clone(); @@ -105,16 +112,21 @@ inline Status cast_from_variant_impl(FunctionContext* context, Block& block, {0, 1}, input_rows_count); } } else { - if (variant.only_have_default_values()) { + if (variant->only_have_default_values()) { col_to->assert_mutable()->insert_many_defaults(input_rows_count); col_to = make_nullable(col_to, true); } else if (is_string_type(data_type_to->get_primitive_type())) { // serialize to string - return CastToStringFunction::execute_impl(context, block, arguments, result, - input_rows_count); + return execute_on_finalized_input([&](Block& finalized_block) { + return CastToStringFunction::execute_impl(context, finalized_block, arguments, + result, input_rows_count); + }); } else if (data_type_to->get_primitive_type() == TYPE_JSONB) { // serialize to json by parsing - return cast_from_generic_to_jsonb(context, block, arguments, result, input_rows_count); + return execute_on_finalized_input([&](Block& finalized_block) { + return cast_from_generic_to_jsonb(context, finalized_block, arguments, result, + input_rows_count); + }); } else if (!data_type_to->is_nullable() && !is_string_type(data_type_to->get_primitive_type())) { // other types diff --git a/be/test/core/column/column_nullable_test.cpp b/be/test/core/column/column_nullable_test.cpp index 799cc6a9826059..109e1087621c97 100644 --- a/be/test/core/column/column_nullable_test.cpp +++ b/be/test/core/column/column_nullable_test.cpp @@ -22,6 +22,7 @@ #include #include "common/status.h" +#include "core/assert_cast.h" #include "core/column/column_nullable_test.h" #include "core/column/predicate_column.h" #include "core/data_type/data_type.h" @@ -131,6 +132,36 @@ TEST(ColumnNullableTest, SharedCreatePreservesImmutableSubcolumns) { EXPECT_EQ(null_map_alias->size(), 1); } +TEST(ColumnNullableTest, UpdateCrc32cBatchDoesNotReplaceSharedNestedNullData) { + auto nested = ColumnInt64::create(); + nested->insert_value(10); + nested->insert_value(20); + nested->insert_value(30); + ColumnPtr nested_alias = std::move(nested); + + auto null_map = ColumnUInt8::create(); + null_map->insert_value(0); + null_map->insert_value(1); + null_map->insert_value(0); + ColumnPtr null_map_alias = std::move(null_map); + + ColumnPtr nullable_column = ColumnNullable::create(nested_alias, null_map_alias); + ColumnPtr nullable_alias = nullable_column; + const auto& nullable = assert_cast(*nullable_column); + ASSERT_EQ(nullable_alias->size(), nullable.size()); + std::vector hashes(nullable.size()); + nullable.update_crc32c_batch(hashes.data(), nullptr); + + const auto& nested_after = assert_cast(nullable.get_nested_column()); + EXPECT_EQ(nested_after.get_element(0), 10); + EXPECT_EQ(nested_after.get_element(1), 20); + EXPECT_EQ(nested_after.get_element(2), 30); + const auto& nested_alias_after = assert_cast(*nested_alias); + EXPECT_EQ(nested_alias_after.get_element(0), 10); + EXPECT_EQ(nested_alias_after.get_element(1), 20); + EXPECT_EQ(nested_alias_after.get_element(2), 30); +} + TEST(ColumnNullableTest, append_data_by_selector) { auto srt_column = ColumnHelper::create_nullable_column( {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, diff --git a/be/test/core/column/column_variant_test.cpp b/be/test/core/column/column_variant_test.cpp index dff9e2c0ae5a77..4db42b8a3efce1 100644 --- a/be/test/core/column/column_variant_test.cpp +++ b/be/test/core/column/column_variant_test.cpp @@ -27,8 +27,11 @@ #include #include #include +#include +#include "agent/be_exec_version_manager.h" #include "common/cast_set.h" +#include "core/block/block.h" #include "core/column/column_variant.cpp" #include "core/column/common_column_test.h" #include "core/column/subcolumn_tree.h" @@ -40,9 +43,11 @@ #include "core/types.h" #include "core/value/jsonb_value.h" #include "exec/common/variant_util.h" +#include "gen_cpp/data.pb.h" #include "storage/olap_common.h" #include "testutil/test_util.h" #include "testutil/variant_util.h" +#include "util/block_compression.h" using namespace doris; namespace doris { @@ -2025,6 +2030,58 @@ TEST_F(ColumnVariantTest, clone_finalized) { test_func(std::move(cloned_object)); } +TEST_F(ColumnVariantTest, clone_finalized_deep_copies_columns) { + auto source_column = VariantUtil::construct_advanced_varint_column(); + source_column->finalize(ColumnVariant::FinalizeMode::READ_MODE); + + auto cloned = source_column->clone_finalized(); + auto* cloned_variant = assert_cast(cloned.get()); + EXPECT_TRUE(cloned_variant->is_finalized()); + + for (const auto& source_subcolumn : source_column->get_subcolumns()) { + const auto* cloned_subcolumn = + cloned_variant->get_subcolumns().find_exact(source_subcolumn->path); + ASSERT_NE(cloned_subcolumn, nullptr); + EXPECT_NE(source_subcolumn->data.get_finalized_column_ptr().get(), + cloned_subcolumn->data.get_finalized_column_ptr().get()) + << source_subcolumn->path.get_path(); + } + EXPECT_NE(source_column->get_sparse_column().get(), cloned_variant->get_sparse_column().get()); + EXPECT_NE(source_column->get_doc_value_column().get(), + cloned_variant->get_doc_value_column().get()); +} + +TEST_F(ColumnVariantTest, serialize_does_not_finalize_source_column) { + auto source_column = VariantUtil::construct_advanced_varint_column(); + ASSERT_FALSE(source_column->is_finalized()); + + const int be_exec_version = BeExecVersionManager::get_newest_version(); + const auto size = + dt_variant->get_uncompressed_serialized_bytes(*source_column, be_exec_version); + EXPECT_FALSE(source_column->is_finalized()); + + auto buffer = std::make_unique(size); + dt_variant->serialize(*source_column, buffer.get(), be_exec_version); + EXPECT_FALSE(source_column->is_finalized()); +} + +TEST_F(ColumnVariantTest, block_serialize_does_not_finalize_source_column) { + auto source_column = VariantUtil::construct_advanced_varint_column(); + ASSERT_FALSE(source_column->is_finalized()); + + Block block({{source_column->get_ptr(), dt_variant, "variant_col"}}); + PBlock pblock; + size_t uncompressed_bytes = 0; + size_t compressed_bytes = 0; + int64_t compress_time = 0; + auto status = block.serialize(BeExecVersionManager::get_newest_version(), &pblock, + &uncompressed_bytes, &compressed_bytes, &compress_time, + segment_v2::NO_COMPRESSION); + ASSERT_TRUE(status.ok()) << status; + EXPECT_FALSE(source_column->is_finalized()); + EXPECT_GT(pblock.column_values().size(), 0); +} + TEST_F(ColumnVariantTest, sanitize) { auto test_func = [](const auto& source_column) { auto src_size = source_column->size(); diff --git a/be/test/exprs/function/cast/function_variant_cast_test.cpp b/be/test/exprs/function/cast/function_variant_cast_test.cpp index 960637bf1507d0..51034ad6e0319e 100644 --- a/be/test/exprs/function/cast/function_variant_cast_test.cpp +++ b/be/test/exprs/function/cast/function_variant_cast_test.cpp @@ -284,6 +284,63 @@ TEST(FunctionVariantCast, CastFromVariant) { } } +TEST(FunctionVariantCast, CastFromVariantDoesNotFinalizeSourceColumn) { + auto variant_type = std::make_shared(); + auto int32_type = std::make_shared(); + auto string_type = std::make_shared(); + auto variant_col = construct_basic_varint_column(); + + ASSERT_FALSE(variant_col->is_finalized()); + + { + ColumnsWithTypeAndName arguments {{variant_col->get_ptr(), variant_type, "variant_col"}, + {nullptr, int32_type, "int32_type"}}; + + auto function = + SimpleFunctionFactory::instance().get_function("CAST", arguments, int32_type); + ASSERT_NE(function, nullptr); + + Block block {arguments}; + size_t result_column = block.columns(); + block.insert({nullptr, int32_type, "result"}); + + RuntimeState state; + auto ctx = FunctionContext::create_context(&state, {}, {}); + ASSERT_TRUE( + function->execute(ctx.get(), block, {0}, result_column, variant_col->size()).ok()); + + EXPECT_FALSE(variant_col->is_finalized()); + + auto result_col = block.get_by_position(result_column).column; + ASSERT_NE(result_col.get(), nullptr); + ASSERT_EQ(result_col->size(), variant_col->size()); + } + + { + ColumnsWithTypeAndName arguments {{variant_col->get_ptr(), variant_type, "variant_col"}, + {nullptr, string_type, "string_type"}}; + + auto function = + SimpleFunctionFactory::instance().get_function("CAST", arguments, string_type); + ASSERT_NE(function, nullptr); + + Block block {arguments}; + size_t result_column = block.columns(); + block.insert({nullptr, string_type, "result"}); + + RuntimeState state; + auto ctx = FunctionContext::create_context(&state, {}, {}); + ASSERT_TRUE( + function->execute(ctx.get(), block, {0}, result_column, variant_col->size()).ok()); + + EXPECT_FALSE(variant_col->is_finalized()); + + auto result_col = block.get_by_position(result_column).column; + ASSERT_NE(result_col.get(), nullptr); + ASSERT_EQ(result_col->size(), variant_col->size()); + } +} + TEST(FunctionVariantCast, CastVariantWithNull) { auto variant_type = std::make_shared(); auto int32_type = std::make_shared();