diff --git a/be/src/exec/exchange/local_exchange_sink_operator.cpp b/be/src/exec/exchange/local_exchange_sink_operator.cpp index 2f9443208e734a..b99b6cd14f4a46 100644 --- a/be/src/exec/exchange/local_exchange_sink_operator.cpp +++ b/be/src/exec/exchange/local_exchange_sink_operator.cpp @@ -37,24 +37,30 @@ std::vector LocalExchangeSinkLocalState::dependencies() const { return deps; } -Status LocalExchangeSinkOperatorX::init(RuntimeState* state, ExchangeType type, - const int num_buckets, const bool use_global_hash_shuffle, +Status LocalExchangeSinkOperatorX::init(RuntimeState* state, TLocalPartitionType::type type, + const int num_buckets, const std::map& shuffle_idx_to_instance_idx) { + DCHECK(!_planned_by_fe); _name = "LOCAL_EXCHANGE_SINK_OPERATOR(" + get_exchange_type_name(type) + ")"; _type = type; - if (_type == ExchangeType::HASH_SHUFFLE) { - _shuffle_idx_to_instance_idx.clear(); - _use_global_shuffle = use_global_hash_shuffle; + if (_type == TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE) { // For shuffle join, if data distribution has been broken by previous operator, we // should use a HASH_SHUFFLE local exchanger to shuffle data again. To be mentioned, // we should use map shuffle idx to instance idx because all instances will be // distributed to all BEs. Otherwise, we should use shuffle idx directly. - if (use_global_hash_shuffle) { - _shuffle_idx_to_instance_idx = shuffle_idx_to_instance_idx; + _shuffle_idx_to_instance_idx = shuffle_idx_to_instance_idx; + if (state->query_options().__isset.enable_new_shuffle_hash_method && + state->query_options().enable_new_shuffle_hash_method) { + _partitioner = std::make_unique(_num_partitions); } else { - for (int i = 0; i < _num_partitions; i++) { - _shuffle_idx_to_instance_idx[i] = i; - } + _partitioner = + std::make_unique>(_num_partitions); + } + RETURN_IF_ERROR(_partitioner->init(_texprs)); + } else if (_type == TLocalPartitionType::LOCAL_EXECUTION_HASH_SHUFFLE) { + _shuffle_idx_to_instance_idx.clear(); + for (int i = 0; i < _num_partitions; i++) { + _shuffle_idx_to_instance_idx[i] = i; } if (state->query_options().__isset.enable_new_shuffle_hash_method && state->query_options().enable_new_shuffle_hash_method) { @@ -64,7 +70,7 @@ Status LocalExchangeSinkOperatorX::init(RuntimeState* state, ExchangeType type, std::make_unique>(_num_partitions); } RETURN_IF_ERROR(_partitioner->init(_texprs)); - } else if (_type == ExchangeType::BUCKET_HASH_SHUFFLE) { + } else if (_type == TLocalPartitionType::BUCKET_HASH_SHUFFLE) { DCHECK_GT(num_buckets, 0); _partitioner = std::make_unique>(num_buckets); RETURN_IF_ERROR(_partitioner->init(_texprs)); @@ -72,9 +78,33 @@ Status LocalExchangeSinkOperatorX::init(RuntimeState* state, ExchangeType type, return Status::OK(); } +Status LocalExchangeSinkOperatorX::init_partitioner(RuntimeState* state) { + DCHECK(_planned_by_fe); + // Set operator name to include exchange type (base class init(tnode) only sets generic name). + _name = "LOCAL_EXCHANGE_SINK_OPERATOR(" + get_exchange_type_name(_type) + ")"; + if (_type == TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE || + _type == TLocalPartitionType::LOCAL_EXECUTION_HASH_SHUFFLE) { + if (state->query_options().__isset.enable_new_shuffle_hash_method && + state->query_options().enable_new_shuffle_hash_method) { + _partitioner = std::make_unique(_num_partitions); + } else { + _partitioner = + std::make_unique>(_num_partitions); + } + RETURN_IF_ERROR(_partitioner->init(_texprs)); + } else if (_type == TLocalPartitionType::BUCKET_HASH_SHUFFLE) { + DCHECK_GT(_num_partitions, 0); + _partitioner = std::make_unique>(_num_partitions); + RETURN_IF_ERROR(_partitioner->init(_texprs)); + } + return Status::OK(); +} + Status LocalExchangeSinkOperatorX::prepare(RuntimeState* state) { RETURN_IF_ERROR(DataSinkOperatorX::prepare(state)); - if (_type == ExchangeType::HASH_SHUFFLE || _type == ExchangeType::BUCKET_HASH_SHUFFLE) { + if (_type == TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE || + _type == TLocalPartitionType::LOCAL_EXECUTION_HASH_SHUFFLE || + _type == TLocalPartitionType::BUCKET_HASH_SHUFFLE) { RETURN_IF_ERROR(_partitioner->prepare(state, _child->row_desc())); RETURN_IF_ERROR(_partitioner->open(state)); } @@ -88,11 +118,6 @@ Status LocalExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo SCOPED_TIMER(_init_timer); _compute_hash_value_timer = ADD_TIMER(custom_profile(), "ComputeHashValueTime"); _distribute_timer = ADD_TIMER(custom_profile(), "DistributeDataTime"); - if (_parent->cast()._type == ExchangeType::HASH_SHUFFLE) { - custom_profile()->add_info_string( - "UseGlobalShuffle", - std::to_string(_parent->cast()._use_global_shuffle)); - } custom_profile()->add_info_string( "PartitionExprsSize", std::to_string(_parent->cast()._partitioned_exprs_num)); @@ -108,8 +133,7 @@ Status LocalExchangeSinkLocalState::open(RuntimeState* state) { _exchanger = _shared_state->exchanger.get(); DCHECK(_exchanger != nullptr); - if (_exchanger->get_type() == ExchangeType::HASH_SHUFFLE || - _exchanger->get_type() == ExchangeType::BUCKET_HASH_SHUFFLE) { + if (is_shuffled_exchange(_exchanger->get_type())) { auto& p = _parent->cast(); RETURN_IF_ERROR(p._partitioner->clone(state, _partitioner)); } @@ -132,12 +156,11 @@ Status LocalExchangeSinkLocalState::close(RuntimeState* state, Status exec_statu std::string LocalExchangeSinkLocalState::debug_string(int indentation_level) const { fmt::memory_buffer debug_string_buffer; fmt::format_to(debug_string_buffer, - "{}, _use_global_shuffle: {}, _channel_id: {}, _num_partitions: {}, " + "{}, _channel_id: {}, _num_partitions: {}, " "_num_senders: {}, _num_sources: {}, " "_running_sink_operators: {}, _running_source_operators: {}", - Base::debug_string(indentation_level), - _parent->cast()._use_global_shuffle, _channel_id, - _exchanger->_num_partitions, _exchanger->_num_senders, _exchanger->_num_sources, + Base::debug_string(indentation_level), _channel_id, _exchanger->_num_partitions, + _exchanger->_num_senders, _exchanger->_num_sources, _exchanger->_running_sink_operators, _exchanger->_running_source_operators); return fmt::to_string(debug_string_buffer); } diff --git a/be/src/exec/exchange/local_exchange_sink_operator.h b/be/src/exec/exchange/local_exchange_sink_operator.h index f1ca1935457ffc..eb698fd97361c4 100644 --- a/be/src/exec/exchange/local_exchange_sink_operator.h +++ b/be/src/exec/exchange/local_exchange_sink_operator.h @@ -79,6 +79,17 @@ class LocalExchangeSinkOperatorX final : public DataSinkOperatorX& shuffle_id_to_instance_idx) + : Base(operator_id, tnode, dest_id), + _type(tnode.local_exchange_node.partition_type), + _num_partitions(num_partitions), + _texprs(tnode.local_exchange_node.distribute_expr_lists), + _partitioned_exprs_num(tnode.local_exchange_node.distribute_expr_lists.size()), + _shuffle_idx_to_instance_idx(shuffle_id_to_instance_idx), + _planned_by_fe(true) {} #ifdef BE_TEST LocalExchangeSinkOperatorX(const std::vector& texprs, const std::map& bucket_seq_to_instance_idx) @@ -89,18 +100,19 @@ class LocalExchangeSinkOperatorX final : public DataSinkOperatorX& shuffle_idx_to_instance_idx) override; + // Initialize partitioner for FE-planned local exchange nodes. The FE-planned constructor + // already sets _type, _num_partitions, _texprs, and _shuffle_idx_to_instance_idx from the + // TPlanNode, but does not create the partitioner. This method creates the partitioner so + // that prepare() can call _partitioner->prepare() without null dereference. + Status init_partitioner(RuntimeState* state); + Status prepare(RuntimeState* state) override; Status sink(RuntimeState* state, Block* in_block, bool eos) override; @@ -115,13 +127,13 @@ class LocalExchangeSinkOperatorX final : public DataSinkOperatorX& _texprs; const size_t _partitioned_exprs_num; std::unique_ptr _partitioner; std::map _shuffle_idx_to_instance_idx; - bool _use_global_shuffle = false; + const bool _planned_by_fe = false; }; } // namespace doris diff --git a/be/src/exec/exchange/local_exchange_source_operator.cpp b/be/src/exec/exchange/local_exchange_source_operator.cpp index ad092656f21793..d6bf8ac7e21906 100644 --- a/be/src/exec/exchange/local_exchange_source_operator.cpp +++ b/be/src/exec/exchange/local_exchange_source_operator.cpp @@ -31,8 +31,7 @@ Status LocalExchangeSourceLocalState::init(RuntimeState* state, LocalStateInfo& DCHECK(_exchanger != nullptr); _get_block_failed_counter = ADD_COUNTER_WITH_LEVEL(custom_profile(), "GetBlockFailedTime", TUnit::UNIT, 1); - if (_exchanger->get_type() == ExchangeType::HASH_SHUFFLE || - _exchanger->get_type() == ExchangeType::BUCKET_HASH_SHUFFLE) { + if (is_shuffled_exchange(_exchanger->get_type())) { _copy_data_timer = ADD_TIMER(custom_profile(), "CopyDataTime"); } @@ -60,7 +59,7 @@ Status LocalExchangeSourceLocalState::close(RuntimeState* state) { } std::vector LocalExchangeSourceLocalState::dependencies() const { - if ((_exchanger->get_type() == ExchangeType::PASS_TO_ONE) && _channel_id != 0) { + if ((_exchanger->get_type() == TLocalPartitionType::PASS_TO_ONE) && _channel_id != 0) { // If this is a PASS_TO_ONE exchange and is not the first task, source operators always // return empty result so no dependencies here. return {}; diff --git a/be/src/exec/exchange/local_exchange_source_operator.h b/be/src/exec/exchange/local_exchange_source_operator.h index 58252b24ec2c23..2f3fc2d529403d 100644 --- a/be/src/exec/exchange/local_exchange_source_operator.h +++ b/be/src/exec/exchange/local_exchange_source_operator.h @@ -63,20 +63,47 @@ class LocalExchangeSourceOperatorX final : public OperatorX; LocalExchangeSourceOperatorX(ObjectPool* pool, int id) : Base(pool, id, id) {} + LocalExchangeSourceOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, + const DescriptorTbl& descs) + : Base(pool, tnode, operator_id, descs), + _exchange_type(tnode.local_exchange_node.partition_type), + _planned_by_fe(true) {} #ifdef BE_TEST LocalExchangeSourceOperatorX() = default; #endif - Status init(ExchangeType type) override { + Status init(TLocalPartitionType::type type) override { + DCHECK(!_planned_by_fe); _op_name = "LOCAL_EXCHANGE_OPERATOR(" + get_exchange_type_name(type) + ")"; _exchange_type = type; return Status::OK(); } - Status prepare(RuntimeState* state) override { return Status::OK(); } + Status prepare(RuntimeState* state) override { + if (_planned_by_fe) { + RETURN_IF_ERROR(Base::prepare(state)); + // Base::prepare() resets _op_name from tnode node_type; restore the type-qualified name. + _op_name = "LOCAL_EXCHANGE_OPERATOR(" + get_exchange_type_name(_exchange_type) + ")"; + return Status::OK(); + } + return Status::OK(); + } const RowDescriptor& intermediate_row_desc() const override { + if (_planned_by_fe) { + return Base::intermediate_row_desc(); + } return _child->intermediate_row_desc(); } - RowDescriptor& row_descriptor() override { return _child->row_descriptor(); } - const RowDescriptor& row_desc() const override { return _child->row_desc(); } + RowDescriptor& row_descriptor() override { + if (_planned_by_fe) { + return Base::row_descriptor(); + } + return _child->row_descriptor(); + } + const RowDescriptor& row_desc() const override { + if (_planned_by_fe) { + return Base::row_desc(); + } + return _child->row_desc(); + } Status get_block(RuntimeState* state, Block* block, bool* eos) override; @@ -85,7 +112,8 @@ class LocalExchangeSourceOperatorX final : public OperatorX; template class Exchanger : public ExchangerBase { public: - Exchanger(int running_sink_operators, int num_partitions, int free_block_limit) - : ExchangerBase(running_sink_operators, num_partitions, free_block_limit) { + Exchanger(int running_sink_operators, int num_partitions, int free_block_limit, + TLocalPartitionType::type type) + : ExchangerBase(running_sink_operators, num_partitions, free_block_limit), _type(type) { _data_queue.resize(num_partitions); _m.resize(num_partitions); for (size_t i = 0; i < num_partitions; i++) { _m[i] = std::make_unique(); } } - Exchanger(int running_sink_operators, int num_sources, int num_partitions, int free_block_limit) - : ExchangerBase(running_sink_operators, num_sources, num_partitions, free_block_limit) { + Exchanger(int running_sink_operators, int num_sources, int num_partitions, int free_block_limit, + TLocalPartitionType::type type) + : ExchangerBase(running_sink_operators, num_sources, num_partitions, free_block_limit), + _type(type) { _data_queue.resize(num_sources); _m.resize(num_sources); for (size_t i = 0; i < num_sources; i++) { @@ -244,6 +247,7 @@ class Exchanger : public ExchangerBase { } } ~Exchanger() override = default; + TLocalPartitionType::type get_type() const override { return _type; } std::string data_queue_debug_string(int i) override { return fmt::format("Data Queue {}: [size approx = {}, eos = {}]", i, _data_queue[i].data_queue.size_approx(), _data_queue[i].eos); @@ -260,6 +264,7 @@ class Exchanger : public ExchangerBase { bool _dequeue_data(BlockType& block, bool* eos, Block* data_block, int channel_id); std::vector> _data_queue; std::vector> _m; + const TLocalPartitionType::type _type; }; class LocalExchangeSourceLocalState; @@ -269,9 +274,9 @@ class ShuffleExchanger : public Exchanger { public: ENABLE_FACTORY_CREATOR(ShuffleExchanger); ShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, - int free_block_limit) + int free_block_limit, TLocalPartitionType::type type) : Exchanger(running_sink_operators, num_sources, num_partitions, - free_block_limit) { + free_block_limit, type) { DCHECK_GT(num_partitions, 0); DCHECK_GT(num_sources, 0); _partition_rows_histogram.resize(running_sink_operators); @@ -283,7 +288,6 @@ class ShuffleExchanger : public Exchanger { Status get_block(RuntimeState* state, Block* block, bool* eos, Profile&& profile, SourceInfo&& source_info) override; void close(SourceInfo&& source_info) override; - ExchangeType get_type() const override { return ExchangeType::HASH_SHUFFLE; } protected: Status _split_rows(RuntimeState* state, const std::vector& channel_ids, Block* block, @@ -299,24 +303,22 @@ class BucketShuffleExchanger final : public ShuffleExchanger { BucketShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, int free_block_limit) : ShuffleExchanger(running_sink_operators, num_sources, num_partitions, - free_block_limit) {} + free_block_limit, TLocalPartitionType::BUCKET_HASH_SHUFFLE) {} ~BucketShuffleExchanger() override = default; - ExchangeType get_type() const override { return ExchangeType::BUCKET_HASH_SHUFFLE; } }; class PassthroughExchanger final : public Exchanger { public: ENABLE_FACTORY_CREATOR(PassthroughExchanger); PassthroughExchanger(int running_sink_operators, int num_partitions, int free_block_limit) - : Exchanger(running_sink_operators, num_partitions, - free_block_limit) {} + : Exchanger(running_sink_operators, num_partitions, free_block_limit, + TLocalPartitionType::PASSTHROUGH) {} ~PassthroughExchanger() override = default; Status sink(RuntimeState* state, Block* in_block, bool eos, Profile&& profile, SinkInfo& sink_info) override; Status get_block(RuntimeState* state, Block* block, bool* eos, Profile&& profile, SourceInfo&& source_info) override; - ExchangeType get_type() const override { return ExchangeType::PASSTHROUGH; } void close(SourceInfo&& source_info) override; }; @@ -324,29 +326,28 @@ class PassToOneExchanger final : public Exchanger { public: ENABLE_FACTORY_CREATOR(PassToOneExchanger); PassToOneExchanger(int running_sink_operators, int num_partitions, int free_block_limit) - : Exchanger(running_sink_operators, num_partitions, - free_block_limit) {} + : Exchanger(running_sink_operators, num_partitions, free_block_limit, + TLocalPartitionType::PASS_TO_ONE) {} ~PassToOneExchanger() override = default; Status sink(RuntimeState* state, Block* in_block, bool eos, Profile&& profile, SinkInfo& sink_info) override; Status get_block(RuntimeState* state, Block* block, bool* eos, Profile&& profile, SourceInfo&& source_info) override; - ExchangeType get_type() const override { return ExchangeType::PASS_TO_ONE; } void close(SourceInfo&& source_info) override; }; class BroadcastExchanger final : public Exchanger { public: ENABLE_FACTORY_CREATOR(BroadcastExchanger); BroadcastExchanger(int running_sink_operators, int num_partitions, int free_block_limit) - : Exchanger(running_sink_operators, num_partitions, free_block_limit) {} + : Exchanger(running_sink_operators, num_partitions, free_block_limit, + TLocalPartitionType::BROADCAST) {} ~BroadcastExchanger() override = default; Status sink(RuntimeState* state, Block* in_block, bool eos, Profile&& profile, SinkInfo& sink_info) override; Status get_block(RuntimeState* state, Block* block, bool* eos, Profile&& profile, SourceInfo&& source_info) override; - ExchangeType get_type() const override { return ExchangeType::BROADCAST; } void close(SourceInfo&& source_info) override; }; @@ -357,8 +358,8 @@ class AdaptivePassthroughExchanger : public Exchanger { ENABLE_FACTORY_CREATOR(AdaptivePassthroughExchanger); AdaptivePassthroughExchanger(int running_sink_operators, int num_partitions, int free_block_limit) - : Exchanger(running_sink_operators, num_partitions, - free_block_limit) { + : Exchanger(running_sink_operators, num_partitions, free_block_limit, + TLocalPartitionType::ADAPTIVE_PASSTHROUGH) { _partition_rows_histogram.resize(running_sink_operators); _tmp_eos.resize(num_partitions); _tmp_block.resize(num_partitions); @@ -368,7 +369,6 @@ class AdaptivePassthroughExchanger : public Exchanger { Status get_block(RuntimeState* state, Block* block, bool* eos, Profile&& profile, SourceInfo&& source_info) override; - ExchangeType get_type() const override { return ExchangeType::ADAPTIVE_PASSTHROUGH; } void close(SourceInfo&& source_info) override; diff --git a/be/src/exec/operator/aggregation_sink_operator.h b/be/src/exec/operator/aggregation_sink_operator.h index f0cc6b70d4e567..ea7c19bcb82baf 100644 --- a/be/src/exec/operator/aggregation_sink_operator.h +++ b/be/src/exec/operator/aggregation_sink_operator.h @@ -159,7 +159,7 @@ class AggSinkOperatorX MOCK_REMOVE(final) : public DataSinkOperatorX::required_data_distribution( state); } @@ -167,8 +167,10 @@ class AggSinkOperatorX MOCK_REMOVE(final) : public DataSinkOperatorX::required_data_distribution(state); } return _is_colocate && _require_bucket_distribution - ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, _partition_exprs) - : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); + ? DataDistribution(TLocalPartitionType::BUCKET_HASH_SHUFFLE, + _partition_exprs) + : DataDistribution(TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE, + _partition_exprs); } bool is_colocated_operator() const override { return _is_colocate; } bool is_shuffled_operator() const override { diff --git a/be/src/exec/operator/analytic_sink_operator.h b/be/src/exec/operator/analytic_sink_operator.h index 9a64ba1d0b7ed7..2a989020a7f395 100644 --- a/be/src/exec/operator/analytic_sink_operator.h +++ b/be/src/exec/operator/analytic_sink_operator.h @@ -212,11 +212,13 @@ class AnalyticSinkOperatorX final : public DataSinkOperatorXenable_local_exchange_before_agg()) { return StatefulOperatorX::required_data_distribution( @@ -124,11 +124,13 @@ class DistinctStreamingAggOperatorX final } if (_needs_finalize || (!_probe_expr_ctxs.empty() && !_is_streaming_preagg)) { return _is_colocate && _require_bucket_distribution - ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, _partition_exprs) - : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); + ? DataDistribution(TLocalPartitionType::BUCKET_HASH_SHUFFLE, + _partition_exprs) + : DataDistribution(TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE, + _partition_exprs); } if (state->enable_distinct_streaming_agg_force_passthrough()) { - return {ExchangeType::PASSTHROUGH}; + return {TLocalPartitionType::PASSTHROUGH}; } else { return StatefulOperatorX::required_data_distribution( state); diff --git a/be/src/exec/operator/exchange_source_operator.h b/be/src/exec/operator/exchange_source_operator.h index da00e088586437..d47250080a1ee0 100644 --- a/be/src/exec/operator/exchange_source_operator.h +++ b/be/src/exec/operator/exchange_source_operator.h @@ -112,13 +112,13 @@ class ExchangeSourceOperatorX final : public OperatorX { DataDistribution required_data_distribution(RuntimeState* /*state*/) const override { if (OperatorX::is_serial_operator()) { - return {ExchangeType::NOOP}; + return {TLocalPartitionType::NOOP}; } return _partition_type == TPartitionType::HASH_PARTITIONED - ? DataDistribution(ExchangeType::HASH_SHUFFLE) + ? DataDistribution(TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE) : _partition_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED - ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE) - : DataDistribution(ExchangeType::NOOP); + ? DataDistribution(TLocalPartitionType::BUCKET_HASH_SHUFFLE) + : DataDistribution(TLocalPartitionType::NOOP); } private: diff --git a/be/src/exec/operator/hashjoin_build_sink.h b/be/src/exec/operator/hashjoin_build_sink.h index be77ef6cc690bc..cf546e6dddb7c8 100644 --- a/be/src/exec/operator/hashjoin_build_sink.h +++ b/be/src/exec/operator/hashjoin_build_sink.h @@ -134,15 +134,17 @@ class HashJoinBuildSinkOperatorX MOCK_REMOVE(final) DataDistribution required_data_distribution(RuntimeState* /*state*/) const override { if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { - return {ExchangeType::NOOP}; + return {TLocalPartitionType::NOOP}; } else if (_is_broadcast_join) { - return _child->is_serial_operator() ? DataDistribution(ExchangeType::PASS_TO_ONE) - : DataDistribution(ExchangeType::NOOP); + return _child->is_serial_operator() ? DataDistribution(TLocalPartitionType::PASS_TO_ONE) + : DataDistribution(TLocalPartitionType::NOOP); } return _join_distribution == TJoinDistributionType::BUCKET_SHUFFLE || _join_distribution == TJoinDistributionType::COLOCATE - ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, _partition_exprs) - : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); + ? DataDistribution(TLocalPartitionType::BUCKET_HASH_SHUFFLE, + _partition_exprs) + : DataDistribution(TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE, + _partition_exprs); } bool is_shuffled_operator() const override { diff --git a/be/src/exec/operator/hashjoin_probe_operator.h b/be/src/exec/operator/hashjoin_probe_operator.h index da60e8b52313c5..371a43fe1eb00b 100644 --- a/be/src/exec/operator/hashjoin_probe_operator.h +++ b/be/src/exec/operator/hashjoin_probe_operator.h @@ -135,21 +135,23 @@ class HashJoinProbeOperatorX MOCK_REMOVE(final) bool need_more_input_data(RuntimeState* state) const override; DataDistribution required_data_distribution(RuntimeState* state) const override { if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { - return {ExchangeType::NOOP}; + return {TLocalPartitionType::NOOP}; } else if (_is_broadcast_join) { if (state->enable_broadcast_join_force_passthrough()) { - return DataDistribution(ExchangeType::PASSTHROUGH); + return DataDistribution(TLocalPartitionType::PASSTHROUGH); } else { return _child && _child->is_serial_operator() - ? DataDistribution(ExchangeType::PASSTHROUGH) - : DataDistribution(ExchangeType::NOOP); + ? DataDistribution(TLocalPartitionType::PASSTHROUGH) + : DataDistribution(TLocalPartitionType::NOOP); } } return (_join_distribution == TJoinDistributionType::BUCKET_SHUFFLE || _join_distribution == TJoinDistributionType::COLOCATE - ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, _partition_exprs) - : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs)); + ? DataDistribution(TLocalPartitionType::BUCKET_HASH_SHUFFLE, + _partition_exprs) + : DataDistribution(TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE, + _partition_exprs)); } bool is_broadcast_join() const { return _is_broadcast_join; } diff --git a/be/src/exec/operator/nested_loop_join_build_operator.h b/be/src/exec/operator/nested_loop_join_build_operator.h index 265a8767b3ec1c..b6be8eb9254046 100644 --- a/be/src/exec/operator/nested_loop_join_build_operator.h +++ b/be/src/exec/operator/nested_loop_join_build_operator.h @@ -68,10 +68,10 @@ class NestedLoopJoinBuildSinkOperatorX final DataDistribution required_data_distribution(RuntimeState* /*state*/) const override { if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { - return {ExchangeType::NOOP}; + return {TLocalPartitionType::NOOP}; } - return _child->is_serial_operator() ? DataDistribution(ExchangeType::BROADCAST) - : DataDistribution(ExchangeType::NOOP); + return _child->is_serial_operator() ? DataDistribution(TLocalPartitionType::BROADCAST) + : DataDistribution(TLocalPartitionType::NOOP); } private: diff --git a/be/src/exec/operator/nested_loop_join_probe_operator.h b/be/src/exec/operator/nested_loop_join_probe_operator.h index 9313331fc697d6..d15652aeb00759 100644 --- a/be/src/exec/operator/nested_loop_join_probe_operator.h +++ b/be/src/exec/operator/nested_loop_join_probe_operator.h @@ -277,9 +277,9 @@ class NestedLoopJoinProbeOperatorX final if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || _join_op == TJoinOp::RIGHT_OUTER_JOIN || _join_op == TJoinOp::RIGHT_ANTI_JOIN || _join_op == TJoinOp::RIGHT_SEMI_JOIN || _join_op == TJoinOp::FULL_OUTER_JOIN) { - return {ExchangeType::NOOP}; + return {TLocalPartitionType::NOOP}; } - return {ExchangeType::ADAPTIVE_PASSTHROUGH}; + return {TLocalPartitionType::ADAPTIVE_PASSTHROUGH}; } const RowDescriptor& row_desc() const override { diff --git a/be/src/exec/operator/operator.cpp b/be/src/exec/operator/operator.cpp index 96f7933d5d5274..491e35ecd80e80 100644 --- a/be/src/exec/operator/operator.cpp +++ b/be/src/exec/operator/operator.cpp @@ -143,8 +143,8 @@ Status PipelineXSinkLocalState::terminate(RuntimeState* state) { DataDistribution OperatorBase::required_data_distribution(RuntimeState* /*state*/) const { return _child && _child->is_serial_operator() && !is_source() - ? DataDistribution(ExchangeType::PASSTHROUGH) - : DataDistribution(ExchangeType::NOOP); + ? DataDistribution(TLocalPartitionType::PASSTHROUGH) + : DataDistribution(TLocalPartitionType::NOOP); } const RowDescriptor& OperatorBase::row_desc() const { diff --git a/be/src/exec/operator/operator.h b/be/src/exec/operator/operator.h index d4b09a4aed3bb5..d1d4cb58e63b0c 100644 --- a/be/src/exec/operator/operator.h +++ b/be/src/exec/operator/operator.h @@ -600,8 +600,8 @@ class DataSinkOperatorXBase : public OperatorBase { virtual bool reset_to_rerun(RuntimeState* state, OperatorXBase* root) const { return false; } Status init(const TDataSink& tsink) override; - [[nodiscard]] virtual Status init(RuntimeState* state, ExchangeType type, const int num_buckets, - const bool use_global_hash_shuffle, + [[nodiscard]] virtual Status init(RuntimeState* state, TLocalPartitionType::type type, + const int num_buckets, const std::map& shuffle_idx_to_instance_idx) { return Status::InternalError("init() is only implemented in local exchange!"); } @@ -849,7 +849,7 @@ class OperatorXBase : public OperatorBase { Status init(const TDataSink& tsink) override { throw Exception(Status::FatalError("should not reach here!")); } - virtual Status init(ExchangeType type) { + virtual Status init(TLocalPartitionType::type type) { throw Exception(Status::FatalError("should not reach here!")); } [[noreturn]] virtual const std::vector& runtime_filter_descs() { diff --git a/be/src/exec/operator/partition_sort_sink_operator.h b/be/src/exec/operator/partition_sort_sink_operator.h index 48b709d3f1f0cd..4589aad8a04779 100644 --- a/be/src/exec/operator/partition_sort_sink_operator.h +++ b/be/src/exec/operator/partition_sort_sink_operator.h @@ -95,9 +95,10 @@ class PartitionSortSinkOperatorX final : public DataSinkOperatorX(pool, tnode, operator_id, descs), - _join_distribution(tnode.hash_join_node.__isset.dist_type ? tnode.hash_join_node.dist_type - : TJoinDistributionType::NONE), _distribution_partition_exprs(tnode.__isset.distribute_expr_lists ? tnode.distribute_expr_lists[0] : std::vector {}), diff --git a/be/src/exec/operator/partitioned_hash_join_probe_operator.h b/be/src/exec/operator/partitioned_hash_join_probe_operator.h index 15767504d85982..d9bca3aab308c4 100644 --- a/be/src/exec/operator/partitioned_hash_join_probe_operator.h +++ b/be/src/exec/operator/partitioned_hash_join_probe_operator.h @@ -228,16 +228,8 @@ class PartitionedHashJoinProbeOperatorX final Status pull(doris::RuntimeState* state, Block* output_block, bool* eos) const override; bool need_more_input_data(RuntimeState* state) const override; - DataDistribution required_data_distribution(RuntimeState* /*state*/) const override { - if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { - return {ExchangeType::NOOP}; - } - return (_join_distribution == TJoinDistributionType::BUCKET_SHUFFLE || - _join_distribution == TJoinDistributionType::COLOCATE - ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, - _distribution_partition_exprs) - : DataDistribution(ExchangeType::HASH_SHUFFLE, - _distribution_partition_exprs)); + DataDistribution required_data_distribution(RuntimeState* state) const override { + return _inner_probe_operator->required_data_distribution(state); } size_t revocable_mem_size(RuntimeState* state) const override; @@ -285,8 +277,6 @@ class PartitionedHashJoinProbeOperatorX final RuntimeState* state, Block* output_block, bool* eos) const; - const TJoinDistributionType::type _join_distribution; - std::shared_ptr _inner_sink_operator; std::shared_ptr _inner_probe_operator; diff --git a/be/src/exec/operator/partitioned_hash_join_sink_operator.h b/be/src/exec/operator/partitioned_hash_join_sink_operator.h index c6dc2cdfb94768..e3c818a33c8aeb 100644 --- a/be/src/exec/operator/partitioned_hash_join_sink_operator.h +++ b/be/src/exec/operator/partitioned_hash_join_sink_operator.h @@ -127,14 +127,14 @@ class PartitionedHashJoinSinkOperatorX DataDistribution required_data_distribution(RuntimeState* /*state*/) const override { if (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { - return {ExchangeType::NOOP}; + return {TLocalPartitionType::NOOP}; } return _join_distribution == TJoinDistributionType::BUCKET_SHUFFLE || _join_distribution == TJoinDistributionType::COLOCATE - ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, + ? DataDistribution(TLocalPartitionType::BUCKET_HASH_SHUFFLE, _distribution_partition_exprs) - : DataDistribution(ExchangeType::HASH_SHUFFLE, + : DataDistribution(TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE, _distribution_partition_exprs); } diff --git a/be/src/exec/operator/rec_cte_anchor_sink_operator.h b/be/src/exec/operator/rec_cte_anchor_sink_operator.h index 42661ee4181376..e49ccf7b8c1f68 100644 --- a/be/src/exec/operator/rec_cte_anchor_sink_operator.h +++ b/be/src/exec/operator/rec_cte_anchor_sink_operator.h @@ -66,7 +66,7 @@ class RecCTEAnchorSinkOperatorX MOCK_REMOVE(final) bool is_serial_operator() const override { return true; } DataDistribution required_data_distribution(RuntimeState* /*state*/) const override { - return {ExchangeType::NOOP}; + return {TLocalPartitionType::NOOP}; } Status terminate(RuntimeState* state) override { diff --git a/be/src/exec/operator/rec_cte_sink_operator.h b/be/src/exec/operator/rec_cte_sink_operator.h index a071cf55761147..d42a6da4e35331 100644 --- a/be/src/exec/operator/rec_cte_sink_operator.h +++ b/be/src/exec/operator/rec_cte_sink_operator.h @@ -77,7 +77,7 @@ class RecCTESinkOperatorX MOCK_REMOVE(final) : public DataSinkOperatorX { bool is_serial_operator() const override { return true; } DataDistribution required_data_distribution(RuntimeState* /*state*/) const override { - return {ExchangeType::NOOP}; + return {TLocalPartitionType::NOOP}; } Status get_block(RuntimeState* state, Block* block, bool* eos) override { diff --git a/be/src/exec/operator/scan_operator.h b/be/src/exec/operator/scan_operator.h index f2ba87d4291412..e5ff76eac737a3 100644 --- a/be/src/exec/operator/scan_operator.h +++ b/be/src/exec/operator/scan_operator.h @@ -368,9 +368,9 @@ class ScanOperatorX : public OperatorX { DataDistribution required_data_distribution(RuntimeState* /*state*/) const override { if (OperatorX::is_serial_operator()) { // `is_serial_operator()` returns true means we ignore the distribution. - return {ExchangeType::NOOP}; + return {TLocalPartitionType::NOOP}; } - return {ExchangeType::BUCKET_HASH_SHUFFLE}; + return {TLocalPartitionType::BUCKET_HASH_SHUFFLE}; } void set_low_memory_mode(RuntimeState* state) override { diff --git a/be/src/exec/operator/set_probe_sink_operator.h b/be/src/exec/operator/set_probe_sink_operator.h index c736e2af2296ef..ff47755d3d03d6 100644 --- a/be/src/exec/operator/set_probe_sink_operator.h +++ b/be/src/exec/operator/set_probe_sink_operator.h @@ -103,8 +103,10 @@ class SetProbeSinkOperatorX final : public DataSinkOperatorX create_shared_state() const override { return nullptr; } diff --git a/be/src/exec/operator/set_sink_operator.h b/be/src/exec/operator/set_sink_operator.h index e1c937f1471891..4315ac8d34a9be 100644 --- a/be/src/exec/operator/set_sink_operator.h +++ b/be/src/exec/operator/set_sink_operator.h @@ -114,8 +114,10 @@ class SetSinkOperatorX final : public DataSinkOperatorX { DataDistribution required_data_distribution(RuntimeState* /*state*/) const override { if (_is_analytic_sort) { return _is_colocate && _require_bucket_distribution - ? DataDistribution(ExchangeType::BUCKET_HASH_SHUFFLE, _partition_exprs) - : DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); + ? DataDistribution(TLocalPartitionType::BUCKET_HASH_SHUFFLE, + _partition_exprs) + : DataDistribution(TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE, + _partition_exprs); } else if (_merge_by_exchange) { // The current sort node is used for the ORDER BY - return {ExchangeType::PASSTHROUGH}; + return {TLocalPartitionType::PASSTHROUGH}; } else { - return {ExchangeType::NOOP}; + return {TLocalPartitionType::NOOP}; } } bool is_colocated_operator() const override { return _is_colocate; } diff --git a/be/src/exec/operator/streaming_aggregation_operator.h b/be/src/exec/operator/streaming_aggregation_operator.h index 40a8de2824446a..80b64de1c9f52f 100644 --- a/be/src/exec/operator/streaming_aggregation_operator.h +++ b/be/src/exec/operator/streaming_aggregation_operator.h @@ -224,18 +224,19 @@ class StreamingAggOperatorX MOCK_REMOVE(final) : public StatefulOperatorXis_hash_join_probe() && state->enable_streaming_agg_hash_join_force_passthrough()) { - return DataDistribution(ExchangeType::PASSTHROUGH); + return DataDistribution(TLocalPartitionType::PASSTHROUGH); } if (!_needs_finalize && !state->enable_local_exchange_before_agg()) { return StatefulOperatorX::required_data_distribution(state); } if (_partition_exprs.empty()) { return _needs_finalize - ? DataDistribution(ExchangeType::NOOP) + ? DataDistribution(TLocalPartitionType::NOOP) : StatefulOperatorX::required_data_distribution( state); } - return DataDistribution(ExchangeType::HASH_SHUFFLE, _partition_exprs); + return DataDistribution(TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE, + _partition_exprs); } private: diff --git a/be/src/exec/operator/table_function_operator.h b/be/src/exec/operator/table_function_operator.h index 0a55c090596035..de6e546af1d550 100644 --- a/be/src/exec/operator/table_function_operator.h +++ b/be/src/exec/operator/table_function_operator.h @@ -116,7 +116,7 @@ class TableFunctionOperatorX MOCK_REMOVE(final) } DataDistribution required_data_distribution(RuntimeState* /*state*/) const override { - return {ExchangeType::PASSTHROUGH}; + return {TLocalPartitionType::PASSTHROUGH}; } Status push(RuntimeState* state, Block* input_block, bool eos) const override { diff --git a/be/src/exec/operator/union_sink_operator.h b/be/src/exec/operator/union_sink_operator.h index 4842ab6b243903..b698a08f01577d 100644 --- a/be/src/exec/operator/union_sink_operator.h +++ b/be/src/exec/operator/union_sink_operator.h @@ -116,10 +116,11 @@ class UnionSinkOperatorX MOCK_REMOVE(final) : public DataSinkOperatorX (e.g. same as storage engine). - BUCKET_HASH_SHUFFLE = 3, - // Passthrough data blocks to all channels. - BROADCAST = 4, - // Passthrough data to channels evenly in an adaptive way. - ADAPTIVE_PASSTHROUGH = 5, - // Send all data to the first channel. - PASS_TO_ONE = 6, -}; +inline bool is_shuffled_exchange(TLocalPartitionType::type idx) { + return idx == TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE || + idx == TLocalPartitionType::LOCAL_EXECUTION_HASH_SHUFFLE || + idx == TLocalPartitionType::BUCKET_HASH_SHUFFLE; +} -inline std::string get_exchange_type_name(ExchangeType idx) { +inline std::string get_exchange_type_name(TLocalPartitionType::type idx) { switch (idx) { - case ExchangeType::NOOP: + case TLocalPartitionType::NOOP: return "NOOP"; - case ExchangeType::HASH_SHUFFLE: - return "HASH_SHUFFLE"; - case ExchangeType::PASSTHROUGH: + case TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE: + return "GLOBAL_HASH_SHUFFLE"; + case TLocalPartitionType::LOCAL_EXECUTION_HASH_SHUFFLE: + return "LOCAL_HASH_SHUFFLE"; + case TLocalPartitionType::PASSTHROUGH: return "PASSTHROUGH"; - case ExchangeType::BUCKET_HASH_SHUFFLE: + case TLocalPartitionType::BUCKET_HASH_SHUFFLE: return "BUCKET_HASH_SHUFFLE"; - case ExchangeType::BROADCAST: + case TLocalPartitionType::BROADCAST: return "BROADCAST"; - case ExchangeType::ADAPTIVE_PASSTHROUGH: + case TLocalPartitionType::ADAPTIVE_PASSTHROUGH: return "ADAPTIVE_PASSTHROUGH"; - case ExchangeType::PASS_TO_ONE: + case TLocalPartitionType::PASS_TO_ONE: return "PASS_TO_ONE"; + case TLocalPartitionType::LOCAL_MERGE_SORT: + return "LOCAL_MERGE_SORT"; } throw Exception(Status::FatalError("__builtin_unreachable")); } struct DataDistribution { - DataDistribution(ExchangeType type) : distribution_type(type) {} - DataDistribution(ExchangeType type, const std::vector& partition_exprs_) + DataDistribution(TLocalPartitionType::type type) : distribution_type(type) {} + DataDistribution(TLocalPartitionType::type type, const std::vector& partition_exprs_) : distribution_type(type), partition_exprs(partition_exprs_) {} DataDistribution(const DataDistribution& other) = default; - bool need_local_exchange() const { return distribution_type != ExchangeType::NOOP; } + bool need_local_exchange() const { return distribution_type != TLocalPartitionType::NOOP; } DataDistribution& operator=(const DataDistribution& other) = default; - ExchangeType distribution_type; + TLocalPartitionType::type distribution_type; std::vector partition_exprs; }; diff --git a/be/src/exec/pipeline/pipeline.cpp b/be/src/exec/pipeline/pipeline.cpp index de3c852ada1bb6..c63267800ed24c 100644 --- a/be/src/exec/pipeline/pipeline.cpp +++ b/be/src/exec/pipeline/pipeline.cpp @@ -54,8 +54,7 @@ bool Pipeline::need_to_local_exchange(const DataDistribution target_data_distrib return true; } - if (target_data_distribution.distribution_type != ExchangeType::BUCKET_HASH_SHUFFLE && - target_data_distribution.distribution_type != ExchangeType::HASH_SHUFFLE) { + if (!is_shuffled_exchange(target_data_distribution.distribution_type)) { // Always do local exchange if non-hash-partition exchanger is required. // For example, `PASSTHROUGH` exchanger is always required to distribute data evenly. return true; diff --git a/be/src/exec/pipeline/pipeline.h b/be/src/exec/pipeline/pipeline.h index 3dc9e2353f7218..817c379692ef2c 100644 --- a/be/src/exec/pipeline/pipeline.h +++ b/be/src/exec/pipeline/pipeline.h @@ -68,16 +68,15 @@ class Pipeline : public std::enable_shared_from_this { [[nodiscard]] PipelineId id() const { return _pipeline_id; } - static bool is_hash_exchange(ExchangeType idx) { - return idx == ExchangeType::HASH_SHUFFLE || idx == ExchangeType::BUCKET_HASH_SHUFFLE; + static bool is_hash_exchange(TLocalPartitionType::type idx) { + return is_shuffled_exchange(idx); } // For HASH_SHUFFLE, BUCKET_HASH_SHUFFLE, and ADAPTIVE_PASSTHROUGH, // data is processed and shuffled on the sink. // Compared to PASSTHROUGH, this is a relatively heavy operation. - static bool heavy_operations_on_the_sink(ExchangeType idx) { - return idx == ExchangeType::HASH_SHUFFLE || idx == ExchangeType::BUCKET_HASH_SHUFFLE || - idx == ExchangeType::ADAPTIVE_PASSTHROUGH; + static bool heavy_operations_on_the_sink(TLocalPartitionType::type idx) { + return is_shuffled_exchange(idx) || idx == TLocalPartitionType::ADAPTIVE_PASSTHROUGH; } bool need_to_local_exchange(const DataDistribution target_data_distribution, @@ -165,7 +164,7 @@ class Pipeline : public std::enable_shared_from_this { // Input data distribution of this pipeline. We do local exchange when input data distribution // does not match the target data distribution. - DataDistribution _data_distribution {ExchangeType::NOOP}; + DataDistribution _data_distribution {TLocalPartitionType::NOOP}; // How many tasks should be created ? int _num_tasks = 1; diff --git a/be/src/exec/pipeline/pipeline_fragment_context.cpp b/be/src/exec/pipeline/pipeline_fragment_context.cpp index 615d67e5c9fff0..05cf647a5be708 100644 --- a/be/src/exec/pipeline/pipeline_fragment_context.cpp +++ b/be/src/exec/pipeline/pipeline_fragment_context.cpp @@ -290,6 +290,32 @@ Status PipelineFragmentContext::_build_and_prepare_full_pipeline(ThreadPool* thr RETURN_IF_ERROR(_build_pipelines(_runtime_state->obj_pool(), *_query_ctx->desc_tbl, &_root_op, root_pipeline)); + // Propagate _num_instances from LOCAL_EXCHANGE pipelines to ancestor pipelines + // that inherited reduced num_tasks from a serial operator. + _propagate_local_exchange_num_tasks(); + + // Create deferred local exchangers now that all pipelines have final num_tasks. + RETURN_IF_ERROR(_create_deferred_local_exchangers()); + + // Raise num_tasks for pipelines whose serial non-scan operators (e.g., + // UNPARTITIONED Exchange) reduced num_tasks below _num_instances. + // Without this, fragment instances 1+ have no task for these pipelines + // and downstream operators fail with "must set shared state". + // + // This applies to ALL pipelines (not just deferred exchanger upstreams): + // fragments with UNION/INTERSECT/EXCEPT + serial Exchange in child + // pipelines also need the raise, even without FE-planned local exchange. + // + // Exception: serial scan sources (pooling scan) keep num_tasks=1 — the + // PassthroughExchanger(1, N) handles the fan-out correctly. + // NOTE: Do NOT raise pipelines whose source is a serial operator + // (Exchange or scan) — they legitimately have 1 task, and raising + // them causes crashes (e.g., 4 Exchange tasks but only 1 receives + // data). The correct fix for shared state injection across + // instances is handled by the FE: it inserts local exchange nodes + // between serial operators and their downstream consumers, creating + // proper pipeline boundaries with _num_instances tasks. + // 3. Create sink operator if (!_params.fragment.__isset.output_sink) { return Status::InternalError("No output sink in this fragment!"); @@ -307,7 +333,7 @@ Status PipelineFragmentContext::_build_and_prepare_full_pipeline(ThreadPool* thr } } // 4. Build local exchanger - if (_runtime_state->enable_local_shuffle()) { + if (_runtime_state->plan_local_shuffle()) { SCOPED_TIMER(_plan_local_exchanger_timer); RETURN_IF_ERROR(_plan_local_exchange(_params.num_buckets, _params.bucket_seq_to_instance_idx, @@ -690,6 +716,199 @@ Status PipelineFragmentContext::_build_pipelines(ObjectPool* pool, const Descrip return Status::OK(); } +Status PipelineFragmentContext::_create_deferred_local_exchangers() { + for (auto& info : _deferred_exchangers) { + // DANGER ZONE — do not "fix" this line without reading the history. + // + // sender_count seeds Exchanger::_running_sink_operators, which the source side + // waits to reach 0 via sub_running_sink_operators on each sink LocalState close. + // The correct value is THIS pipeline-instance's sink task count, which is exactly + // info.upstream_pipe->num_tasks() — one PipelineTask per task, one close per task. + // + // Tempting wrong fix #1: `std::max(num_tasks, _num_instances)` to mirror the + // BE-planned path in _add_local_exchange_impl (~line 1023). THIS BREAKS the + // common FE-planned shape of `serial scan → LE(PT) → ...`: upstream_pipe + // genuinely has num_tasks=1, only 1 close arrives, but seed becomes + // _num_instances so _running_sink_operators never reaches 0 — downstream + // sources hang on SHUFFLE_DATA_DEPENDENCY (e.g. MTMV refresh from + // mtmv_up_down_job_p0/load.groovy stays at Status=RUNNING; build 949402 + // regressed exactly this way). BE-planned mode uses max() because its + // `cur_pipe` is the source-side pipeline (always raised to _num_instances by + // add_pipeline) — not analogous to our `upstream_pipe` here, which is the + // sink-side pipeline that may legitimately stay at 1 for serial sources. + // + // Tempting wrong fix #2: multiply by _num_instances on the theory shared_state + // is shared across all instances. Same hang — each fragment-instance + // PipelineFragmentContext has its OWN _op_id_to_shared_state map, so the + // exchanger is per-instance, not per-BE. num_tasks() is already the right + // close-count for one instance. + // + // If a hang shows up with `_running_sink_operators < 0`, the bug is upstream: + // _propagate_local_exchange_num_tasks left num_tasks too low (or too high) for + // this fragment shape. Fix THAT pass, not this seed value. + const int sender_count = info.upstream_pipe->num_tasks(); + switch (info.partition_type) { + case TLocalPartitionType::LOCAL_EXECUTION_HASH_SHUFFLE: + case TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE: + info.shared_state->exchanger = ShuffleExchanger::create_unique( + sender_count, _num_instances, info.num_partitions, info.free_blocks_limit, + info.partition_type); + break; + case TLocalPartitionType::BUCKET_HASH_SHUFFLE: + info.shared_state->exchanger = BucketShuffleExchanger::create_unique( + sender_count, _num_instances, info.num_partitions, info.free_blocks_limit); + break; + case TLocalPartitionType::PASSTHROUGH: + info.shared_state->exchanger = PassthroughExchanger::create_unique( + sender_count, _num_instances, info.free_blocks_limit); + break; + case TLocalPartitionType::BROADCAST: + info.shared_state->exchanger = BroadcastExchanger::create_unique( + sender_count, _num_instances, info.free_blocks_limit); + break; + case TLocalPartitionType::PASS_TO_ONE: + if (_runtime_state->enable_share_hash_table_for_broadcast_join()) { + info.shared_state->exchanger = PassToOneExchanger::create_unique( + sender_count, _num_instances, info.free_blocks_limit); + } else { + info.shared_state->exchanger = BroadcastExchanger::create_unique( + sender_count, _num_instances, info.free_blocks_limit); + } + break; + case TLocalPartitionType::ADAPTIVE_PASSTHROUGH: + info.shared_state->exchanger = AdaptivePassthroughExchanger::create_unique( + sender_count, _num_instances, info.free_blocks_limit); + break; + case TLocalPartitionType::NOOP: + case TLocalPartitionType::LOCAL_MERGE_SORT: + // FE-planned LocalExchangeNode currently never emits NOOP or LOCAL_MERGE_SORT + // through the deferred-exchanger path. NOOP means "no exchange needed" and + // is filtered out before reaching here; LOCAL_MERGE_SORT is planned by the + // legacy BE path only. Crash in debug to surface the protocol violation if + // that ever changes; return an error in release to avoid silently corrupting + // execution. + DCHECK(false) << "FE-planned local exchange should not emit partition_type=" + << static_cast(info.partition_type); + return Status::InternalError("FE-planned local exchange emitted unsupported type: " + + std::to_string(static_cast(info.partition_type))); + default: + // New TLocalPartitionType added on FE side without a BE handler here. + DCHECK(false) << "Unhandled TLocalPartitionType in deferred exchangers: " + << static_cast(info.partition_type); + return Status::InternalError("Unsupported FE-planned local exchange type: " + + std::to_string(static_cast(info.partition_type))); + } + } + _deferred_exchangers.clear(); + return Status::OK(); +} + +void PipelineFragmentContext::_propagate_local_exchange_num_tasks() { + // Only runs when FE has planned local exchanges and BE deferred their construction. + // In legacy mode (enable_local_shuffle_planner=false) BE plans LE itself via + // _plan_local_exchange and _deferred_exchangers stays empty — the legacy path + // already gets its num_tasks right at construction time, so the propagate passes + // would be no-ops and are skipped. This is a transitional design: once the FE + // planner is the only planner, the propagation logic itself should degrade into + // a pure assertion that the FE plan already wired the right num_tasks everywhere. + if (_deferred_exchangers.empty()) { + return; + } + // Fix num_tasks mismatches between paired pipelines created by pipeline-splitting + // operators (AGG, SORT, JOIN). These pairs share state via inject_shared_state and + // must have consistent num_tasks; otherwise instance 1+ tasks access null shared_state. + // + // Pass 1 (upward): when FE inserts LOCAL_EXCHANGE_NODE below a splitting operator, + // the LE pipeline gets _num_instances but ancestor pipelines still have the reduced + // count. Raise them to _num_instances (skip serial source operators). + // + // Pass 2 (downward): when a serial Exchange feeds a splitting operator (e.g., scalar + // merge-finalize AGG), the sink pipeline has 1 task but the source pipeline may have + // been raised (by LE or by inheriting _num_instances from add_pipeline). Lower it + // to match, unless it contains LocalExchangeSource (deliberately raised by LE). + std::unordered_map id_to_pipe; + for (auto& p : _pipelines) { + id_to_pipe[p->id()] = p; + } + + // Pass 1 (upward): raise downstream pipelines to _num_instances when any upstream + // dependency was already raised by LOCAL_EXCHANGE. + bool changed = true; + while (changed) { + changed = false; + for (const auto& [downstream_id, upstream_ids] : _dag) { + auto dit = id_to_pipe.find(downstream_id); + if (dit == id_to_pipe.end()) { + continue; + } + auto& downstream = dit->second; + if (downstream->num_tasks() >= _num_instances) { + continue; + } + if (!downstream->operators().empty() && + downstream->operators().front()->is_serial_operator()) { + continue; + } + for (auto upstream_id : upstream_ids) { + auto uit = id_to_pipe.find(upstream_id); + if (uit != id_to_pipe.end() && uit->second->num_tasks() >= _num_instances) { + downstream->set_num_tasks(_num_instances); + changed = true; + break; + } + } + } + } + + // Pass 2 (downward): when a pipeline-splitting operator (AGG, SORT, JOIN) creates + // paired pipelines (source + sink), they share state via inject_shared_state and must + // have consistent num_tasks. If the sink pipeline has a serial source (Exchange) with + // num_tasks < _num_instances, but the source pipeline was raised (by inheriting from + // an LE-fixed parent during add_pipeline), lower the source pipeline to match. + // Skip pipelines containing LocalExchangeSource — they were deliberately raised. + // + // Example: serial Exchange → AGG(merge finalize) → LOCAL_EXCHANGE → NLJ + // P_sink = [Exchange, AGGSink] num_tasks=1 (serial) + // P_source = [AGGSource] num_tasks=_num_instances (inherited) + // → lower P_source to 1; the LE above fans out the 1 result to N NLJ tasks. + changed = true; + while (changed) { + changed = false; + for (const auto& [downstream_id, upstream_ids] : _dag) { + auto dit = id_to_pipe.find(downstream_id); + if (dit == id_to_pipe.end()) { + continue; + } + auto& downstream = dit->second; + if (downstream->num_tasks() <= 1) { + continue; + } + // Don't lower pipelines that contain LocalExchangeSource — they were + // deliberately raised to _num_instances by LOCAL_EXCHANGE processing. + bool has_le_source = false; + for (auto& op : downstream->operators()) { + if (dynamic_cast(op.get())) { + has_le_source = true; + break; + } + } + if (has_le_source) { + continue; + } + for (auto upstream_id : upstream_ids) { + auto uit = id_to_pipe.find(upstream_id); + if (uit != id_to_pipe.end() && uit->second->num_tasks() < downstream->num_tasks() && + !uit->second->operators().empty() && + uit->second->operators().front()->is_serial_operator()) { + downstream->set_num_tasks(uit->second->num_tasks()); + changed = true; + break; + } + } + } + } +} + Status PipelineFragmentContext::_create_tree_helper( ObjectPool* pool, const std::vector& tnodes, const DescriptorTbl& descs, OperatorPtr parent, int* node_idx, OperatorPtr* root, PipelinePtr& cur_pipe, int child_idx, @@ -723,7 +942,7 @@ Status PipelineFragmentContext::_create_tree_helper( *root = op; } /** - * `ExchangeType::HASH_SHUFFLE` should be used if an operator is followed by a shuffled operator (shuffled hash join, union operator followed by co-located operators). + * `TLocalPartitionType::HASH_SHUFFLE` should be used if an operator is followed by a shuffled operator (shuffled hash join, union operator followed by co-located operators). * * For plan: * LocalExchange(id=0) -> Aggregation(id=1) -> ShuffledHashJoin(id=2) @@ -743,7 +962,7 @@ Status PipelineFragmentContext::_create_tree_helper( : op->is_shuffled_operator())) && Pipeline::is_hash_exchange(required_data_distribution.distribution_type)) || (followed_by_shuffled_operator && - required_data_distribution.distribution_type == ExchangeType::NOOP); + required_data_distribution.distribution_type == TLocalPartitionType::NOOP); current_require_bucket_distribution = ((require_bucket_distribution || @@ -751,7 +970,7 @@ Status PipelineFragmentContext::_create_tree_helper( : op->is_colocated_operator())) && Pipeline::is_hash_exchange(required_data_distribution.distribution_type)) || (require_bucket_distribution && - required_data_distribution.distribution_type == ExchangeType::NOOP); + required_data_distribution.distribution_type == TLocalPartitionType::NOOP); if (num_children == 0) { _use_serial_source = op->is_serial_operator(); @@ -810,28 +1029,35 @@ Status PipelineFragmentContext::_add_local_exchange_impl( sink_id, local_exchange_id, use_global_hash_shuffle ? _total_instances : _num_instances, data_distribution.partition_exprs, bucket_seq_to_instance_idx); if (bucket_seq_to_instance_idx.empty() && - data_distribution.distribution_type == ExchangeType::BUCKET_HASH_SHUFFLE) { - data_distribution.distribution_type = ExchangeType::HASH_SHUFFLE; + data_distribution.distribution_type == TLocalPartitionType::BUCKET_HASH_SHUFFLE) { + data_distribution.distribution_type = + use_global_hash_shuffle ? TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE + : TLocalPartitionType::LOCAL_EXECUTION_HASH_SHUFFLE; + } + if (!use_global_hash_shuffle && + data_distribution.distribution_type == TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE) { + data_distribution.distribution_type = TLocalPartitionType::LOCAL_EXECUTION_HASH_SHUFFLE; } RETURN_IF_ERROR(new_pip->set_sink(sink)); RETURN_IF_ERROR(new_pip->sink()->init(_runtime_state.get(), data_distribution.distribution_type, - num_buckets, use_global_hash_shuffle, - shuffle_idx_to_instance_idx)); + num_buckets, shuffle_idx_to_instance_idx)); // 2. Create and initialize LocalExchangeSharedState. std::shared_ptr shared_state = LocalExchangeSharedState::create_shared(_num_instances); switch (data_distribution.distribution_type) { - case ExchangeType::HASH_SHUFFLE: + case TLocalPartitionType::LOCAL_EXECUTION_HASH_SHUFFLE: + case TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE: shared_state->exchanger = ShuffleExchanger::create_unique( std::max(cur_pipe->num_tasks(), _num_instances), _num_instances, use_global_hash_shuffle ? _total_instances : _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? cast_set( _runtime_state->query_options().local_exchange_free_blocks_limit) - : 0); + : 0, + data_distribution.distribution_type); break; - case ExchangeType::BUCKET_HASH_SHUFFLE: + case TLocalPartitionType::BUCKET_HASH_SHUFFLE: shared_state->exchanger = BucketShuffleExchanger::create_unique( std::max(cur_pipe->num_tasks(), _num_instances), _num_instances, num_buckets, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit @@ -839,7 +1065,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; - case ExchangeType::PASSTHROUGH: + case TLocalPartitionType::PASSTHROUGH: shared_state->exchanger = PassthroughExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit @@ -847,7 +1073,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; - case ExchangeType::BROADCAST: + case TLocalPartitionType::BROADCAST: shared_state->exchanger = BroadcastExchanger::create_unique( cur_pipe->num_tasks(), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit @@ -855,7 +1081,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( _runtime_state->query_options().local_exchange_free_blocks_limit) : 0); break; - case ExchangeType::PASS_TO_ONE: + case TLocalPartitionType::PASS_TO_ONE: if (_runtime_state->enable_share_hash_table_for_broadcast_join()) { // If shared hash table is enabled for BJ, hash table will be built by only one task shared_state->exchanger = PassToOneExchanger::create_unique( @@ -873,7 +1099,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( : 0); } break; - case ExchangeType::ADAPTIVE_PASSTHROUGH: + case TLocalPartitionType::ADAPTIVE_PASSTHROUGH: shared_state->exchanger = AdaptivePassthroughExchanger::create_unique( std::max(cur_pipe->num_tasks(), _num_instances), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit @@ -984,9 +1210,9 @@ Status PipelineFragmentContext::_add_local_exchange( Pipeline::heavy_operations_on_the_sink(data_distribution.distribution_type)) { RETURN_IF_ERROR(_add_local_exchange_impl( cast_set(new_pip->operators().size()), pool, new_pip, - add_pipeline(new_pip, pip_idx + 2), DataDistribution(ExchangeType::PASSTHROUGH), - do_local_exchange, num_buckets, bucket_seq_to_instance_idx, - shuffle_idx_to_instance_idx)); + add_pipeline(new_pip, pip_idx + 2), + DataDistribution(TLocalPartitionType::PASSTHROUGH), do_local_exchange, num_buckets, + bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } return Status::OK(); } @@ -1027,10 +1253,10 @@ Status PipelineFragmentContext::_plan_local_exchange( do_local_exchange = false; // Plan local exchange for each operator. for (; idx < ops.size();) { - if (ops[idx]->required_data_distribution(_runtime_state.get()).need_local_exchange()) { + auto _le_req = ops[idx]->required_data_distribution(_runtime_state.get()); + if (_le_req.need_local_exchange()) { RETURN_IF_ERROR(_add_local_exchange( - pip_idx, idx, ops[idx]->node_id(), _runtime_state->obj_pool(), pip, - ops[idx]->required_data_distribution(_runtime_state.get()), + pip_idx, idx, ops[idx]->node_id(), _runtime_state->obj_pool(), pip, _le_req, &do_local_exchange, num_buckets, bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); } @@ -1796,6 +2022,89 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo RETURN_IF_ERROR(cur_pipe->add_operator(op, _parallel_instances)); break; } + case TPlanNodeType::LOCAL_EXCHANGE_NODE: { + op = std::make_shared(pool, tnode, next_operator_id(), descs); + // The downstream pipeline (containing LocalExchangeSource) must have + // _num_instances tasks — matching BE-native _inherit_pipeline_properties + // which sets pipe_with_source.set_num_tasks(_num_instances). + // Without this, when the parent pipeline was reduced by a serial operator + // (e.g., serial Exchange with use_serial_exchange=true, or UNPARTITIONED + // Exchange), the downstream inherits the reduced num_tasks via + // add_pipeline(parent). The deferred exchanger creates _num_instances + // channels but only fewer source tasks initialize mem_counters — the + // sink round-robins to all channels and crashes on uninitialized ones. + auto downstream_num_tasks = _num_instances; + RETURN_IF_ERROR(cur_pipe->add_operator(op, _parallel_instances)); + // Restore downstream pipeline's num_tasks (mirroring _inherit_pipeline_properties: + // downstream keeps _num_instances, upstream gets the serial/reduced count) + cur_pipe->set_num_tasks(downstream_num_tasks); + + const auto downstream_pipeline_id = cur_pipe->id(); + if (!_dag.contains(downstream_pipeline_id)) { + _dag.insert({downstream_pipeline_id, {}}); + } + cur_pipe = add_pipeline(cur_pipe); + // If this local exchange was inserted because of a serial scan (is_serial_operator), + // the upstream pipeline (cur_pipe) should have num_tasks=1 (only 1 scan task). + // We set this now so the exchanger is created with the correct sender count. + // Child operators added later (serial scan) will also set num_tasks=1, which is + // consistent with this. + if (op->is_serial_operator() && _parallel_instances > 0) { + cur_pipe->set_num_tasks(_parallel_instances); + } + _dag[downstream_pipeline_id].push_back(cur_pipe->id()); + int num_partitions = 0; + std::map shuffle_id_to_instance_idx; + auto partition_type = tnode.local_exchange_node.partition_type; + switch (partition_type) { + case TLocalPartitionType::BUCKET_HASH_SHUFFLE: + num_partitions = _params.num_buckets; + shuffle_id_to_instance_idx = _params.bucket_seq_to_instance_idx; + break; + case TLocalPartitionType::LOCAL_EXECUTION_HASH_SHUFFLE: + for (int i = 0; i < _num_instances; i++) { + shuffle_id_to_instance_idx[i] = i; + } + num_partitions = _num_instances; + break; + case TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE: + num_partitions = _total_instances; + shuffle_id_to_instance_idx = _params.shuffle_idx_to_instance_idx; + break; + default: + break; + } + auto local_exchange_id = op->operator_id(); + auto sink_id = next_sink_operator_id(); + DataSinkOperatorPtr sink = std::make_shared( + sink_id, local_exchange_id, tnode, num_partitions, shuffle_id_to_instance_idx); + sink_ops.push_back(sink); + RETURN_IF_ERROR(cur_pipe->set_sink(sink)); + RETURN_IF_ERROR(cur_pipe->sink()->init(tnode, _runtime_state.get())); + + // For FE-planned local exchange, we need to: + // 1. Initialize the partitioner for hash shuffle types + // 2. Defer exchanger creation until after the full plan tree is built + // (child operators like serial ExchangeNode may change cur_pipe->num_tasks()) + // 3. Register shared state so pipeline tasks can find it + RETURN_IF_ERROR(static_cast(cur_pipe->sink()) + ->init_partitioner(_runtime_state.get())); + + int free_blocks_limit = + _runtime_state->query_options().__isset.local_exchange_free_blocks_limit + ? cast_set( + _runtime_state->query_options().local_exchange_free_blocks_limit) + : 0; + auto shared_state = LocalExchangeSharedState::create_shared(_num_instances); + shared_state->create_source_dependencies(_num_instances, local_exchange_id, + local_exchange_id, "LOCAL_EXCHANGE_OPERATOR"); + shared_state->create_sink_dependency(sink_id, local_exchange_id, "LOCAL_EXCHANGE_SINK"); + _op_id_to_shared_state.insert({local_exchange_id, {shared_state, shared_state->sink_deps}}); + // Defer exchanger creation: sender count depends on final upstream num_tasks + _deferred_exchangers.push_back({shared_state, cur_pipe, partition_type, num_partitions, + free_blocks_limit, local_exchange_id, sink_id}); + break; + } default: return Status::InternalError("Unsupported exec type in pipeline: {}", print_plan_node_type(tnode.node_type)); diff --git a/be/src/exec/pipeline/pipeline_fragment_context.h b/be/src/exec/pipeline/pipeline_fragment_context.h index c220ea386f68db..4950da0a7b0a5e 100644 --- a/be/src/exec/pipeline/pipeline_fragment_context.h +++ b/be/src/exec/pipeline/pipeline_fragment_context.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include @@ -49,6 +50,7 @@ class TDataSink; class TPipelineFragmentParams; class Dependency; +struct LocalExchangeSharedState; class PipelineFragmentContext : public TaskExecutionContext { public: @@ -320,8 +322,11 @@ class PipelineFragmentContext : public TaskExecutionContext { std::mutex _state_map_lock; - int _operator_id = 0; - int _sink_operator_id = 0; + // Start from -1 so all operator IDs are negative. This avoids collision with + // unpaired sinks (OlapTableSink etc.) whose hardcoded dest_id=0 would otherwise + // match the first operator's ID when FE-planned LocalExchangeNode is the root. + int _operator_id = -1; + int _sink_operator_id = -1; /** * Some states are shared by tasks in different pipeline task (e.g. local exchange , broadcast join). * @@ -342,6 +347,27 @@ class PipelineFragmentContext : public TaskExecutionContext { std::map _pip_id_to_pipeline; std::vector> _runtime_filter_mgr_map; + // Deferred exchanger creation info for FE-planned local exchanges. + // Exchanger sender count depends on the upstream pipeline's final num_tasks, + // which is only known after the full plan tree is built (child operators like + // serial ExchangeNode may reduce num_tasks). So we defer exchanger creation + // until after _build_pipelines completes. + struct DeferredExchangerInfo { + std::shared_ptr shared_state; + PipelinePtr upstream_pipe; + TLocalPartitionType::type partition_type; + int num_partitions; + int free_blocks_limit; + int local_exchange_id; + int sink_id; + }; + std::vector _deferred_exchangers; + Status _create_deferred_local_exchangers(); + // After _build_pipelines, propagate _num_instances from FE-planned LOCAL_EXCHANGE + // pipelines upward through the DAG to ancestor pipelines that inherited reduced + // num_tasks from a serial operator. + void _propagate_local_exchange_num_tasks(); + //Here are two types of runtime states: // - _runtime state is at the Fragment level. // - _task_runtime_states is at the task level, unique to each task. diff --git a/be/src/load/routine_load/kinesis_conf.cpp b/be/src/load/routine_load/kinesis_conf.cpp index 82cf35ea2744da..f2b1bf95d50ada 100644 --- a/be/src/load/routine_load/kinesis_conf.cpp +++ b/be/src/load/routine_load/kinesis_conf.cpp @@ -115,7 +115,9 @@ Status KinesisConf::apply_to_get_shard_iterator_request( it = _get_shard_iterator_params.find("timestamp"); if (it != _get_shard_iterator_params.end()) { try { - request.SetTimestamp(Aws::Utils::DateTime(std::stol(it->second))); + // Cast to int64_t explicitly: on macOS `long` is 64-bit but not identical to + // int64_t, causing ambiguity between DateTime(int64_t) and DateTime(double). + request.SetTimestamp(Aws::Utils::DateTime(static_cast(std::stol(it->second)))); } catch (const std::exception&) { return Status::InternalError("Failed to apply get_shard_iterator.timestamp: {}", it->second); diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 6b004013ffe9e9..f19b148e96e197 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -628,6 +628,17 @@ Status ExecEnv::init_mem_env() { } else { fd_number = static_cast(l.rlim_cur); } +#ifdef __APPLE__ + // On macOS, rlim_cur can be RLIM_INFINITY (INT64_MAX), which causes + // fd_number / 100 * percentage to overflow and crash cast_set. + // Linux kernels cap this via fs.nr_open (default 1M), so only macOS needs this. + { + constexpr uint64_t max_fd = UINT32_MAX >> 2; + if (fd_number > max_fd) { + fd_number = max_fd; + } + } +#endif int64_t segment_cache_capacity = 0; if (config::is_cloud_mode()) { diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 619b842eef7de7..c185ee673adf12 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -415,8 +415,11 @@ class RuntimeState { BeExecVersionManager::check_be_exec_version(_query_options.be_exec_version)); return _query_options.be_exec_version; } - bool enable_local_shuffle() const { - return _query_options.__isset.enable_local_shuffle && _query_options.enable_local_shuffle; + bool plan_local_shuffle() const { + // If local shuffle is enabled and not planned by local shuffle planner, we should plan local shuffle in BE. + return _query_options.__isset.enable_local_shuffle && _query_options.enable_local_shuffle && + (!_query_options.__isset.enable_local_shuffle_planner || + !_query_options.enable_local_shuffle_planner); } MOCK_FUNCTION bool enable_local_exchange() const { diff --git a/be/src/udf/python/python_udf_runtime.cpp b/be/src/udf/python/python_udf_runtime.cpp index 9d687c43f40dd6..3fa94bf417ef4c 100644 --- a/be/src/udf/python/python_udf_runtime.cpp +++ b/be/src/udf/python/python_udf_runtime.cpp @@ -21,8 +21,6 @@ #include #include -#include - #include "common/logging.h" namespace doris { diff --git a/be/test/exec/pipeline/local_exchanger_test.cpp b/be/test/exec/pipeline/local_exchanger_test.cpp index 3051625a3ee530..22dacde456875f 100644 --- a/be/test/exec/pipeline/local_exchanger_test.cpp +++ b/be/test/exec/pipeline/local_exchanger_test.cpp @@ -89,8 +89,9 @@ TEST_F(LocalExchangerTest, ShuffleExchanger) { _local_states.resize(num_sources); auto profile = std::make_shared(""); auto shared_state = LocalExchangeSharedState::create_shared(num_partitions); - shared_state->exchanger = ShuffleExchanger::create_unique(num_sink, num_sources, num_partitions, - free_block_limit); + shared_state->exchanger = + ShuffleExchanger::create_unique(num_sink, num_sources, num_partitions, free_block_limit, + TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE); auto sink_dep = std::make_shared(0, 0, "LOCAL_EXCHANGE_SINK_DEPENDENCY", true); sink_dep->set_shared_state(shared_state.get()); shared_state->sink_deps.push_back(sink_dep); @@ -1175,8 +1176,9 @@ TEST_F(LocalExchangerTest, TestShuffleExchangerWrongMap) { _local_states.resize(num_sources); auto profile = std::make_shared(""); auto shared_state = LocalExchangeSharedState::create_shared(num_partitions); - shared_state->exchanger = ShuffleExchanger::create_unique(num_sink, num_sources, num_partitions, - free_block_limit); + shared_state->exchanger = + ShuffleExchanger::create_unique(num_sink, num_sources, num_partitions, free_block_limit, + TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE); auto sink_dep = std::make_shared(0, 0, "LOCAL_EXCHANGE_SINK_DEPENDENCY", true); sink_dep->set_shared_state(shared_state.get()); shared_state->sink_deps.push_back(sink_dep); diff --git a/be/test/exec/pipeline/pipeline_test.cpp b/be/test/exec/pipeline/pipeline_test.cpp index d5d0b5028d6300..714d668bffcb89 100644 --- a/be/test/exec/pipeline/pipeline_test.cpp +++ b/be/test/exec/pipeline/pipeline_test.cpp @@ -509,7 +509,7 @@ TEST_F(PipelineTest, PLAN_LOCAL_EXCHANGE) { DescriptorTbl* desc; OperatorPtr op; _build_fragment_context(); - EXPECT_EQ(_runtime_state.front()->enable_local_shuffle(), true); + EXPECT_EQ(_runtime_state.front()->plan_local_shuffle(), true); auto cur_pipe = _build_pipeline(parallelism); { auto tnode = TPlanNodeBuilder(_next_node_id(), TPlanNodeType::EXCHANGE_NODE) @@ -584,11 +584,12 @@ TEST_F(PipelineTest, PLAN_LOCAL_EXCHANGE) { } { cur_pipe->init_data_distribution(_runtime_state.back().get()); - EXPECT_EQ(cur_pipe->data_distribution().distribution_type, ExchangeType::HASH_SHUFFLE); + EXPECT_EQ(cur_pipe->data_distribution().distribution_type, + TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE); EXPECT_EQ(cur_pipe->sink() ->required_data_distribution(_runtime_state.back().get()) .distribution_type, - ExchangeType::NOOP); + TLocalPartitionType::NOOP); EXPECT_EQ(cur_pipe->need_to_local_exchange( cur_pipe->sink()->required_data_distribution(_runtime_state.back().get()), 1), @@ -597,11 +598,11 @@ TEST_F(PipelineTest, PLAN_LOCAL_EXCHANGE) { { cur_pipe->operators().front()->set_serial_operator(); cur_pipe->init_data_distribution(_runtime_state.back().get()); - EXPECT_EQ(cur_pipe->data_distribution().distribution_type, ExchangeType::NOOP); + EXPECT_EQ(cur_pipe->data_distribution().distribution_type, TLocalPartitionType::NOOP); EXPECT_EQ(cur_pipe->sink() ->required_data_distribution(_runtime_state.back().get()) .distribution_type, - ExchangeType::PASSTHROUGH); + TLocalPartitionType::PASSTHROUGH); EXPECT_EQ(cur_pipe->need_to_local_exchange( cur_pipe->sink()->required_data_distribution(_runtime_state.back().get()), 1), @@ -620,7 +621,7 @@ TEST_F(PipelineTest, PLAN_HASH_JOIN) { // Build pipeline DescriptorTbl* desc; _build_fragment_context(); - EXPECT_EQ(_runtime_state.front()->enable_local_shuffle(), true); + EXPECT_EQ(_runtime_state.front()->plan_local_shuffle(), true); { TTupleDescriptor tuple0 = TTupleDescriptorBuilder().set_id(0).build(); TSlotDescriptor slot0 = @@ -903,12 +904,12 @@ TEST_F(PipelineTest, PLAN_HASH_JOIN) { if (pip_idx == 1) { // Pipeline(ExchangeOperator(id=1, HASH_PARTITIONED) -> HashJoinBuildOperator(id=0)) EXPECT_EQ(_pipelines[pip_idx]->data_distribution().distribution_type, - ExchangeType::HASH_SHUFFLE); + TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE); EXPECT_EQ(_pipelines[pip_idx] ->sink() ->required_data_distribution(_runtime_state.back().get()) .distribution_type, - ExchangeType::HASH_SHUFFLE); + TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE); EXPECT_EQ(_pipelines[pip_idx]->need_to_local_exchange( _pipelines[pip_idx]->sink()->required_data_distribution( _runtime_state.back().get()), @@ -919,7 +920,7 @@ TEST_F(PipelineTest, PLAN_HASH_JOIN) { _pipelines[pip_idx]->set_data_distribution( _pipelines[pip_idx]->children().front()->data_distribution()); EXPECT_EQ(_pipelines[pip_idx]->data_distribution().distribution_type, - ExchangeType::HASH_SHUFFLE); + TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE); EXPECT_EQ(_pipelines[pip_idx]->need_to_local_exchange( _pipelines[pip_idx]->sink()->required_data_distribution( _runtime_state.back().get()), @@ -930,7 +931,7 @@ TEST_F(PipelineTest, PLAN_HASH_JOIN) { .back() ->required_data_distribution(_runtime_state.back().get()) .distribution_type, - ExchangeType::HASH_SHUFFLE); + TLocalPartitionType::GLOBAL_EXECUTION_HASH_SHUFFLE); EXPECT_EQ(_pipelines[pip_idx]->need_to_local_exchange( _pipelines[pip_idx]->operators().back()->required_data_distribution( _runtime_state.back().get()), diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java b/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java index 3ad317792a5a6f..7a32d339f4cee9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/profile/SummaryProfile.java @@ -113,6 +113,10 @@ public class SummaryProfile { public static final String READ_BYTES_PER_SECOND = "Read Bytes Per Second"; public static final String REMOTE_READ_BYTES_PER_SECOND = "Remote Read Bytes Per Second"; + // Written last in queryFinished(), after all BE fragment profiles are merged. + // Readers can poll for this field to know the profile is fully collected. + public static final String IS_PROFILE_COLLECTION_COMPLETED = "Is Profile Collection Completed"; + public static final String PARSE_SQL_TIME = "Parse SQL Time"; public static final String NEREIDS_LOCK_TABLE_TIME = "Nereids Lock Table Time"; public static final String NEREIDS_ANALYSIS_TIME = "Nereids Analysis Time"; @@ -514,6 +518,10 @@ public void update(Map summaryInfo) { // This method is used to display the final data status when the overall query ends. // This can avoid recalculating some strings and so on every time during the update process. public void queryFinished() { + // Mark profile collection as complete. This is written last, after all BE fragment + // profiles have been merged (called post-waitForFragmentsDone). Pollers can wait for + // this field to avoid reading a partial profile. + executionSummaryProfile.addInfoString(IS_PROFILE_COLLECTION_COMPLETED, "true"); if (assignedWeightPerBackend != null) { Map m = assignedWeightPerBackend.entrySet().stream() .sorted(Map.Entry.comparingByValue()) diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java index ce22e96c80b58e..644bc0504ce6a8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/tvf/source/TVFScanNode.java @@ -25,6 +25,7 @@ import org.apache.doris.catalog.TableIf; import org.apache.doris.common.DdlException; import org.apache.doris.common.MetaNotFoundException; +import org.apache.doris.common.Pair; import org.apache.doris.common.UserException; import org.apache.doris.common.util.LocationPath; import org.apache.doris.common.util.Util; @@ -33,6 +34,10 @@ import org.apache.doris.datasource.FileSplit.FileSplitCreator; import org.apache.doris.datasource.FileSplitter; import org.apache.doris.datasource.TableFormatType; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; +import org.apache.doris.planner.PlanNode; import org.apache.doris.planner.PlanNodeId; import org.apache.doris.planner.ScanContext; import org.apache.doris.qe.SessionVariable; @@ -212,4 +217,10 @@ protected void setScanParams(TFileRangeDesc rangeDesc, Split split) { public int getNumInstances() { return scanRangeLocations.size(); } + + @Override + public Pair enforceAndDeriveLocalExchange(PlanTranslatorContext translatorContext, + PlanNode parent, LocalExchangeTypeRequire parentRequire) { + return Pair.of(this, LocalExchangeType.PASSTHROUGH); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java index 7952d009e5199e..b0827116e84495 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/NereidsPlanner.java @@ -75,6 +75,7 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalRelation; import org.apache.doris.nereids.trees.plans.physical.PhysicalSqlCache; import org.apache.doris.nereids.trees.plans.physical.TopnFilter; +import org.apache.doris.planner.AddLocalExchange; import org.apache.doris.planner.PlanFragment; import org.apache.doris.planner.PlanNodeId; import org.apache.doris.planner.Planner; @@ -130,6 +131,7 @@ public class NereidsPlanner extends Planner { private DescriptorTable descTable; private FragmentIdMapping distributedPlans; + private PlanTranslatorContext planTranslatorContext; // The cost of optimized plan private double cost = 0; private LogicalPlanAdapter logicalPlanAdapter; @@ -602,7 +604,8 @@ protected void splitFragments(PhysicalPlan resultPlan) { return; } - PlanTranslatorContext planTranslatorContext = new PlanTranslatorContext(cascadesContext); + this.planTranslatorContext = new PlanTranslatorContext(cascadesContext); + PlanTranslatorContext planTranslatorContext = this.planTranslatorContext; PhysicalPlanTranslator physicalPlanTranslator = new PhysicalPlanTranslator(planTranslatorContext, statementContext.getConnectContext().getStatsErrorEstimator()); SessionVariable sessionVariable = cascadesContext.getConnectContext().getSessionVariable(); @@ -625,7 +628,8 @@ protected void splitFragments(PhysicalPlan resultPlan) { scanNodeList.addAll(planTranslatorContext.getScanNodes()); physicalRelations.addAll(planTranslatorContext.getPhysicalRelations()); descTable = planTranslatorContext.getDescTable(); - fragments = new ArrayList<>(planTranslatorContext.getPlanFragments()); + List planFragments = planTranslatorContext.getPlanFragments(); + fragments = new ArrayList<>(planFragments); boolean enableQueryCache = sessionVariable.getEnableQueryCache(); for (int seq = 0; seq < fragments.size(); seq++) { @@ -708,6 +712,18 @@ protected void distribute(PhysicalPlan physicalPlan, ExplainLevel explainLevel) splitFragments(physicalPlan); doDistribute(canUseNereidsDistributePlanner, explainLevel); + + SessionVariable sessionVariable = cascadesContext.getConnectContext().getSessionVariable(); + if (sessionVariable.isEnableLocalShufflePlanner() && sessionVariable.isEnableLocalShuffle()) { + addLocalExchangeAfterDistribute(); + } + } + + private void addLocalExchangeAfterDistribute() { + AddLocalExchange adder = new AddLocalExchange(); + if (distributedPlans != null && !distributedPlans.isEmpty()) { + adder.addLocalExchange(distributedPlans, planTranslatorContext); + } } protected void doDistribute(boolean canUseNereidsDistributePlanner, ExplainLevel explainLevel) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java index 7b01ce6c6b143c..12e98d9954e1fb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java @@ -90,6 +90,7 @@ import org.apache.doris.nereids.properties.DistributionSpecStorageAny; import org.apache.doris.nereids.properties.DistributionSpecStorageGather; import org.apache.doris.nereids.properties.OrderKey; +import org.apache.doris.nereids.properties.PhysicalProperties; import org.apache.doris.nereids.rules.implementation.LogicalWindowToPhysicalWindow.WindowFrameGroup; import org.apache.doris.nereids.rules.rewrite.MergeLimits; import org.apache.doris.nereids.stats.StatsErrorEstimator; @@ -350,7 +351,7 @@ public PlanFragment visitPhysicalDistribute(PhysicalDistribute d PlanTranslatorContext context) { Plan upstream = distribute.child(); // now they're in one fragment but will be split by ExchangeNode. PlanFragment upstreamFragment = upstream.accept(this, context); - List> upstreamDistributeExprs = getDistributeExprs(upstream); + List> upstreamDistributeExprs = getChildrenDistributeExprs(upstream); DistributionSpec targetDistribution = distribute.getDistributionSpec(); @@ -398,6 +399,7 @@ public PlanFragment visitPhysicalDistribute(PhysicalDistribute d // target data partition DataPartition targetDataPartition = toDataPartition(targetDistribution, validOutputIds, context); exchangeNode.setPartitionType(targetDataPartition.getType()); + exchangeNode.setDistributeExprLists(getDistributeExpr(distribute)); exchangeNode.setChildrenDistributeExprLists(upstreamDistributeExprs); // its source partition is targetDataPartition. and outputPartition is UNPARTITIONED now, will be set when // visit its SinkNode @@ -793,6 +795,7 @@ public PlanFragment visitPhysicalFileScan(PhysicalFileScan fileScan, PlanTransla fileScan.getTableSnapshot().ifPresent(fileQueryScanNode::setQueryTableSnapshot); fileScan.getScanParams().ifPresent(fileQueryScanNode::setScanParams); } + scanNode.setDistributeExprLists(getDistributeExpr(fileScan)); return getPlanFragmentForPhysicalFileScan(fileScan, context, scanNode); } @@ -813,6 +816,7 @@ public PlanFragment visitPhysicalEmptyRelation(PhysicalEmptyRelation emptyRelati PlanFragment planFragment = createPlanFragment(emptySetNode, DataPartition.UNPARTITIONED, emptyRelation); context.addPlanFragment(planFragment); + emptySetNode.setDistributeExprLists(getDistributeExpr(emptyRelation)); updateLegacyPlanIdToPhysicalPlan(planFragment.getPlanRoot(), emptyRelation); return planFragment; } @@ -837,6 +841,7 @@ public PlanFragment visitPhysicalHudiScan(PhysicalHudiScan hudiScan, PlanTransla hudiScanNode.setQueryTableSnapshot(hudiScan.getTableSnapshot().get()); } hudiScanNode.setSelectedPartitions(hudiScan.getSelectedPartitions()); + hudiScanNode.setDistributeExprLists(getDistributeExpr(hudiScan)); return getPlanFragmentForPhysicalFileScan(hudiScan, context, hudiScanNode); } @@ -858,6 +863,7 @@ private PlanFragment getPlanFragmentForPhysicalFileScan(PhysicalFileScan fileSca DataPartition dataPartition = DataPartition.RANDOM; PlanFragment planFragment = createPlanFragment(scanNode, dataPartition, fileScan); context.addPlanFragment(planFragment); + scanNode.setDistributeExprLists(getDistributeExpr(fileScan)); updateLegacyPlanIdToPhysicalPlan(planFragment.getPlanRoot(), fileScan); return planFragment; } @@ -896,6 +902,7 @@ private PlanFragment computePhysicalOlapScan(PhysicalOlapScan olapScan, PlanTran context.getScanContext()); olapScanNode.setNereidsId(olapScan.getId()); context.getNereidsIdToPlanNodeIdMap().put(olapScan.getId(), olapScanNode.getId()); + olapScanNode.setDistributeExprLists(getDistributeExpr(olapScan)); // translate score topn info if (!olapScan.getScoreOrderKeys().isEmpty()) { @@ -1021,6 +1028,8 @@ public PlanFragment visitPhysicalOneRowRelation(PhysicalOneRowRelation oneRowRel PlanFragment planFragment = createPlanFragment(unionNode, DataPartition.UNPARTITIONED, oneRowRelation); context.addPlanFragment(planFragment); updateLegacyPlanIdToPhysicalPlan(planFragment.getPlanRoot(), oneRowRelation); + + unionNode.setDistributeExprLists(getDistributeExpr(oneRowRelation)); return planFragment; } @@ -1053,6 +1062,7 @@ public PlanFragment visitPhysicalSchemaScan(PhysicalSchemaScan schemaScan, PlanT context.addScanNode(scanNode, schemaScan); PlanFragment planFragment = createPlanFragment(scanNode, DataPartition.RANDOM, schemaScan); context.addPlanFragment(planFragment); + scanNode.setDistributeExprLists(getDistributeExpr(schemaScan)); updateLegacyPlanIdToPhysicalPlan(planFragment.getPlanRoot(), schemaScan); return planFragment; } @@ -1069,6 +1079,7 @@ public PlanFragment visitPhysicalWorkTableReference(PhysicalWorkTableReference w PlanFragment planFragment = createPlanFragment(scanNode, DataPartition.RANDOM, workTableReference); context.addPlanFragment(planFragment); + scanNode.setDistributeExprLists(getDistributeExpr(workTableReference)); updateLegacyPlanIdToPhysicalPlan(planFragment.getPlanRoot(), workTableReference); return planFragment; } @@ -1089,6 +1100,7 @@ public PlanFragment visitPhysicalTVFRelation(PhysicalTVFRelation tvfRelation, Pl TableValuedFunctionIf catalogFunction = tvfRelation.getFunction().getCatalogFunction(); SessionVariable sv = ConnectContext.get().getSessionVariable(); ScanNode scanNode = catalogFunction.getScanNode(context.nextPlanNodeId(), tupleDescriptor, sv); + scanNode.setDistributeExprLists(getDistributeExpr(tvfRelation)); scanNode.setNereidsId(tvfRelation.getId()); context.getNereidsIdToPlanNodeIdMap().put(tvfRelation.getId(), scanNode.getId()); Utils.execWithUncheckedException(scanNode::init); @@ -1127,7 +1139,7 @@ public PlanFragment visitPhysicalHashAggregate( PlanTranslatorContext context) { PlanFragment inputPlanFragment = aggregate.child(0).accept(this, context); - List> distributeExprLists = getDistributeExprs(aggregate.child(0)); + List> distributeExprLists = getChildrenDistributeExprs(aggregate.child(0)); List groupByExpressions = aggregate.getGroupByExpressions(); List outputExpressions = aggregate.getOutputExpressions(); @@ -1191,6 +1203,7 @@ public PlanFragment visitPhysicalHashAggregate( AggregationNode aggregationNode = new AggregationNode(context.nextPlanNodeId(), inputPlanFragment.getPlanRoot(), aggInfo); + aggregationNode.setDistributeExprLists(getDistributeExpr(aggregate)); aggregationNode.setChildrenDistributeExprLists(distributeExprLists); aggregationNode.setNereidsId(aggregate.getId()); @@ -1394,7 +1407,7 @@ public PlanFragment visitPhysicalStorageLayerAggregate( public PlanFragment visitPhysicalAssertNumRows(PhysicalAssertNumRows assertNumRows, PlanTranslatorContext context) { PlanFragment currentFragment = assertNumRows.child().accept(this, context); - List> distributeExprLists = getDistributeExprs(assertNumRows.child()); + List> distributeExprLists = getChildrenDistributeExprs(assertNumRows.child()); // we need convert all columns to nullable in AssertNumRows node // create a tuple for AssertNumRowsNode @@ -1404,6 +1417,7 @@ public PlanFragment visitPhysicalAssertNumRows(PhysicalAssertNumRows gener .collect(Collectors.toCollection(ArrayList::new)); TableFunctionNode tableFunctionNode = new TableFunctionNode(context.nextPlanNodeId(), currentFragment.getPlanRoot(), tupleDescriptor.getId(), functionCalls, outputSlotIds, conjuncts); + tableFunctionNode.setDistributeExprLists(getDistributeExpr(generate)); tableFunctionNode.setNereidsId(generate.getId()); context.getNereidsIdToPlanNodeIdMap().put(generate.getId(), tableFunctionNode.getId()); addPlanRoot(currentFragment, tableFunctionNode, generate); @@ -1627,7 +1643,8 @@ public PlanFragment visitPhysicalHashJoin( // NOTICE: We must visit from right to left, to ensure the last fragment is root fragment PlanFragment rightFragment = hashJoin.child(1).accept(this, context); PlanFragment leftFragment = hashJoin.child(0).accept(this, context); - List> distributeExprLists = getDistributeExprs(physicalHashJoin.left(), physicalHashJoin.right()); + List> distributeExprLists + = getChildrenDistributeExprs(physicalHashJoin.left(), physicalHashJoin.right()); if (JoinUtils.shouldNestedLoopJoin(hashJoin)) { throw new RuntimeException("Physical hash join could not execute without equal join condition."); @@ -1678,6 +1695,7 @@ public PlanFragment visitPhysicalHashJoin( markConjuncts, hashJoin.isMarkJoin()); hashJoinNode.setNereidsId(hashJoin.getId()); context.getNereidsIdToPlanNodeIdMap().put(hashJoin.getId(), hashJoinNode.getId()); + hashJoinNode.setDistributeExprLists(getDistributeExpr(hashJoin)); hashJoinNode.setChildrenDistributeExprLists(distributeExprLists); PlanFragment currentFragment = connectJoinNode(hashJoinNode, leftFragment, rightFragment, context, hashJoin); @@ -1897,7 +1915,8 @@ public PlanFragment visitPhysicalNestedLoopJoin( // PhysicalPlan plan, PlanVisitor visitor, Context context). PlanFragment rightFragment = nestedLoopJoin.child(1).accept(this, context); PlanFragment leftFragment = nestedLoopJoin.child(0).accept(this, context); - List> distributeExprLists = getDistributeExprs(nestedLoopJoin.child(0), nestedLoopJoin.child(1)); + List> distributeExprLists + = getChildrenDistributeExprs(nestedLoopJoin.child(0), nestedLoopJoin.child(1)); PlanNode leftFragmentPlanRoot = leftFragment.getPlanRoot(); PlanNode rightFragmentPlanRoot = rightFragment.getPlanRoot(); @@ -1918,6 +1937,7 @@ public PlanFragment visitPhysicalNestedLoopJoin( nestedLoopJoin.isMarkJoin()); nestedLoopJoinNode.setNereidsId(nestedLoopJoin.getId()); context.getNereidsIdToPlanNodeIdMap().put(nestedLoopJoin.getId(), nestedLoopJoinNode.getId()); + nestedLoopJoinNode.setDistributeExprLists(getDistributeExpr(nestedLoopJoin)); nestedLoopJoinNode.setChildrenDistributeExprLists(distributeExprLists); if (nestedLoopJoin.getStats() != null) { nestedLoopJoinNode.setCardinality((long) nestedLoopJoin.getStats().getRowCount()); @@ -2133,9 +2153,10 @@ public PlanFragment visitPhysicalLimit(PhysicalLimit physicalLim public PlanFragment visitPhysicalPartitionTopN(PhysicalPartitionTopN partitionTopN, PlanTranslatorContext context) { PlanFragment inputFragment = partitionTopN.child(0).accept(this, context); - List> distributeExprLists = getDistributeExprs(partitionTopN.child(0)); + List> distributeExprLists = getChildrenDistributeExprs(partitionTopN.child(0)); PartitionSortNode partitionSortNode = translatePartitionSortNode( partitionTopN, inputFragment.getPlanRoot(), context); + partitionSortNode.setDistributeExprLists(getDistributeExpr(partitionTopN)); partitionSortNode.setChildrenDistributeExprLists(distributeExprLists); addPlanRoot(inputFragment, partitionSortNode, partitionTopN); // in pipeline engine, we use parallel scan by default, but it broke the rule of data distribution @@ -2326,12 +2347,13 @@ public PlanFragment visitPhysicalRecursiveUnion(PhysicalRecursiveUnion> distributeExprLists = getDistributeExprs(recursiveCte.children().toArray(new Plan[0])); + List> distributeExprLists = getChildrenDistributeExprs(recursiveCte.children().toArray(new Plan[0])); TupleDescriptor setTuple = generateTupleDesc(recursiveCte.getOutput(), null, context); RecursiveCteNode recursiveCteNode = new RecursiveCteNode(context.nextPlanNodeId(), setTuple.getId(), recursiveCte.getCteName(), recursiveCte.isUnionAll()); + recursiveCteNode.setDistributeExprLists(getDistributeExpr(recursiveCte)); recursiveCteNode.setChildrenDistributeExprLists(distributeExprLists); recursiveCteNode.setNereidsId(recursiveCte.getId()); context.getNereidsIdToPlanNodeIdMap().put(recursiveCte.getId(), recursiveCteNode.getId()); @@ -2399,7 +2421,7 @@ public PlanFragment visitPhysicalSetOperation( for (Plan plan : setOperation.children()) { childrenFragments.add(plan.accept(this, context)); } - List> distributeExprLists = getDistributeExprs(setOperation.children().toArray(new Plan[0])); + List> distributeExprLists = getChildrenDistributeExprs(setOperation.children().toArray(new Plan[0])); TupleDescriptor setTuple = generateTupleDesc(setOperation.getOutput(), null, context); SetOperationNode setOperationNode; @@ -2413,6 +2435,7 @@ public PlanFragment visitPhysicalSetOperation( } else { throw new RuntimeException("not support set operation type " + setOperation); } + setOperationNode.setDistributeExprLists(getDistributeExpr(setOperation)); setOperationNode.setChildrenDistributeExprLists(distributeExprLists); setOperationNode.setNereidsId(setOperation.getId()); context.getNereidsIdToPlanNodeIdMap().put(setOperation.getId(), setOperationNode.getId()); @@ -2522,12 +2545,13 @@ public PlanFragment visitPhysicalExcept(PhysicalExcept except, PlanTranslatorCon public PlanFragment visitPhysicalQuickSort(PhysicalQuickSort sort, PlanTranslatorContext context) { PlanFragment inputFragment = sort.child(0).accept(this, context); - List> distributeExprLists = getDistributeExprs(sort.child(0)); + List> distributeExprLists = getChildrenDistributeExprs(sort.child(0)); // 2. According to the type of sort, generate physical plan if (!sort.getSortPhase().isMerge()) { // For localSort or Gather->Sort, we just need to add sortNode SortNode sortNode = translateSortNode(sort, inputFragment.getPlanRoot(), context); + sortNode.setDistributeExprLists(getDistributeExpr(sort)); sortNode.setChildrenDistributeExprLists(distributeExprLists); addPlanRoot(inputFragment, sortNode, sort); } else { @@ -2544,6 +2568,7 @@ public PlanFragment visitPhysicalQuickSort(PhysicalQuickSort sor inputFragment.getChild(0).getSink().setMerge(true); } sortNode.setMergeByExchange(); + sortNode.setDistributeExprLists(getDistributeExpr(sort)); sortNode.setChildrenDistributeExprLists(distributeExprLists); } return inputFragment; @@ -2552,7 +2577,7 @@ public PlanFragment visitPhysicalQuickSort(PhysicalQuickSort sor @Override public PlanFragment visitPhysicalTopN(PhysicalTopN topN, PlanTranslatorContext context) { PlanFragment inputFragment = topN.child(0).accept(this, context); - List> distributeExprLists = getDistributeExprs(topN.child(0)); + List> distributeExprLists = getChildrenDistributeExprs(topN.child(0)); // 2. According to the type of sort, generate physical plan if (!topN.getSortPhase().isMerge()) { // For localSort or Gather->Sort, we just need to add TopNNode @@ -2588,6 +2613,9 @@ public PlanFragment visitPhysicalTopN(PhysicalTopN topN, PlanTra } } } + sortNode.setDistributeExprLists( + CollectionUtils.isEmpty(distributeExprLists) ? null : distributeExprLists.get(0) + ); sortNode.setChildrenDistributeExprLists(distributeExprLists); addPlanRoot(inputFragment, sortNode, topN); } else { @@ -2601,6 +2629,9 @@ public PlanFragment visitPhysicalTopN(PhysicalTopN topN, PlanTra return inputFragment; } ExchangeNode exchangeNode = (ExchangeNode) inputFragment.getPlanRoot(); + exchangeNode.setDistributeExprLists( + CollectionUtils.isEmpty(distributeExprLists) ? null : distributeExprLists.get(0) + ); exchangeNode.setChildrenDistributeExprLists(distributeExprLists); exchangeNode.setMergeInfo(((SortNode) exchangeNode.getChild(0)).getSortInfo()); if (inputFragment.hasChild(0) && inputFragment.getChild(0).getSink() != null) { @@ -2617,7 +2648,7 @@ public PlanFragment visitPhysicalTopN(PhysicalTopN topN, PlanTra @Override public PlanFragment visitPhysicalRepeat(PhysicalRepeat repeat, PlanTranslatorContext context) { PlanFragment inputPlanFragment = repeat.child(0).accept(this, context); - List> distributeExprLists = getDistributeExprs(repeat.child(0)); + List> distributeExprLists = getChildrenDistributeExprs(repeat.child(0)); List flattenGroupingExpressions = repeat.getGroupByExpressions(); Set preRepeatExpressions = Sets.newLinkedHashSet(); @@ -2668,6 +2699,7 @@ public PlanFragment visitPhysicalRepeat(PhysicalRepeat repeat, P allSlotId, repeat.computeGroupingFunctionsValues()); repeatNode.setNereidsId(repeat.getId()); context.getNereidsIdToPlanNodeIdMap().put(repeat.getId(), repeatNode.getId()); + repeatNode.setDistributeExprLists(getDistributeExpr(repeat)); repeatNode.setChildrenDistributeExprLists(distributeExprLists); addPlanRoot(inputPlanFragment, repeatNode, repeat); updateLegacyPlanIdToPhysicalPlan(inputPlanFragment.getPlanRoot(), repeat); @@ -2678,7 +2710,7 @@ public PlanFragment visitPhysicalRepeat(PhysicalRepeat repeat, P public PlanFragment visitPhysicalWindow(PhysicalWindow physicalWindow, PlanTranslatorContext context) { PlanFragment inputPlanFragment = physicalWindow.child(0).accept(this, context); - List> distributeExprLists = getDistributeExprs(physicalWindow.child(0)); + List> distributeExprLists = getChildrenDistributeExprs(physicalWindow.child(0)); // 1. translate to old optimizer variable // variable in Nereids @@ -2743,6 +2775,7 @@ public PlanFragment visitPhysicalWindow(PhysicalWindow physicalW ); analyticEvalNode.setNereidsId(physicalWindow.getId()); context.getNereidsIdToPlanNodeIdMap().put(physicalWindow.getId(), analyticEvalNode.getId()); + analyticEvalNode.setDistributeExprLists(getDistributeExpr(physicalWindow)); analyticEvalNode.setChildrenDistributeExprLists(distributeExprLists); PlanNode root = inputPlanFragment.getPlanRoot(); if (root instanceof SortNode) { @@ -2780,6 +2813,7 @@ public PlanFragment visitPhysicalLazyMaterialize(PhysicalLazyMaterialize rowStoreFlags = new ArrayList<>(); for (Relation relation : materialize.getRelations()) { @@ -2819,6 +2853,7 @@ public PlanFragment visitPhysicalLazyMaterializeTVFScan(PhysicalLazyMaterializeT TableValuedFunctionIf catalogFunction = tvfRelation.getFunction().getCatalogFunction(); SessionVariable sv = ConnectContext.get().getSessionVariable(); ScanNode scanNode = catalogFunction.getScanNode(context.nextPlanNodeId(), tupleDescriptor, sv); + scanNode.setDistributeExprLists(getDistributeExpr(tvfRelation)); scanNode.setNereidsId(tvfRelation.getId()); context.getNereidsIdToPlanNodeIdMap().put(tvfRelation.getId(), scanNode.getId()); Utils.execWithUncheckedException(scanNode::init); @@ -3309,7 +3344,7 @@ private boolean findOlapScanNodesByPassExchangeNode(PlanNode root) { return false; } - private List> getDistributeExprs(Plan ... children) { + private List> getChildrenDistributeExprs(Plan ... children) { List> distributeExprLists = Lists.newArrayList(); for (Plan child : children) { DistributionSpec spec = ((PhysicalPlan) child).getPhysicalProperties().getDistributionSpec(); @@ -3318,6 +3353,16 @@ private List> getDistributeExprs(Plan ... children) { return distributeExprLists; } + private List getDistributeExpr(PhysicalPlan physicalPlan) { + // physicalProperties is set during property derivation; guard against translator-only + // paths and test fixtures that bypass the derivation step. + PhysicalProperties props = physicalPlan.getPhysicalProperties(); + if (props == null) { + return Lists.newArrayList(); + } + return getDistributeExpr(physicalPlan.getOutputExprIds(), props.getDistributionSpec()); + } + private List getDistributeExpr(List childOutputIds, DistributionSpec spec) { if (spec instanceof DistributionSpecHash) { DistributionSpecHash distributionSpecHash = (DistributionSpecHash) spec; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PlanTranslatorContext.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PlanTranslatorContext.java index 4118399b91d75b..223d84888bd7ab 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PlanTranslatorContext.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PlanTranslatorContext.java @@ -109,10 +109,36 @@ public class PlanTranslatorContext { private final Map> statsUnknownColumnsMap = Maps.newHashMap(); + // Per-node "is there a serial operator between me and the pipeline's sink" flag. + // Mirrors BE's any_of(operators[idx..end], is_serial_operator) check used by + // _add_local_exchange / need_to_local_exchange to skip LE insertion when an ancestor + // in the same pipeline is already serial (the whole pipeline runs with 1 task, so an + // extra LE would be a no-op). Written by AddLocalExchange entry + PlanNode.enforceRequire + // step 1 (root → leaf during traversal). Read by PlanNode.enforceRequire step 4 (Layer 1 + // skip) and by child overrides that compute their require. Reset to false at fragment + // root and across pipeline boundaries (see shouldResetSerialFlagForChild). + private final Map serialAncestorInPipelineMap = Maps.newHashMap(); + + // Per-node "is there a downstream operator that depends on hash distribution for + // correctness, with HASH/NOOP path connecting it to me" flag. Mirrors BE's + // _followed_by_shuffled_operator propagation in pipeline_fragment_context.cpp. + // Written by PlanNode.enforceRequire step 1b (root → leaf). Read by SetOperationNode + // to decide whether to propagate hash requirement to its inputs (only when downstream + // needs shuffle for correctness, not just for performance like StreamingAgg pre-agg). + private final Map shuffledAncestorMap = Maps.newHashMap(); + + // Whether the current fragment uses LocalShuffleAssignedJob (pooling scan with + // ignoreDataDistribution → _parallel_instances=1 in BE). When true, serial operators + // indicate real pipeline bottlenecks needing PASSTHROUGH fan-out (heavy_ops). private boolean isTopMaterializeNode = true; private final Set virtualColumnIds = Sets.newHashSet(); + // Used by AddLocalExchange: tracks whether any serial operator exists + // between the current node and the pipeline's sink (downstream serial check). + // Pipeline-splitting nodes reset this to false; serial non-splitting nodes set it to true. + private boolean serialAncestorInCurrentPipeline; + /** PlanTranslatorContext */ public PlanTranslatorContext(CascadesContext ctx) { this.connectContext = ctx.getConnectContext(); @@ -234,6 +260,22 @@ public PlanNodeId nextPlanNodeId() { return nodeIdGenerator.getNextId(); } + public void setHasSerialAncestorInPipeline(PlanNode node, boolean hasSerialAncestorInPipeline) { + serialAncestorInPipelineMap.put(node.getId(), hasSerialAncestorInPipeline); + } + + public boolean hasSerialAncestorInPipeline(PlanNode node) { + return serialAncestorInPipelineMap.getOrDefault(node.getId(), false); + } + + public void setHasShuffleForCorrectnessAncestor(PlanNode node, boolean value) { + shuffledAncestorMap.put(node.getId(), value); + } + + public boolean hasShuffleForCorrectnessAncestor(PlanNode node) { + return shuffledAncestorMap.getOrDefault(node.getId(), false); + } + public SlotDescriptor addSlotDesc(TupleDescriptor t) { return descTable.addSlotDescriptor(t); } @@ -373,4 +415,12 @@ public void setTopMaterializeNode(boolean topMaterializeNode) { public Set getVirtualColumnIds() { return virtualColumnIds; } + + public boolean hasSerialAncestorInCurrentPipeline() { + return serialAncestorInCurrentPipeline; + } + + public void setSerialAncestorInCurrentPipeline(boolean serialAncestorInCurrentPipeline) { + this.serialAncestorInCurrentPipeline = serialAncestorInCurrentPipeline; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java index 09acde2ad6f5a2..2f5dcbb7cfce9f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/PlanType.java @@ -144,6 +144,7 @@ public enum PlanType { PHYSICAL_RECURSIVE_CTE, PHYSICAL_RECURSIVE_CTE_RECURSIVE_CHILD, PHYSICAL_DISTRIBUTE, + PHYSICAL_LOCAL_DISTRIBUTE, PHYSICAL_EXCEPT, PHYSICAL_FILTER, PHYSICAL_GENERATE, diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommand.java index 89e4579c45837a..b66dc2a35c070e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommand.java @@ -82,8 +82,10 @@ import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor; import org.apache.doris.nereids.util.RelationUtil; import org.apache.doris.planner.DataSink; +import org.apache.doris.planner.LocalExchangeNode; import org.apache.doris.planner.OlapScanNode; import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.PlanNode; import org.apache.doris.planner.ScanNode; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.ConnectContext.ConnectType; @@ -676,8 +678,18 @@ private void applyInsertPlanStatistic(FastInsertIntoValuesPlanner planner) { return; } for (PlanFragment fragment : planner.getFragments()) { - if (fragment.getPlanRoot() instanceof FileScanNode) { - FileScanNode fileScanNode = (FileScanNode) fragment.getPlanRoot(); + // The FE local-shuffle planner may wrap the fragment root with one or more + // LocalExchangeNodes (e.g. a PASSTHROUGH fan-out above a serial FileScanNode). + // Peel those off before checking the actual operator, otherwise streaming / + // S3 INSERTs leave LoadStatistic.fileNum and totalFileSizeB at 0 and tests + // like job_p0.streaming_job.test_streaming_insert_job that inspect + // loadStatistic.fileNumber / fileSize fail. + PlanNode root = fragment.getPlanRoot(); + while (root instanceof LocalExchangeNode && !root.getChildren().isEmpty()) { + root = root.getChild(0); + } + if (root instanceof FileScanNode) { + FileScanNode fileScanNode = (FileScanNode) root; // Prefer distinct file count; fall back to split count for batch-mode scans. int fileNum = fileScanNode.getSelectedFileNum() >= 0 ? fileScanNode.getSelectedFileNum() diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/worker/job/UnassignedScanBucketOlapTableJob.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/worker/job/UnassignedScanBucketOlapTableJob.java index b6a450c93a1b1a..b7f654cd023852 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/worker/job/UnassignedScanBucketOlapTableJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/distribute/worker/job/UnassignedScanBucketOlapTableJob.java @@ -270,7 +270,7 @@ private Pair splitSerialScanSource(BucketSca Map> serialScanRanges = Maps.newLinkedHashMap(); Map> nonSerialScanRanges = Maps.newLinkedHashMap(); for (ScanNode scanNode : scanNodes) { - if (scanNode.isSerialOperator()) { + if (scanNode.isSerialNode()) { collectScanRanges(totalScanSource, scanNode, serialScanRanges); } else { collectScanRanges(totalScanSource, scanNode, nonSerialScanRanges); diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/AddLocalExchange.java b/fe/fe-core/src/main/java/org/apache/doris/planner/AddLocalExchange.java new file mode 100644 index 00000000000000..bb4a883d6080d4 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/AddLocalExchange.java @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.planner; + +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.nereids.trees.plans.distribute.DistributedPlan; +import org.apache.doris.nereids.trees.plans.distribute.FragmentIdMapping; +import org.apache.doris.nereids.trees.plans.distribute.PipelineDistributedPlan; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; +import org.apache.doris.planner.LocalExchangeNode.RequireHash; + +/** + * FE-side local exchange planner — inserts {@link LocalExchangeNode} into each fragment's + * plan tree so that within-fragment data redistribution is decided at planning time + * instead of at BE pipeline-build time. + * + *

When this runs

+ * Invoked from {@code NereidsPlanner.addLocalExchangeAfterDistribute()} right after + * {@code DistributePlanner} has assigned instances to fragments and before the plan is + * serialized to BE. Gated by session variable {@code enable_local_shuffle_planner} + * (default true) and {@code enable_local_shuffle}; when either is off this pass is + * skipped entirely and BE falls back to its own {@code _plan_local_exchange}. The two + * paths are mutually exclusive: BE consults {@code runtime_state.h::plan_local_shuffle()} + * to know whether it should plan LE itself. + * + *

What it changes

+ *
    + *
  • For each fragment with {@code maxPerBeInstances > 1}, walks the plan tree + * bottom-up via {@link PlanNode#enforceAndDeriveLocalExchange} and inserts + * LocalExchangeNodes where children's output distribution doesn't satisfy the + * parent's requirement.
  • + *
  • May wrap the fragment root with an extra PASSTHROUGH LE so the data sink + * (DataStreamSink / OlapTableSink) runs with the full instance count even when + * the root operator is serial — see {@link #addLocalExchangeForFragment}.
  • + *
  • Does NOT modify the fragment sink itself, fragment boundaries, or instance + * assignment.
  • + *
+ * + *

Per-BE instance semantics

+ * Skips fragments where every BE has at most 1 instance. Using a global instance count + * would insert LE for "2 BEs × 1 instance" cases, which BE's own + * {@code _plan_local_exchange} would not — leading to pipeline task-count mismatch and + * deadlock. See {@link #addLocalExchange}. + * + *

Reading order

+ * Start with {@link PlanNode#enforceRequire} (the recursion engine), then individual + * {@code enforceAndDeriveLocalExchange} overrides on PlanNode subclasses. + */ +public class AddLocalExchange { + /** addLocalExchange with distributed plans, skipping single-instance fragments. + * BE's _plan_local_exchange checks _num_instances which is the per-BE instance count. + * With _num_instances<=1 all pipelines on that BE have 1 task so local exchange is a no-op. + * We must use the same per-BE semantics: skip when every BE has at most 1 instance. + * Using global instanceCount would insert LE for fragments where 2 BEs each have 1 instance + * (global=2, per-BE=1), causing pipeline task mismatch and deadlock. */ + public void addLocalExchange(FragmentIdMapping distributedPlans, + PlanTranslatorContext context) { + for (DistributedPlan plan : distributedPlans.values()) { + PipelineDistributedPlan pipePlan = (PipelineDistributedPlan) plan; + long maxPerBeInstances = pipePlan.getInstanceJobs().stream() + .collect(java.util.stream.Collectors.groupingBy( + j -> j.getAssignedWorker().id(), java.util.stream.Collectors.counting())) + .values().stream().mapToLong(Long::longValue).max().orElse(0); + if (maxPerBeInstances <= 1) { + continue; + } + PlanFragment fragment = pipePlan.getFragmentJob().getFragment(); + addLocalExchangeForFragment(fragment, context); + } + } + + private void addLocalExchangeForFragment(PlanFragment fragment, PlanTranslatorContext context) { + DataSink sink = fragment.getSink(); + LocalExchangeTypeRequire require = sink == null + ? LocalExchangeTypeRequire.noRequire() : sink.getLocalExchangeTypeRequire(); + PlanNode root = fragment.getPlanRoot(); + context.setHasSerialAncestorInPipeline(root, false); + Pair output = root + .enforceAndDeriveLocalExchange(context, null, require); + PlanNode newRoot = output.first; + // The fragment data sink (DataStreamSink, OlapTableSink) runs in the same pipeline + // as the root. If the root will be serial on BE, the sink pipeline has 1 task — + // only instance 0 sends data, others hang or miss writes. + // Insert PASSTHROUGH fan-out so sink runs with _num_instances tasks. + // This matches BE-native's default required_data_distribution(): + // _child->is_serial_operator() ? PASSTHROUGH : NOOP + if (newRoot.isSerialOperatorOnBe(context.getConnectContext())) { + newRoot = new LocalExchangeNode(context.nextPlanNodeId(), newRoot, + LocalExchangeType.PASSTHROUGH, null); + } + if (newRoot != root) { + fragment.setPlanRoot(newRoot); + } + validateNoSerialWithoutLocalExchange(fragment.getPlanRoot(), context.getConnectContext()); + } + + /** + * In a local-shuffle fragment, the root check above guarantees the root pipeline + * has N tasks. Any serial operator reduces its pipeline to 1 task. If this serial + * operator feeds into a non-serial parent without LocalExchangeNode in between, + * some pipelines have 1 task while others have N → shared_state mismatch, data loss. + * + * Serial→serial chains are fine (all at 1 task, consistent). Only the transition + * from serial to non-serial needs LE to restore parallelism. + */ + private void validateNoSerialWithoutLocalExchange(PlanNode node, + org.apache.doris.qe.ConnectContext context) { + for (PlanNode child : node.getChildren()) { + validateNoSerialWithoutLocalExchange(child, context); + if (child.isSerialOperatorOnBe(context) + && !(child instanceof LocalExchangeNode) + && !(node instanceof LocalExchangeNode) + && !(node instanceof ExchangeNode) + && !node.isSerialOperatorOnBe(context)) { + org.apache.logging.log4j.LogManager.getLogger(AddLocalExchange.class).warn( + "Serial " + child.getClass().getSimpleName() + "(id=" + child.getId() + + ") feeds into non-serial " + node.getClass().getSimpleName() + + "(id=" + node.getId() + ") without LocalExchangeNode" + + " in fragment " + node.getFragment().getFragmentId() + + ". FE should insert LocalExchangeNode to restore parallelism."); + } + } + } + + public static boolean isColocated(PlanNode plan) { + if (plan instanceof AggregationNode) { + return ((AggregationNode) plan).isColocate() && isColocated(plan.getChild(0)); + } else if (plan instanceof OlapScanNode) { + return true; + } else if (plan instanceof SelectNode) { + return isColocated(plan.getChild(0)); + } else if (plan instanceof HashJoinNode) { + return ((HashJoinNode) plan).isColocate() + && (isColocated(plan.getChild(0)) || isColocated(plan.getChild(1))); + } else if (plan instanceof SetOperationNode) { + if (!((SetOperationNode) plan).isColocate()) { + return false; + } + for (PlanNode child : plan.getChildren()) { + if (isColocated(child)) { + return true; + } + } + return false; + } else { + return false; + } + } + + public static LocalExchangeType resolveExchangeType(LocalExchangeTypeRequire require, + PlanTranslatorContext translatorContext, PlanNode parent, PlanNode child) { + // Only generic RequireHash adapts to LOCAL_EXECUTION_HASH_SHUFFLE. + // Explicit RequireSpecific (GLOBAL_EXECUTION_HASH_SHUFFLE, BUCKET_HASH_SHUFFLE, etc.) + // must never be degraded — if they appear in an invalid context, the plan is wrong. + // + // Always prefer LOCAL_EXECUTION_HASH_SHUFFLE for FE-planned intra-fragment hash exchanges. + // GLOBAL_EXECUTION_HASH_SHUFFLE requires shuffle_idx_to_instance_idx which may be empty + // for fragments with non-hash sinks (UNPARTITIONED/MERGE). LOCAL_HASH is always safe + // since it partitions by local instance count without needing external shuffle maps. + if (require instanceof RequireHash) { + return LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE; + } + return require.preferType(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/AggregationNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/AggregationNode.java index 0f9d34ae8261f5..210252df253260 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/AggregationNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/AggregationNode.java @@ -27,7 +27,13 @@ import org.apache.doris.analysis.SlotDescriptor; import org.apache.doris.analysis.SlotRef; import org.apache.doris.analysis.SortInfo; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.planner.normalize.Normalizer; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.SessionVariable; import org.apache.doris.thrift.TAggregationNode; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TExpr; @@ -239,7 +245,7 @@ public String getNodeExplainString(String detailPrefix, TExplainLevel detailLeve // If `GroupingExprs` is empty and agg need to finalize, the result must be output by single instance @Override - public boolean isSerialOperator() { + public boolean isSerialNode() { return aggInfo.getGroupingExprs().isEmpty() && needsFinalize; } @@ -247,6 +253,10 @@ public void setColocate(boolean colocate) { isColocate = colocate; } + public boolean isColocate() { + return isColocate; + } + public void setSortByGroupKey(SortInfo sortByGroupKey) { this.sortByGroupKey = sortByGroupKey; } @@ -258,4 +268,155 @@ public boolean isQueryCacheCandidate() { public void setQueryCacheCandidate(boolean queryCacheCandidate) { this.queryCacheCandidate = queryCacheCandidate; } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + + ConnectContext connectContext = translatorContext.getConnectContext(); + SessionVariable sessionVariable = connectContext.getSessionVariable(); + // PR #62438: when false, non-finalize agg falls back to BE base class. + boolean enableLeBeforeAgg = sessionVariable.enableLocalExchangeBeforeAgg; + boolean hasKeys = !aggInfo.getGroupingExprs().isEmpty(); + + // Each branch mirrors the corresponding BE operator's required_data_distribution() + // check order 1:1. The helper baseClassRequire() expands BE's base class behavior. + LocalExchangeTypeRequire requireChild; + if (canUseDistinctStreamingAgg(sessionVariable)) { + // DistinctStreamingAggOperatorX. Two flavors share this operator class: + // - streaming preagg (useStreamingPreagg=true): performance-only, + // flag controls + // - non-streaming dedup (useStreamingPreagg=false): correctness-required, + // always HASH regardless of flag + // Diverges from BE: BE's `!_needs_finalize && !enable_local_exchange_before_agg` + // early return catches non-streaming dedup too, causing the same family of + // wrong-result bug as AggSink (DORIS-25413). + if (needsFinalize && !hasKeys) { + requireChild = LocalExchangeTypeRequire.noRequire(); + } else if (!needsFinalize && useStreamingPreagg && !enableLeBeforeAgg) { + requireChild = baseClassRequire(connectContext); + } else if (needsFinalize || (hasKeys && !useStreamingPreagg)) { + requireChild = AddLocalExchange.isColocated(this) + ? LocalExchangeTypeRequire.requireHash() + : parentRequire.autoRequireHash(); + } else if (sessionVariable.enableDistinctStreamingAggForcePassthrough) { + requireChild = LocalExchangeTypeRequire.requirePassthrough(); + } else { + requireChild = baseClassRequire(connectContext); + } + } else if (useStreamingPreagg) { + // StreamingAggOperatorX + if (children.get(0) instanceof HashJoinNode + && sessionVariable.enableStreamingAggHashJoinForcePassthrough) { + requireChild = LocalExchangeTypeRequire.requirePassthrough(); + } else if (!needsFinalize && !enableLeBeforeAgg) { + requireChild = baseClassRequire(connectContext); + } else if (!hasKeys) { + requireChild = needsFinalize + ? LocalExchangeTypeRequire.noRequire() + : baseClassRequire(connectContext); + } else { + requireChild = LocalExchangeTypeRequire.requireHash(); + } + } else { + // AggSinkOperatorX — covers finalize phase AND non-finalize phases (LOCAL + // preagg / FIRST_MERGE dedup). Streaming preagg goes through the StreamingAgg + // branch above, not here. + // + // Phase semantics for !needsFinalize: + // - FIRST / SECOND (LOCAL phase, !isMerge): performance-only, flag controls + // - FIRST_MERGE (correctness-required): always HASH regardless of flag + // + // Diverges from BE here: BE's `!_needs_finalize && !enable_local_exchange_before_agg` + // early return also catches FIRST_MERGE, dropping the HASH requirement and + // causing wrong-result (e.g. PASSTHROUGH over serial child breaks the + // group-by-key invariant — DORIS-25413). + if (!hasKeys) { + requireChild = needsFinalize + ? LocalExchangeTypeRequire.noRequire() + : baseClassRequire(connectContext); + } else if (!needsFinalize && !aggInfo.isMerge() && !enableLeBeforeAgg) { + // LOCAL phase (FIRST preagg / SECOND distinct local) + user opted out + // of pre-agg LE → base class decides: serial child → PASSTHROUGH + // (parallelism), non-serial child → NOOP (no LE). + requireChild = baseClassRequire(connectContext); + } else if (!needsFinalize || AddLocalExchange.isColocated(this)) { + // FIRST_MERGE (correctness) or finalize+colocate → HASH. + requireChild = parentRequire.autoRequireHash(); + } else if (hasPartitionExprs(parentRequire)) { + // FE-only heuristic: finalize non-colocate with parent hash requirement + // → inherit parent's specific hash type. + requireChild = parentRequire.autoRequireHash(); + } else { + // FE-only heuristic: finalize non-colocate without parent hash → skip + // LE (child Exchange already provides hash distribution). + requireChild = LocalExchangeTypeRequire.noRequire(); + } + } + + Pair enforceResult + = enforceRequire(translatorContext, children.get(0), 0, requireChild); + children = Lists.newArrayList(enforceResult.first); + return Pair.of(this, enforceResult.second); + } + + /** BE base class required_data_distribution: serial child → PASSTHROUGH, else → NOOP. */ + private LocalExchangeTypeRequire baseClassRequire(ConnectContext connectContext) { + return children.get(0).isSerialOperatorOnBe(connectContext) + ? LocalExchangeTypeRequire.requirePassthrough() + : LocalExchangeTypeRequire.noRequire(); + } + + @Override + protected List getSemanticPartitionExprs() { + return aggInfo.getGroupingExprs(); + } + + @Override + protected List getLocalExchangeDistributeExprs(int childIndex, boolean followedByShuffled) { + // Mirror BE AggSinkOperatorX::update_operator / StreamingAggOperatorX::update_operator: + // _partition_exprs = (distribute_expr_lists set && (followed_by_shuffled || has_distinct)) + // ? distribute_expr_lists[0] : grouping_exprs + // The HASH LocalExchange must partition by _partition_exprs so a streaming partial preagg + // locally collapses same-key rows. Using child distribution (default) for a non-shuffled + // chain scatters same-group rows across N instances, leaving partial_preagg essentially a + // no-op and breaking row-arrival order at downstream merge-finalize (e.g. group_concat). + List childDist = getChildDistributeExprList(childIndex); + boolean hasDistinct = aggInfo.getAggregateExprs().stream() + .map(FunctionCallExpr::getFnName) + .filter(name -> name != null) + .map(name -> name.getFunction()) + .filter(name -> name != null) + .anyMatch(name -> name.startsWith("multi_distinct_")); + if (childDist != null && !childDist.isEmpty() && (followedByShuffled || hasDistinct)) { + return childDist; + } + return Lists.newArrayList(aggInfo.getGroupingExprs()); + } + + @Override + public boolean requiresShuffleForCorrectness() { + // Mirrors BE's AggSinkOperatorX::is_shuffled_operator() exactly: + // finalize agg with group keys needs hash-distributed input for correctness. + // GLOBAL dedup (!needsFinalize) is intentionally NOT included here — if a + // GLOBAL dedup exists, a finalize agg always sits above it (e.g. DISTINCT_GLOBAL + // above DISTINCT_LOCAL/GLOBAL_DEDUP), and the finalize agg propagates the flag + // down via inheritedShuffled. A solo finalize agg satisfies hash distribution + // through its own child requirement. + return needsFinalize && !aggInfo.getGroupingExprs().isEmpty(); + } + + private boolean canUseDistinctStreamingAgg(SessionVariable sessionVariable) { + return aggInfo.getAggregateExprs().isEmpty() && sortByGroupKey == null + && sessionVariable.enableDistinctStreamingAggregation; + } + + @Override + protected boolean shouldResetSerialFlagForChild(int childIndex) { + // Non-streaming AGG is a pipeline breaker: child is in AGG_Sink pipeline, + // parent is in AGG_Source pipeline. Reset inherited serial flag from parent + // (different pipeline), but enforceRequire still adds this node's own + // isSerialNode() so the child sees AGG_Sink's serial status correctly. + return !useStreamingPreagg; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/AnalyticEvalNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/AnalyticEvalNode.java index 2ede7ac4e528e4..aff347fc6ace3e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/AnalyticEvalNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/AnalyticEvalNode.java @@ -27,6 +27,10 @@ import org.apache.doris.analysis.OrderByElement; import org.apache.doris.analysis.ToSqlParams; import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TAnalyticNode; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TPlanNode; @@ -176,7 +180,86 @@ private String orderByElementToSql(OrderByElement orderByElement) { * all data should be input in this node to ensure the global ordering by colB. */ @Override - public boolean isSerialOperator() { + public boolean isSerialNode() { return partitionExprs.isEmpty(); } + + /** + * Mirrors BE's + * {@code AnalyticSinkOperatorX::is_shuffled_operator() = !_partition_by_eq_expr_ctxs.empty()} + * (be/src/exec/operator/analytic_sink_operator.h:226). With PARTITION BY, input must be + * hash-partitioned by partition keys, so downstream UnionNode / SetOperationNode under + * us must pre-shuffle their branches to match — the framework propagates this through + * {@link PlanTranslatorContext#hasShuffleForCorrectnessAncestor}. + */ + @Override + public boolean requiresShuffleForCorrectness() { + return !partitionExprs.isEmpty(); + } + + @Override + protected List getSemanticPartitionExprs() { + return partitionExprs; + } + + @Override + public Pair enforceAndDeriveLocalExchange(PlanTranslatorContext translatorContext, + PlanNode parent, LocalExchangeTypeRequire parentRequire) { + LocalExchangeTypeRequire requireChild; + LocalExchangeType outputType = null; + if (partitionExprs.isEmpty()) { + // Serial AnalyticEval (OVER() with no PARTITION BY): + // Must NOT have any LocalExchange between AnalyticEval and its child. + // On BE, AnalyticSink and AnalyticSource share state (source_deps/sink_deps). + // A LocalExchange below would restore the AnalyticSink pipeline to _num_instances + // tasks while the serial AnalyticSource pipeline stays at 1 task. + // + // Use enforceRequire with noRequire to traverse children, then strip any + // LocalExchange the child inserted (e.g., Exchange wrapping itself with PASSTHROUGH). + Pair enforceResult + = enforceRequire(translatorContext, children.get(0), 0, LocalExchangeTypeRequire.noRequire()); + PlanNode newChild = enforceResult.first; + if (newChild instanceof LocalExchangeNode) { + newChild = newChild.getChild(0); + } + children = Lists.newArrayList(newChild); + // Return NOOP: the serial AnalyticSource pipeline has 1 task, we don't provide + // fan-out ourselves. The parent's enforceRequire framework-level serial check + // will see our serial status and insert PASSTHROUGH LE above us if needed. + return Pair.of(this, LocalExchangeType.NOOP); + } else if (orderByElements.isEmpty()) { + if (AddLocalExchange.isColocated(this)) { + requireChild = LocalExchangeTypeRequire.requireHash(); + outputType = AddLocalExchange.resolveExchangeType( + LocalExchangeTypeRequire.requireHash(), translatorContext, this, + children.get(0)); + } else { + // Non-colocated analytic with PARTITION BY but no ORDER BY: + // The parent SortNode (mergeByExchange) will insert PASSTHROUGH above us, + // which is what BE does natively. Don't force a hash exchange here. + requireChild = LocalExchangeTypeRequire.noRequire(); + outputType = LocalExchangeType.NOOP; + } + } else if (children.get(0).isSerialOperatorOnBe(translatorContext.getConnectContext())) { + // BE base class: _child->is_serial_operator() ? PASSTHROUGH : NOOP + requireChild = LocalExchangeTypeRequire.requirePassthrough(); + outputType = LocalExchangeType.PASSTHROUGH; + } else { + requireChild = LocalExchangeTypeRequire.noRequire(); + outputType = LocalExchangeType.NOOP; + } + + Pair enforceResult + = enforceRequire(translatorContext, children.get(0), 0, requireChild); + children = Lists.newArrayList(enforceResult.first); + if (outputType == null) { + outputType = enforceResult.second; + } + return Pair.of(this, outputType); + } + + @Override + protected boolean shouldResetSerialFlagForChild(int childIndex) { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/AssertNumRowsNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/AssertNumRowsNode.java index dcf683f0783df4..5784d2050a49f8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/AssertNumRowsNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/AssertNumRowsNode.java @@ -19,11 +19,17 @@ import org.apache.doris.analysis.AssertNumRowsElement; import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TAssertNumRowsNode; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TPlanNode; import org.apache.doris.thrift.TPlanNodeType; +import com.google.common.collect.Lists; + /** * Assert num rows node is used to determine whether the number of rows is less than desired num of rows. * The rows are the result of subqueryString. @@ -78,7 +84,17 @@ public int getNumInstances() { } @Override - public boolean isSerialOperator() { + public boolean isSerialNode() { return true; } + + @Override + public Pair enforceAndDeriveLocalExchange(PlanTranslatorContext translatorContext, + PlanNode parent, LocalExchangeTypeRequire parentRequire) { + + Pair enforceResult = enforceRequire( + translatorContext, children.get(0), 0, LocalExchangeTypeRequire.requirePassthrough()); + children = Lists.newArrayList(enforceResult.first); + return Pair.of(this, LocalExchangeType.PASSTHROUGH); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/BucketedAggregationNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/BucketedAggregationNode.java index 96dbd579388433..f49c316cf24627 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/BucketedAggregationNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/BucketedAggregationNode.java @@ -20,6 +20,7 @@ import org.apache.doris.analysis.AggregateInfo; import org.apache.doris.analysis.ExprToThriftVisitor; import org.apache.doris.analysis.FunctionCallExpr; +import org.apache.doris.qe.ConnectContext; import org.apache.doris.thrift.TBucketedAggregationNode; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TExpr; @@ -97,7 +98,7 @@ public String getNodeExplainString(String detailPrefix, TExplainLevel detailLeve } @Override - public boolean isSerialOperator() { + public boolean isSerialOperatorOnBe(ConnectContext context) { // Bucketed agg handles group-by keys only (no without-key in initial version), // so it's never a serial operator. return false; diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/CTEScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/CTEScanNode.java index 13ac382bf16b81..92bf1974b39721 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/CTEScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/CTEScanNode.java @@ -19,7 +19,11 @@ import org.apache.doris.analysis.TupleDescriptor; import org.apache.doris.analysis.TupleId; +import org.apache.doris.common.Pair; import org.apache.doris.common.UserException; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TPlanNode; import org.apache.doris.thrift.TScanRangeLocations; @@ -55,4 +59,10 @@ public List getScanRangeLocations(long maxScanRangeLength) // NO real action to be taken, just a wrapper return null; } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + return Pair.of(this, LocalExchangeType.NOOP); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/DataSink.java b/fe/fe-core/src/main/java/org/apache/doris/planner/DataSink.java index 07da3d4f9bae0f..f8c27eae5c4675 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/DataSink.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/DataSink.java @@ -20,6 +20,7 @@ package org.apache.doris.planner; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TDataSink; import org.apache.doris.thrift.TExplainLevel; @@ -65,4 +66,8 @@ public boolean isMerge() { public void setMerge(boolean merge) { isMerge = merge; } + + public LocalExchangeTypeRequire getLocalExchangeTypeRequire() { + return LocalExchangeTypeRequire.noRequire(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/EmptySetNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/EmptySetNode.java index d89bfc0b4eefb4..3a731d28a90996 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/EmptySetNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/EmptySetNode.java @@ -18,6 +18,10 @@ package org.apache.doris.planner; import org.apache.doris.analysis.TupleId; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TPlanNode; import org.apache.doris.thrift.TPlanNodeType; @@ -48,4 +52,10 @@ protected void toThrift(TPlanNode msg) { public int getNumInstances() { return 1; } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + return Pair.of(this, LocalExchangeType.NOOP); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/ExceptNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/ExceptNode.java index ae728da571b6e7..1052b6bd1184ff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/ExceptNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/ExceptNode.java @@ -30,4 +30,10 @@ public ExceptNode(PlanNodeId id, TupleId tupleId) { protected void toThrift(TPlanNode msg) { toThrift(msg, TPlanNodeType.EXCEPT_NODE); } + + @Override + public boolean requiresShuffleForCorrectness() { + // BE: SetSinkOperatorX / SetSourceOperatorX.is_shuffled_operator() = true (unconditional). + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/ExchangeNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/ExchangeNode.java index 7274f731c5ab24..256f89843ebd0d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/ExchangeNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/ExchangeNode.java @@ -23,6 +23,10 @@ import org.apache.doris.analysis.SortInfo; import org.apache.doris.analysis.TupleDescriptor; import org.apache.doris.analysis.TupleId; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.qe.ConnectContext; import org.apache.doris.thrift.TExchangeNode; import org.apache.doris.thrift.TExplainLevel; @@ -98,10 +102,7 @@ public void setMergeInfo(SortInfo info) { @Override protected void toThrift(TPlanNode msg) { - // If this fragment has another scan node, this exchange node is serial or not should be decided by the scan - // node. - msg.setIsSerialOperator((isSerialOperator() || fragment.hasSerialScanNode()) - && fragment.useSerialSource(ConnectContext.get())); + msg.setIsSerialOperator(isSerialOperatorOnBe(ConnectContext.get())); msg.node_type = TPlanNodeType.EXCHANGE_NODE; msg.exchange_node = new TExchangeNode(); for (TupleId tid : tupleIds) { @@ -150,20 +151,45 @@ public void setRightChildOfBroadcastHashJoin(boolean value) { * because this loading job relies on the global ordering of column `k1` and `v1`. * * So FRAGMENT 0 should not use serial source. + * + * Important: this method does NOT call fragment.useSerialSource() — that path would + * recurse into hasNullAwareLeftAntiJoin and walk the entire plan tree, and was + * previously found to blow the stack on deep plans. The fragment-level gating is + * applied in {@link #isSerialOperatorOnBe} instead. */ @Override - public boolean isSerialOperator() { + public boolean isSerialNode() { return (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().isUseSerialExchange() || partitionType == TPartitionType.UNPARTITIONED) && mergeInfo == null; } + @Override + public boolean isSerialOperatorOnBe(ConnectContext context) { + return fragment != null + && (isSerialNode() || fragment.hasSerialScanNode()) + && fragment.useSerialSource(context); + } + @Override public boolean hasSerialChildren() { - return isSerialOperator(); + return isSerialNode(); } @Override public boolean hasSerialScanChildren() { return false; } + + @Override + public Pair enforceAndDeriveLocalExchange(PlanTranslatorContext translatorContext, + PlanNode parent, LocalExchangeTypeRequire parentRequire) { + // Report actual distribution. Serial handling is done by the framework + // (enforceRequire step 2.5 overrides serial child output to NOOP). + if (partitionType == TPartitionType.HASH_PARTITIONED) { + return Pair.of(this, LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE); + } else if (partitionType == TPartitionType.BUCKET_SHFFULE_HASH_PARTITIONED) { + return Pair.of(this, LocalExchangeType.BUCKET_HASH_SHUFFLE); + } + return Pair.of(this, LocalExchangeType.NOOP); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/HashJoinNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/HashJoinNode.java index a98422bb2e0fea..ae01b515cab706 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/HashJoinNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/HashJoinNode.java @@ -28,7 +28,11 @@ import org.apache.doris.analysis.SlotId; import org.apache.doris.analysis.ToSqlParams; import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; import org.apache.doris.nereids.trees.expressions.ExprId; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TEqJoinCondition; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.THashJoinNode; @@ -124,6 +128,19 @@ public void setColocate(boolean colocate, String reason) { colocateReason = reason; } + public boolean isColocate() { + return isColocate; + } + + @Override + public boolean requiresShuffleForCorrectness() { + // BE: HashJoinBuild/Probe.is_shuffled_operator() = PARTITIONED || BUCKET_SHUFFLE || COLOCATE. + // (BROADCAST and NONE are not shuffled — they don't depend on hash distribution.) + return distrMode == DistributionMode.PARTITIONED + || distrMode == DistributionMode.BUCKET_SHUFFLE + || isColocate; + } + public Map getHashOutputExprSlotIdMap() { return hashOutputExprSlotIdMap; } @@ -285,4 +302,71 @@ public List getOtherJoinConjuncts() { public List getMarkJoinConjuncts() { return markJoinConjuncts; } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + + LocalExchangeTypeRequire probeSideRequire; + LocalExchangeTypeRequire buildSideRequire; + LocalExchangeType outputType = null; + + if (joinOp == JoinOperator.NULL_AWARE_LEFT_ANTI_JOIN) { + buildSideRequire = probeSideRequire = LocalExchangeTypeRequire.noRequire(); + outputType = LocalExchangeType.NOOP; + } else if (distrMode == DistributionMode.BROADCAST) { + // BE: _child->is_serial_operator() ? PASSTHROUGH/PASS_TO_ONE : NOOP + boolean probeChildSerial = children.get(0).isSerialOperatorOnBe( + translatorContext.getConnectContext()); + boolean buildChildSerial = children.get(1).isSerialOperatorOnBe( + translatorContext.getConnectContext()); + probeSideRequire = probeChildSerial + ? LocalExchangeTypeRequire.requirePassthrough() + : LocalExchangeTypeRequire.noRequire(); + buildSideRequire = buildChildSerial + ? LocalExchangeTypeRequire.requirePassToOne() + : LocalExchangeTypeRequire.noRequire(); + // For serial probe: output is PASSTHROUGH (data from single instance). + // For non-serial probe: propagate probe side's actual distribution. + outputType = probeChildSerial ? LocalExchangeType.PASSTHROUGH : null; + } else if (isColocate() || isBucketShuffle()) { + probeSideRequire = LocalExchangeTypeRequire.requireBucketHash(); + // For BUCKET_SHUFFLE with serial build child: use requireBucketHash() (not + // requirePassToOne()). Unlike BROADCAST joins, BUCKET_SHUFFLE has no shared + // hash table mechanism — PASS_TO_ONE routes all data to task 0 while tasks 1..N-1 + // build empty hash tables, losing rows. BUCKET_HASH_SHUFFLE correctly distributes + // build data by bucket to match the probe side's bucket distribution. + // The serial exchange returns NOOP, so enforceRequire() will insert a + // BUCKET_HASH_SHUFFLE local exchange (with PASSTHROUGH fan-out for heavy-ops + // bottleneck avoidance). + buildSideRequire = LocalExchangeTypeRequire.requireBucketHash(); + outputType = AddLocalExchange.resolveExchangeType( + LocalExchangeTypeRequire.requireBucketHash(), translatorContext, this, + children.get(0)); + } else { + // Use requireHash() (not requireGlobalExecutionHash()) so that resolveExchangeType() + // can downgrade to LOCAL_EXECUTION_HASH_SHUFFLE via shouldUseLocalExecutionHash(). + // This matches BE-native behavior where use_serial_exchange=true sets _use_serial_source=true, + // causing _add_local_exchange_impl to use LOCAL (not GLOBAL) hash shuffle. + // With use_serial_exchange=false, the upstream ExchangeNode already outputs + // GLOBAL_EXECUTION_HASH_SHUFFLE which satisfies requireHash() — no new exchange inserted. + buildSideRequire = probeSideRequire = LocalExchangeTypeRequire.requireHash(); + outputType = null; // derived from probeResult.second below + } + + Pair probeResult = enforceRequire( + translatorContext, children.get(0), 0, probeSideRequire); + Pair buildResult = enforceRequire( + translatorContext, children.get(1), 1, buildSideRequire); + this.children = Lists.newArrayList(probeResult.first, buildResult.first); + if (outputType == null) { + outputType = probeResult.second; + } + return Pair.of(this, outputType); + } + + @Override + protected boolean shouldResetSerialFlagForChild(int childIndex) { + return childIndex == 1; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/IntersectNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/IntersectNode.java index 89f4002ea4d33c..5d784c03152aa1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/IntersectNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/IntersectNode.java @@ -30,4 +30,10 @@ public IntersectNode(PlanNodeId id, TupleId tupleId) { protected void toThrift(TPlanNode msg) { toThrift(msg, TPlanNodeType.INTERSECT_NODE); } + + @Override + public boolean requiresShuffleForCorrectness() { + // BE: SetSinkOperatorX / SetSourceOperatorX.is_shuffled_operator() = true (unconditional). + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/LocalExchangeNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/LocalExchangeNode.java new file mode 100644 index 00000000000000..df9551866a295e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/LocalExchangeNode.java @@ -0,0 +1,349 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/apache/impala/blob/branch-2.9.0/fe/src/main/java/org/apache/impala/ExchangeNode.java +// and modified by Doris + +package org.apache.doris.planner; + +import org.apache.doris.analysis.Expr; +import org.apache.doris.analysis.ExprToThriftVisitor; +import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.thrift.TExplainLevel; +import org.apache.doris.thrift.TExpr; +import org.apache.doris.thrift.TLocalExchangeNode; +import org.apache.doris.thrift.TLocalPartitionType; +import org.apache.doris.thrift.TPlanNode; +import org.apache.doris.thrift.TPlanNodeType; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** LocalExchangeNode */ +public class LocalExchangeNode extends PlanNode { + public static final String EXCHANGE_NODE = "LOCAL-EXCHANGE"; + + private LocalExchangeType exchangeType; + + /** + * use for Nereids only. + */ + public LocalExchangeNode(PlanNodeId id, PlanNode inputNode, LocalExchangeType exchangeType) { + this(id, inputNode, exchangeType, null); + } + + public LocalExchangeNode(PlanNodeId id, PlanNode inputNode, LocalExchangeType exchangeType, + List distributeExprs) { + super(id, inputNode, EXCHANGE_NODE); + this.offset = 0; + this.limit = -1; + this.conjuncts = Collections.emptyList(); + this.children.add(inputNode); + this.exchangeType = exchangeType; + this.fragment = inputNode.getFragment(); + + List hashExprs = distributeExprs; + boolean isHashShuffle = (exchangeType == LocalExchangeType.BUCKET_HASH_SHUFFLE + || exchangeType == LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE + || exchangeType == LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE); + if (isHashShuffle && hashExprs != null && !hashExprs.isEmpty()) { + setDistributeExprLists(hashExprs); + } + TupleDescriptor outputTupleDesc = inputNode.getOutputTupleDesc(); + updateTupleIds(outputTupleDesc); + } + + public void updateTupleIds(TupleDescriptor outputTupleDesc) { + if (outputTupleDesc != null) { + clearTupleIds(); + tupleIds.add(outputTupleDesc.getId()); + } else { + clearTupleIds(); + tupleIds.addAll(getChild(0).getOutputTupleIds()); + } + } + + @Override + protected void toThrift(TPlanNode msg) { + // FE-planned LocalExchangeNode itself must stay non-serial. In the BE-planned path, + // the serial semantics belong to the upstream scan/exchange pipeline rather than the + // downstream LocalExchangeSource pipeline. Marking LocalExchangeNode as serial would + // incorrectly reduce the downstream pipeline's task count to 1. + msg.setIsSerialOperator(false); + + msg.node_type = TPlanNodeType.LOCAL_EXCHANGE_NODE; + msg.local_exchange_node = new TLocalExchangeNode(exchangeType.toThrift()); + + if (exchangeType.isHashShuffle()) { + List thriftDistributeExprLists = new ArrayList<>(); + for (Expr expr : distributeExprLists()) { + thriftDistributeExprLists.add(ExprToThriftVisitor.treeToThrift(expr)); + } + msg.local_exchange_node.setDistributeExprLists(thriftDistributeExprLists); + } + } + + private List distributeExprLists() { + if (distributeExprLists == null) { + return Collections.emptyList(); + } + return distributeExprLists; + } + + @Override + public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { + return prefix + "type: " + exchangeType.name() + "\n"; + } + + public LocalExchangeType getExchangeType() { + return exchangeType; + } + + @Override + protected boolean shouldResetSerialFlagForChild(int childIndex) { + return true; + } + + /** + * Describes what a parent operator demands of its child's output distribution. + * Returned by the parent in {@code enforceAndDeriveLocalExchange} and consumed by + * {@link PlanNode#enforceRequire}, which decides whether to insert a LocalExchangeNode. + * + *

How to pick the right require when overriding {@code enforceAndDeriveLocalExchange}: + *

    + *
  • {@link NoRequire} — "I don't care about the child's distribution". Use for + * operators whose correctness doesn't depend on partitioning (e.g. base default, + * limit, select). The framework still upgrades this to {@code requirePassthrough} + * automatically when the child turns out to be serial — see + * {@link PlanNode#enforceRequire} step 3.
  • + * + *
  • {@link RequireHash} (via {@code requireHash()}) — "I need hash-partitioned + * input, any flavour of hash will do". Accepts {@code GLOBAL_EXECUTION_HASH_SHUFFLE}, + * {@code LOCAL_EXECUTION_HASH_SHUFFLE}, and {@code BUCKET_HASH_SHUFFLE}. This is + * the right choice for shuffled correctness consumers (finalize AggSink with keys, + * partitioned HashJoin, Intersect, Except) — the upstream may already provide a + * compatible flavour and we shouldn't insert a redundant exchange.
  • + * + *
  • {@link RequireSpecific} (via {@code requirePassthrough()}, + * {@code requireBroadcast()}, {@code requireBucketHash()}, + * {@code requireGlobalExecutionHash()}, etc.) — "I need exactly this exchange type". + * Use only when the operator's correctness or efficiency hinges on that exact + * type (e.g. NLJ probe wants ADAPTIVE_PASSTHROUGH; BucketShuffle join build wants + * BUCKET_HASH_SHUFFLE). Note: PASSTHROUGH is satisfied by ADAPTIVE_PASSTHROUGH + * (superset), but other specific types require exact match.
  • + *
+ * + *

Rule of thumb: prefer {@code requireHash()} over + * {@code requireSpecific(GLOBAL_EXECUTION_HASH_SHUFFLE)} unless you genuinely need to + * reject other hash flavours. RequireSpecific is fragile because the upstream may + * legitimately output a different (still correct) hash type. + */ + public interface LocalExchangeTypeRequire { + boolean satisfy(LocalExchangeType provide); + + LocalExchangeType preferType(); + + default LocalExchangeTypeRequire autoRequireHash() { + return RequireHash.INSTANCE; + } + + static NoRequire noRequire() { + return NoRequire.INSTANCE; + } + + static RequireHash requireHash() { + return RequireHash.INSTANCE; + } + + static RequireSpecific requirePassthrough() { + return requireSpecific(LocalExchangeType.PASSTHROUGH); + } + + static RequireSpecific requirePassToOne() { + return requireSpecific(LocalExchangeType.PASS_TO_ONE); + } + + static RequireSpecific requireBroadcast() { + return requireSpecific(LocalExchangeType.BROADCAST); + } + + static RequireSpecific requireAdaptivePassthrough() { + return requireSpecific(LocalExchangeType.ADAPTIVE_PASSTHROUGH); + } + + static RequireSpecific requireBucketHash() { + return requireSpecific(LocalExchangeType.BUCKET_HASH_SHUFFLE); + } + + static RequireSpecific requireGlobalExecutionHash() { + return requireSpecific(LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE); + } + + static RequireSpecific requireSpecific(LocalExchangeType require) { + return new RequireSpecific(require); + } + + default LocalExchangeType noopTo(LocalExchangeType defaultType) { + LocalExchangeType preferType = preferType(); + return (preferType == LocalExchangeType.NOOP) ? defaultType : preferType; + } + } + + /** NoRequire */ + public static class NoRequire implements LocalExchangeTypeRequire { + public static final NoRequire INSTANCE = new NoRequire(); + + @Override + public boolean satisfy(LocalExchangeType provide) { + return true; + } + + @Override + public LocalExchangeType preferType() { + return LocalExchangeType.NOOP; + } + } + + /** RequireHash */ + public static class RequireHash implements LocalExchangeTypeRequire { + public static final RequireHash INSTANCE = new RequireHash(); + + @Override + public boolean satisfy(LocalExchangeType provide) { + switch (provide) { + case GLOBAL_EXECUTION_HASH_SHUFFLE: + case LOCAL_EXECUTION_HASH_SHUFFLE: + case BUCKET_HASH_SHUFFLE: + return true; + default: + return false; + } + } + + @Override + public LocalExchangeType preferType() { + return LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE; + } + + @Override + public LocalExchangeTypeRequire autoRequireHash() { + return this; + } + } + + public static class RequireSpecific implements LocalExchangeTypeRequire { + LocalExchangeType requireType; + + public RequireSpecific(LocalExchangeType requireType) { + this.requireType = requireType; + } + + @Override + public boolean satisfy(LocalExchangeType provide) { + if (requireType == provide) { + return true; + } + // ADAPTIVE_PASSTHROUGH is a superset of PASSTHROUGH — both fan out data + // from fewer to more tasks. BE's need_to_local_exchange treats them as + // compatible, so ADAPTIVE_PASSTHROUGH satisfies a PASSTHROUGH requirement. + if (requireType == LocalExchangeType.PASSTHROUGH + && provide == LocalExchangeType.ADAPTIVE_PASSTHROUGH) { + return true; + } + return false; + } + + @Override + public LocalExchangeType preferType() { + return requireType; + } + + @Override + public LocalExchangeTypeRequire autoRequireHash() { + if (requireType == LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE + || requireType == LocalExchangeType.BUCKET_HASH_SHUFFLE) { + return this; + } + return RequireHash.INSTANCE; + } + } + + public enum LocalExchangeType { + NOOP, + GLOBAL_EXECUTION_HASH_SHUFFLE, + LOCAL_EXECUTION_HASH_SHUFFLE, + BUCKET_HASH_SHUFFLE, + PASSTHROUGH, + ADAPTIVE_PASSTHROUGH, + BROADCAST, + PASS_TO_ONE, + LOCAL_MERGE_SORT; + + public boolean isHashShuffle() { + switch (this) { + case GLOBAL_EXECUTION_HASH_SHUFFLE: + case LOCAL_EXECUTION_HASH_SHUFFLE: + case BUCKET_HASH_SHUFFLE: + return true; + default: + return false; + } + } + + // Mirrors BE Pipeline::heavy_operations_on_the_sink(): + // HASH_SHUFFLE, BUCKET_HASH_SHUFFLE, and ADAPTIVE_PASSTHROUGH perform + // heavy computation on the sink side. When the upstream pipeline has only + // 1 task (serial/pooling scan), a PASSTHROUGH fan-out must be inserted + // before these exchanges to avoid a single-task bottleneck. + public boolean isHeavyOperation() { + switch (this) { + case GLOBAL_EXECUTION_HASH_SHUFFLE: + case LOCAL_EXECUTION_HASH_SHUFFLE: + case BUCKET_HASH_SHUFFLE: + case ADAPTIVE_PASSTHROUGH: + return true; + default: + return false; + } + } + + public TLocalPartitionType toThrift() { + switch (this) { + case GLOBAL_EXECUTION_HASH_SHUFFLE: + return TLocalPartitionType.GLOBAL_EXECUTION_HASH_SHUFFLE; + case LOCAL_EXECUTION_HASH_SHUFFLE: + return TLocalPartitionType.LOCAL_EXECUTION_HASH_SHUFFLE; + case BUCKET_HASH_SHUFFLE: + return TLocalPartitionType.BUCKET_HASH_SHUFFLE; + case PASSTHROUGH: + return TLocalPartitionType.PASSTHROUGH; + case ADAPTIVE_PASSTHROUGH: + return TLocalPartitionType.ADAPTIVE_PASSTHROUGH; + case BROADCAST: + return TLocalPartitionType.BROADCAST; + case PASS_TO_ONE: + return TLocalPartitionType.PASS_TO_ONE; + case LOCAL_MERGE_SORT: + return TLocalPartitionType.LOCAL_MERGE_SORT; + default: { + throw new IllegalStateException("Unsupported LocalExchangeType: " + this); + } + } + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java index 3a21ae34868b29..40c75b90b13554 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/MaterializationNode.java @@ -22,6 +22,10 @@ import org.apache.doris.analysis.TupleDescriptor; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.ColumnToThrift; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.qe.ConnectContext; import org.apache.doris.resource.computegroup.ComputeGroup; import org.apache.doris.system.Backend; @@ -227,7 +231,17 @@ public void setTopMaterializeNode(boolean topMaterializeNode) { } @Override - public boolean isSerialOperator() { + public boolean isSerialNode() { return true; } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + Pair enforceResult = enforceRequire( + translatorContext, children.get(0), 0, LocalExchangeTypeRequire.requirePassthrough()); + children = new ArrayList<>(); + children.add(enforceResult.first); + return Pair.of(this, LocalExchangeType.PASSTHROUGH); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/NestedLoopJoinNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/NestedLoopJoinNode.java index ff788e4c5f1ee1..6465970201ecf7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/NestedLoopJoinNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/NestedLoopJoinNode.java @@ -23,12 +23,17 @@ import org.apache.doris.analysis.SlotId; import org.apache.doris.analysis.TupleDescriptor; import org.apache.doris.analysis.TupleId; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; import org.apache.doris.nereids.trees.expressions.ExprId; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TNestedLoopJoinNode; import org.apache.doris.thrift.TPlanNode; import org.apache.doris.thrift.TPlanNodeType; +import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; @@ -215,8 +220,64 @@ public String getNodeExplainString(String detailPrefix, TExplainLevel detailLeve * Probe-side must have full data so join is a serial operator. */ @Override - public boolean isSerialOperator() { + public boolean isSerialNode() { return joinOp == JoinOperator.RIGHT_OUTER_JOIN || joinOp == JoinOperator.RIGHT_ANTI_JOIN || joinOp == JoinOperator.RIGHT_SEMI_JOIN || joinOp == JoinOperator.FULL_OUTER_JOIN; } + + @Override + public Pair enforceAndDeriveLocalExchange(PlanTranslatorContext translatorContext, + PlanNode parent, LocalExchangeTypeRequire parentRequire) { + + // Pooling mode: the fragment uses serial source (pooling scan or serial exchange). + // NLJ build side needs BROADCAST in pooling mode so all probe tasks see full build data. + boolean childUsePoolingScan = fragment.useSerialSource(translatorContext.getConnectContext()); + + LocalExchangeTypeRequire probeSideRequire; + LocalExchangeTypeRequire buildSideRequire; + LocalExchangeType outputType; + if (joinOp == JoinOperator.NULL_AWARE_LEFT_ANTI_JOIN) { + probeSideRequire = buildSideRequire = LocalExchangeTypeRequire.noRequire(); + outputType = LocalExchangeType.NOOP; + } else if (isSerialNode()) { + // RIGHT_OUTER/RIGHT_SEMI/RIGHT_ANTI/FULL_OUTER: probe side must be serial (1 task). + // Build side: noRequire() — inserting BROADCAST would inflate build pipeline's + // num_tasks while probe stays at 1, crashing in set_ready_to_read(). + probeSideRequire = LocalExchangeTypeRequire.noRequire(); + buildSideRequire = LocalExchangeTypeRequire.noRequire(); + outputType = LocalExchangeType.NOOP; + } else if (childUsePoolingScan) { + probeSideRequire = LocalExchangeTypeRequire.requireAdaptivePassthrough(); + buildSideRequire = LocalExchangeTypeRequire.requireBroadcast(); + outputType = LocalExchangeType.ADAPTIVE_PASSTHROUGH; + } else { + probeSideRequire = LocalExchangeTypeRequire.requireAdaptivePassthrough(); + buildSideRequire = LocalExchangeTypeRequire.noRequire(); + outputType = LocalExchangeType.ADAPTIVE_PASSTHROUGH; + } + + // Both sides use enforceRequire — it handles serial flag propagation, satisfy + // check (skip LE when child already outputs the required type, e.g., chained NLJs), + // serial ancestor skip, and serial child fallback (auto-upgrade noRequire to + // requirePassthrough when child is serial but this node is not). + PlanNode probeSide = enforceRequire( + translatorContext, children.get(0), 0, probeSideRequire).first; + PlanNode buildSide = enforceRequire( + translatorContext, children.get(1), 1, buildSideRequire).first; + this.children = Lists.newArrayList(probeSide, buildSide); + return Pair.of(this, outputType); + } + + @Override + protected boolean shouldResetSerialFlagForChild(int childIndex) { + // Build side (child 1) is a separate pipeline in BE. Normally, + // the serial-ancestor flag should be reset across pipeline boundaries. + // BUT when NLJ itself is serial (RIGHT_OUTER/ANTI/SEMI/FULL_OUTER), + // the probe pipeline has num_tasks=1. If we reset the flag, the + // build-side Exchange may insert PASSTHROUGH (restoring num_tasks to + // _num_instances), creating more build tasks than probe tasks. The + // extra build tasks have a NLJ shared state with empty source_deps, + // crashing in set_ready_to_read(). + return childIndex == 1 && !isSerialNode(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java index f1d89b63d51a25..925ae8e9a4ac58 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java @@ -60,6 +60,8 @@ import org.apache.doris.common.util.DebugUtil; import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; import org.apache.doris.nereids.trees.plans.ScoreRangeInfo; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.planner.normalize.Normalizer; import org.apache.doris.planner.normalize.PartitionRangePredicateNormalizer; import org.apache.doris.qe.ConnectContext; @@ -1076,6 +1078,9 @@ public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { if (isPointQuery()) { output.append(prefix).append("SHORT-CIRCUIT\n"); } + if (fragment.useSerialSource(ConnectContext.get())) { + output.append(prefix).append("POOLING-SCAN\n"); + } printNestedColumns(output, prefix, getTupleDesc()); @@ -1453,4 +1458,17 @@ public OlapTableStreamUpdate getStreamUpdate() { } return new OlapTableStreamUpdate(prev, next); } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, + LocalExchangeTypeRequire parentRequire) { + boolean useSerialSource = fragment != null + && fragment.useSerialSource(translatorContext.getConnectContext()); + if (useSerialSource) { + return Pair.of(this, LocalExchangeType.NOOP); + } + // Non-pooling OlapScan has bucket distribution — each instance scans specific buckets + return Pair.of(this, LocalExchangeType.BUCKET_HASH_SHUFFLE); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/PartitionSortNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/PartitionSortNode.java index ed587df3816ab7..2185585d70f725 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/PartitionSortNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/PartitionSortNode.java @@ -22,8 +22,12 @@ import org.apache.doris.analysis.ExprToThriftVisitor; import org.apache.doris.analysis.SortInfo; import org.apache.doris.analysis.ToSqlParams; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; import org.apache.doris.nereids.trees.plans.PartitionTopnPhase; import org.apache.doris.nereids.trees.plans.WindowFuncType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TPartTopNPhase; import org.apache.doris.thrift.TPartitionSortNode; @@ -169,4 +173,44 @@ protected void toThrift(TPlanNode msg) { partitionSortNode.setPtopnPhase(pTopNPhase); msg.partition_sort_node = partitionSortNode; } + + // NOTE: unlike SortNode (analytic_sort) and AnalyticEvalNode (with PARTITION BY), we + // intentionally do NOT override requiresShuffleForCorrectness() here, mirroring BE's + // PartitionSortSinkOperatorX which does not override is_shuffled_operator() either + // (be/src/exec/operator/partition_sort_sink_operator.h). The require on the child + // below is sufficient to insert the necessary HASH LE for TWO_PHASE_GLOBAL_PTOPN + // directly; the propagation flag would only matter if a SetOperationNode could sit + // between this node and the data source within the same fragment, which in practice + // does not happen because PartitionSort's two-phase shape places an ExchangeNode + // (fragment boundary) between the GLOBAL phase and the LOCAL phase / scans below. + // If FE ever plans PartitionSort + Union in a single fragment, both BE's + // is_shuffled_operator and this method must be updated together — never let FE + // diverge from BE here. + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + LocalExchangeTypeRequire requireChild; + LocalExchangeType outputType; + if (phase == PartitionTopnPhase.TWO_PHASE_GLOBAL_PTOPN) { + // Use requireHash() so resolveExchangeType() can downgrade to LOCAL_EXECUTION_HASH_SHUFFLE, + // matching BE-native behavior where _use_serial_source=true causes LOCAL (not GLOBAL) hash. + // Output type is derived from the child's actual output (may be LOCAL or GLOBAL depending + // on whether a new exchange was inserted or the existing upstream exchange already satisfied). + requireChild = LocalExchangeTypeRequire.requireHash(); + outputType = null; + } else { + requireChild = LocalExchangeTypeRequire.requirePassthrough(); + outputType = LocalExchangeType.PASSTHROUGH; + } + Pair enforceResult + = enforceRequire(translatorContext, children.get(0), 0, requireChild); + this.children = Lists.newArrayList(enforceResult.first); + return Pair.of(this, outputType != null ? outputType : enforceResult.second); + } + + @Override + protected boolean shouldResetSerialFlagForChild(int childIndex) { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/PlanFragment.java b/fe/fe-core/src/main/java/org/apache/doris/planner/PlanFragment.java index a46e2240f4c98f..98621ccc4f6636 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/PlanFragment.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/PlanFragment.java @@ -514,7 +514,7 @@ public boolean useSerialSource(ConnectContext context) { // parallelism of non-serial operators. // For bucket shuffle / colocate join fragment, always use serial source if the bucket scan nodes are // serial. - && (hasSerialScanNode() || (sink instanceof DataStreamSink && !planRoot.isSerialOperator() + && (hasSerialScanNode() || (sink instanceof DataStreamSink && !planRoot.isSerialNode() && planRoot.hasSerialChildren())); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/PlanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/PlanNode.java index 6e56919e4ffe1e..fdb9f93414f40b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/PlanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/PlanNode.java @@ -37,6 +37,9 @@ import org.apache.doris.common.TreeNode; import org.apache.doris.common.UserException; import org.apache.doris.datasource.iceberg.source.IcebergScanNode; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.planner.normalize.ExprNormalizeVisitor; import org.apache.doris.planner.normalize.Normalizer; import org.apache.doris.qe.ConnectContext; @@ -142,7 +145,8 @@ public abstract class PlanNode extends TreeNode { protected int nereidsId = -1; - private List> childrenDistributeExprLists = new ArrayList<>(); + protected List> childrenDistributeExprLists = new ArrayList<>(); + protected List distributeExprLists = new ArrayList<>(); protected PlanNode(PlanNodeId id, List tupleIds, String planNodeName) { this.id = id; @@ -467,7 +471,7 @@ private void treeToThriftHelper(TPlan container) { TPlanNode msg = new TPlanNode(); msg.node_id = id.asInt(); msg.setNereidsId(nereidsId); - msg.setIsSerialOperator(isSerialOperator() && fragment.useSerialSource(ConnectContext.get())); + msg.setIsSerialOperator(isSerialOperatorOnBe(ConnectContext.get())); msg.num_children = children.size(); msg.limit = limit; for (TupleId tid : tupleIds) { @@ -824,14 +828,97 @@ public void foreachDownInCurrentFragment(Consumer visitor) { }); } - // Operators need to be executed serially. (e.g. finalized agg without key) - public boolean isSerialOperator() { + /** + * Node-level "is this operator inherently serial" property — answers without looking + * at the fragment. Default false; subclasses override (e.g. finalized agg without key, + * UNPARTITIONED ExchangeNode with merge sort). + * + * Use ONLY in framework-internal places where we are already iterating within a + * fragment whose serial-source mode is fixed: {@link #shouldResetSerialFlagForChild} + * inputs, {@link #createLocalExchange} heavy-op gate, and child.isSerialNode() checks + * embedded inside an enforceRequire path. Do NOT use it when computing a + * {@link LocalExchangeNode.LocalExchangeTypeRequire} on a child — call + * {@link #isSerialOperatorOnBe} instead. + */ + public boolean isSerialNode() { + return false; + } + + /** + * Whether this node will be reported to BE as {@code is_serial_operator=true}, i.e. it + * actually runs with one task on BE. Composes {@link #isSerialNode} with the fragment's + * {@code useSerialSource(context)} — when the fragment is not in serial-source mode + * even an isSerialNode()=true operator still runs with N tasks. + * + *

This is the API to use when deciding what {@code LocalExchangeTypeRequire} to + * declare for a child in {@code enforceAndDeriveLocalExchange}. Using + * {@link #isSerialNode} directly there will compute the wrong require under + * non-serial-source fragments and misses the {@code hasSerialScanNode()} contribution + * that {@link ExchangeNode#isSerialOperatorOnBe} layers in. Getting this wrong + * silently produces wrong results (serial child feeds N-task parent without LE). + * + *

Must match the condition in {@code toThrift()/treeToThriftHelper()}; subclasses + * (ExchangeNode) override to fold in {@code hasSerialScanNode()}. + */ + public boolean isSerialOperatorOnBe(ConnectContext context) { + return fragment != null && isSerialNode() && fragment.useSerialSource(context); + } + + /** + * "I depend on hash distribution for correctness, not just performance optimization." + * Used by UnionNode to decide whether to propagate hash requirement to its inputs: + * when a downstream operator requires shuffle for correctness, Union must pre-shuffle + * its inputs so the merged output is hash-distributed. + * + * Default is false; only operators that truly need hash for correctness override + * (finalize AggSink with group keys, HashJoin PARTITIONED/BUCKET_SHUFFLE, Intersect, + * Except, analytic SortNode, partition-by AnalyticEvalNode). Operators that request + * hash for performance only (StreamingAgg pre-agg with enable_local_exchange_before_agg) + * MUST NOT override — that would cause SetOperationNode to over-insert HASH LE on + * every union branch even when nothing downstream actually needs correctness shuffling. + * + * Mirrors BE's OperatorBase::is_shuffled_operator(). + * + *

Propagation example — multi-distinct over UNION

+ *
+     *   AggGlobal(finalize, hasKeys)             ← override = true (chain start)
+     *     └─ Agg(DISTINCT_LOCAL, !finalize)      ← override = false, inherits via
+     *                                              inheritedShuffled in enforceRequire 1b
+     *          └─ Agg(FIRST_MERGE, !finalize)    ← override = false, inherits
+     *               └─ Agg(FIRST_LOCAL, ...)     ← override = false, inherits
+     *                    └─ Union                ← reads inheritedShuffled=true and
+     *                                              pre-shuffles each branch
+     *                         ├─ Scan_t1
+     *                         └─ Scan_t2
+     * 
+ * + * Only the top-level correctness consumer needs to override true. Mid-chain + * merge / local phases do NOT need to — the flag flows down through + * {@link PlanTranslatorContext#hasShuffleForCorrectnessAncestor} automatically as long + * as every link in the chain requires HASH or NOOP (see {@code enforceRequire} step 1b). + * + *

What happens if you forget to override

+ *
    + *
  • Short chain (top consumer directly above Union): Union doesn't + * pre-shuffle its branches, but {@code enforceRequire} inserts a fallback + * LE(HASH) between the consumer and Union. Data result is still correct, + * but the plan shape differs from BE-planned mode (one extra fan-in→fan-out).
  • + *
  • Long chain: same outcome as short chain, because the fallback LE + * is inserted at the consumer/Union boundary regardless of chain length.
  • + *
  • The real wrong-result risk is when {@code enforceRequire}'s fallback + * LE is skipped — e.g. Layer 1 skip when a serial ancestor sits between the + * consumer and Union. In practice top-level correctness consumers (finalize + * agg, hash join, etc.) are not under serial ancestors so this is rare, but + * the override is the principled fix.
  • + *
+ */ + public boolean requiresShuffleForCorrectness() { return false; } public boolean hasSerialChildren() { if (children.isEmpty()) { - return isSerialOperator(); + return isSerialNode(); } return children.stream().allMatch(PlanNode::hasSerialChildren); } @@ -933,4 +1020,262 @@ private String mergeIcebergAccessPathsWithId( } return StringUtils.join(mergeDisplayAccessPaths, ", "); } + + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + ArrayList newChildren = Lists.newArrayList(); + for (int i = 0; i < children.size(); i++) { + Pair childOutput + = enforceRequire(translatorContext, children.get(i), i, LocalExchangeTypeRequire.noRequire()); + newChildren.add(childOutput.first); + } + this.children = newChildren; + return Pair.of(this, LocalExchangeType.NOOP); + } + + /** + * Unified framework method: propagate serial flag → recurse child → satisfy check → Layer 1 skip → insert LE. + * Replaces the old enforceChild/enforceChildExchange/forceEnforceChildExchange trio. + * + *

Data flow

+ *
    + *
  • serial-ancestor flag ({@link PlanTranslatorContext#hasSerialAncestorInPipeline}) + * — flows root → leaf during traversal. Mirrors BE's + * {@code any_of(operators[idx..end], is_serial_operator)} check used by + * {@code _add_local_exchange} to skip LE insertion when an ancestor in the same + * pipeline is already serial. Reset at pipeline boundaries via + * {@link #shouldResetSerialFlagForChild}.
  • + *
  • shuffle-for-correctness flag + * ({@link PlanTranslatorContext#hasShuffleForCorrectnessAncestor}) — also flows + * root → leaf. Mirrors BE's {@code _followed_by_shuffled_operator}: tells a + * child whether some downstream operator depends on hash distribution for + * correctness, so {@code SetOperationNode} can pre-shuffle union branches.
  • + *
  • return value {@code Pair} — first is the + * (possibly LE-wrapped) child; second is the actual output distribution as + * observed by the parent. Caller's {@code require.satisfy(output)} decides + * whether more LE is needed.
  • + *
  • parent.require describes the constraint on the child output — + * computed inside the parent's {@code enforceAndDeriveLocalExchange} per child.
  • + *
+ * + *

Invariants

+ *
    + *
  • Every serial → non-serial transition has an LE somewhere between them + * (enforced by framework step 3 below, validated by + * {@link AddLocalExchange#validateNoSerialWithoutLocalExchange}).
  • + *
  • {@code LocalExchangeNode} itself is always non-serial — setting it serial + * would defeat its purpose of fanning a 1-task pipeline back to N tasks.
  • + *
  • For pipeline-breaking parents ({@code shouldResetSerialFlagForChild=true}), + * the child starts a fresh pipeline so {@code hasSerialAncestor} is reset; the + * node's own {@code isSerialNode()} still composes in for the child's view.
  • + *
  • {@code RequireHash} accepts any hash flavour; {@code RequireSpecific} demands + * an exact match (with the one PASSTHROUGH/ADAPTIVE_PASSTHROUGH compatibility). + * Pick the looser one whenever correctness allows — see + * {@link LocalExchangeNode.LocalExchangeTypeRequire}.
  • + *
+ * + *

Layers

+ * Layer 1 (shouldSkipLE): mirrors BE's need_to_local_exchange — skip when this node or + * an ancestor in the same pipeline is serial (operators[idx..end] has serial → skip). + * Layer 2 (require/output): each Node declares require and output in enforceAndDeriveLocalExchange. + */ + protected Pair enforceRequire( + PlanTranslatorContext translatorContext, PlanNode child, int childIndex, + LocalExchangeTypeRequire require) { + // 1. Propagate serial-ancestor flag to child. + // For pipeline-splitting operators (shouldReset=true, e.g. non-streaming AGG): + // Drop inherited serial flag from parent (parent is in a different pipeline), + // but keep this node's own serial status (child is in the same pipeline as this + // node's sink, e.g. Exchange is in AGG_Sink pipeline). + // For non-splitting operators (shouldReset=false, e.g. streaming AGG): + // Inherit parent's serial flag + this node's own. + boolean inheritedSerial = shouldResetSerialFlagForChild(childIndex) + ? false : translatorContext.hasSerialAncestorInPipeline(this); + // Use isSerialOperatorOnBe (= isSerialNode && fragment.useSerialSource) instead of the + // raw isSerialNode(). BE's OperatorBase reads the Thrift `is_serial_operator` flag — + // which is what FE writes via isSerialOperatorOnBe — so when the fragment is not in + // serial-source mode, BE treats this operator as non-serial regardless of isSerialNode. + // Using isSerialNode here would set the child's serial-ancestor flag wider than BE's + // view and over-skip required LocalExchanges downstream. + boolean childHasSerialAncestor = inheritedSerial + || isSerialOperatorOnBe(translatorContext.getConnectContext()); + translatorContext.setHasSerialAncestorInPipeline(child, childHasSerialAncestor); + + // 1b. Propagate shuffle-for-correctness-ancestor flag to child. + // Mirrors BE's _followed_by_shuffled_operator: a downstream operator needs hash + // distribution for correctness, and the chain to here goes through HASH or NOOP + // requirements (so the dependency is preserved). + // propagate = ((inheritedShuffled || self.requiresShuffleForCorrectness) + // && require is hash) + // || (inheritedShuffled && require is noop/passthrough) + boolean inheritedShuffled = translatorContext.hasShuffleForCorrectnessAncestor(this); + boolean selfOrInheritedShuffled = inheritedShuffled || requiresShuffleForCorrectness(); + boolean requireIsHash = require.preferType().isHashShuffle(); + boolean requireIsNoop = require.preferType() == LocalExchangeNode.LocalExchangeType.NOOP; + boolean childShuffledAncestor = (selfOrInheritedShuffled && requireIsHash) + || (inheritedShuffled && requireIsNoop); + translatorContext.setHasShuffleForCorrectnessAncestor(child, childShuffledAncestor); + + // 2. Recurse child (Layer 2: child declares its own require/output) + Pair childOutput = + child.enforceAndDeriveLocalExchange(translatorContext, this, require); + + // Steps 2.5 and 3 both react to a serial child but address different concerns: + // - Step 2.5 rewrites the OUTPUT-side view (what we tell satisfy/parent about + // the child's actual distribution). A serial pipeline runs with 1 task so + // its distribution claim is meaningless — flatten to NOOP so the satisfy + // check below doesn't get fooled by a stale "I output BUCKET_HASH" claim. + // - Step 3 rewrites the REQUIRE-side decision (what we want from the child). + // If we previously asked for nothing (noRequire) but the child turns out + // to be serial and we're not, upgrade to requirePassthrough so an LE is + // inserted to restore parallelism. + + // 2.5. Serial child override (output side): if child is serial on BE, force its + // reported output to NOOP. Distribution is irrelevant when the child runs + // with 1 task; downstream parallelism is restored either by step 3 (LE + // insertion) or skipped entirely by step 4b (we're also serial). + if (childOutput.first.isSerialOperatorOnBe(translatorContext.getConnectContext())) { + childOutput = Pair.of(childOutput.first, LocalExchangeType.NOOP); + } + + // 3. Framework-level serial child check (require side, mirrors BE base class + // required_data_distribution): if child will be serial on BE but this node is + // not serial, the pipeline has a 1-task serial child feeding an N-task non-serial + // parent. Without LE, pipeline splits (AGG/JOIN) create paired pipelines with + // mismatched num_tasks → crash. Upgrade noRequire to requirePassthrough so an + // LE is inserted below to restore parallelism. + if (require instanceof LocalExchangeNode.NoRequire + && childOutput.first.isSerialOperatorOnBe(translatorContext.getConnectContext()) + && !isSerialOperatorOnBe(translatorContext.getConnectContext())) { + require = LocalExchangeTypeRequire.requirePassthrough(); + } + + // 4. Satisfy check: child output meets requirement → done + if (require.satisfy(childOutput.second)) { + return childOutput; + } + + // 4. Layer 1: skip LE when serial operator or ancestor in same pipeline + // Equivalent to BE's need_to_local_exchange: any_of(operators[idx..end], is_serial) → skip. + // Use isSerialOperatorOnBe (not isSerialNode) because BE's Pipeline::need_to_local_exchange + // checks op->is_serial_operator() which reads the Thrift flag set from isSerialOperatorOnBe; + // when fragment.useSerialSource is false, BE treats this node as non-serial. + if (translatorContext.hasSerialAncestorInPipeline(this) + || isSerialOperatorOnBe(translatorContext.getConnectContext())) { + return childOutput; + } + + // 5. Resolve exchange type and create LE node + LocalExchangeType preferType = AddLocalExchange.resolveExchangeType( + require, translatorContext, this, childOutput.first); + List distributeExprs = getLocalExchangeDistributeExprs(childIndex, selfOrInheritedShuffled); + PlanNode leNode = createLocalExchange(translatorContext, childOutput.first, preferType, distributeExprs); + return Pair.of(leNode, preferType); + } + + /** + * Create a LocalExchangeNode wrapping child with the given exchange type. + * No child-type skip — matches BE's _add_local_exchange which inserts LE for any child + * type without checking instanceof. + * + * Handles heavy-ops bottleneck avoidance (mirrors BE pipeline_fragment_context.cpp): + * when upstream has 1 task (serial source) and exchange is heavy (hash/bucket/adaptive), + * insert a PASSTHROUGH fan-out first to avoid single-task bottleneck on the heavy + * exchange sink. Only applies to local-shuffle (pooling scan) fragments. + */ + protected PlanNode createLocalExchange(PlanTranslatorContext translatorContext, + PlanNode child, LocalExchangeType exchangeType, List distributeExprs) { + if (fragment != null && fragment.useSerialSource(translatorContext.getConnectContext()) + && exchangeType.isHeavyOperation() && child.isSerialNode()) { + PlanNode ptNode = new LocalExchangeNode(translatorContext.nextPlanNodeId(), + child, LocalExchangeType.PASSTHROUGH, null); + return new LocalExchangeNode(translatorContext.nextPlanNodeId(), ptNode, + exchangeType, distributeExprs); + } + return new LocalExchangeNode(translatorContext.nextPlanNodeId(), child, + exchangeType, distributeExprs); + } + + /** + * Whether the child at {@code childIndex} starts a new pipeline context, causing + * its serial-ancestor flag to be reset to {@code false} rather than inherited from this node. + * Override to return {@code true} for pipeline-splitting nodes (LocalExchangeNode) and nodes + * whose children run in an independent pipeline segment (SortNode above analytic, etc.). + */ + protected boolean shouldResetSerialFlagForChild(int childIndex) { + return false; + } + + protected List getChildDistributeExprList(int childIndex) { + if ((childrenDistributeExprLists == null || childrenDistributeExprLists.size() <= childIndex)) { + return null; + } else { + return childrenDistributeExprLists.get(childIndex); + } + } + + /** + * Return distribute exprs used as the hash key when {@link #enforceRequire} inserts a + * LocalExchange between this node and {@code child[childIndex]}. Default returns the + * child's output distribution ({@code childrenDistributeExprLists[childIndex]}). + * + *

Subclasses override this to mirror BE-specific {@code _partition_exprs} logic. For + * example BE's {@code AggSinkOperatorX::update_operator} picks + * {@code grouping_exprs} when {@code !_followed_by_shuffled_operator && !has_distinct}, + * even though the child outputs a different (hash) distribution — and the LE inserted + * before the streaming preagg must partition by {@code grouping_exprs} so a local + * partial reduce actually collapses same-key rows. Using the default (child + * distribution) here would scatter same-group rows across instances and degrade the + * preagg to a no-op, also breaking row-arrival order at downstream merge-finalize. + * + * @param childIndex which child + * @param followedByShuffled whether the chain at this node is followed by a shuffled + * operator (mirrors BE's {@code _followed_by_shuffled_operator}) + */ + protected List getLocalExchangeDistributeExprs(int childIndex, boolean followedByShuffled) { + return getChildDistributeExprList(childIndex); + } + + /** + * Returns the operator's own semantically-defined partition expressions + * (e.g. GROUP BY exprs for aggregation, PARTITION BY exprs for analytic). + * Corresponds to BE's fallback path: tnode.agg_node.grouping_exprs / + * tnode.analytic_node.partition_exprs when _followed_by_shuffled_operator=false. + * Override in subclasses that have intrinsic partition keys. + */ + protected List getSemanticPartitionExprs() { + return null; + } + + /** + * Returns true if there are effective (non-empty) partition expressions, + * mirroring BE's _partition_exprs logic: + * _followed_by_shuffled_operator=true → distribute_expr_lists[0] (child distribute key) + * _followed_by_shuffled_operator=false → semantic partition exprs (grouping / partition by) + * parentRequire.preferType().isHashShuffle() corresponds to _followed_by_shuffled_operator=true. + */ + protected boolean hasPartitionExprs(LocalExchangeTypeRequire parentRequire) { + if (parentRequire.preferType().isHashShuffle()) { + List childExprs = getChildDistributeExprList(0); + return childExprs != null && !childExprs.isEmpty(); + } + List semanticExprs = getSemanticPartitionExprs(); + return semanticExprs != null && !semanticExprs.isEmpty(); + } + + public List> getChildrenDistributeExprLists() { + return childrenDistributeExprLists; + } + + public List getDistributeExprLists() { + return distributeExprLists; + } + + public void setDistributeExprLists(List distributeExprLists) { + if (distributeExprLists == null) { + this.distributeExprLists = Collections.emptyList(); + } else { + this.distributeExprLists = distributeExprLists; + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/RecursiveCteNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/RecursiveCteNode.java index 838facd27b320a..a32b599374a03a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/RecursiveCteNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/RecursiveCteNode.java @@ -21,6 +21,10 @@ import org.apache.doris.analysis.ExprToSqlVisitor; import org.apache.doris.analysis.ToSqlParams; import org.apache.doris.analysis.TupleId; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TPlanNode; import org.apache.doris.thrift.TPlanNodeType; @@ -29,6 +33,7 @@ import com.google.common.base.MoreObjects; import com.google.common.collect.Lists; +import java.util.ArrayList; import java.util.List; public class RecursiveCteNode extends PlanNode { @@ -86,4 +91,49 @@ public String toString() { .add("tid", tupleIds.get(0).asInt()) .add("isUnionAll", isUnionAll).toString(); } + + @Override + public boolean isSerialNode() { + // Mirror BE's RecCTESourceOperatorX::is_serial_operator() which always returns true: + // the recursive driver runs sequentially in one task, so downstream consumers must see + // RecursiveCteNode as serial too. Without this, FE planner leaves the producer + // fragment with parallel=N senders but only one actually emits data — the cross- + // fragment Exchange receiver expects N senders done and hangs waiting on the other + // N-1. Marking serial here lets AddLocalExchange#addLocalExchangeForFragment wrap + // the root with a PASSTHROUGH LE that fans the serial RecCte output out to N + // parallel sinks, matching BE-native _plan_local_exchange behaviour. + return true; + } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + // Recurse into children to give them a chance to plan local exchanges below + // themselves, but never insert one *directly* under RecursiveCteNode: + // - ThriftPlansBuilder locates the recursive sender fragment via + // `getChild(1).getChild(0).getFragment()`; a LocalExchangeNode wrapper + // would shift that path off the cross-fragment ExchangeNode and pull the + // wrong fragment into `fragmentsToReset`. + // - BE's RecCTESourceOperatorX wires the anchor / recursive side pipelines + // directly against the Exchange children (pipeline_fragment_context.cpp + // REC_CTE_NODE handling); injecting an extra LE pipeline between them + // mis-routes the rerun signal and crashes BE during execution. + // Both issues are pure shape mismatches — RecursiveCteNode's children are + // already the cross-fragment ExchangeNode receivers, which BE drives serially + // itself, so no FE-side fan-out is needed here. + ArrayList newChildren = Lists.newArrayList(); + for (int i = 0; i < children.size(); i++) { + PlanNode child = children.get(i); + Pair childOutput = child.enforceAndDeriveLocalExchange( + translatorContext, this, LocalExchangeTypeRequire.noRequire()); + newChildren.add(childOutput.first); + } + this.children = newChildren; + return Pair.of(this, LocalExchangeType.NOOP); + } + + @Override + protected boolean shouldResetSerialFlagForChild(int childIndex) { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/RecursiveCteScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/RecursiveCteScanNode.java index c0ed988010ca97..9f98f7b2c2719c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/RecursiveCteScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/RecursiveCteScanNode.java @@ -21,6 +21,10 @@ import org.apache.doris.analysis.ExprToSqlVisitor; import org.apache.doris.analysis.ToSqlParams; import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TPlanNode; import org.apache.doris.thrift.TPlanNodeType; @@ -67,7 +71,13 @@ protected void toThrift(TPlanNode msg) { } @Override - public boolean isSerialOperator() { + public boolean isSerialNode() { return true; } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + return Pair.of(this, LocalExchangeType.NOOP); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/RepeatNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/RepeatNode.java index e28094cfe89bf1..bf1be174f979d7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/RepeatNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/RepeatNode.java @@ -19,6 +19,10 @@ import org.apache.doris.analysis.ExprToThriftVisitor; import org.apache.doris.analysis.GroupingInfo; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TPlanNode; import org.apache.doris.thrift.TPlanNodeType; @@ -86,11 +90,21 @@ public String getNodeExplainString(String detailPrefix, TExplainLevel detailLeve // Determined by its child. @Override - public boolean isSerialOperator() { - return children.get(0).isSerialOperator(); + public boolean isSerialNode() { + return children.get(0).isSerialNode(); } public GroupingInfo getGroupingInfo() { return groupingInfo; } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + Pair enforceResult + = enforceRequire(translatorContext, children.get(0), 0, parentRequire); + children = new java.util.ArrayList<>(); + children.add(enforceResult.first); + return Pair.of(this, enforceResult.second); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/RuntimeFilter.java b/fe/fe-core/src/main/java/org/apache/doris/planner/RuntimeFilter.java index f4ea16c9360dce..e1860e5bcd7220 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/RuntimeFilter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/RuntimeFilter.java @@ -263,7 +263,7 @@ public TRuntimeFilterDesc toThrift() { for (RuntimeFilterTarget target : targets) { tFilter.putToPlanIdToTargetExpr(target.node.getId().asInt(), ExprToThriftVisitor.treeToThrift(target.expr)); hasSerialTargets = hasSerialTargets - || (target.node.isSerialOperator() && target.node.fragment.useSerialSource(ConnectContext.get())); + || target.node.isSerialOperatorOnBe(ConnectContext.get()); } boolean enableSyncFilterSize = ConnectContext.get() != null diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java index 02a40761db9c19..e69d310154851f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java @@ -42,11 +42,15 @@ import org.apache.doris.catalog.TableIf; import org.apache.doris.cloud.catalog.CloudPartition; import org.apache.doris.common.Config; +import org.apache.doris.common.Pair; import org.apache.doris.common.UserException; import org.apache.doris.datasource.FederationBackendPolicy; import org.apache.doris.datasource.SplitAssignment; import org.apache.doris.datasource.SplitGenerator; import org.apache.doris.datasource.SplitSource; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.qe.ConnectContext; import org.apache.doris.rpc.RpcException; import org.apache.doris.system.Backend; @@ -733,7 +737,7 @@ public ScanContext getScanContext() { } @Override - public boolean isSerialOperator() { + public boolean isSerialNode() { ConnectContext context = ConnectContext.get(); if (context == null) { return numScanBackends() <= 0; @@ -747,7 +751,15 @@ public boolean isSerialOperator() { @Override public boolean hasSerialScanChildren() { - return isSerialOperator(); + return isSerialNode(); + } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + // Base ScanNode returns NOOP — only OlapScanNode overrides with BUCKET_HASH_SHUFFLE + // for non-pooling scans that have bucket distribution. + return Pair.of(this, LocalExchangeType.NOOP); } public void setDesc(TupleDescriptor desc) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/SelectNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/SelectNode.java index 2ef0b364be1292..92b6975afcf709 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/SelectNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/SelectNode.java @@ -20,6 +20,10 @@ package org.apache.doris.planner; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TPlanNode; import org.apache.doris.thrift.TPlanNodeType; @@ -57,7 +61,17 @@ public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { // Determined by its child. @Override - public boolean isSerialOperator() { - return children.get(0).isSerialOperator(); + public boolean isSerialNode() { + return children.get(0).isSerialNode(); + } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + Pair enforceResult + = enforceRequire(translatorContext, children.get(0), 0, parentRequire); + this.children = new ArrayList<>(); + this.children.add(enforceResult.first); + return Pair.of(this, enforceResult.second); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/SetOperationNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/SetOperationNode.java index aecb6e43c88ba3..7d7008d5911e6d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/SetOperationNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/SetOperationNode.java @@ -22,6 +22,10 @@ import org.apache.doris.analysis.ExprToThriftVisitor; import org.apache.doris.analysis.ToSqlParams; import org.apache.doris.analysis.TupleId; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TExceptNode; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TExpr; @@ -37,6 +41,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; @@ -193,4 +198,52 @@ public int getNumInstances() { public boolean isBucketShuffle() { return distributionMode.equals(DistributionMode.BUCKET_SHUFFLE); } + + public boolean isColocate() { + return isColocate; + } + + @Override + public Pair enforceAndDeriveLocalExchange(PlanTranslatorContext translatorContext, + PlanNode parent, LocalExchangeTypeRequire parentRequire) { + LocalExchangeTypeRequire requireChild; + LocalExchangeType outputType; + PlanNode firstChild = children.isEmpty() ? null : children.get(0); + if (this instanceof UnionNode) { + // Propagate parent's hash requirement to children ONLY when a downstream operator + // requires shuffle for correctness (not just performance optimization). Matches BE's + // UnionSinkOperatorX which returns GLOBAL_HASH(_distribute_exprs) whenever + // _followed_by_shuffled_operator=true. The flag is propagated by enforceRequire + // from operators with requiresShuffleForCorrectness()=true (finalize agg, hash join, + // intersect/except) through hash/noop links. + // See PlanNode.requiresShuffleForCorrectness() for a chain-propagation example. + boolean canPropagateHash = translatorContext.hasShuffleForCorrectnessAncestor(this); + requireChild = canPropagateHash ? parentRequire.autoRequireHash() : LocalExchangeTypeRequire.noRequire(); + outputType = canPropagateHash + ? AddLocalExchange.resolveExchangeType(requireChild, translatorContext, this, firstChild) + : LocalExchangeType.NOOP; + } else { + // Intersect / Except + if (AddLocalExchange.isColocated(this)) { + requireChild = LocalExchangeTypeRequire.requireBucketHash(); + outputType = LocalExchangeType.BUCKET_HASH_SHUFFLE; + } else { + requireChild = parentRequire.autoRequireHash(); + outputType = AddLocalExchange.resolveExchangeType( + requireChild, translatorContext, this, firstChild); + } + } + + ArrayList newChildren = Lists.newArrayList(); + for (int i = 0; i < children.size(); i++) { + newChildren.add(enforceRequire(translatorContext, children.get(i), i, requireChild).first); + } + this.children = newChildren; + return Pair.of(this, outputType); + } + + @Override + protected boolean shouldResetSerialFlagForChild(int childIndex) { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/SortNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/SortNode.java index c94c1764d61be2..2d1631f2d05acb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/SortNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/SortNode.java @@ -25,6 +25,9 @@ import org.apache.doris.analysis.SortInfo; import org.apache.doris.analysis.ToSqlParams; import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.qe.ConnectContext; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TPlanNode; @@ -217,10 +220,22 @@ protected void toThrift(TPlanNode msg) { // If it's analytic sort or not merged by a followed exchange node, it must output the global ordered data. @Override - public boolean isSerialOperator() { + public boolean isSerialNode() { return !isAnalyticSort && !mergeByexchange; } + /** + * Mirrors BE's {@code SortSinkOperatorX::is_shuffled_operator() = _is_analytic_sort} + * (be/src/exec/operator/sort_sink_operator.h:95). Analytic-sort requires partitioned + * input by analytic partition keys, so downstream UnionNode / SetOperationNode under + * us must pre-shuffle their branches to match — the framework propagates this through + * {@link PlanTranslatorContext#hasShuffleForCorrectnessAncestor}. + */ + @Override + public boolean requiresShuffleForCorrectness() { + return isAnalyticSort; + } + public void setColocate(boolean colocate) { isColocate = colocate; } @@ -245,4 +260,46 @@ public void setTopnFilterTargets( List> topnFilterTargets) { this.topnFilterTargets = topnFilterTargets; } + + @Override + public Pair enforceAndDeriveLocalExchange(PlanTranslatorContext translatorContext, + PlanNode parent, LocalExchangeTypeRequire parentRequire) { + + LocalExchangeTypeRequire requireChild; + LocalExchangeType outputType = null; + if (isAnalyticSort) { + // BE: SortSink._is_analytic_sort=true → required_data_distribution() = HASH. + // This sort serves a parent AnalyticEvalNode (window function) and requires + // data partitioned by the analytic's partition exprs. + if (AddLocalExchange.isColocated(this)) { + requireChild = LocalExchangeTypeRequire.requireHash(); + outputType = AddLocalExchange.resolveExchangeType( + LocalExchangeTypeRequire.requireHash(), translatorContext, this, + children.get(0)); + } else { + requireChild = parentRequire.autoRequireHash(); + } + } else if (mergeByexchange) { + // BE: SortSink._merge_by_exchange=true → required_data_distribution() = PASSTHROUGH. + requireChild = LocalExchangeTypeRequire.requirePassthrough(); + outputType = LocalExchangeType.PASSTHROUGH; + } else { + // BE: else → NOOP + requireChild = LocalExchangeTypeRequire.noRequire(); + outputType = LocalExchangeType.NOOP; + } + + Pair enforceResult + = enforceRequire(translatorContext, children.get(0), 0, requireChild); + this.children = Lists.newArrayList(enforceResult.first); + if (outputType == null) { + outputType = enforceResult.second; + } + return Pair.of(this, outputType); + } + + @Override + protected boolean shouldResetSerialFlagForChild(int childIndex) { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/TableFunctionNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/TableFunctionNode.java index 3630bfd72a9c31..c363b30c31850e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/TableFunctionNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/TableFunctionNode.java @@ -23,6 +23,10 @@ import org.apache.doris.analysis.SlotId; import org.apache.doris.analysis.ToSqlParams; import org.apache.doris.analysis.TupleId; +import org.apache.doris.common.Pair; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TPlanNode; import org.apache.doris.thrift.TPlanNodeType; @@ -120,4 +124,24 @@ protected void toThrift(TPlanNode msg) { msg.table_function_node.addToOutputSlotIds(slotId.asInt()); } } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + // Mirrors BE TableFunctionOperatorX::required_data_distribution() which always + // returns PASSTHROUGH, regardless of child's serial status. + // + // Conceptual model: TableFunction requires PASSTHROUGH input but outputs + // "unknown distribution" (NOOP). This means downstream operators (e.g. Sort) + // must independently evaluate their own requirements against NOOP, naturally + // triggering exchange insertion when they require PASSTHROUGH. + // + // In BE, need_to_local_exchange() Step 4 treats non-hash exchanges (PASSTHROUGH) + // as always needing insertion, so "PASSTHROUGH doesn't satisfy PASSTHROUGH" — + // which is equivalent to our FE model of require=PASSTHROUGH, output=NOOP. + Pair enforceResult = enforceRequire( + translatorContext, children.get(0), 0, LocalExchangeTypeRequire.requirePassthrough()); + children = Lists.newArrayList(enforceResult.first); + return Pair.of(this, LocalExchangeType.NOOP); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/UnionNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/UnionNode.java index 36e50b8f5010ba..0b05cafd9cb31a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/UnionNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/UnionNode.java @@ -37,7 +37,7 @@ protected void toThrift(TPlanNode msg) { // If it is a union without children which means it will output some constant values, we should use a serial union // to output non-duplicated data. @Override - public boolean isSerialOperator() { + public boolean isSerialNode() { return children.isEmpty(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java index 5181a77b042f57..49ca93b7d7ab08 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java @@ -352,7 +352,6 @@ public Coordinator(ConnectContext context, Planner planner) { } else { distributedPlans = ((NereidsPlanner) planner).getDistributedPlans(); } - setFromUserProperty(context); this.queryGlobals.setNowString(TimeUtils.getDatetimeFormatWithTimeZone().format(LocalDateTime.now())); @@ -428,6 +427,8 @@ private void initQueryOptions(ConnectContext context) { this.queryOptions.setFeProcessUuid(ExecuteEnv.getInstance().getProcessUUID()); this.queryOptions.setMysqlRowBinaryFormat( context.getCommand() == MysqlCommand.COM_STMT_EXECUTE); + // Old Coordinator never plans local exchange in FE. Force BE to plan its own. + this.queryOptions.setEnableLocalShufflePlanner(false); } public ConnectContext getConnectContext() { @@ -2884,7 +2885,7 @@ private void assignScanRanges(PlanFragmentId fragmentId, int parallelExecInstanc Optional node = scanNodes.stream().filter( scanNode -> scanNode.getId().asInt() == scanId).findFirst(); Preconditions.checkArgument(node.isPresent()); - FInstanceExecParam instanceParamToScan = node.get().isSerialOperator() + FInstanceExecParam instanceParamToScan = node.get().isSerialNode() ? firstInstanceParam : instanceParam; if (!instanceParamToScan.perNodeScanRanges.containsKey(nodeScanRange.getKey())) { range.put(nodeScanRange.getKey(), Lists.newArrayList()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/NereidsCoordinator.java b/fe/fe-core/src/main/java/org/apache/doris/qe/NereidsCoordinator.java index f8c7509f102678..19c2eaec3ad708 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/NereidsCoordinator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/NereidsCoordinator.java @@ -101,6 +101,7 @@ public NereidsCoordinator(ConnectContext context, setForInsert(-1L); } + syncLocalShufflePlannerOption(); Preconditions.checkState(!planner.getFragments().isEmpty() && coordinatorContext.instanceNum.get() > 0, "Fragment and Instance can not be empty˚"); } @@ -115,6 +116,7 @@ public NereidsCoordinator(ConnectContext context, // we don't need to check the dataSink, Because setting jobId means this must be a load operation setForInsert(jobId); + syncLocalShufflePlannerOption(); Preconditions.checkState(!planner.getFragments().isEmpty() && coordinatorContext.instanceNum.get() > 0, "Fragment and Instance can not be empty˚"); } @@ -132,11 +134,20 @@ public NereidsCoordinator(Long jobId, TUniqueId queryId, DescriptorTable descTab // same reason in `setForInsert` this.coordinatorContext.queryOptions.setDisableFileCache(true); this.needEnqueue = false; + syncLocalShufflePlannerOption(); Preconditions.checkState(!fragments.isEmpty() && coordinatorContext.instanceNum.get() > 0, "Fragment and Instance can not be empty˚"); } + private void syncLocalShufflePlannerOption() { + coordinatorContext.queryOptions.setEnableLocalShufflePlanner( + coordinatorContext.distributedPlans != null + && !coordinatorContext.distributedPlans.isEmpty() + && coordinatorContext.connectContext != null + && coordinatorContext.connectContext.getSessionVariable().isEnableLocalShufflePlanner()); + } + @Override public void exec() throws Exception { enqueue(coordinatorContext.connectContext); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 54095c9e08a08a..fe5d24f23978c3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -337,6 +337,8 @@ public class SessionVariable implements Serializable, Writable { public static final String ENABLE_LOCAL_SHUFFLE = "enable_local_shuffle"; + public static final String ENABLE_LOCAL_SHUFFLE_PLANNER = "enable_local_shuffle_planner"; + public static final String FORCE_TO_LOCAL_SHUFFLE = "force_to_local_shuffle"; public static final String ENABLE_LOCAL_MERGE_SORT = "enable_local_merge_sort"; @@ -1598,6 +1600,12 @@ public enum IgnoreSplitType { "Whether to enable local shuffle on pipelineX engine."}, needForward = true) private boolean enableLocalShuffle = true; + @VarAttrDef.VarAttr( + name = ENABLE_LOCAL_SHUFFLE_PLANNER, fuzzy = false, varType = VariableAnnotation.EXPERIMENTAL, + description = {"是否在FE规划Local Shuffle", + "Whether to plan local shuffle in frontend"}, needForward = true) + private boolean enableLocalShufflePlanner = true; + @VarAttrDef.VarAttr( name = FORCE_TO_LOCAL_SHUFFLE, fuzzy = false, varType = VariableAnnotation.EXPERIMENTAL, description = {"是否在 pipelineX 引擎上强制开启 local shuffle 优化", @@ -4748,6 +4756,18 @@ public void setEnableLocalShuffle(boolean enableLocalShuffle) { this.enableLocalShuffle = enableLocalShuffle; } + public boolean isEnableLocalShuffle() { + return enableLocalShuffle; + } + + public boolean isEnableLocalShufflePlanner() { + return enableLocalShufflePlanner; + } + + public void setEnableLocalShufflePlanner(boolean enableLocalShufflePlanner) { + this.enableLocalShufflePlanner = enableLocalShufflePlanner; + } + public boolean enablePushDownNoGroupAgg() { return enablePushDownNoGroupAgg; } @@ -5653,6 +5673,8 @@ public TQueryOptions toThrift() { // Set Iceberg write target file size tResult.setIcebergWriteTargetFileSizeBytes(icebergWriteTargetFileSizeBytes); + tResult.setEnableLocalShufflePlanner(enableLocalShufflePlanner); + return tResult; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java index 3ebd2f69307c4d..36c798ae8d41d7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java @@ -1110,8 +1110,10 @@ public boolean isProfileSafeStmt() { // 1. CreateTableCommand(mainly for create as select). // 2. LoadCommand. // 3. InsertOverwriteTableCommand. + // 4. MergeIntoCommand (merge into ... using ...). if ((plan instanceof Command) && !(plan instanceof LoadCommand) - && !(plan instanceof CreateTableCommand) && !(plan instanceof InsertOverwriteTableCommand)) { + && !(plan instanceof CreateTableCommand) && !(plan instanceof InsertOverwriteTableCommand) + && !(plan instanceof org.apache.doris.nereids.trees.plans.commands.merge.MergeIntoCommand)) { // Commands like SHOW QUERY PROFILE will not have profile. return false; } else { diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/runtime/ThriftPlansBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/qe/runtime/ThriftPlansBuilder.java index 4f8499970c98dc..985e650afd9cdc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/runtime/ThriftPlansBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/runtime/ThriftPlansBuilder.java @@ -95,7 +95,6 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; -import java.util.function.BiConsumer; import java.util.function.Supplier; public class ThriftPlansBuilder { @@ -280,7 +279,8 @@ private static Multiset computeInstanceNumPerWorker( return workerCounter; } - private static Map computeExchangeSenderNum(PipelineDistributedPlan distributedPlan) { + private static Map computeExchangeSenderNum( + PipelineDistributedPlan distributedPlan) { Map senderNum = Maps.newLinkedHashMap(); for (Entry kv : distributedPlan.getInputs().entries()) { ExchangeNode exchangeNode = kv.getKey(); @@ -583,39 +583,55 @@ private static Map computeDestIdToInstanceId( } Map destIdToInstanceId = Maps.newLinkedHashMap(); - filterInstancesWhichReceiveDataFromRemote( - receivePlan, worker, - (instanceJob, destId) -> destIdToInstanceId.put(destId, instanceToIndex.get(instanceJob)) - ); - return destIdToInstanceId; - } - - private static void filterInstancesWhichReceiveDataFromRemote( - PipelineDistributedPlan receivePlan, DistributedPlanWorker filterWorker, - BiConsumer computeFn) { - - // current only support all input plans have same destination with same order, - // so we can get first input plan to compute shuffle index to instance id - Set> exchangeToChildPlanSet = receivePlan.getInputs().entries(); - if (exchangeToChildPlanSet.isEmpty()) { - return; + // dest id is defined by sender-side destination order rather than receiver fragment instance order. + // When FE inserts local shuffle or serial source, one worker may have multiple local receiver instances, + // but only the first one actually receives remote data. In that case receiverPlan.getInstanceJobs() + // no longer matches sender destinations, and using receiver instance order will produce an invalid + // shuffle_idx_to_instance_idx mapping for BE. + // + // When a fragment has multiple ExchangeNode inputs (e.g., NLJ with probe + BROADCAST build sides), + // always pick the one with the most destinations on this worker. A BROADCAST input has 1 dest per BE + // while the main data-carrying input (HASH-partitioned probe) has N dests per BE; using the BROADCAST + // one would produce a 1-entry map for GLOBAL_HASH LOCAL_EXCHANGE, causing rows to be lost. + Entry exchangeToChildPlan = null; + int maxDestsOnWorker = -1; + for (Entry entry : receivePlan.getInputs().entries()) { + ExchangeNode exchNode = entry.getKey(); + PipelineDistributedPlan childPlan = (PipelineDistributedPlan) entry.getValue(); + for (Entry> kv : childPlan.getDestinations().entrySet()) { + if (kv.getKey().getExchNodeId().asInt() != exchNode.getId().asInt()) { + continue; + } + int destsOnWorker = (int) kv.getValue().stream() + .filter(j -> j.getAssignedWorker().id() == worker.id()) + .count(); + if (destsOnWorker > maxDestsOnWorker) { + maxDestsOnWorker = destsOnWorker; + exchangeToChildPlan = entry; + } + break; + } + } + if (exchangeToChildPlan == null) { + return destIdToInstanceId; } - Entry exchangeToChildPlan = exchangeToChildPlanSet.iterator().next(); ExchangeNode linkNode = exchangeToChildPlan.getKey(); PipelineDistributedPlan firstInputPlan = (PipelineDistributedPlan) exchangeToChildPlan.getValue(); - Map> sinkToDestInstances = firstInputPlan.getDestinations(); - for (Entry> kv : sinkToDestInstances.entrySet()) { + for (Entry> kv : firstInputPlan.getDestinations().entrySet()) { DataSink senderSink = kv.getKey(); - if (senderSink.getExchNodeId().asInt() == linkNode.getId().asInt()) { - for (int destId = 0; destId < kv.getValue().size(); destId++) { - AssignedJob assignedJob = kv.getValue().get(destId); - if (assignedJob.getAssignedWorker().id() == filterWorker.id()) { - computeFn.accept(assignedJob, destId); - } + if (senderSink.getExchNodeId().asInt() != linkNode.getId().asInt()) { + continue; + } + List destinationJobs = kv.getValue(); + for (int destId = 0; destId < destinationJobs.size(); destId++) { + AssignedJob destinationJob = destinationJobs.get(destId); + if (destinationJob.getAssignedWorker().id() == worker.id()) { + destIdToInstanceId.put(destId, instanceToIndex.get(destinationJob)); } - break; } + break; } + return destIdToInstanceId; } private static Set setParamsForRecursiveCteNode(List distributedPlans, diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/LocalShuffleNodeCoverageTest.java b/fe/fe-core/src/test/java/org/apache/doris/planner/LocalShuffleNodeCoverageTest.java new file mode 100644 index 00000000000000..c7c9f67dd764d1 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/planner/LocalShuffleNodeCoverageTest.java @@ -0,0 +1,745 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.planner; + +import org.apache.doris.analysis.AssertNumRowsElement; +import org.apache.doris.analysis.BinaryPredicate; +import org.apache.doris.analysis.Expr; +import org.apache.doris.analysis.GroupingInfo; +import org.apache.doris.analysis.JoinOperator; +import org.apache.doris.analysis.OrderByElement; +import org.apache.doris.analysis.SlotDescriptor; +import org.apache.doris.analysis.SortInfo; +import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.analysis.TupleId; +import org.apache.doris.common.Pair; +import org.apache.doris.common.UserException; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.nereids.trees.plans.PartitionTopnPhase; +import org.apache.doris.nereids.trees.plans.WindowFuncType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeTypeRequire; +import org.apache.doris.thrift.TExplainLevel; +import org.apache.doris.thrift.TPartitionType; +import org.apache.doris.thrift.TPlanNode; +import org.apache.doris.thrift.TScanRangeLocations; + +import com.google.common.collect.Lists; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; + +public class LocalShuffleNodeCoverageTest { + private static final AtomicInteger NEXT_ID = new AtomicInteger(1); + + @Test + public void testSelectNode() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + + TrackingPlanNode childNoop = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + SelectNode selectWithNoopChild = new SelectNode(nextPlanNodeId(), childNoop); + Pair output = selectWithNoopChild.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + + // resolveExchangeType with RequireHash always returns LOCAL_EXECUTION_HASH_SHUFFLE + Assertions.assertEquals(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE, output.second); + Assertions.assertEquals(LocalExchangeNode.RequireHash.class, childNoop.lastRequire.getClass()); + assertChildLocalExchangeType(selectWithNoopChild, 0, LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE); + + TrackingPlanNode childBucket = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.BUCKET_HASH_SHUFFLE); + SelectNode selectWithBucketChild = new SelectNode(nextPlanNodeId(), childBucket); + Pair bucketOutput = selectWithBucketChild.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.BUCKET_HASH_SHUFFLE, bucketOutput.second); + Assertions.assertSame(childBucket, selectWithBucketChild.getChild(0)); + } + + @Test + public void testRepeatNode() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + GroupingInfo groupingInfo = Mockito.mock(GroupingInfo.class); + TupleDescriptor outputTuple = new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement())); + Mockito.when(groupingInfo.getOutputTupleDesc()).thenReturn(outputTuple); + Mockito.when(groupingInfo.getPreRepeatExprs()).thenReturn(Collections.emptyList()); + + TrackingPlanNode childNoop = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + RepeatNode repeatNode = new RepeatNode(nextPlanNodeId(), childNoop, groupingInfo, + Collections.singletonList(Collections.emptySet()), Collections.emptySet(), + Collections.singletonList(Collections.emptyList())); + Pair output = repeatNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + // resolveExchangeType with RequireHash always returns LOCAL_EXECUTION_HASH_SHUFFLE + Assertions.assertEquals(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE, output.second); + assertChildLocalExchangeType(repeatNode, 0, LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE); + } + + @Test + public void testTableFunctionNode() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + TrackingPlanNode childNoop = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TableFunctionNode tableFunctionNode = new TableFunctionNode(nextPlanNodeId(), childNoop, + new TupleId(NEXT_ID.getAndIncrement()), new ArrayList<>(), new ArrayList<>(), new ArrayList<>()); + + // TableFunctionNode always requires PASSTHROUGH from child and outputs NOOP. + // This mirrors BE's TableFunctionOperatorX::required_data_distribution() override. + // Parent's requireHash is ignored — TableFunction's own PASSTHROUGH requirement takes precedence. + Pair output = tableFunctionNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, output.second); + assertChildLocalExchangeType(tableFunctionNode, 0, LocalExchangeType.PASSTHROUGH); + } + + @Test + public void testPartitionSortNode() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + SortInfo sortInfo = Mockito.mock(SortInfo.class); + TupleDescriptor sortTuple = new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement())); + Mockito.when(sortInfo.getOrderingExprs()).thenReturn(Collections.emptyList()); + Mockito.when(sortInfo.getIsAscOrder()).thenReturn(Collections.emptyList()); + Mockito.when(sortInfo.getSortTupleDescriptor()).thenReturn(sortTuple); + + TrackingPlanNode childNoop = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + PartitionSortNode globalTopnNode = new PartitionSortNode(nextPlanNodeId(), childNoop, + WindowFuncType.ROW_NUMBER, Collections.emptyList(), sortInfo, false, 1, + PartitionTopnPhase.TWO_PHASE_GLOBAL_PTOPN); + Pair globalOutput = globalTopnNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.noRequire()); + // enforceRequire resolves RequireHash to LOCAL_EXECUTION_HASH_SHUFFLE (FE-planned always uses LOCAL) + Assertions.assertEquals(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE, globalOutput.second); + assertChildLocalExchangeType(globalTopnNode, 0, LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE); + + TrackingPlanNode childNoop2 = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + PartitionSortNode passthroughNode = new PartitionSortNode(nextPlanNodeId(), childNoop2, + WindowFuncType.ROW_NUMBER, Collections.emptyList(), sortInfo, false, 1, + PartitionTopnPhase.TWO_PHASE_LOCAL_PTOPN); + Pair passthroughOutput = passthroughNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.noRequire()); + Assertions.assertEquals(LocalExchangeType.PASSTHROUGH, passthroughOutput.second); + assertChildLocalExchangeType(passthroughNode, 0, LocalExchangeType.PASSTHROUGH); + } + + @Test + public void testMaterializationNode() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + TrackingPlanNode childNoop = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TupleDescriptor tupleDescriptor = new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement())); + TestMaterializationNode node = new TestMaterializationNode(nextPlanNodeId(), tupleDescriptor, childNoop); + + // MaterializationNode.isSerialNode() returns true. Without a fragment context, + // isSerialOperatorOnBe() returns false (fragment == null guard), so the framework + // does not skip the Layer 1 check and inserts a LocalExchange(PASSTHROUGH) to satisfy + // MaterializationNode's requirePassthrough() requirement on its child. + // In production with fragment.useSerialSource=true, isSerialOperatorOnBe would be + // true and the framework would skip the LE — the test exercises the fragment-less + // unit-test path which reflects the BE behavior when the fragment is not in + // serial-source mode. + Pair output = node.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.PASSTHROUGH, output.second); + Assertions.assertInstanceOf(LocalExchangeNode.class, node.getChild(0)); + } + + @Test + public void testCteAndRecursiveNodesAndEmptySet() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + + CTEScanNode cteScanNode = new CTEScanNode(ScanContext.EMPTY); + Pair cteOutput = cteScanNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, cteOutput.second); + + RecursiveCteScanNode recursiveScanNode = new RecursiveCteScanNode("r", nextPlanNodeId(), + new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement()))); + Pair recursiveScanOutput = recursiveScanNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, recursiveScanOutput.second); + + EmptySetNode emptySetNode = new EmptySetNode(nextPlanNodeId(), + new ArrayList<>(Collections.singletonList(new TupleId(NEXT_ID.getAndIncrement())))); + Pair emptyOutput = emptySetNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, emptyOutput.second); + + TrackingPlanNode recursiveChild = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + RecursiveCteNode recursiveNode = new RecursiveCteNode(nextPlanNodeId(), new TupleId(NEXT_ID.getAndIncrement()), + "r", true); + recursiveNode.addChild(recursiveChild); + Pair recursiveOutput = recursiveNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, recursiveOutput.second); + Assertions.assertEquals(LocalExchangeNode.NoRequire.class, recursiveChild.lastRequire.getClass()); + } + + @Test + public void testHashJoinNodeBranches() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + List eqConjuncts = Collections.singletonList(Mockito.mock(BinaryPredicate.class)); + + TrackingPlanNode probe = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TrackingPlanNode build = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + HashJoinNode broadcastJoin = new HashJoinNode(nextPlanNodeId(), probe, build, JoinOperator.INNER_JOIN, + eqConjuncts, Collections.emptyList(), null, null, false); + broadcastJoin.setDistributionMode(DistributionMode.BROADCAST); + Pair broadcastOutput = broadcastJoin.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, broadcastOutput.second); + Assertions.assertSame(probe, broadcastJoin.getChild(0)); + Assertions.assertSame(build, broadcastJoin.getChild(1)); + + TrackingPlanNode probe2 = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TrackingPlanNode build2 = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + HashJoinNode bucketJoin = new HashJoinNode(nextPlanNodeId(), probe2, build2, JoinOperator.INNER_JOIN, + eqConjuncts, Collections.emptyList(), null, null, false); + bucketJoin.setDistributionMode(DistributionMode.BUCKET_SHUFFLE); + Pair bucketOutput = bucketJoin.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.BUCKET_HASH_SHUFFLE, bucketOutput.second); + assertChildLocalExchangeType(bucketJoin, 0, LocalExchangeType.BUCKET_HASH_SHUFFLE); + assertChildLocalExchangeType(bucketJoin, 1, LocalExchangeType.BUCKET_HASH_SHUFFLE); + + TrackingScanNode probeScan = new TrackingScanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TrackingPlanNode buildPlan = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + HashJoinNode hashJoin = new HashJoinNode(nextPlanNodeId(), probeScan, buildPlan, JoinOperator.INNER_JOIN, + eqConjuncts, Collections.emptyList(), null, null, false); + hashJoin.setDistributionMode(DistributionMode.PARTITIONED); + Pair hashOutput = hashJoin.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + // enforceRequire resolves RequireHash to LOCAL_EXECUTION_HASH_SHUFFLE (FE-planned always uses LOCAL) + Assertions.assertEquals(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE, hashOutput.second); + assertChildLocalExchangeType(hashJoin, 0, LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE); + assertChildLocalExchangeType(hashJoin, 1, LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE); + + TrackingPlanNode probe3 = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TrackingPlanNode build3 = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + HashJoinNode nullAwareJoin = new HashJoinNode(nextPlanNodeId(), probe3, build3, + JoinOperator.NULL_AWARE_LEFT_ANTI_JOIN, eqConjuncts, Collections.emptyList(), null, null, false); + Pair nullAwareOutput = nullAwareJoin.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, nullAwareOutput.second); + Assertions.assertSame(probe3, nullAwareJoin.getChild(0)); + Assertions.assertSame(build3, nullAwareJoin.getChild(1)); + + SerialTrackingPlanNode serialProbe = new SerialTrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + serialProbe.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(serialProbe.fragment.useSerialSource(Mockito.any())).thenReturn(true); + TrackingPlanNode nonSerialBuild = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + nonSerialBuild.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(nonSerialBuild.fragment.useSerialSource(Mockito.any())).thenReturn(true); + HashJoinNode serialProbeBroadcast = new HashJoinNode(nextPlanNodeId(), serialProbe, nonSerialBuild, + JoinOperator.INNER_JOIN, eqConjuncts, Collections.emptyList(), null, null, false); + serialProbeBroadcast.setDistributionMode(DistributionMode.BROADCAST); + // BROADCAST serial check uses fragment.useSerialSource() on the HashJoinNode itself + serialProbeBroadcast.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(serialProbeBroadcast.fragment.useSerialSource(Mockito.any())).thenReturn(true); + Pair serialProbeOutput = serialProbeBroadcast.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.PASSTHROUGH, serialProbeOutput.second); + assertChildLocalExchangeType(serialProbeBroadcast, 0, LocalExchangeType.PASSTHROUGH); + Assertions.assertSame(nonSerialBuild, serialProbeBroadcast.getChild(1)); + + TrackingPlanNode nonSerialProbe = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + nonSerialProbe.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(nonSerialProbe.fragment.useSerialSource(Mockito.any())).thenReturn(true); + SerialTrackingPlanNode serialBuild = new SerialTrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + serialBuild.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(serialBuild.fragment.useSerialSource(Mockito.any())).thenReturn(true); + HashJoinNode serialBuildBroadcast = new HashJoinNode(nextPlanNodeId(), nonSerialProbe, serialBuild, + JoinOperator.INNER_JOIN, eqConjuncts, Collections.emptyList(), null, null, false); + serialBuildBroadcast.setDistributionMode(DistributionMode.BROADCAST); + serialBuildBroadcast.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(serialBuildBroadcast.fragment.useSerialSource(Mockito.any())).thenReturn(true); + Pair serialBuildOutput = serialBuildBroadcast.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, serialBuildOutput.second); + Assertions.assertSame(nonSerialProbe, serialBuildBroadcast.getChild(0)); + assertChildLocalExchangeType(serialBuildBroadcast, 1, LocalExchangeType.PASS_TO_ONE); + } + + @Test + public void testLocalExchangeNodeIsNotSerializedAsSerialOperator() { + SerialTrackingScanNode serialScan = new SerialTrackingScanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + LocalExchangeNode localExchangeNode = new LocalExchangeNode(nextPlanNodeId(), serialScan, + LocalExchangeType.PASSTHROUGH); + localExchangeNode.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(localExchangeNode.fragment.hasSerialScanNode()).thenReturn(true); + Mockito.when(localExchangeNode.fragment.useSerialSource(Mockito.any())).thenReturn(true); + + TPlanNode thriftNode = new TPlanNode(); + localExchangeNode.toThrift(thriftNode); + + Assertions.assertFalse(thriftNode.isIsSerialOperator(), + "local exchange source pipeline should not be marked serial in thrift"); + } + + @Test + public void testLayer1SkipUsesIsSerialOperatorOnBeNotIsSerialNode() { + // Guard against regression of the isSerialNode -> isSerialOperatorOnBe fix + // (PR #63366 review feedback). When a node's isSerialNode()=true but its + // fragment is NOT in serial-source mode (fragment.useSerialSource(ctx)=false), + // BE's Thrift `is_serial_operator` flag is false, so BE's + // Pipeline::need_to_local_exchange does NOT skip local exchange. + // + // The FE framework must mirror that — both the serial-ancestor propagation + // (enforceRequire step 1) and the Layer 1 skip (enforceRequire step 4b) + // must consult isSerialOperatorOnBe(ctx), not the raw isSerialNode(). + // Otherwise we over-skip required LocalExchanges in fragments where + // ignore_storage_data_distribution=false / NAAJ / query-cache disables + // serial-source mode at the fragment level. + // + // Setup: a node with isSerialNode()=true but fragment.useSerialSource(ctx)=false. + // It declares requireHash on its child whose output is NOOP. If Layer 1 used + // the raw isSerialNode(), the framework would skip LE. With the fix it must + // insert a LocalExchange(LOCAL_EXECUTION_HASH_SHUFFLE). + PlanTranslatorContext ctx = new PlanTranslatorContext(); + TrackingPlanNode childNoop = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + SerialNodeInNonSerialFragment parent = new SerialNodeInNonSerialFragment( + nextPlanNodeId(), childNoop); + parent.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(parent.fragment.useSerialSource(Mockito.any())).thenReturn(false); + + Pair output = parent.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.noRequire()); + + Assertions.assertEquals(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE, output.second); + Assertions.assertInstanceOf(LocalExchangeNode.class, parent.getChild(0), + "Layer 1 must NOT skip LE when fragment.useSerialSource=false, " + + "even if isSerialNode()=true — BE treats the node as non-serial."); + } + + @Test + public void testNestedLoopJoinNodeBranches() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + List tupleIds = Lists.newArrayList(new TupleId(NEXT_ID.getAndIncrement())); + + TrackingPlanNode probe = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TrackingPlanNode build = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + NestedLoopJoinNode defaultJoin = new NestedLoopJoinNode(nextPlanNodeId(), probe, build, tupleIds, + JoinOperator.INNER_JOIN, false); + defaultJoin.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(defaultJoin.fragment.useSerialSource(Mockito.any())).thenReturn(false); + Pair defaultOutput = defaultJoin.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.ADAPTIVE_PASSTHROUGH, defaultOutput.second); + assertChildLocalExchangeType(defaultJoin, 0, LocalExchangeType.ADAPTIVE_PASSTHROUGH); + Assertions.assertSame(build, defaultJoin.getChild(1)); + + TrackingScanNode probeScan = new TrackingScanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TrackingPlanNode buildNoop = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + NestedLoopJoinNode serialSourceJoin = new NestedLoopJoinNode(nextPlanNodeId(), probeScan, buildNoop, + Lists.newArrayList(new TupleId(NEXT_ID.getAndIncrement())), JoinOperator.INNER_JOIN, false); + serialSourceJoin.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(serialSourceJoin.fragment.useSerialSource(Mockito.any())).thenReturn(true); + Pair serialOutput = serialSourceJoin.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.noRequire()); + Assertions.assertEquals(LocalExchangeType.ADAPTIVE_PASSTHROUGH, serialOutput.second); + assertChildLocalExchangeType(serialSourceJoin, 0, LocalExchangeType.ADAPTIVE_PASSTHROUGH); + assertChildLocalExchangeType(serialSourceJoin, 1, LocalExchangeType.BROADCAST); + + // RIGHT_OUTER/FULL_OUTER: probe side must use NOOP (serial processing for unmatched rows). + // BE: NestedLoopJoinProbeOperatorX returns NOOP for these join types. + TrackingPlanNode probeRight = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TrackingPlanNode buildRight = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + NestedLoopJoinNode rightOuterJoin = new NestedLoopJoinNode(nextPlanNodeId(), probeRight, buildRight, + Lists.newArrayList(new TupleId(NEXT_ID.getAndIncrement())), + JoinOperator.RIGHT_OUTER_JOIN, false); + rightOuterJoin.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(rightOuterJoin.fragment.useSerialSource(Mockito.any())).thenReturn(false); + Pair rightOuterOutput = rightOuterJoin.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, rightOuterOutput.second); + Assertions.assertSame(probeRight, rightOuterJoin.getChild(0)); + + TrackingPlanNode probe2 = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TrackingPlanNode build2 = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + NestedLoopJoinNode nullAwareJoin = new NestedLoopJoinNode(nextPlanNodeId(), probe2, build2, + Lists.newArrayList(new TupleId(NEXT_ID.getAndIncrement())), + JoinOperator.NULL_AWARE_LEFT_ANTI_JOIN, false); + nullAwareJoin.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(nullAwareJoin.fragment.useSerialSource(Mockito.any())).thenReturn(false); + Pair nullAwareOutput = nullAwareJoin.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, nullAwareOutput.second); + } + + @Test + public void testSetOperationAndAssertNumRowsNode() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + + UnionNode unionNode = new UnionNode(nextPlanNodeId(), new TupleId(NEXT_ID.getAndIncrement())); + TrackingPlanNode unionChild = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + unionNode.addChild(unionChild); + // UnionNode propagates parent hash require to children only when a downstream operator + // requires shuffle for correctness. Simulate that via the context flag. + ctx.setHasShuffleForCorrectnessAncestor(unionNode, true); + Pair unionOutput = unionNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE, unionOutput.second); + Assertions.assertEquals(LocalExchangeNode.RequireHash.class, unionChild.lastRequire.getClass()); + + IntersectNode intersectNode = new IntersectNode(nextPlanNodeId(), new TupleId(NEXT_ID.getAndIncrement())); + intersectNode.setColocate(false); + TrackingScanNode left = new TrackingScanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + TrackingPlanNode right = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + intersectNode.addChild(left); + intersectNode.addChild(right); + Pair intersectOutput = intersectNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE, intersectOutput.second); + assertChildLocalExchangeType(intersectNode, 0, LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE); + assertChildLocalExchangeType(intersectNode, 1, LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE); + + // Colocated ExceptNode with OlapScan children: OlapScan already provides BUCKET_HASH_SHUFFLE, + // so requireBucketHash() is satisfied and no LocalExchangeNode is inserted. + ExceptNode exceptNode = new ExceptNode(nextPlanNodeId(), new TupleId(NEXT_ID.getAndIncrement())); + exceptNode.setColocate(true); + FakeOlapScanNode exceptLeft = new FakeOlapScanNode(nextPlanNodeId()); + FakeOlapScanNode exceptRight = new FakeOlapScanNode(nextPlanNodeId()); + exceptNode.addChild(exceptLeft); + exceptNode.addChild(exceptRight); + Pair exceptOutput = exceptNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.BUCKET_HASH_SHUFFLE, exceptOutput.second); + // OlapScan already satisfies requireBucketHash(), so children are passed through unchanged. + Assertions.assertSame(exceptLeft, exceptNode.getChild(0)); + Assertions.assertSame(exceptRight, exceptNode.getChild(1)); + + TrackingPlanNode assertChild = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + AssertNumRowsElement assertElement = Mockito.mock(AssertNumRowsElement.class); + Mockito.when(assertElement.getDesiredNumOfRows()).thenReturn(1L); + Mockito.when(assertElement.getSubqueryString()).thenReturn("subquery"); + Mockito.when(assertElement.getAssertion()).thenReturn(AssertNumRowsElement.Assertion.EQ); + AssertNumRowsNode assertNode = new AssertNumRowsNode(nextPlanNodeId(), assertChild, + assertElement, new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement()))); + // AssertNumRowsNode.isSerialNode()=true. Without a fragment context, + // isSerialOperatorOnBe()=false so the framework falls through Layer 1 and inserts + // a LocalExchange(PASSTHROUGH) — same fragment-less rationale as the + // MaterializationNode test above. In production with fragment.useSerialSource=true + // the LE would be skipped. + Pair assertOutput = assertNode.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.PASSTHROUGH, assertOutput.second); + Assertions.assertInstanceOf(LocalExchangeNode.class, assertNode.getChild(0)); + } + + @Test + public void testSortNodeBranches() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + SortInfo sortInfo = mockSortInfo(); + + TrackingPlanNode mergeChild = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + SortNode mergeSort = new SortNode(nextPlanNodeId(), mergeChild, sortInfo, false); + mergeSort.setMergeByExchange(); + Pair mergeOutput = mergeSort.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.noRequire()); + Assertions.assertEquals(LocalExchangeType.PASSTHROUGH, mergeOutput.second); + assertChildLocalExchangeType(mergeSort, 0, LocalExchangeType.PASSTHROUGH); + + // Non-merge, non-analytic SortNode: isSerialNode()=true → enforceChild skips exchange. + // Output is still PASSTHROUGH (hardcoded for useSerialSource + ScanNode child). + SerialTrackingScanNode serialScan = new SerialTrackingScanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + SortNode scanSort = new SortNode(nextPlanNodeId(), serialScan, sortInfo, false); + scanSort.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(scanSort.fragment.useSerialSource(Mockito.any())).thenReturn(true); + Pair scanOutput = scanSort.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.noRequire()); + // Non-merge, non-analytic SortNode: isSerialNode()=true, requireChild=noRequire, + // outputType=NOOP. enforceRequire shouldSkipLE skips because Sort itself is serial. + Assertions.assertEquals(LocalExchangeType.NOOP, scanOutput.second); + // SortNode is serial → enforceRequire skips exchange → child unchanged. + Assertions.assertSame(serialScan, scanSort.getChild(0)); + + // Analytic sort (mergeByexchange=false): sort before analytic with partition + orderBy. + // AnalyticEvalNode returns NOOP (non-serial, has partition+order), SortNode enforceChild + // inserts LOCAL_EXECUTION_HASH_SHUFFLE (RequireHash → resolveExchangeType → LOCAL). + AnalyticEvalNode analyticChild = new AnalyticEvalNode(nextPlanNodeId(), + new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP), + Collections.emptyList(), Collections.singletonList(Mockito.mock(Expr.class)), + Collections.singletonList(new OrderByElement(Mockito.mock(Expr.class), true, true)), + null, new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement()))); + analyticChild.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(analyticChild.fragment.useSerialSource(Mockito.any())).thenReturn(false); + SortNode analyticSort = new SortNode(nextPlanNodeId(), analyticChild, sortInfo, false); + analyticSort.setIsAnalyticSort(true); // Must set for isSerialNode() to return false + Pair analyticOutput = analyticSort.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE, analyticOutput.second); + assertChildLocalExchangeType(analyticSort, 0, LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE); + + // Outer merge-sort above analytic (mergeByexchange=true): BE SortSink._merge_by_exchange=true → PASSTHROUGH. + // Should NOT insert GLOBAL_HASH even though child is AnalyticEvalNode. + AnalyticEvalNode analyticChild2 = new AnalyticEvalNode(nextPlanNodeId(), + new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP), + Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), + null, new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement()))); + analyticChild2.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(analyticChild2.fragment.useSerialSource(Mockito.any())).thenReturn(false); + SortNode mergeAnalyticSort = new SortNode(nextPlanNodeId(), analyticChild2, sortInfo, false); + mergeAnalyticSort.setMergeByExchange(); + Pair mergeAnalyticOutput = mergeAnalyticSort.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.PASSTHROUGH, mergeAnalyticOutput.second); + } + + @Test + public void testAnalyticEvalNodeBranches() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + + TrackingPlanNode noPartitionChild = new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + AnalyticEvalNode noPartition = new AnalyticEvalNode(nextPlanNodeId(), noPartitionChild, + Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), + null, new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement()))); + // No partition → isSerialNode()=true → returns NOOP (serial nodes let framework handle). + Pair noPartitionOutput = noPartition.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, noPartitionOutput.second); + Assertions.assertSame(noPartitionChild, noPartition.getChild(0)); + + // Analytic with partition but no orderBy, non-colocated → noRequire/NOOP. + // (Non-colocated analytic relies on parent SortNode to handle distribution.) + TrackingScanNode hashChild = new TrackingScanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + AnalyticEvalNode hashAnalytic = new AnalyticEvalNode(nextPlanNodeId(), hashChild, + Collections.emptyList(), Collections.singletonList(Mockito.mock(Expr.class)), + Collections.emptyList(), null, new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement()))); + Pair hashOutput = hashAnalytic.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, hashOutput.second); + // No exchange inserted — child remains unchanged. + Assertions.assertSame(hashChild, hashAnalytic.getChild(0)); + + SerialTrackingScanNode serialScan = new SerialTrackingScanNode(nextPlanNodeId(), LocalExchangeType.NOOP); + AnalyticEvalNode orderedAnalytic = new AnalyticEvalNode(nextPlanNodeId(), serialScan, + Collections.emptyList(), Collections.singletonList(Mockito.mock(Expr.class)), + Collections.singletonList(new OrderByElement(Mockito.mock(Expr.class), true, true)), + null, new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement()))); + orderedAnalytic.fragment = Mockito.mock(PlanFragment.class); + Mockito.when(orderedAnalytic.fragment.useSerialSource(Mockito.any())).thenReturn(true); + Pair orderedOutput = orderedAnalytic.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.noRequire()); + // Serial AnalyticEval returns NOOP — lets framework serial check handle fan-out + Assertions.assertEquals(LocalExchangeType.NOOP, orderedOutput.second); + } + + @Test + public void testExchangeNodeBranches() { + PlanTranslatorContext ctx = new PlanTranslatorContext(); + + ExchangeNode hashExchange = new ExchangeNode(nextPlanNodeId(), + new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP)); + hashExchange.setPartitionType(TPartitionType.HASH_PARTITIONED); + Pair hashOutput = hashExchange.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.noRequire()); + Assertions.assertEquals(LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE, hashOutput.second); + + ExchangeNode bucketExchange = new ExchangeNode(nextPlanNodeId(), + new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP)); + bucketExchange.setPartitionType(TPartitionType.BUCKET_SHFFULE_HASH_PARTITIONED); + Pair bucketOutput = bucketExchange.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.noRequire()); + Assertions.assertEquals(LocalExchangeType.BUCKET_HASH_SHUFFLE, bucketOutput.second); + + ExchangeNode noopExchange = new ExchangeNode(nextPlanNodeId(), + new TrackingPlanNode(nextPlanNodeId(), LocalExchangeType.NOOP)); + noopExchange.setPartitionType(TPartitionType.UNPARTITIONED); + Pair noopOutput = noopExchange.enforceAndDeriveLocalExchange( + ctx, null, LocalExchangeTypeRequire.requireHash()); + Assertions.assertEquals(LocalExchangeType.NOOP, noopOutput.second); + } + + private static PlanNodeId nextPlanNodeId() { + return new PlanNodeId(NEXT_ID.getAndIncrement()); + } + + private static void assertChildLocalExchangeType(PlanNode node, int index, LocalExchangeType expectedType) { + Assertions.assertTrue(node.getChild(index) instanceof LocalExchangeNode, + "expected child " + index + " to be LocalExchangeNode"); + LocalExchangeNode exchangeNode = (LocalExchangeNode) node.getChild(index); + Assertions.assertEquals(expectedType, exchangeNode.getExchangeType()); + } + + private static SortInfo mockSortInfo() { + SortInfo sortInfo = Mockito.mock(SortInfo.class); + TupleDescriptor sortTuple = new TupleDescriptor(new TupleId(NEXT_ID.getAndIncrement())); + Mockito.when(sortInfo.getOrderingExprs()).thenReturn(Collections.emptyList()); + Mockito.when(sortInfo.getIsAscOrder()).thenReturn(Collections.emptyList()); + Mockito.when(sortInfo.getSortTupleDescriptor()).thenReturn(sortTuple); + return sortInfo; + } + + /** + * Helper for testLayer1SkipUsesIsSerialOperatorOnBeNotIsSerialNode: a PlanNode that + * reports isSerialNode()=true but whose fragment can be mocked to return + * useSerialSource=false, exercising the discrepancy the review fix targets. + */ + private static class SerialNodeInNonSerialFragment extends PlanNode { + SerialNodeInNonSerialFragment(PlanNodeId id, PlanNode child) { + super(id, Lists.newArrayList(new TupleId(id.asInt() + 20000)), + "SERIAL_NODE_IN_NON_SERIAL_FRAGMENT"); + children.add(child); + } + + @Override + public boolean isSerialNode() { + return true; + } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, + LocalExchangeTypeRequire parentRequire) { + // Require hash so the satisfy() check fails on the child's NOOP output, + // forcing the framework into Layer 1 — which is where the + // isSerialNode/isSerialOperatorOnBe choice matters. + Pair result = enforceRequire(translatorContext, + children.get(0), 0, LocalExchangeTypeRequire.requireHash()); + children = Lists.newArrayList(result.first); + return Pair.of(this, result.second); + } + + @Override + protected void toThrift(TPlanNode msg) { + } + + @Override + public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { + return ""; + } + } + + private static class TrackingPlanNode extends PlanNode { + private final LocalExchangeType providedType; + private LocalExchangeTypeRequire lastRequire; + + TrackingPlanNode(PlanNodeId id, LocalExchangeType providedType) { + super(id, Lists.newArrayList(new TupleId(id.asInt() + 10000)), "TRACKING"); + this.providedType = providedType; + } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + this.lastRequire = parentRequire; + return Pair.of(this, providedType); + } + + @Override + protected void toThrift(TPlanNode msg) { + } + + @Override + public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { + return ""; + } + } + + private static class SerialTrackingPlanNode extends TrackingPlanNode { + SerialTrackingPlanNode(PlanNodeId id, LocalExchangeType providedType) { + super(id, providedType); + } + + @Override + public boolean isSerialNode() { + return true; + } + } + + private static class TrackingScanNode extends ScanNode { + private final LocalExchangeType providedType; + private LocalExchangeTypeRequire lastRequire; + + TrackingScanNode(PlanNodeId id, LocalExchangeType providedType) { + super(id, new TupleDescriptor(new TupleId(id.asInt() + 20000)), "TRACKING_SCAN", ScanContext.EMPTY); + this.providedType = providedType; + } + + @Override + public Pair enforceAndDeriveLocalExchange( + PlanTranslatorContext translatorContext, PlanNode parent, LocalExchangeTypeRequire parentRequire) { + this.lastRequire = parentRequire; + return Pair.of(this, providedType); + } + + @Override + protected void createScanRangeLocations() throws UserException { + } + + @Override + public List getScanRangeLocations(long maxScanRangeLength) { + return Collections.emptyList(); + } + + @Override + protected void toThrift(TPlanNode msg) { + } + + @Override + public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { + return ""; + } + } + + private static class SerialTrackingScanNode extends TrackingScanNode { + SerialTrackingScanNode(PlanNodeId id, LocalExchangeType providedType) { + super(id, providedType); + } + + @Override + public boolean isSerialNode() { + return true; + } + } + + private static class FakeOlapScanNode extends OlapScanNode { + FakeOlapScanNode(PlanNodeId id) { + super(id, mockTupleDescriptor(id), "FAKE_OLAP_SCAN", ScanContext.EMPTY); + } + + @Override + protected void createScanRangeLocations() throws UserException { + } + + @Override + public List getScanRangeLocations(long maxScanRangeLength) { + return Collections.emptyList(); + } + + private static TupleDescriptor mockTupleDescriptor(PlanNodeId id) { + TupleDescriptor desc = Mockito.mock(TupleDescriptor.class); + org.apache.doris.catalog.OlapTable table = Mockito.mock(org.apache.doris.catalog.OlapTable.class); + Mockito.when(desc.getId()).thenReturn(new TupleId(id.asInt() + 30000)); + Mockito.when(desc.getTable()).thenReturn(table); + Mockito.when(desc.getSlots()).thenReturn(new ArrayList()); + Mockito.when(table.getDistributionColumnNames()).thenReturn(Collections.emptySet()); + return desc; + } + } + + private static class TestMaterializationNode extends MaterializationNode { + TestMaterializationNode(PlanNodeId id, TupleDescriptor desc, PlanNode child) { + super(id, desc, child); + } + + @Override + public void initNodeInfo() { + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/PlanShape.java b/fe/fe-core/src/test/java/org/apache/doris/planner/PlanShape.java new file mode 100644 index 00000000000000..39022c4283a0d5 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/planner/PlanShape.java @@ -0,0 +1,335 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.planner; + +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; + +import com.google.common.collect.ImmutableList; + +import java.util.List; +import java.util.function.Predicate; + +/** + * Plan-shape DSL for asserting {@link PlanNode} tree structure in tests. + * Inspired by Trino's {@code PlanMatchPattern}. + * + *

Use with {@code import static org.apache.doris.planner.PlanShape.*;} to + * drop the {@code PlanShape.} prefix from call sites: + *

{@code
+ * import static org.apache.doris.planner.PlanShape.*;
+ * import static org.apache.doris.planner.LocalExchangeNode.LocalExchangeType.*;
+ *
+ * assertMatchesAnyFragment(planner.getFragments(),
+ *     anyTree(
+ *         agg(
+ *             localExchange(LOCAL_EXECUTION_HASH_SHUFFLE,
+ *                 olapScan("t1")))));
+ * }
+ * + *

This reads bottom-up: somewhere in the plan tree there must be an + * {@link AggregationNode} whose direct child is a + * {@link LocalExchangeNode}(LOCAL_EXECUTION_HASH_SHUFFLE) whose direct child + * is an {@link OlapScanNode} on table {@code t1}. + * + *

Add a predicate with the fluent {@link #where(Predicate)} method. + * The lambda parameter is typed to the most specific {@link PlanNode} + * subclass declared by the factory — no explicit cast is required: + *

{@code
+ * agg(...).where(a -> a.isColocate())
+ * }
+ * + * @param the {@link PlanNode} subclass this shape matches against; lets + * {@link #where(Predicate)} infer the lambda parameter type automatically. + */ +public final class PlanShape { + private final Class nodeClass; + private final Predicate filter; + private final List> children; + private final boolean anyTreeMatch; + private final String label; + + private PlanShape(Class nodeClass, Predicate filter, + List> children, boolean anyTreeMatch, String label) { + this.nodeClass = nodeClass; + this.filter = filter; + this.children = children; + this.anyTreeMatch = anyTreeMatch; + this.label = label; + } + + // ---- generic factories ---- + + /** Match a node of the given class with the given direct children patterns. */ + public static PlanShape node(Class cls, PlanShape... children) { + return new PlanShape<>(cls, null, ImmutableList.copyOf(children), false, cls.getSimpleName()); + } + + /** + * Skip any number of intermediate nodes; the {@code inner} pattern can match + * anywhere in the subtree rooted at the current node (including the node itself). + */ + public static PlanShape anyTree(PlanShape inner) { + return new PlanShape<>(PlanNode.class, null, ImmutableList.of(inner), true, "anyTree"); + } + + /** Match any single node (regardless of class) that has the given children. */ + public static PlanShape anyNode(PlanShape... children) { + return new PlanShape<>(PlanNode.class, null, ImmutableList.copyOf(children), false, "anyNode"); + } + + /** + * Match a node that is NOT an instance of {@code excludedClass}, with the given + * direct children patterns. Useful for asserting that a particular node type is + * absent at a specific position. + * + *

Typical use: assert that an LE was skipped where the framework should not + * have inserted one. + *

{@code
+     * agg(anyNot(LocalExchangeNode.class, olapScan("t1")))
+     * }
+ * reads as "AggregationNode whose direct child is NOT a LocalExchangeNode and + * is itself an OlapScan('t1')". + */ + public static PlanShape anyNot(Class excludedClass, + PlanShape... children) { + return new PlanShape<>(PlanNode.class, + n -> !excludedClass.isInstance(n), + ImmutableList.copyOf(children), false, + "anyNot(" + excludedClass.getSimpleName() + ")"); + } + + // ---- convenience wrappers for common Doris nodes ---- + + public static PlanShape localExchange(LocalExchangeType type, PlanShape... children) { + return new PlanShape<>(LocalExchangeNode.class, + le -> le.getExchangeType() == type, + ImmutableList.copyOf(children), false, + "LocalExchange(" + type + ")"); + } + + public static PlanShape agg(PlanShape... children) { + return node(AggregationNode.class, children); + } + + public static PlanShape olapScan(String tableName, PlanShape... children) { + return new PlanShape<>(OlapScanNode.class, + s -> tableName.equalsIgnoreCase(s.getOlapTable().getName()), + ImmutableList.copyOf(children), false, + "OlapScan(" + tableName + ")"); + } + + public static PlanShape olapScan() { + return node(OlapScanNode.class); + } + + public static PlanShape exchange(PlanShape... children) { + return node(ExchangeNode.class, children); + } + + public static PlanShape hashJoin(PlanShape... children) { + return node(HashJoinNode.class, children); + } + + public static PlanShape sort(PlanShape... children) { + return node(SortNode.class, children); + } + + public static PlanShape analytic(PlanShape... children) { + return node(AnalyticEvalNode.class, children); + } + + public static PlanShape union(PlanShape... children) { + return node(UnionNode.class, children); + } + + public static PlanShape nestedLoopJoin(PlanShape... children) { + return node(NestedLoopJoinNode.class, children); + } + + public static PlanShape partitionSort(PlanShape... children) { + return node(PartitionSortNode.class, children); + } + + // ---- chained predicate ---- + + /** + * Constrain the current node to also satisfy {@code predicate}. The lambda + * parameter type is inferred from the receiver's generic type parameter + * (e.g. {@link AggregationNode} for {@link #agg}); no explicit cast required. + */ + public PlanShape where(Predicate predicate) { + Predicate combined = filter == null ? predicate : filter.and(predicate); + return new PlanShape<>(nodeClass, combined, children, anyTreeMatch, label); + } + + // ---- assertion entry points ---- + + /** Assert that {@code shape} matches the plan tree rooted at {@code root}. */ + public static void assertMatches(PlanNode root, PlanShape shape) { + shape.assertMatchesImpl(root); + } + + /** + * Assert that {@code shape} matches the plan tree of at least one fragment. + * Useful when the interesting fragment is buried among several (typical for + * distributed plans with scan / shuffle fragments). + */ + public static void assertMatchesAnyFragment(List fragments, PlanShape shape) { + for (PlanFragment fragment : fragments) { + if (shape.match(fragment.getPlanRoot()).matched) { + return; + } + } + StringBuilder dump = new StringBuilder(); + for (PlanFragment fragment : fragments) { + dump.append("[fragment ").append(fragment.getFragmentId()).append("]\n"); + dump.append(indent(dumpTree(fragment.getPlanRoot()), " ")); + } + throw new AssertionError("Plan shape did not match any of the " + fragments.size() + + " fragments.\n expected: " + shape.describe() + "\n fragments:\n" + + indent(dump.toString(), " ")); + } + + private void assertMatchesImpl(PlanNode root) { + MatchResult r = match(root); + if (!r.matched) { + throw new AssertionError("Plan shape mismatch.\n" + + " expected: " + describe() + "\n" + + " failure : " + r.failureReason + "\n" + + " actual tree:\n" + indent(dumpTree(root), " ")); + } + } + + // ---- matching impl ---- + + private MatchResult match(PlanNode node) { + if (anyTreeMatch) { + return findInSubtree(node, children.get(0)); + } + if (!nodeClass.isInstance(node)) { + return MatchResult.fail("expected " + label + " but got " + + node.getClass().getSimpleName() + "#" + node.getId().asInt()); + } + if (filter != null && !filter.test(nodeClass.cast(node))) { + return MatchResult.fail("predicate failed on " + label + "#" + node.getId().asInt()); + } + if (children.isEmpty()) { + return MatchResult.OK; + } + if (node.getChildren().size() != children.size()) { + return MatchResult.fail(label + "#" + node.getId().asInt() + " expected " + + children.size() + " children, got " + node.getChildren().size()); + } + for (int i = 0; i < children.size(); i++) { + MatchResult r = children.get(i).match(node.getChild(i)); + if (!r.matched) { + return MatchResult.fail("at " + label + "#" + node.getId().asInt() + ".child[" + i + + "]: " + r.failureReason); + } + } + return MatchResult.OK; + } + + private static MatchResult findInSubtree(PlanNode root, PlanShape inner) { + MatchResult r = inner.match(root); + if (r.matched) { + return r; + } + for (PlanNode child : root.getChildren()) { + r = findInSubtree(child, inner); + if (r.matched) { + return r; + } + } + return MatchResult.fail("no node in subtree matches " + inner.describe()); + } + + private String describe() { + if (anyTreeMatch) { + return "anyTree(" + children.get(0).describe() + ")"; + } + StringBuilder sb = new StringBuilder(label); + if (!children.isEmpty()) { + sb.append("("); + for (int i = 0; i < children.size(); i++) { + if (i > 0) { + sb.append(", "); + } + sb.append(children.get(i).describe()); + } + sb.append(")"); + } + return sb.toString(); + } + + /** + * Render a plan tree as a text outline (one node per line, indented by depth). + * Useful for debugging when writing new shape assertions — print the actual + * plan, then copy the structure into a {@code PlanShape} pattern. + */ + public static String prettyPrint(PlanNode root) { + return dumpTree(root); + } + + private static String dumpTree(PlanNode root) { + StringBuilder sb = new StringBuilder(); + dumpTree(root, sb, 0); + return sb.toString(); + } + + private static void dumpTree(PlanNode node, StringBuilder sb, int depth) { + for (int i = 0; i < depth; i++) { + sb.append(" "); + } + sb.append(node.getClass().getSimpleName()).append("#").append(node.getId().asInt()); + if (node instanceof LocalExchangeNode) { + sb.append("(").append(((LocalExchangeNode) node).getExchangeType()).append(")"); + } + sb.append("\n"); + for (PlanNode child : node.getChildren()) { + dumpTree(child, sb, depth + 1); + } + } + + private static String indent(String text, String prefix) { + StringBuilder sb = new StringBuilder(); + for (String line : text.split("\n", -1)) { + if (!line.isEmpty()) { + sb.append(prefix); + } + sb.append(line).append("\n"); + } + return sb.toString(); + } + + private static final class MatchResult { + final boolean matched; + final String failureReason; + + private MatchResult(boolean matched, String failureReason) { + this.matched = matched; + this.failureReason = failureReason; + } + + static final MatchResult OK = new MatchResult(true, null); + + static MatchResult fail(String reason) { + return new MatchResult(false, reason); + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/PlanShapeDsl.java b/fe/fe-core/src/test/java/org/apache/doris/planner/PlanShapeDsl.java new file mode 100644 index 00000000000000..bc96ca91353515 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/planner/PlanShapeDsl.java @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.planner; + +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; + +import java.util.List; + +/** + * Implement this interface in a test class to use the {@link PlanShape} DSL + * factories without a {@code PlanShape.} prefix and without {@code static} + * imports (which are forbidden by Doris's checkstyle rule + * {@code AvoidStaticImport}). + * + *

Each factory is a {@code default} method that delegates to the static one + * on {@link PlanShape}. The shortened {@code LocalExchangeType} aliases + * (LOCAL_HASH, GLOBAL_HASH, BUCKET_HASH, etc.) are interface constants so + * implementing classes can reference them by bare name. + * + *

Example: + *

{@code
+ * public class MyTest extends TestWithFeService implements PlanShapeDsl {
+ *     @Test
+ *     public void testAggOverScan() {
+ *         assertPlanShape(sql,
+ *             anyTree(agg(localExchange(LOCAL_HASH, olapScan("t1")))));
+ *     }
+ * }
+ * }
+ */ +public interface PlanShapeDsl { + + // ---- shortened LocalExchangeType aliases ---- + // Avoids spelling LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE in every test. + + LocalExchangeType LOCAL_HASH = LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE; + LocalExchangeType GLOBAL_HASH = LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE; + LocalExchangeType BUCKET_HASH = LocalExchangeType.BUCKET_HASH_SHUFFLE; + LocalExchangeType PT = LocalExchangeType.PASSTHROUGH; + LocalExchangeType ADAPTIVE_PT = LocalExchangeType.ADAPTIVE_PASSTHROUGH; + LocalExchangeType BROADCAST_LE = LocalExchangeType.BROADCAST; + LocalExchangeType PASS_TO_ONE_LE = LocalExchangeType.PASS_TO_ONE; + LocalExchangeType NOOP_LE = LocalExchangeType.NOOP; + + // ---- structural factories ---- + + default PlanShape node(Class cls, PlanShape... children) { + return PlanShape.node(cls, children); + } + + default PlanShape anyTree(PlanShape inner) { + return PlanShape.anyTree(inner); + } + + default PlanShape anyNode(PlanShape... children) { + return PlanShape.anyNode(children); + } + + default PlanShape anyNot(Class excludedClass, + PlanShape... children) { + return PlanShape.anyNot(excludedClass, children); + } + + // ---- Doris-specific node factories ---- + + default PlanShape localExchange(LocalExchangeType type, + PlanShape... children) { + return PlanShape.localExchange(type, children); + } + + default PlanShape agg(PlanShape... children) { + return PlanShape.agg(children); + } + + default PlanShape olapScan(String tableName, PlanShape... children) { + return PlanShape.olapScan(tableName, children); + } + + default PlanShape olapScan() { + return PlanShape.olapScan(); + } + + default PlanShape exchange(PlanShape... children) { + return PlanShape.exchange(children); + } + + default PlanShape hashJoin(PlanShape... children) { + return PlanShape.hashJoin(children); + } + + default PlanShape sort(PlanShape... children) { + return PlanShape.sort(children); + } + + default PlanShape analytic(PlanShape... children) { + return PlanShape.analytic(children); + } + + default PlanShape union(PlanShape... children) { + return PlanShape.union(children); + } + + default PlanShape nestedLoopJoin(PlanShape... children) { + return PlanShape.nestedLoopJoin(children); + } + + default PlanShape partitionSort(PlanShape... children) { + return PlanShape.partitionSort(children); + } + + // ---- assertion entry points ---- + + default void assertMatches(PlanNode root, PlanShape shape) { + PlanShape.assertMatches(root, shape); + } + + default void assertMatchesAnyFragment(List fragments, PlanShape shape) { + PlanShape.assertMatchesAnyFragment(fragments, shape); + } + + /** + * Print all fragments' plan trees to stderr. Useful for one-off debugging + * when writing a new shape assertion: print first, copy the structure into a + * {@link PlanShape} pattern, then remove the call. Not intended to be left + * in committed tests. + */ + default void printFragmentPlans(List fragments) { + for (PlanFragment f : fragments) { + System.err.println("=== fragment " + f.getFragmentId() + " ==="); + System.err.println(PlanShape.prettyPrint(f.getPlanRoot())); + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/qe/LocalExchangePlannerTest.java b/fe/fe-core/src/test/java/org/apache/doris/qe/LocalExchangePlannerTest.java new file mode 100644 index 00000000000000..55b3c62b87ca68 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/qe/LocalExchangePlannerTest.java @@ -0,0 +1,688 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.qe; + +import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.analysis.TupleId; +import org.apache.doris.common.UserException; +import org.apache.doris.nereids.NereidsPlanner; +import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.planner.AddLocalExchange; +import org.apache.doris.planner.LocalExchangeNode; +import org.apache.doris.planner.LocalExchangeNode.LocalExchangeType; +import org.apache.doris.planner.PlanFragment; +import org.apache.doris.planner.PlanNode; +import org.apache.doris.planner.PlanNodeId; +import org.apache.doris.planner.PlanShape; +import org.apache.doris.planner.PlanShapeDsl; +import org.apache.doris.planner.ScanContext; +import org.apache.doris.planner.ScanNode; +import org.apache.doris.thrift.TExplainLevel; +import org.apache.doris.thrift.TPlanNode; +import org.apache.doris.thrift.TScanRangeLocations; +import org.apache.doris.utframe.TestWithFeService; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +public class LocalExchangePlannerTest extends TestWithFeService implements PlanShapeDsl { + @Override + protected int backendNum() { + return 3; + } + + @Override + protected void runBeforeAll() throws Exception { + createDatabase("test"); + useDatabase("test"); + createTable("CREATE TABLE test.t1 (k1 INT, k2 INT, v1 INT) " + + "DISTRIBUTED BY HASH(k1) BUCKETS 6 " + + "PROPERTIES ('replication_num'='1')"); + createTable("CREATE TABLE test.t2 (k1 INT, k2 INT, v2 INT) " + + "DISTRIBUTED BY HASH(k1) BUCKETS 6 " + + "PROPERTIES ('replication_num'='1')"); + } + + // ---- helpers ---- + + /** + * Apply the default local-shuffle session config and let the caller tweak it. + * Mirrors Trino's {@code Session.builder(defaultSession).setSystemProperty(...).build()} + * pattern — tests only express what differs from the default, not the full set. + */ + protected void setupLocalShuffleSession(java.util.function.Consumer tweaks) + throws Exception { + SessionVariable sv = connectContext.getSessionVariable(); + sv.setEnableLocalShufflePlanner(true); + sv.setEnableLocalShuffle(true); + sv.setEnableNereidsDistributePlanner(true); + sv.setIgnoreStorageDataDistribution(true); + sv.setPipelineTaskNum("4"); + sv.setForceToLocalShuffle(false); + if (tweaks != null) { + tweaks.accept(sv); + } + } + + /** + * Run the SQL through the planner and assert the resulting distributed plan + * matches {@code shape} in at least one fragment. Replaces the boilerplate + * {@code executeNereidsSql + cast planner + collect fragments + manual asserts} + * with a one-liner, in the spirit of Trino's + * {@code assertDistributedPlan(sql, pattern)}. + */ + protected void assertPlanShape(String sql, PlanShape shape) throws Exception { + StmtExecutor executor = executeNereidsSql("explain distributed plan " + sql); + NereidsPlanner planner = (NereidsPlanner) executor.planner(); + assertMatchesAnyFragment(planner.getFragments(), shape); + } + + /** + * Debug-only helper: run the SQL and dump every fragment's plan tree to stderr. + * Use this when crafting a new {@link #assertPlanShape} assertion to see what + * the real plan looks like, then replace this call with the proper DSL pattern. + */ + protected void dumpPlan(String sql) throws Exception { + StmtExecutor executor = executeNereidsSql("explain distributed plan " + sql); + NereidsPlanner planner = (NereidsPlanner) executor.planner(); + System.err.println("=== dump for SQL: " + sql + " ==="); + printFragmentPlans(planner.getFragments()); + } + + /** + * Assert that NO local exchange of the given type appears anywhere in any + * fragment's plan tree. Companion to {@link #assertPlanShape} for negative + * checks where pinning the full shape would be brittle. + */ + protected void assertNoLocalExchangeOfType(String sql, LocalExchangeType excludedType) throws Exception { + StmtExecutor executor = executeNereidsSql("explain distributed plan " + sql); + NereidsPlanner planner = (NereidsPlanner) executor.planner(); + EnumSet types = collectLocalExchangeTypes(planner.getFragments()); + Assertions.assertFalse(types.contains(excludedType), + "expected no " + excludedType + " in plan, actual: " + types); + } + + @Test + public void testAggFromScanShapeDsl() throws Exception { + // Same scenario as testAggFromScanUsesLocalExecutionHashShuffle but uses the + // PlanShape DSL to assert a *structural* relationship rather than just + // "explain contains the string LOCAL_EXECUTION_HASH_SHUFFLE somewhere". + // + // With ignore_storage_data_distribution=true the scan is serial (pooling + // mode); AggSink requires HASH; framework's heavy-op fan-out kicks in and + // inserts a PASSTHROUGH LE *before* the HASH LE so the 1-task scan does + // not bottleneck the hash redistribution. The full chain is: + // AggregationNode → LE(LOCAL_HASH) → LE(PASSTHROUGH) → OlapScan(t1) + // + // A substring check on explain text could not distinguish this from a + // plan that only has the LOCAL_HASH LE — DSL pins the structure. + setupLocalShuffleSession(null); + assertPlanShape("select k1, k2, count(*) from test.t1 group by k1, k2", + anyTree( + agg( + localExchange(LOCAL_HASH, + localExchange(PT, + olapScan("t1")))))); + } + + @Test + public void testAggWithoutKeyTwoPhase() throws Exception { + // doc rule "Agg / 没有 groupby key" → PASSTHROUGH. + // count(*) generates a two-phase aggregation: + // FINAL AggregationNode → ExchangeNode (UNPARTITIONED, serial) + // → PARTIAL AggregationNode (serial, no keys) + // → LE(PASSTHROUGH) (fan-out from 1-task pooling scan) + // → OlapScan(t1) + // The intermediate LE(PT) is the heavy-op fan-out for the pooling scan; + // the serial PARTIAL agg sits right above it. + setupLocalShuffleSession(null); + assertPlanShape("select count(*) from test.t1", + anyTree( + agg( + anyTree( + agg( + localExchange(PT, + olapScan("t1"))))))); + } + + @Test + public void testBroadcastJoinPoolingShapeDsl() throws Exception { + // doc rule "HashJoin / BROADCAST / 池化": + // probe ← LE(PASSTHROUGH) ← scan + // build ← LE(PASS_TO_ONE) ← Exchange (cross-fragment broadcast) + // Matches doc Example 2 plus the pooling variant (PASS_TO_ONE instead of NOOP + // on the serial build-side exchange). + setupLocalShuffleSession(null); + assertPlanShape("select * from test.t1 a join [broadcast] test.t2 b on a.k1=b.k1", + anyTree( + hashJoin( + localExchange(PT, + olapScan()), + localExchange(PASS_TO_ONE_LE, + anyTree(exchange()))))); + } + + @Test + public void testNlJoinPoolingShapeDsl() throws Exception { + // doc rule "NL join / 池化": build BROADCAST, probe ADAPTIVE_PASSTHROUGH. + // With pooling, the probe-side scan is serial (1 task) so the framework + // additionally inserts an inner LE(PASSTHROUGH) below the ADAPTIVE_PASSTHROUGH + // to fan the serial scan out before adaptive redistribution. The build side + // comes through a cross-fragment Exchange (broadcast). + setupLocalShuffleSession(null); + assertPlanShape("select * from test.t1 a, test.t2 b where a.k1 > b.k1", + anyTree( + nestedLoopJoin( + localExchange(ADAPTIVE_PT, + localExchange(PT, + anyTree(olapScan()))), + localExchange(BROADCAST_LE, + anyTree(exchange()))))); + } + + @Test + public void testNullAwareLeftAntiJoinHasNoLocalExchange() throws Exception { + // doc rule "HashJoin / NULL_AWARE_LEFT_ANTI_JOIN": NOOP/NOOP/NOOP — no LE + // inserted on either side, in either direction. + // HashJoin(NULL_AWARE_LEFT_ANTI_JOIN) + // ← OlapScan(t1) (probe, no LE) + // ← Exchange ← OlapScan(t2) (build, no LE) + setupLocalShuffleSession(null); + assertPlanShape("select k1 from test.t1 where k1 not in (select k1 from test.t2)", + anyTree( + hashJoin( + olapScan(), + anyTree(exchange())) + .where(j -> j.getJoinOp() + == org.apache.doris.analysis.JoinOperator.NULL_AWARE_LEFT_ANTI_JOIN))); + } + + @Test + public void testAnalyticNoPartitionByHasNoLocalExchange() throws Exception { + // doc rule "Analytic / 无 partition by": serial path with PASSTHROUGH require + // upstream. Because the analytic fragment uses a cross-fragment UNPARTITIONED + // Exchange to gather data into a single instance, no LE is inserted between + // the AnalyticEvalNode and the Exchange — the Exchange already serves as the + // serial source. + setupLocalShuffleSession(null); + assertPlanShape("select k1, row_number() over () from test.t1", + anyTree( + analytic( + anyTree(exchange())))); + } + + // -- Tier A: scenarios borrowed from Trino's TestAddExchangesPlans -- + + @Test + public void testUnionDistinctTwoPhaseAgg() throws Exception { + // Borrowed from Trino's testRepartitionForUnionWithAnyTableScans. + // UNION (not UNION ALL) implies DISTINCT — Doris realises that with a + // two-phase Aggregation above the UnionNode. The cross-fragment Exchanges + // feeding the Union pre-shuffle the scan outputs; the LE(PT) above the + // Union is the heavy-op fan-out for the partial agg over the gathered union. + // FINAL Agg ← Exchange ← PARTIAL Agg ← LE(PT) ← Union ← {Exchange ← scan} x 2 + setupLocalShuffleSession(null); + assertPlanShape("select k1 from test.t1 union select k1 from test.t2", + anyTree( + agg( + anyTree( + agg( + localExchange(PT, + union( + anyTree(olapScan()), + anyTree(olapScan())))))))); + } + + @Test + public void testUnionAllBeforeHashJoin() throws Exception { + // Borrowed from Trino's testRepartitionForUnionAllBeforeHashJoin. + // UNION ALL feeds into a hash join — the join's hash requirement is + // satisfied by the cross-fragment Exchanges sitting under each Union branch + // (data is already hash-distributed by the time it reaches Union), so no + // intra-fragment LE is needed on either probe or build side. + setupLocalShuffleSession(null); + assertPlanShape("select * from (select k1 from test.t1 union all select k1 from test.t2) u " + + "join test.t1 t3 on u.k1=t3.k1", + anyTree( + hashJoin( + union( + anyTree(olapScan()), + anyTree(olapScan())), + anyTree(exchange())))); + } + + @Test + public void testWindowPartitionByBucketKey() throws Exception { + // Borrowed from Trino's testWindowIsExactlyPartitioned. + // PARTITION BY uses the table's bucket key (k1) — Doris's analytic eval + // is colocate-eligible. With pooling, the chain is: + // AnalyticEval ← Sort ← LE(LOCAL_HASH) ← LE(PT) ← scan + // The inner LE(PT) is the heavy-op fan-out for the serial pooling scan. + setupLocalShuffleSession(null); + assertPlanShape("select k1, row_number() over (partition by k1) from test.t1", + anyTree( + analytic( + sort( + localExchange(LOCAL_HASH, + localExchange(PT, + olapScan("t1"))))))); + } + + @Test + public void testWindowPartitionByNonBucketKey() throws Exception { + // Borrowed from Trino's testRowNumberIsExactlyPartitioned (negative variant). + // PARTITION BY uses a non-bucket key (k2) — colocate is not eligible, so + // the analytic eval lives in its own fragment fed by a cross-fragment + // hash-partitioned Exchange. No intra-fragment LE inside the analytic fragment. + setupLocalShuffleSession(null); + assertPlanShape("select k1, row_number() over (partition by k2) from test.t1", + anyTree( + analytic( + sort( + anyTree(exchange()))))); + } + + @Test + public void testNestedUnionAll() throws Exception { + // Borrowed from Trino's testNestedUnionAll. + // Three-way UNION ALL flattens into a single UnionNode with three + // cross-fragment Exchange children. No LE since there's no downstream + // consumer requiring hash distribution. + setupLocalShuffleSession(null); + assertPlanShape( + "select k1 from test.t1 union all " + + "(select k1 from test.t2 union all select k1 from test.t1)", + anyTree( + union( + anyTree(olapScan()), + anyTree(olapScan()), + anyTree(olapScan())))); + } + + @Test + public void testGroupedAggOverNlj() throws Exception { + // Borrowed from Trino's testGroupedAggregationAboveUnionAllCrossJoined + // (NLJ + agg variant). NLJ output is ADAPTIVE_PASSTHROUGH; the outer Agg + // requires HASH on k1. Because ADAPTIVE_PASSTHROUGH does not satisfy HASH, + // an LE(LOCAL_HASH) is inserted between the NLJ output and the Agg. + // Agg ← LE(LOCAL_HASH) ← NLJ + // ├─ LE(ADAPTIVE_PT) ← LE(PT) ← scan(t1) + // └─ LE(BROADCAST) ← Exchange ← scan(t2) + setupLocalShuffleSession(null); + assertPlanShape("select a.k1, count(*) from test.t1 a, test.t2 b where a.k1 > b.k1 group by a.k1", + anyTree( + agg( + localExchange(LOCAL_HASH, + nestedLoopJoin( + localExchange(ADAPTIVE_PT, + localExchange(PT, + anyTree(olapScan()))), + localExchange(BROADCAST_LE, + anyTree(exchange()))))))); + } + + @Test + public void testTopNQualifyPartitionSort() throws Exception { + // Borrowed from Trino's testTopNRowNumberIsExactlyPartitioned. + // ROW_NUMBER() filtered by `rn = 1` triggers Doris's PartitionSortNode + // optimisation (LOCAL phase pre-trims rows before the global Analytic). + // The chain becomes: + // AnalyticEval ← Sort ← LE(LOCAL_HASH) ← PartitionSort ← LE(PT) ← scan + // The LE(PT) under PartitionSort is the heavy-op fan-out for the pooling scan; + // the LE(LOCAL_HASH) above PartitionSort enforces hash partitioning for the + // global ROW_NUMBER computation. + setupLocalShuffleSession(null); + assertPlanShape( + "select k1, k2 from (select k1, k2, row_number() over (partition by k1 order by k2) rn " + + "from test.t1) t where rn = 1", + anyTree( + analytic( + sort( + localExchange(LOCAL_HASH, + partitionSort( + localExchange(PT, + olapScan("t1")))))))); + } + + @Test + public void testAggOverBroadcastJoin() throws Exception { + // Borrowed from Trino's testGroupedAggregationAboveUnionAll variant. + // count(*) over a broadcast join generates a two-phase aggregation; the + // partial agg sits directly on top of the HashJoin and the final agg lives + // in a separate fragment (count merge): + // FINAL Agg ← Exchange ← PARTIAL Agg ← HashJoin + // ├─ LE(PT) ← scan + // └─ LE(PASS_TO_ONE) ← Exchange ← scan + setupLocalShuffleSession(null); + assertPlanShape("select count(*) from test.t1 a join [broadcast] test.t2 b on a.k1=b.k1", + anyTree( + agg( + anyTree( + agg( + hashJoin( + localExchange(PT, + olapScan()), + localExchange(PASS_TO_ONE_LE, + anyTree(exchange())))))))); + } + + @Test + public void testNonSerialScanKeepsBucketHashDistribution() throws Exception { + // Non-pooling scan with pipelineTaskNum=1 → the BUCKET_HASH_SHUFFLE output of + // the colocated scan is preserved end-to-end; no LOCAL_EXECUTION_HASH_SHUFFLE + // is ever introduced. Only a serial-source PASSTHROUGH appears (for the + // SortNode's merge-by-exchange). + setupLocalShuffleSession(sv -> { + sv.setIgnoreStorageDataDistribution(false); + try { + sv.setPipelineTaskNum("1"); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + assertNoLocalExchangeOfType( + "select k1, count(*) from test.t1 group by k1 order by k1", + LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE); + } + + @Test + public void testJoinPlanContainsHashShuffle() throws Exception { + // Pooling hash join under an aggregate. Both sides of the join feed through + // local exchanges; the agg above the join requires LOCAL_EXECUTION_HASH_SHUFFLE. + // Agg → LE(LOCAL_HASH) → HashJoin + // ← LE(PASSTHROUGH) ← OlapScan(t1) (probe) + // ← LE(PASS_TO_ONE) ← Exchange ← OlapScan(t2) (build) + setupLocalShuffleSession(null); + assertPlanShape( + "select a.k1, count(*) from test.t1 a join test.t2 b on a.k1 = b.k1 group by a.k1", + anyTree( + agg( + localExchange(LOCAL_HASH, + hashJoin( + localExchange(PT, + olapScan("t1")), + localExchange(PASS_TO_ONE_LE, + anyTree(exchange()))))))); + } + + @Test + public void testNoopLocalExchangeNotInjected() throws Exception { + // A simple LIMIT scan plan should contain no local exchanges of any kind — + // and most importantly, no synthesized NOOP node. doc rule "NOOP is meta, + // never materialized": the planner uses NOOP as a 'skip' signal during + // resolution but never instantiates a LocalExchangeNode with type NOOP. + setupLocalShuffleSession(null); + assertNoLocalExchangeOfType("select * from test.t1 limit 1", LocalExchangeType.NOOP); + } + + @Test + public void testHashShuffleHasDistributeExprs() throws Exception { + // Same scan→agg plan as testAggFromScanShapeDsl, but with a predicate that + // checks the inserted LE(LOCAL_HASH) carries non-empty distribute expressions + // (without which the BE would not know which columns to hash on). + setupLocalShuffleSession(null); + assertPlanShape("select k1, k2, count(*) from test.t1 group by k1, k2", + anyTree( + agg( + localExchange(LOCAL_HASH, + localExchange(PT, olapScan("t1"))) + .where(le -> le.getDistributeExprLists() != null + && !le.getDistributeExprLists().isEmpty())))); + } + + @Test + public void testStreamingAggHashShuffleUsesGroupingExprs() throws Exception { + // Regression for: FE-planned LE(LOCAL_HASH) under a streaming partial agg used + // the child's table distribution (e.g. `k1`) instead of the grouping keys + // (e.g. `k2`). BE's AggSinkOperatorX/StreamingAggOperatorX::update_operator + // picks `_partition_exprs = grouping_exprs` when the chain is NOT followed by + // a shuffled operator (the common case where the streaming preagg sits at + // fragment root with only a cross-fragment HASH ExchangeSink above). Using + // child distribution instead scatters same-group rows across N instances, + // turning the partial preagg into a no-op and corrupting row-arrival order at + // downstream merge-finalize (manifests as e.g. non-deterministic + // group_concat / py_json_array_agg output). + // + // Table t1 is DISTRIBUTED BY HASH(k1). GROUP BY k2 forces a cross-fragment + // exchange (and thus a two-phase aggregation): the streaming partial agg lives + // in the lower fragment, with an FE-inserted LE(LOCAL_HASH) below it. The fix + // makes that LE carry [k2] (the grouping key) rather than [k1] (the child + // table distribution). + setupLocalShuffleSession(null); + assertPlanShape( + "select k2, count(*) from test.t1 group by k2", + anyTree( + agg( + localExchange(LOCAL_HASH, + localExchange(PT, olapScan("t1"))) + .where(le -> { + List exprs = + le.getDistributeExprLists(); + if (exprs == null || exprs.size() != 1) { + return false; + } + org.apache.doris.analysis.Expr e = exprs.get(0); + return e instanceof org.apache.doris.analysis.SlotRef + && "k2".equals(((org.apache.doris.analysis.SlotRef) e).getCol()); + })))); + } + + @Test + public void testRequireHashSatisfyAllHashShuffleTypes() { + LocalExchangeNode.LocalExchangeTypeRequire requireHash = LocalExchangeNode.LocalExchangeTypeRequire.requireHash(); + Assertions.assertTrue(requireHash.satisfy(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE)); + Assertions.assertTrue(requireHash.satisfy(LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE)); + Assertions.assertTrue(requireHash.satisfy(LocalExchangeType.BUCKET_HASH_SHUFFLE)); + Assertions.assertFalse(requireHash.satisfy(LocalExchangeType.PASSTHROUGH)); + } + + @Test + public void testSetOperationUnderAggHasHashShuffle() throws Exception { + // Non-pooling UNION ALL under an agg. The outer agg's group key requires a + // LOCAL_EXECUTION_HASH_SHUFFLE directly above the UnionNode (above each + // branch's pre-agg). + setupLocalShuffleSession(sv -> sv.setIgnoreStorageDataDistribution(false)); + assertPlanShape( + "select k1, count(*) from (select k1 from test.t1 union all select k1 from test.t2) u group by k1", + anyTree( + agg( + localExchange(LOCAL_HASH, + union( + anyTree(olapScan()), + anyTree(olapScan())))))); + } + + @Test + public void testAnalyticPlanContainsPassthroughAndLocalHashShuffle() throws Exception { + // doc rule "Analytic / 有 partition by / 池化": LE(LOCAL_HASH) for partition + // redistribution, plus a LE(PASSTHROUGH) heavy-op fan-out below it for the + // 1-task pooling scan, then LE(PASSTHROUGH) above the AnalyticEval for the + // outer order-by merge. + // SortNode → LE(PASSTHROUGH) → AnalyticEval → SortNode + // → LE(LOCAL_HASH) → LE(PASSTHROUGH) → scan + setupLocalShuffleSession(null); + assertPlanShape( + "select k1, k2, row_number() over(partition by k1 order by k2) from test.t1 order by k1, k2", + anyTree( + sort( + localExchange(PT, + analytic( + sort( + localExchange(LOCAL_HASH, + localExchange(PT, + olapScan("t1"))))))))); + } + + @Test + public void testGroupingSetsPlanContainsHashShuffle() throws Exception { + // Non-pooling grouping sets keeps the colocated BUCKET_HASH_SHUFFLE output of + // the scan all the way through Repeat→Agg; no LE(LOCAL_HASH) is needed. + setupLocalShuffleSession(sv -> sv.setIgnoreStorageDataDistribution(false)); + assertNoLocalExchangeOfType( + "select k1, k2, sum(v1) from test.t1 group by grouping sets((k1), (k1, k2))", + LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE); + } + + @Test + public void testLocalAndGlobalExecutionHashShufflePreferType() { + PlanTranslatorContext translatorContext = new PlanTranslatorContext(); + LocalExchangeNode.LocalExchangeTypeRequire requireHash = LocalExchangeNode.LocalExchangeTypeRequire.requireHash(); + LocalExchangeNode.LocalExchangeTypeRequire requireBucketHash + = LocalExchangeNode.LocalExchangeTypeRequire.requireBucketHash(); + LocalExchangeNode.LocalExchangeTypeRequire requireGlobalHash + = LocalExchangeNode.LocalExchangeTypeRequire.requireGlobalExecutionHash(); + + LocalExchangeType localType = AddLocalExchange.resolveExchangeType( + requireHash, translatorContext, null, new MockScanNode(new PlanNodeId(1001))); + LocalExchangeType globalType = AddLocalExchange.resolveExchangeType( + requireHash, translatorContext, null, new MockPlanNode(new PlanNodeId(1002))); + // Explicit GLOBAL_EXECUTION_HASH_SHUFFLE must NOT be degraded, even on a scan path. + // If it appears on a scan path, the plan is wrong — not something resolveExchangeType should fix. + LocalExchangeType explicitGlobalOnScanType = AddLocalExchange.resolveExchangeType( + requireGlobalHash, translatorContext, null, new MockScanNode(new PlanNodeId(1003))); + + // shouldUseLocalExecutionHash always returns true → RequireHash always resolves to LOCAL + Assertions.assertEquals(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE, localType); + Assertions.assertEquals(LocalExchangeType.LOCAL_EXECUTION_HASH_SHUFFLE, globalType); + // Explicit GLOBAL (RequireSpecific) must NOT be degraded. + Assertions.assertEquals(LocalExchangeType.GLOBAL_EXECUTION_HASH_SHUFFLE, explicitGlobalOnScanType); + Assertions.assertEquals(LocalExchangeType.BUCKET_HASH_SHUFFLE, requireBucketHash.preferType()); + } + + @Test + public void testMixedPlanWithPoolingScan() throws Exception { + // Pooling: grouping-sets sub-query JOINed and re-aggregated. Probe side + // wraps the inner agg/Repeat with LE(LOCAL_HASH) over LE(PASSTHROUGH); build + // side comes through LE(PASS_TO_ONE) over an inter-fragment Exchange. + // Agg → HashJoin + // ← Agg → LE(LOCAL_HASH) → LE(PASSTHROUGH) → Repeat → scan(t1) + // ← LE(PASS_TO_ONE) → Exchange → scan(t2) + setupLocalShuffleSession(null); + assertPlanShape( + "select u.k1, count(*) from (select k1, k2 from test.t1 group by grouping sets((k1), (k1, k2))) u " + + "join test.t2 b on u.k1 = b.k1 group by u.k1", + anyTree( + agg( + hashJoin( + agg( + localExchange(LOCAL_HASH, + localExchange(PT, + anyTree(olapScan("t1"))))), + localExchange(PASS_TO_ONE_LE, + anyTree(exchange())))))); + } + + @Test + public void testUnionAllScanAndValues() throws Exception { + // Tier B (borrowed from Trino): UNION ALL of a real OlapScan and inline + // VALUES rows. The values branches flow through their own fragments and + // land at a UnionNode that gathers via Exchange. Verifies the + // scan-side LE(PASSTHROUGH) heavy-op fan-out is still inserted while the + // values branches contribute no local exchanges (already serial sources). + setupLocalShuffleSession(null); + assertPlanShape("select k1 from test.t1 union all select 1 union all select 2", + anyTree( + union( + anyTree(exchange())))); + } + + private EnumSet collectLocalExchangeTypes(List fragments) { + EnumSet types = EnumSet.noneOf(LocalExchangeType.class); + for (PlanFragment fragment : fragments) { + collect(fragment.getPlanRoot(), types); + } + return types; + } + + private List collectLocalExchangeNodes(List fragments) { + List nodes = new ArrayList<>(); + for (PlanFragment fragment : fragments) { + collectLocalExchangeNode(fragment.getPlanRoot(), nodes); + } + return nodes; + } + + private String collectFragmentExplain(List fragments) { + StringBuilder explain = new StringBuilder(); + for (PlanFragment fragment : fragments) { + explain.append(fragment.getExplainString(TExplainLevel.NORMAL)); + } + return explain.toString(); + } + + private void collect(PlanNode node, EnumSet types) { + if (node instanceof LocalExchangeNode) { + types.add(((LocalExchangeNode) node).getExchangeType()); + } + for (PlanNode child : node.getChildren()) { + collect(child, types); + } + } + + private void collectLocalExchangeNode(PlanNode node, List nodes) { + if (node instanceof LocalExchangeNode) { + nodes.add((LocalExchangeNode) node); + } + for (PlanNode child : node.getChildren()) { + collectLocalExchangeNode(child, nodes); + } + } + + private static class MockPlanNode extends PlanNode { + MockPlanNode(PlanNodeId id) { + super(id, "MOCK-PLAN"); + } + + @Override + protected void toThrift(TPlanNode msg) { + } + + @Override + public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { + return ""; + } + } + + private static class MockScanNode extends ScanNode { + MockScanNode(PlanNodeId id) { + super(id, new TupleDescriptor(new TupleId(id.asInt())), "MOCK-SCAN", ScanContext.EMPTY); + } + + @Override + protected void createScanRangeLocations() throws UserException { + } + + @Override + public List getScanRangeLocations(long maxScanRangeLength) { + return java.util.Collections.emptyList(); + } + + @Override + protected void toThrift(TPlanNode msg) { + } + + @Override + public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { + return ""; + } + } +} diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 211f38dccb5896..772111645acd11 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -471,7 +471,6 @@ struct TQueryOptions { // session variable `spill_repartition_max_depth` in FE. Default is 8. 209: optional i32 spill_repartition_max_depth = 8 - 210: optional double max_scan_mem_ratio = 0.3; 211: optional bool enable_adaptive_scan = false; @@ -493,6 +492,10 @@ struct TQueryOptions { 219: optional bool enable_segment_limit_pushdown = true 220: optional bool enable_ann_index_result_cache = true + + // enable plan local exchange node in fe + 221: optional bool enable_local_shuffle_planner; + // For cloud, to control if the content would be written into file cache // In write path, to control if the content would be written into file cache. // In read path, read from file cache or remote storage when execute query. diff --git a/gensrc/thrift/Partitions.thrift b/gensrc/thrift/Partitions.thrift index de6ba8127affdd..bf23f418eaf5ce 100644 --- a/gensrc/thrift/Partitions.thrift +++ b/gensrc/thrift/Partitions.thrift @@ -55,6 +55,40 @@ enum TPartitionType { MERGE_PARTITIONED = 9 } +enum TLocalPartitionType { + NOOP = 0, + // used to resume the global hash distribution because other distribution break the global hash distribution, + // such as PASSTHROUGH. and then JoinNode can shuffle data by the same hash distribution. + // + // for example: look here, need resume to GLOBAL_EXECUTION_HASH_SHUFFLE + // ↓ + // Node -> LocalExchangeNode(PASSTHROUGH) → JoinNode → LocalExchangeNode(GLOBAL_EXECUTION_HASH_SHUFFLE) → JoinNode + // ExchangeNode(BROADCAST) ↗ ↑ + // ExchangeNode(GLOBAL_EXECUTION_HASH_SHUFFLE) + GLOBAL_EXECUTION_HASH_SHUFFLE = 1, + // used to rebalance data for rebalance data and add parallelism + // + // for example: look here, need use LOCAL_EXECUTION_HASH_SHUFFLE to rebalance data + // ↓ + // Scan(hash(id)) -> LocalExchangeNode(LOCAL_EXECUTION_HASH_SHUFFLE(id, name)) → AggregationNode(group by(id,name)) + // + // the LOCAL_EXECUTION_HASH_SHUFFLE is necessary because the hash distribution of scan node is based on id, + // but the hash distribution of aggregation node is based on id and name, so we need to rebalance data by both + // id and name to make sure the data with same id and name can be sent to the same instance of aggregation node. + // and we can not use GLOBAL_EXECUTION_HASH_SHUFFLE(id, name) here, because + // `TPipelineFragmentParams.shuffle_idx_to_instance_idx` is used to mapping partial global instance index to local + // instance index, and discard the other backend's instance index, the data not belong to the local instance will be + // discarded, which cause data loss. + LOCAL_EXECUTION_HASH_SHUFFLE = 2, + BUCKET_HASH_SHUFFLE = 3, + // round-robin partition, used to rebalance data for rebalance data and add parallelism + PASSTHROUGH = 4, + ADAPTIVE_PASSTHROUGH = 5, + BROADCAST = 6, + PASS_TO_ONE = 7, + LOCAL_MERGE_SORT = 8 +} + enum TDistributionType { UNPARTITIONED = 0, @@ -119,3 +153,5 @@ struct TDataPartition { 3: optional list partition_infos 4: optional TMergePartitionInfo merge_partition_info } + + diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 757751d3a46724..4cdfd440feb122 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -63,7 +63,8 @@ enum TPlanNodeType { MATERIALIZATION_NODE = 34, REC_CTE_NODE = 35, REC_CTE_SCAN_NODE = 36, - BUCKETED_AGGREGATION_NODE = 37 + BUCKETED_AGGREGATION_NODE = 37, + LOCAL_EXCHANGE_NODE = 38 } struct TKeyRange { @@ -1368,6 +1369,24 @@ struct TExchangeNode { 4: optional Partitions.TPartitionType partition_type } +struct TLocalExchangeNode { + 1: required Partitions.TLocalPartitionType partition_type + // when partition_type in (GLOBAL_EXECUTION_HASH_SHUFFLE, LOCAL_EXECUTION_HASH_SHUFFLE, BUCKET_HASH_SHUFFLE), + // the distribute_expr_lists is not null, and the legacy `TPlanNode.distribute_expr_lists` is deprecated + // + // the hash computation: + // 1. for BUCKET_HASH_SHUFFLE, use distribution_exprs to compute hash value and mod by + // `TPipelineFragmentParams.num_buckets`, and mapping bucket index to local instance id by + // `TPipelineFragmentParams.bucket_seq_to_instance_idx` + // 2. for LOCAL_EXECUTION_HASH_SHUFFLE, use distribution_exprs to compute hash value and mod by + // `TPipelineFragmentParams.local_params.size`, and backend will mapping instance index to local instance + // by `i -> i`, for example: 1 -> 1, 2 -> 2, ... + // 3. for GLOBAL_EXECUTION_HASH_SHUFFLE, use distribution_exprs to compute hash value and mod by + // `TPipelineFragmentParams.total_instances`, and mapping global instance index to local instance by + // `TPipelineFragmentParams.shuffle_idx_to_instance_idx` + 2: optional list distribute_expr_lists +} + struct TOlapRewriteNode { 1: required list columns 2: required list column_types @@ -1585,6 +1604,7 @@ struct TPlanNode { 51: optional bool is_serial_operator 52: optional TRecCTEScanNode rec_cte_scan_node 53: optional TBucketedAggregationNode bucketed_agg_node + 54: optional TLocalExchangeNode local_exchange_node // projections is final projections, which means projecting into results and materializing them into the output block. 101: optional list projections diff --git a/regression-test/data/nereids_function_p0/gen_function/unnest_order_by_list_test.out b/regression-test/data/nereids_function_p0/gen_function/unnest_order_by_list_test.out index 31141b95eaeaa5..f2f86a8bc5c993 100644 --- a/regression-test/data/nereids_function_p0/gen_function/unnest_order_by_list_test.out +++ b/regression-test/data/nereids_function_p0/gen_function/unnest_order_by_list_test.out @@ -63,13 +63,13 @@ Alice Charlie -- !window_function_order_by_unnested_value -- -1 English 90 1 1 English 85 2 -1 Math 95 1 -1 Math 80 2 +1 English 90 1 1 Math 70 3 -2 Math 75 1 +1 Math 80 2 +1 Math 95 1 2 Math 60 2 +2 Math 75 1 -- !order_by_after_where_and_unnest -- 1 0 80 diff --git a/regression-test/data/query_p0/join/test_multilevel_join_agg_local_shuffle.out b/regression-test/data/query_p0/join/test_multilevel_join_agg_local_shuffle.out new file mode 100644 index 00000000000000..652df5e654d66b --- /dev/null +++ b/regression-test/data/query_p0/join/test_multilevel_join_agg_local_shuffle.out @@ -0,0 +1,815 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !bucket_shuffle_broadcast_agg_stage_1_result_on -- +1 52 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_shuffle_broadcast_agg_stage_1_result_off -- +1 52 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_shuffle_broadcast_agg_stage_2_result_on -- +1 52 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_shuffle_broadcast_agg_stage_2_result_off -- +1 52 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_shuffle_broadcast_agg_stage_3_result_on -- +1 52 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_shuffle_broadcast_agg_stage_3_result_off -- +1 52 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !shuffle_broadcast_broadcast_agg_stage_1_result_on -- +1 10 62 10 +2 5 54 20 +3 5 38 30 +4 6 49 40 + +-- !shuffle_broadcast_broadcast_agg_stage_1_result_off -- +1 10 62 10 +2 5 54 20 +3 5 38 30 +4 6 49 40 + +-- !shuffle_broadcast_broadcast_agg_stage_2_result_on -- +1 10 62 10 +2 5 54 20 +3 5 38 30 +4 6 49 40 + +-- !shuffle_broadcast_broadcast_agg_stage_2_result_off -- +1 10 62 10 +2 5 54 20 +3 5 38 30 +4 6 49 40 + +-- !shuffle_broadcast_broadcast_agg_stage_3_result_on -- +1 10 62 10 +2 5 54 20 +3 5 38 30 +4 6 49 40 + +-- !shuffle_broadcast_broadcast_agg_stage_3_result_off -- +1 10 62 10 +2 5 54 20 +3 5 38 30 +4 6 49 40 + +-- !bucket_broadcast_shuffle_agg_stage_1_result_on -- +1 26 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_broadcast_shuffle_agg_stage_1_result_off -- +1 26 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_broadcast_shuffle_agg_stage_2_result_on -- +1 26 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_broadcast_shuffle_agg_stage_2_result_off -- +1 26 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_broadcast_shuffle_agg_stage_3_result_on -- +1 26 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_broadcast_shuffle_agg_stage_3_result_off -- +1 26 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !alternating_bucket_bucket_bucket_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_bucket_bucket_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_bucket_shuffle_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_bucket_shuffle_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_bucket_broadcast_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_bucket_broadcast_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_shuffle_bucket_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_shuffle_bucket_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_shuffle_shuffle_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_shuffle_shuffle_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_shuffle_broadcast_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_shuffle_broadcast_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_broadcast_bucket_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_broadcast_bucket_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_broadcast_shuffle_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_broadcast_shuffle_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_broadcast_broadcast_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_bucket_broadcast_broadcast_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_bucket_bucket_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_bucket_bucket_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_bucket_shuffle_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_bucket_shuffle_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_bucket_broadcast_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_bucket_broadcast_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_shuffle_bucket_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_shuffle_bucket_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_shuffle_shuffle_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_shuffle_shuffle_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_shuffle_broadcast_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_shuffle_broadcast_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_broadcast_bucket_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_broadcast_bucket_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_broadcast_shuffle_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_broadcast_shuffle_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_broadcast_broadcast_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_shuffle_broadcast_broadcast_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_bucket_bucket_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_bucket_bucket_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_bucket_shuffle_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_bucket_shuffle_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_bucket_broadcast_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_bucket_broadcast_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_shuffle_bucket_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_shuffle_bucket_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_shuffle_shuffle_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_shuffle_shuffle_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_shuffle_broadcast_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_shuffle_broadcast_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_broadcast_bucket_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_broadcast_bucket_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_broadcast_shuffle_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_broadcast_shuffle_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_broadcast_broadcast_result_on -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !alternating_broadcast_broadcast_broadcast_result_off -- +1 63 16 11 +2 16 24 22 +3 16 35 33 +4 19 46 44 + +-- !window_union_all_bucket_row_number_result_on -- +1 52 4 40 +2 21 3 60 +3 16 2 60 +4 20 2 80 + +-- !window_union_all_bucket_row_number_result_off -- +1 52 4 40 +2 21 3 60 +3 16 2 60 +4 20 2 80 + +-- !window_union_all_bucket_window_sum_result_on -- +1 10 13 40 +2 6 7 60 +3 3 8 60 +4 3 10 80 + +-- !window_union_all_bucket_window_sum_result_off -- +1 10 13 40 +2 6 7 60 +3 3 8 60 +4 3 10 80 + +-- !window_union_all_shuffle_row_number_result_on -- +1 208 8 44 +2 21 3 21 +3 16 2 16 +4 20 2 18 + +-- !window_union_all_shuffle_row_number_result_off -- +1 208 8 44 +2 21 3 21 +3 16 2 16 +4 20 2 18 + +-- !window_union_all_shuffle_window_sum_result_on -- +1 36 26 44 +2 6 7 21 +3 3 8 16 +4 3 10 18 + +-- !window_union_all_shuffle_window_sum_result_off -- +1 36 26 44 +2 6 7 21 +3 3 8 16 +4 3 10 18 + +-- !window_union_all_broadcast_row_number_result_on -- +1 52 4 40 +2 21 3 60 +3 16 2 60 +4 20 2 80 + +-- !window_union_all_broadcast_row_number_result_off -- +1 52 4 40 +2 21 3 60 +3 16 2 60 +4 20 2 80 + +-- !window_union_all_broadcast_window_sum_result_on -- +1 10 13 40 +2 6 7 60 +3 3 8 60 +4 3 10 80 + +-- !window_union_all_broadcast_window_sum_result_off -- +1 10 13 40 +2 6 7 60 +3 3 8 60 +4 3 10 80 + +-- !window_except_bucket_row_number_result_on -- +1 10 2 20 +2 10 2 40 +3 5 1 30 +4 6 1 40 + +-- !window_except_bucket_row_number_result_off -- +1 10 2 20 +2 10 2 40 +3 5 1 30 +4 6 1 40 + +-- !window_except_bucket_window_sum_result_on -- +1 3 5 20 +2 3 5 40 +3 1 5 30 +4 1 6 40 + +-- !window_except_bucket_window_sum_result_off -- +1 3 5 20 +2 3 5 40 +3 1 5 30 +4 1 6 40 + +-- !window_except_shuffle_row_number_result_on -- +1 40 4 22 +2 10 2 14 +3 5 1 8 +4 6 1 9 + +-- !window_except_shuffle_row_number_result_off -- +1 40 4 22 +2 10 2 14 +3 5 1 8 +4 6 1 9 + +-- !window_except_shuffle_window_sum_result_on -- +1 10 10 22 +2 3 5 14 +3 1 5 8 +4 1 6 9 + +-- !window_except_shuffle_window_sum_result_off -- +1 10 10 22 +2 3 5 14 +3 1 5 8 +4 1 6 9 + +-- !window_except_broadcast_row_number_result_on -- +1 10 2 20 +2 10 2 40 +3 5 1 30 +4 6 1 40 + +-- !window_except_broadcast_row_number_result_off -- +1 10 2 20 +2 10 2 40 +3 5 1 30 +4 6 1 40 + +-- !window_except_broadcast_window_sum_result_on -- +1 3 5 20 +2 3 5 40 +3 1 5 30 +4 1 6 40 + +-- !window_except_broadcast_window_sum_result_off -- +1 3 5 20 +2 3 5 40 +3 1 5 30 +4 1 6 40 + +-- !window_intersect_bucket_row_number_result_on -- + +-- !window_intersect_bucket_row_number_result_off -- + +-- !window_intersect_bucket_window_sum_result_on -- + +-- !window_intersect_bucket_window_sum_result_off -- + +-- !window_intersect_shuffle_row_number_result_on -- + +-- !window_intersect_shuffle_row_number_result_off -- + +-- !window_intersect_shuffle_window_sum_result_on -- + +-- !window_intersect_shuffle_window_sum_result_off -- + +-- !window_intersect_broadcast_row_number_result_on -- + +-- !window_intersect_broadcast_row_number_result_off -- + +-- !window_intersect_broadcast_window_sum_result_on -- + +-- !window_intersect_broadcast_window_sum_result_off -- + +-- !bucket_broadcast_agg_result_on -- +1 26 10 +2 9 20 +3 8 30 +4 10 40 + +-- !bucket_broadcast_agg_result_off -- +1 26 10 +2 9 20 +3 8 30 +4 10 40 + +-- !partitioned_broadcast_agg_result_on -- +1 32 10 +2 19 20 +3 13 30 +4 15 40 + +-- !partitioned_broadcast_agg_result_off -- +1 32 10 +2 19 20 +3 13 30 +4 15 40 + +-- !bucket_partitioned_agg_result_on -- +1 52 44 +2 9 14 +3 8 8 +4 10 9 + +-- !bucket_partitioned_agg_result_off -- +1 52 44 +2 9 14 +3 8 8 +4 10 9 + +-- !all_three_multilevel_agg_result_on -- +1 96 10 +2 23 20 +3 16 30 +4 19 40 + +-- !all_three_multilevel_agg_result_off -- +1 96 10 +2 23 20 +3 16 30 +4 19 40 + +-- !agg_join_agg_mix_result_on -- +1 10 11 10 +2 5 7 20 +3 5 8 30 +4 6 9 40 + +-- !agg_join_agg_mix_result_off -- +1 10 11 10 +2 5 7 20 +3 5 8 30 +4 6 9 40 + +-- !double_broadcast_after_bucket_result_on -- +1 26 20 +2 9 40 +3 8 60 +4 10 80 + +-- !double_broadcast_after_bucket_result_off -- +1 26 20 +2 9 40 +3 8 60 +4 10 80 + +-- !partitioned_join_between_two_aggs_then_broadcast_result_on -- +1 16 10 +2 12 20 +3 13 30 +4 15 40 + +-- !partitioned_join_between_two_aggs_then_broadcast_result_off -- +1 16 10 +2 12 20 +3 13 30 +4 15 40 + +-- !bucket_shuffle_broadcast_two_stage_agg_result_on -- +1 52 44 10 +2 9 14 20 +3 8 8 30 +4 10 9 40 + +-- !bucket_shuffle_broadcast_two_stage_agg_result_off -- +1 52 44 10 +2 9 14 20 +3 8 8 30 +4 10 9 40 + +-- !left_join_null_preserving_with_multilevel_agg_result_on -- +1 5 44 +2 5 14 +3 5 8 +4 6 9 + +-- !left_join_null_preserving_with_multilevel_agg_result_off -- +1 5 44 +2 5 14 +3 5 8 +4 6 9 + +-- !seven_layer_bucket_shuffle_broadcast_result_on -- +1 37 10 +2 16 20 +3 16 30 +4 19 40 + +-- !seven_layer_bucket_shuffle_broadcast_result_off -- +1 37 10 +2 16 20 +3 16 30 +4 19 40 + +-- !eight_layer_mixed_join_agg_chain_result_on -- +1 52 44 20 +2 9 14 40 +3 8 8 60 +4 10 9 80 + +-- !eight_layer_mixed_join_agg_chain_result_off -- +1 52 44 20 +2 9 14 40 +3 8 8 60 +4 10 9 80 + +-- !seven_layer_left_join_mix_result_on -- +1 10 11 10 +2 5 7 20 +3 5 8 30 +4 6 9 40 + +-- !seven_layer_left_join_mix_result_off -- +1 10 11 10 +2 5 7 20 +3 5 8 30 +4 6 9 40 + +-- !broadcast_shuffle_broadcast_nested_agg_result_on -- +1 26 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !broadcast_shuffle_broadcast_nested_agg_result_off -- +1 26 11 10 +2 9 7 20 +3 8 8 30 +4 10 9 40 + +-- !window_union_join_agg_result_on -- +1 13 14 +2 7 23 +3 8 32 +4 10 42 + +-- !window_union_join_agg_result_off -- +1 13 14 +2 7 23 +3 8 32 +4 10 42 + +-- !window_except_join_agg_result_on -- +1 1 10 +2 1 20 +3 1 30 + +-- !window_except_join_agg_result_off -- +1 1 10 +2 1 20 +3 1 30 + +-- !window_intersect_shuffle_agg_result_on -- +1 22 2 +2 7 1 +3 8 1 +4 9 1 + +-- !window_intersect_shuffle_agg_result_off -- +1 22 2 +2 7 1 +3 8 1 +4 9 1 + +-- !window_union_except_broadcast_agg_result_on -- +1 13 4 10 +2 7 3 20 +3 8 2 30 +4 10 2 40 + +-- !window_union_except_broadcast_agg_result_off -- +1 13 4 10 +2 7 3 20 +3 8 2 30 +4 10 2 40 + +-- !window_setop_join_agg_chain_result_on -- +1 20 2 22 +2 10 2 14 +3 5 1 8 +4 6 1 9 + +-- !window_setop_join_agg_chain_result_off -- +1 20 2 22 +2 10 2 14 +3 5 1 8 +4 6 1 9 + diff --git a/regression-test/plugins/plugin_profile_plan_tree.groovy b/regression-test/plugins/plugin_profile_plan_tree.groovy new file mode 100644 index 00000000000000..ee86810ca79b56 --- /dev/null +++ b/regression-test/plugins/plugin_profile_plan_tree.groovy @@ -0,0 +1,274 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.Suite + +/** + * Parses the MergedProfile section of a Doris query profile and returns + * the pipeline operator tree in an explain-like format. + * + * Profile text structure (MergedProfile section): + * MergedProfile: + * Fragments: + * Fragment N: ← indent level A + * Pipeline N(instance_num=M): ← indent level B (B = A + 2) + * - WaitWorkerTime: ... ← pipeline stats (skip) + * OPERATOR_NAME(...): ← indent level C (C = B + 2) + * - PlanInfo ← plan info marker + * - key: value ← plan info item + * CommonCounters: ← counter section start (stop reading plan info) + * - ... + * + * Usage in a test suite: + * // From raw profile text: + * def tree = profile_plan_tree(profileText) + * logger.info(tree) + * + * // From a query_id (fetches profile via HTTP): + * def tree = profile_plan_tree_from_id(queryId) + * logger.info(tree) + */ + +// --------------------------------------------------------------------------- +// profile_plan_tree(profileText) → formatted String +// --------------------------------------------------------------------------- +Suite.metaClass.profile_plan_tree = { String profileText -> + + // ── Locate MergedProfile section ────────────────────────────────────── + def mergedStart = profileText.indexOf("MergedProfile:") + if (mergedStart == -1) { + return "(no MergedProfile section found in profile)" + } + def mergedSection = profileText.substring(mergedStart) + + // ── Line-by-line state machine ───────────────────────────────────────── + // We detect indent levels dynamically: the first Fragment line sets the + // baseline; Pipeline and Operator levels follow by relative indentation. + int fragmentIndent = -1 // indent of "Fragment N:" lines + int pipelineIndent = -1 // indent of "Pipeline N(...):" lines + int operatorIndent = -1 // indent of "OPERATOR_NAME(...):" lines + int planInfoIndent = -1 // indent of "- PlanInfo" lines + int planItemIndent = -1 // indent of "- key: value" plan info items + int counterIndent = -1 // indent of "CommonCounters:" / "CustomCounters:" + + // Parsed tree: list of fragments, each with pipelines, each with operators + def fragments = [] // [{name, pipelines:[{name,instanceNum,ops:[{name,planInfo:[]}]}]}] + + def curFragName = null + def curPipeName = null + def curOpName = null + def curPlanInfo = [] // accumulated plan-info lines for current operator + def inPlanInfo = false + def curPipeOps = [] // operators in current pipeline + def curFragPipes = [] // pipelines in current fragment + + def flushOp = { + if (curOpName != null) { + curPipeOps << [name: curOpName, planInfo: new ArrayList(curPlanInfo)] + curOpName = null + curPlanInfo.clear() + inPlanInfo = false + } + } + + def flushPipeline = { + flushOp() + if (curPipeName != null) { + curFragPipes << [name: curPipeName, ops: new ArrayList(curPipeOps)] + curPipeName = null + curPipeOps.clear() + } + } + + def flushFragment = { + flushPipeline() + if (curFragName != null) { + fragments << [name: curFragName, pipelines: new ArrayList(curFragPipes)] + curFragName = null + curFragPipes.clear() + } + } + + for (def rawLine : mergedSection.split("\n")) { + // Count leading spaces + int spaces = 0 + for (char c : rawLine.toCharArray()) { + if (c == ' ') spaces++ + else break + } + def content = rawLine.trim() + if (content.isEmpty()) continue + + // ── Fragment ─────────────────────────────────────────────────────── + if (content =~ /^Fragment \d+:$/) { + if (fragmentIndent == -1) fragmentIndent = spaces + if (spaces == fragmentIndent) { + flushFragment() + curFragName = content[0..-2] // strip trailing ':' + // Reset derived indent markers when a new fragment begins + pipelineIndent = -1 + operatorIndent = -1 + planInfoIndent = -1 + planItemIndent = -1 + counterIndent = -1 + continue + } + } + + // Only process further if we are inside a Fragment + if (curFragName == null) continue + + // ── Pipeline ─────────────────────────────────────────────────────── + if (content =~ /^Pipeline \d+\(instance_num=\d+\):$/) { + if (pipelineIndent == -1) pipelineIndent = spaces + if (spaces == pipelineIndent) { + flushPipeline() + def m = content =~ /^Pipeline (\d+)\(instance_num=(\d+)\):$/ + if (m.find()) { + curPipeName = "Pipeline ${m.group(1)} (instances=${m.group(2)})" + } else { + curPipeName = content[0..-2] + } + // Reset operator-level indent markers per pipeline + operatorIndent = -1 + planInfoIndent = -1 + planItemIndent = -1 + counterIndent = -1 + continue + } + } + + // Only process further if we are inside a Pipeline + if (curPipeName == null) continue + + // ── CommonCounters / CustomCounters ──────────────────────────────── + // These appear inside an operator block and signal end of PlanInfo. + if (content == "CommonCounters:" || content == "CustomCounters:") { + if (counterIndent == -1) counterIndent = spaces + if (spaces == counterIndent) { + inPlanInfo = false + continue + } + } + + // ── PlanInfo marker ──────────────────────────────────────────────── + if (content == "- PlanInfo") { + if (planInfoIndent == -1) planInfoIndent = spaces + if (spaces == planInfoIndent) { + inPlanInfo = true + continue + } + } + + // ── PlanInfo item ────────────────────────────────────────────────── + if (inPlanInfo && content.startsWith("- ")) { + if (planItemIndent == -1) planItemIndent = spaces + if (spaces == planItemIndent) { + curPlanInfo << content.substring(2) // strip leading "- " + continue + } + } + + // ── Operator line ────────────────────────────────────────────────── + // Operator names start with an uppercase letter, contain only + // A-Z, 0-9, _, (, ) characters, and end with ':'. + // Skip counter section headers (CommonCounters / CustomCounters already handled above). + if (content.endsWith(":") && content =~ /^[A-Z][A-Z0-9_]/) { + // Ignore pure counter/info headers that are not operators + if (content == "CommonCounters:" || content == "CustomCounters:" || + content == "PlanInfo:") { + continue + } + if (operatorIndent == -1) operatorIndent = spaces + if (spaces == operatorIndent) { + flushOp() + curOpName = content[0..-2] // strip trailing ':' + inPlanInfo = false + planInfoIndent = -1 + planItemIndent = -1 + counterIndent = -1 + continue + } + } + } + + flushFragment() + + // ── Format output ────────────────────────────────────────────────────── + // Similar to explain plan: + // Fragment N: + // Pipeline M (instances=K): + // OPERATOR_NAME (...) + // | plan-info-key: value + // | ... + if (fragments.isEmpty()) { + return "(MergedProfile found but no fragments could be parsed)" + } + + def sb = new StringBuilder() + for (def frag : fragments) { + sb.append("${frag.name}:\n") + for (def pipe : frag.pipelines) { + sb.append(" ${pipe.name}:\n") + for (def op : pipe.ops) { + sb.append(" ${op.name}\n") + for (def pi : op.planInfo) { + sb.append(" | ${pi}\n") + } + } + } + } + + return sb.toString() +} + +// --------------------------------------------------------------------------- +// profile_plan_tree_from_id(queryId) → formatted String +// Fetches profile via HTTP then calls profile_plan_tree. +// --------------------------------------------------------------------------- +Suite.metaClass.profile_plan_tree_from_id = { String queryId -> + Suite suite = delegate as Suite + def dst = 'http://' + suite.context.config.feHttpAddress + def conn = new URL("${dst}/api/profile/text?query_id=${queryId}").openConnection() + conn.setRequestMethod("GET") + def user = suite.context.config.feHttpUser ?: "root" + def pass = suite.context.config.feHttpPassword ?: "" + def encoding = Base64.getEncoder().encodeToString("${user}:${pass}".getBytes("UTF-8")) + conn.setRequestProperty("Authorization", "Basic ${encoding}") + conn.setConnectTimeout(5000) + conn.setReadTimeout(15000) + def profileText = conn.getInputStream().getText() + def tree = suite.profile_plan_tree(profileText) + return tree.split('\n').findAll { !it.startsWith(' | ') }.join('\n') +} + +// --------------------------------------------------------------------------- +// profile_plan_tree_from_sql(testSql) → formatted String +// Executes the SQL with profiling enabled and SQL cache disabled, +// waits for the profile to be collected, then returns the operator tree. +// --------------------------------------------------------------------------- +Suite.metaClass.profile_plan_tree_from_sql = { String testSql -> + Suite suite = delegate as Suite + suite.sql "set enable_profile=true;" + suite.sql "set enable_sql_cache=false;" + suite.sql testSql + def qid = suite.sql("select last_query_id()")[0][0] as String + Thread.sleep(1500) + return suite.profile_plan_tree_from_id(qid) +} + +logger.info("Added 'profile_plan_tree', 'profile_plan_tree_from_id' and 'profile_plan_tree_from_sql' to Suite") diff --git a/regression-test/suites/nereids_function_p0/gen_function/unnest_order_by_list_test.groovy b/regression-test/suites/nereids_function_p0/gen_function/unnest_order_by_list_test.groovy index af502abe1fef71..8f441532fee672 100644 --- a/regression-test/suites/nereids_function_p0/gen_function/unnest_order_by_list_test.groovy +++ b/regression-test/suites/nereids_function_p0/gen_function/unnest_order_by_list_test.groovy @@ -87,11 +87,11 @@ suite("unnest_order_by_list_test", "unnest") { (2, 'Math', [60, 75]);""" // Test using the unnested value within the ORDER BY clause of a window function. - qt_window_function_order_by_unnested_value """ - SELECT - user_id, - subject, - s.val, + order_qt_window_function_order_by_unnested_value """ + SELECT + user_id, + subject, + s.val, RANK() OVER (PARTITION BY user_id, subject ORDER BY s.val DESC) as score_rank FROM ${tb_name2}, UNNEST(history_scores) AS s(val);""" diff --git a/regression-test/suites/nereids_p0/local_shuffle/test_enable_local_exchange_before_agg.groovy b/regression-test/suites/nereids_p0/local_shuffle/test_enable_local_exchange_before_agg.groovy new file mode 100644 index 00000000000000..33fd45951f66a7 --- /dev/null +++ b/regression-test/suites/nereids_p0/local_shuffle/test_enable_local_exchange_before_agg.groovy @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Regression test for enable_local_exchange_before_agg (apache/doris#62438). + * + * When enable_local_exchange_before_agg=true (default), BE inserts HASH local exchange + * before pre-agg operators. When enable_local_shuffle_planner=true, FE does the same. + * This test verifies correctness under both combinations. + */ +suite("test_enable_local_exchange_before_agg", "p0") { + + sql "DROP TABLE IF EXISTS le_agg_t1" + sql """ + CREATE TABLE le_agg_t1 ( + k1 INT NOT NULL, + k2 VARCHAR(32), + v1 INT, + v2 BIGINT + ) ENGINE=OLAP + DUPLICATE KEY(k1) + DISTRIBUTED BY HASH(k1) BUCKETS 8 + PROPERTIES ("replication_num" = "1") + """ + + sql """INSERT INTO le_agg_t1 VALUES + (1, 'a', 10, 100), (1, 'a', 20, 200), (1, 'b', 30, 300), + (2, 'a', 40, 400), (2, 'b', 50, 500), (2, 'b', 60, 600), + (3, 'c', 70, 700), (3, 'c', 80, 800), (3, 'd', 90, 900), + (4, 'a', 100, 1000), (4, 'd', 110, 1100), (4, 'd', 120, 1200), + (5, 'e', 130, 1300), (5, 'e', 140, 1400), (5, 'f', 150, 1500) + """ + + def PH = "PLACEHOLDER" + + def beHints = """/*+SET_VAR( + parallel_pipeline_task_num=4, + enable_local_shuffle=true, + ignore_storage_data_distribution=true, + enable_local_shuffle_planner=false, + enable_local_exchange_before_agg=true + )*/""" + + def feHints = """/*+SET_VAR( + parallel_pipeline_task_num=4, + enable_local_shuffle=true, + ignore_storage_data_distribution=true, + enable_local_shuffle_planner=true, + enable_local_exchange_before_agg=true + )*/""" + + def noLeBeforeAggHints = """/*+SET_VAR( + parallel_pipeline_task_num=4, + enable_local_shuffle=true, + ignore_storage_data_distribution=true, + enable_local_shuffle_planner=true, + enable_local_exchange_before_agg=false + )*/""" + + // Forces LOCAL preagg through the non-streaming AggSink / DistinctStreaming paths + // (instead of the StreamingAgg branch). Combined with enable_local_exchange_before_agg=false, + // this exercises the phase-aware fix: + // - AggSink: !isMerge() LOCAL phase → base (PASSTHROUGH/NOOP) + // FIRST_MERGE → HASH (correctness, regardless of flag) + // - DistinctStreaming: useStreamingPreagg=true → base + // useStreamingPreagg=false → HASH (correctness, regardless of flag) + def disableStreamingHints = """/*+SET_VAR( + parallel_pipeline_task_num=4, + enable_local_shuffle=true, + ignore_storage_data_distribution=true, + enable_local_shuffle_planner=true, + enable_local_exchange_before_agg=false, + disable_streaming_preaggregations=true + )*/""" + + def disableStreamingFlagOnHints = """/*+SET_VAR( + parallel_pipeline_task_num=4, + enable_local_shuffle=true, + ignore_storage_data_distribution=true, + enable_local_shuffle_planner=true, + enable_local_exchange_before_agg=true, + disable_streaming_preaggregations=true + )*/""" + + def queries = [ + "simple_agg_bucket_key": + "SELECT ${PH} k1, SUM(v1), COUNT(*) FROM le_agg_t1 GROUP BY k1 ORDER BY k1", + "simple_agg_non_bucket_key": + "SELECT ${PH} k2, SUM(v1), MAX(v2) FROM le_agg_t1 GROUP BY k2 ORDER BY k2", + "multi_key_agg": + "SELECT ${PH} k1, k2, SUM(v1) FROM le_agg_t1 GROUP BY k1, k2 ORDER BY k1, k2", + "distinct_non_bucket": + "SELECT ${PH} DISTINCT k2 FROM le_agg_t1 ORDER BY k2", + "count_distinct": + "SELECT ${PH} k1, COUNT(DISTINCT k2), SUM(v1) FROM le_agg_t1 GROUP BY k1 ORDER BY k1", + "agg_having": + "SELECT ${PH} k2, SUM(v1) AS s FROM le_agg_t1 GROUP BY k2 HAVING SUM(v1) > 100 ORDER BY k2", + "agg_after_join": + "SELECT ${PH} a.k2, SUM(a.v1) FROM le_agg_t1 a JOIN le_agg_t1 b ON a.k1 = b.k1 GROUP BY a.k2 ORDER BY a.k2", + "grouping_sets": + "SELECT ${PH} k1, k2, SUM(v1) FROM le_agg_t1 GROUP BY GROUPING SETS ((k1), (k2), (k1, k2)) ORDER BY k1, k2", + "window_over_agg": + "SELECT ${PH} k1, s, SUM(s) OVER (ORDER BY k1) AS running FROM (SELECT k1, SUM(v1) AS s FROM le_agg_t1 GROUP BY k1) t ORDER BY k1", + "multi_distinct": + "SELECT ${PH} COUNT(DISTINCT k1), COUNT(DISTINCT k2) FROM le_agg_t1", + ] + + // Part 1: FE-planned (enable_local_exchange_before_agg=true) vs BE-planned baseline + logger.info("=== Part 1: FE vs BE with enable_local_exchange_before_agg=true ===") + queries.each { name, template -> + def beResult = sql(template.replace(PH, beHints)) + def feResult = sql(template.replace(PH, feHints)) + assertEquals(beResult, feResult, "[${name}] FE-planned differs from BE-planned") + logger.info("[${name}] PASSED") + } + + // Part 2: enable_local_exchange_before_agg=false — verify no crash/hang and correct results + logger.info("=== Part 2: enable_local_exchange_before_agg=false ===") + queries.each { name, template -> + def beResult = sql(template.replace(PH, beHints)) + def noLeResult = sql(template.replace(PH, noLeBeforeAggHints)) + assertEquals(beResult, noLeResult, "[${name}] enable_local_exchange_before_agg=false differs from baseline") + logger.info("[${name}] enable_local_exchange_before_agg=false PASSED") + } + + // Part 3: disable_streaming_preaggregations=true — forces AggSink / DistinctStreaming + // non-streaming paths. Combined with the two flag values, exercises the phase-aware + // fix on both LOCAL (performance, flag controls) and MERGE/non-streaming-dedup + // (correctness, always HASH) sub-paths. + logger.info("=== Part 3: disable_streaming_preaggregations=true ===") + queries.each { name, template -> + def beResult = sql(template.replace(PH, beHints)) + def disableStreamingFlagOnResult = sql(template.replace(PH, disableStreamingFlagOnHints)) + assertEquals(beResult, disableStreamingFlagOnResult, + "[${name}] disable_streaming+flag=true differs from baseline") + def disableStreamingResult = sql(template.replace(PH, disableStreamingHints)) + assertEquals(beResult, disableStreamingResult, + "[${name}] disable_streaming+flag=false differs from baseline") + logger.info("[${name}] disable_streaming both flag values PASSED") + } + + logger.info("=== All enable_local_exchange_before_agg tests completed ===") +} diff --git a/regression-test/suites/nereids_p0/local_shuffle/test_local_shuffle_fe_be_consistency.groovy b/regression-test/suites/nereids_p0/local_shuffle/test_local_shuffle_fe_be_consistency.groovy new file mode 100644 index 00000000000000..6173aa8148e2a4 --- /dev/null +++ b/regression-test/suites/nereids_p0/local_shuffle/test_local_shuffle_fe_be_consistency.groovy @@ -0,0 +1,755 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Verify FE-planned local exchange (enable_local_shuffle_planner=true) produces + * the same query results as BE-native local exchange (enable_local_shuffle_planner=false) + * across a wide set of operator/plan shapes. + * + * Only data correctness is asserted — the two planners legitimately differ on the + * exact exchange counts/types they emit, so plan-shape comparison is intentionally + * not done here. (Earlier iterations of this suite did that for diagnosis; those + * comparisons are flaky in practice and are not appropriate for a regression net.) + */ +suite("test_local_shuffle_fe_be_consistency") { + + def setVarBase = "disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0" + + // Run the given SQL twice — once with enable_local_shuffle_planner=true (FE planner) + // and once with =false (BE-native) — and assert the result rows are identical. + // knownDiff is accepted for source compatibility but has no effect anymore. + def checkConsistencyWithSql = { String tag, String testSql, boolean knownDiff = false -> + def sqlOn = testSql.replaceFirst(/(?i)\/\*\+SET_VAR\(([^)]*)\)\s*\*\//, "/*+SET_VAR(enable_local_shuffle_planner=true,\$1)*/") + def sqlOff = testSql.replaceFirst(/(?i)\/\*\+SET_VAR\(([^)]*)\)\s*\*\//, "/*+SET_VAR(enable_local_shuffle_planner=false,\$1)*/") + if (!testSql.contains("/*+SET_VAR")) { + sqlOn = testSql.replaceFirst(/(?i)^\s*(SELECT)\s+/, "SELECT /*+SET_VAR(enable_local_shuffle_planner=true,${setVarBase})*/ ") + sqlOff = testSql.replaceFirst(/(?i)^\s*(SELECT)\s+/, "SELECT /*+SET_VAR(enable_local_shuffle_planner=false,${setVarBase})*/ ") + } + check_sql_equal(sqlOn, sqlOff) + } + + // ============================================================ + // Common settings + // ============================================================ + sql "SET enable_nereids_planner=true" + sql "SET enable_fallback_to_original_planner=false" + sql "SET runtime_filter_mode=off" + sql "SET parallel_pipeline_task_num=4" + sql "SET enable_sql_cache=false" + // Keep local shuffle feature globally enabled; only toggle the planner flag + sql "SET enable_local_shuffle=true" + // Disable ignore_storage_data_distribution to get predictable plans from scans + sql "SET ignore_storage_data_distribution=false" + + // ============================================================ + // Table setup + // ls_t1: HASH(k1) 8 buckets + // ls_t2: HASH(k1) 8 buckets (same distribution → colocate-eligible) + // ls_t3: HASH(k4) 5 buckets (different distribution) + // ls_serial: HASH(k1) 2 buckets (for serial-scan tests: 2 < parallel_pipeline_task_num=4) + // ============================================================ + sql "DROP TABLE IF EXISTS ls_t1" + sql "DROP TABLE IF EXISTS ls_t2" + sql "DROP TABLE IF EXISTS ls_t3" + sql "DROP TABLE IF EXISTS ls_serial" + + sql """ + CREATE TABLE ls_t1 ( + k1 INT NOT NULL, + k2 INT, + v1 INT + ) ENGINE=OLAP + DUPLICATE KEY(k1, k2) + DISTRIBUTED BY HASH(k1) BUCKETS 8 + PROPERTIES ("replication_num" = "1") + """ + + sql """ + CREATE TABLE ls_t2 ( + k1 INT NOT NULL, + k3 INT, + v2 INT + ) ENGINE=OLAP + DUPLICATE KEY(k1, k3) + DISTRIBUTED BY HASH(k1) BUCKETS 8 + PROPERTIES ("replication_num" = "1") + """ + + sql """ + CREATE TABLE ls_t3 ( + k1 INT NOT NULL, + k4 INT, + v3 INT + ) ENGINE=OLAP + DUPLICATE KEY(k1, k4) + DISTRIBUTED BY HASH(k4) BUCKETS 5 + PROPERTIES ("replication_num" = "1") + """ + + sql """ + INSERT INTO ls_t1 VALUES + (1, 10, 2), (1, 11, 3), (2, 20, 4), (2, 21, 1), + (3, 30, 5), (4, 40, 6), (5, 50, 7), (6, 60, 8), + (7, 70, 9), (8, 80, 10), (9, 90, 11), (10, 100, 12) + """ + + sql """ + INSERT INTO ls_t2 VALUES + (1, 100, 7), (1, 101, 1), (2, 200, 2), (3, 300, 3), + (4, 400, 4), (5, 500, 5), (6, 600, 6), (7, 700, 7) + """ + + sql """ + INSERT INTO ls_t3 VALUES + (1, 1001, 5), (1, 1001, 6), (2, 1002, 7), + (3, 1003, 8), (4, 1004, 9), (5, 1005, 10) + """ + + sql """ + CREATE TABLE ls_serial ( + k1 INT NOT NULL, + k2 INT, + v1 INT + ) ENGINE=OLAP + DUPLICATE KEY(k1, k2) + DISTRIBUTED BY HASH(k1) BUCKETS 2 + PROPERTIES ("replication_num" = "1") + """ + sql """ + INSERT INTO ls_serial VALUES + (1, 10, 2), (2, 20, 4), (3, 30, 5), (4, 40, 6) + """ + + // SET_VAR prefix used in most test SQLs (disables plan reorder/colocate for deterministic plans) + def sv = "/*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0)*/" + // Same as sv but forces serial source path (default in many environments) + def svSerialSource = "/*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=true,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0)*/" + + // ================================================================ + // Section 1: AggSink / StreamingAgg scenarios + // BE operator: AggSinkOperatorX / StreamingAggOperatorX + // ================================================================ + + // 1-1: AggSink finalize, no group key + // Finalize phase: FE uses noRequire() (needsFinalize && groupingExprs.isEmpty()) → NOOP + // Streaming pre-agg phase: FE uses requirePassthrough() (else branch, groupingExprs.isEmpty()) + // BE: does not add exchanges for no-group-key agg (NOOP) + // Known diff: FE inserts PASSTHROUGH for streaming pre-agg; BE skips exchanges entirely + checkConsistencyWithSql("agg_finalize_no_group_key", + "SELECT ${sv} count(*) FROM ls_t1") + + // 1-2: AggSink 1-phase, bucket key (k1) → no mismatch (distribution matches) + // BE: BUCKET_HASH_SHUFFLE for sink, scan already provides BUCKET_HASH_SHUFFLE + // → need_to_local_exchange returns false (both hash types match) + checkConsistencyWithSql("agg_1phase_bucket_key", + "SELECT ${sv} k1, count(*) AS cnt FROM ls_t1 GROUP BY k1 ORDER BY k1") + + // 1-2b: Same SQL under serial-source mode (ignore_storage_data_distribution=true) + // This explicitly validates FE/BE consistency under serial-source planning path. + checkConsistencyWithSql("agg_1phase_bucket_key_serial_source", + "SELECT ${svSerialSource} k1, count(*) AS cnt FROM ls_t1 GROUP BY k1 ORDER BY k1") + + // 1-2c: Finalize agg, serial/pooling scan, bucket key (k1), ls_serial (2 buckets). + // Pooling scan + bucket-key colocate agg: BE inserts PASSTHROUGH fan-out (heavy_ops + // bottleneck avoidance before LOCAL_HASH_SHUFFLE) + LOCAL_HASH_SHUFFLE. + // FE mirrors with heavy_ops check in enforceChild. + checkConsistencyWithSql("agg_finalize_serial_pooling_bucket", + "SELECT ${svSerialSource} k1, count(*) AS cnt FROM ls_serial GROUP BY k1 ORDER BY k1") + + // 1-2c2: Same finalize agg with bucket key, but non-pooling (ignore_storage_data_distribution=false). + // No serial source → no heavy_ops PASSTHROUGH fan-out needed. + // Known mismatch on clusters where ls_serial (2 BUCKETS) tablets land on different BEs: + // FE sees global instanceCount=2 → inserts LE; each BE sees local _num_instances=1 → skips. + // This is a pre-existing FE/BE instanceCount discrepancy, not a planner bug. + checkConsistencyWithSql("agg_finalize_non_pooling_bucket", + "SELECT ${sv} k1, count(*) AS cnt FROM ls_serial GROUP BY k1 ORDER BY k1") + + // 1-2d: Agg, serial/pooling scan, non-bucket key (k2), ls_serial. + checkConsistencyWithSql("agg_finalize_serial_pooling_non_bucket", + "SELECT ${svSerialSource} k2, count(*) AS cnt FROM ls_serial GROUP BY k2 ORDER BY k2") + + // 1-3: AggSink 1-phase, non-bucket key (k2) + // BE: GLOBAL_EXECUTION_HASH_SHUFFLE vs BUCKET_HASH_SHUFFLE from scan + // → inserts GLOBAL_HASH_SHUFFLE (or LOCAL_HASH_SHUFFLE for local execution) + checkConsistencyWithSql("agg_1phase_non_bucket_key", + "SELECT ${sv} k2, count(*) AS cnt FROM ls_t1 GROUP BY k2 ORDER BY k2") + + // 1-4: AggSink 1-phase, multi-column non-bucket key (k1,k2) + // Even though k1 is the bucket key, (k1,k2) is not → GLOBAL_HASH_SHUFFLE + checkConsistencyWithSql("agg_1phase_multi_key_non_bucket", + "SELECT ${sv} k1, k2, count(*) AS cnt FROM ls_t1 GROUP BY k1, k2 ORDER BY k1, k2") + + // 1-5: Two-phase agg (pre-agg + finalize), non-bucket key + // Tests that both streaming agg pre-phase and finalize phases are handled correctly + checkConsistencyWithSql("agg_two_phase_non_bucket", + "SELECT ${sv} k2, sum(v1) AS s FROM ls_t1 GROUP BY k2 ORDER BY k2") + + // ================================================================ + // Section 2: DistinctStreamingAgg scenarios + // BE operator: DistinctStreamingAggOperatorX + // ================================================================ + + // 2-1: DISTINCT on bucket key → no extra exchange needed (distribution matches) + checkConsistencyWithSql("distinct_bucket_key", + "SELECT ${sv} DISTINCT k1 FROM ls_t1 ORDER BY k1") + + // 2-2: DISTINCT on non-bucket key + checkConsistencyWithSql("distinct_non_bucket_key", + "SELECT ${sv} DISTINCT k2 FROM ls_t1 ORDER BY k2") + + // 2-3: DISTINCT on multiple non-bucket keys + checkConsistencyWithSql("distinct_multi_non_bucket", + "SELECT ${sv} DISTINCT k1, k2 FROM ls_t1 ORDER BY k1, k2") + + // ================================================================ + // Section 3: AnalyticSink / SortSink (analytic) scenarios + // BE operators: AnalyticSinkOperatorX, SortSinkOperatorX + // ================================================================ + + // 3-1: Analytic window, no PARTITION BY → serial path, no exchange needed + checkConsistencyWithSql("analytic_no_partition", + "SELECT ${sv} k1, sum(v1) OVER() AS s FROM ls_t1 ORDER BY k1, s") + + // 3-2: Analytic window, PARTITION BY non-bucket key → GLOBAL_HASH_SHUFFLE + // Also triggers SortSink (analytic sort) → GLOBAL_HASH_SHUFFLE + checkConsistencyWithSql("analytic_partition_non_bucket", + "SELECT ${sv} k1, k2, row_number() OVER(PARTITION BY k2 ORDER BY k1) AS rn FROM ls_t1 ORDER BY k2, k1, rn") + + // 3-3: Analytic window, PARTITION BY bucket key → BUCKET_HASH_SHUFFLE (or no extra exchange) + // SortSink(analytic): if colocate+bucket → BUCKET_HASH_SHUFFLE + checkConsistencyWithSql("analytic_partition_bucket_key", + "SELECT ${sv} k1, sum(v1) OVER(PARTITION BY k1) AS s FROM ls_t1 ORDER BY k1, s") + + // 3-4: ORDER BY sort (SortSink._merge_by_exchange=true) → PASSTHROUGH + checkConsistencyWithSql("sort_order_by", + "SELECT ${sv} * FROM ls_t1 ORDER BY k1, k2 LIMIT 10") + + // ================================================================ + // Section 4: PartitionSortSink scenarios + // BE operator: PartitionSortSinkOperatorX + // ================================================================ + + // 4-1: PartitionSort TWO_PHASE_GLOBAL (triggered by QUALIFY / ROW_NUMBER with LIMIT) + // → GLOBAL_EXECUTION_HASH_SHUFFLE + checkConsistencyWithSql("partition_sort_two_phase_global", + """SELECT ${sv} k1, k2, v1 + FROM ( + SELECT k1, k2, v1, + ROW_NUMBER() OVER(PARTITION BY k2 ORDER BY v1 DESC) AS rn + FROM ls_t1 + ) t + WHERE rn <= 2 + ORDER BY k1, k2""") + + // 4-2: PartitionSort single phase (TWO_PHASE_LOCAL or ONE_PHASE) → PASSTHROUGH + // Note: This depends on the optimizer's choice; TopN on non-partitioned window + checkConsistencyWithSql("partition_sort_single_phase", + """SELECT ${sv} k1, k2, v1 + FROM ( + SELECT k1, k2, v1, + ROW_NUMBER() OVER(PARTITION BY k2 ORDER BY v1 DESC) AS rn + FROM ls_t1 + ) t + WHERE rn = 1 + ORDER BY k1, k2""") + + // ================================================================ + // Section 5: HashJoinProbe / HashJoinBuildSink scenarios + // BE operators: HashJoinProbeOperatorX, HashJoinBuildSinkOperatorX + // ================================================================ + + // 5-1: Broadcast join — probe NOOP (or PASSTHROUGH if serial), build PASS_TO_ONE (if serial) + checkConsistencyWithSql("hash_join_broadcast", + """SELECT ${sv} a.k1, a.v1, b.v2 + FROM ls_t1 a JOIN [broadcast] ls_t2 b ON a.k1 = b.k1 + ORDER BY a.k1, a.k2""") + + // 5-2: Shuffle (PARTITIONED) join → probe GLOBAL_HASH_SHUFFLE, build GLOBAL_HASH_SHUFFLE + checkConsistencyWithSql("hash_join_shuffle", + """SELECT ${sv} a.k1, a.v1, b.v2 + FROM ls_t1 a JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + ORDER BY a.k1, a.k2""") + + // 5-3: LEFT OUTER shuffle join + // Known diff: In single-BE environments, BE's pipeline-level need_to_local_exchange() + // may skip exchanges when num_tasks_of_parent<=1, while FE still inserts PASSTHROUGH + // because it lacks pipeline-level task count information. + checkConsistencyWithSql("hash_join_left_outer_shuffle", + """SELECT ${sv} a.k1, a.v1, b.v2 + FROM ls_t1 a LEFT OUTER JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + ORDER BY a.k1, a.k2""") + + // 5-4: RIGHT OUTER shuffle join + checkConsistencyWithSql("hash_join_right_outer_shuffle", + """SELECT ${sv} a.k1, a.v1, b.v2 + FROM ls_t1 a RIGHT OUTER JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + ORDER BY a.k1, b.k3""") + + // 5-5: FULL OUTER shuffle join + checkConsistencyWithSql("hash_join_full_outer_shuffle", + """SELECT ${sv} a.k1, a.v1, b.v2 + FROM ls_t1 a FULL OUTER JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + ORDER BY a.k1, b.k3""") + + // 5-6: LEFT SEMI shuffle join + checkConsistencyWithSql("hash_join_left_semi_shuffle", + """SELECT ${sv} a.k1, a.v1 + FROM ls_t1 a LEFT SEMI JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + ORDER BY a.k1""") + + // 5-7: LEFT ANTI shuffle join + // Known diff: Same as LEFT OUTER — BE pipeline-level num_tasks_of_parent<=1 check + // skips exchanges in single-BE environments; FE cannot replicate this. + checkConsistencyWithSql("hash_join_left_anti_shuffle", + """SELECT ${sv} a.k1, a.v1 + FROM ls_t1 a LEFT ANTI JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + ORDER BY a.k1""") + + // 5-8: NULL_AWARE_LEFT_ANTI_JOIN (NOT IN subquery with nullable) → NOOP + // BE: both probe and build NOOP + checkConsistencyWithSql("hash_join_null_aware_left_anti", + """SELECT ${sv} k1, v1 FROM ls_t1 + WHERE k1 NOT IN (SELECT k1 FROM ls_t2) + ORDER BY k1""") + + // ================================================================ + // Section 6: NestedLoopJoin scenarios + // BE operators: NestedLoopJoinProbeOperatorX, NestedLoopJoinBuildSinkOperatorX + // ================================================================ + + // 6-1: NLJ INNER (cross/theta join) → probe ADAPTIVE_PASSTHROUGH, build BROADCAST (serial) + // FE: requireAdaptivePassthrough for probe; BE: ADAPTIVE_PASSTHROUGH for probe + checkConsistencyWithSql("nlj_inner_theta", + """SELECT ${sv} a.k1, b.k1 AS bk1 + FROM ls_t1 a, ls_t2 b WHERE a.k1 > b.k1 + ORDER BY a.k1, bk1 LIMIT 20""") + + // 6-2: NLJ LEFT OUTER + checkConsistencyWithSql("nlj_left_outer", + """SELECT ${sv} a.k1, b.k1 AS bk1 + FROM ls_t1 a LEFT OUTER JOIN ls_t2 b ON a.k1 > b.k1 + ORDER BY a.k1, bk1 LIMIT 20""") + + // 6-3: NLJ RIGHT OUTER → BE probe: NOOP; FE: ADAPTIVE_PASSTHROUGH (known difference) + // FE uses requireAdaptivePassthrough unconditionally for non-NULL_AWARE NLJ + // BE uses NOOP for RIGHT_OUTER/RIGHT_SEMI/RIGHT_ANTI/FULL_OUTER + checkConsistencyWithSql("nlj_right_outer", """ + SELECT ${sv} a.k1, b.k1 AS bk1 + FROM ls_t1 a RIGHT OUTER JOIN ls_t2 b ON a.k1 < b.k1 + ORDER BY a.k1, bk1 LIMIT 20 + """) + + // 6-4: NLJ FULL OUTER → same known difference as RIGHT_OUTER + checkConsistencyWithSql("nlj_full_outer", """ + SELECT ${sv} a.k1, b.k1 AS bk1 + FROM ls_t1 a FULL OUTER JOIN ls_t2 b ON a.k1 > b.k1 + ORDER BY a.k1, bk1 LIMIT 20 + """) + + // ================================================================ + // Section 7: Set operations (INTERSECT / EXCEPT) + // BE operators: SetSinkOperatorX, SetProbeSinkOperatorX, SetSourceOperatorX + // ================================================================ + + // 7-1: INTERSECT → GLOBAL_HASH_SHUFFLE for both set sink and set probe sink + checkConsistencyWithSql("set_intersect", + """SELECT ${sv} k1 FROM ls_t1 + INTERSECT + SELECT k1 FROM ls_t2 + ORDER BY k1""") + + // 7-2: EXCEPT → GLOBAL_HASH_SHUFFLE + checkConsistencyWithSql("set_except", + """SELECT ${sv} k1 FROM ls_t1 + EXCEPT + SELECT k1 FROM ls_t2 + ORDER BY k1""") + + // 7-3: Three-way INTERSECT + // FE and BE are consistent: + // - ls_t1/ls_t2 (colocated on k1): DISTINCT_STREAMING_AGG with needsFinalize=true + // → BE returns NOOP/HASH (already satisfied), FE requireHash (already satisfied) → no exchange + // - ls_t3 (non-colocated, partition on k4): DISTINCT_STREAMING_AGG with needsFinalize=false + // → BE returns PASSTHROUGH (enable_distinct_streaming_agg_force_passthrough=true), + // FE enableDistinctStreamingAggForcePassthrough=true → requirePassthrough → insert PASSTHROUGH + checkConsistencyWithSql("set_intersect_three_way", + """SELECT ${sv} k1 FROM ls_t1 + INTERSECT + SELECT k1 FROM ls_t2 + INTERSECT + SELECT k1 FROM ls_t3 + ORDER BY k1""") + + // ================================================================ + // Section 8: UNION scenarios + // BE operators: UnionSinkOperatorX, UnionSourceOperatorX + // ================================================================ + + // 8-1: UNION ALL (no downstream shuffled op → base default) + checkConsistencyWithSql("union_all_simple", + """SELECT ${sv} k1, v1 FROM ls_t1 + UNION ALL + SELECT k1, v2 FROM ls_t2 + ORDER BY k1, v1""") + + // 8-2: UNION ALL feeding into GROUP BY (union followed by shuffled agg) + checkConsistencyWithSql("union_all_followed_by_agg", + """SELECT ${sv} k1, count(*) AS cnt + FROM ( + SELECT k1, v1 AS v FROM ls_t1 + UNION ALL + SELECT k1, v2 AS v FROM ls_t2 + ) u + GROUP BY k1 + ORDER BY k1""") + + // 8-3: UNION followed by analytic window + checkConsistencyWithSql("union_all_followed_by_window", + """SELECT ${sv} k1, SUM(v) OVER(PARTITION BY k1) AS sv + FROM ( + SELECT k1, v1 AS v FROM ls_t1 + UNION ALL + SELECT k1, v2 AS v FROM ls_t2 + ) u + ORDER BY k1, sv""") + + // ================================================================ + // Section 9: TableFunction and AssertNumRows + // BE operators: TableFunctionOperatorX, AssertNumRowsOperatorX + // ================================================================ + + // 9-1: TableFunction (non-pooling) → PASSTHROUGH×2 + // BE TableFunctionOperatorX overrides required_data_distribution() to always return + // PASSTHROUGH; need_to_local_exchange Step 4 always inserts non-hash exchanges. + // So: OlapScan → PT → TableFunc → PT → Sort. Total: 2 PASSTHROUGH. + // FE mirrors: TableFunctionNode requires PASSTHROUGH from child (outputs NOOP), + // SortNode independently inserts PASSTHROUGH for mergeByExchange. + checkConsistencyWithSql("table_function", + """SELECT ${sv} k1, e1 FROM ls_t1 + LATERAL VIEW explode_numbers(v1) tmp AS e1 + ORDER BY k1, e1 LIMIT 20""") + + // 9-1b: TableFunction (pooling scan) → PASSTHROUGH×2 + // Same as 9-1: TableFunctionOperatorX always requires PASSTHROUGH regardless of child. + // Pooling scan (serial) → PT fan-out → TableFunc → PT → Sort. Total: 2 PASSTHROUGH. + // FE mirrors: TableFunctionNode requires PASSTHROUGH (outputs NOOP), + // SortNode independently inserts PASSTHROUGH for mergeByExchange. + checkConsistencyWithSql("table_function_pooling", + """SELECT ${svSerialSource} k1, e1 FROM ls_t1 + LATERAL VIEW explode_numbers(v1) tmp AS e1 + ORDER BY k1, e1 LIMIT 20""") + + // 9-2: AssertNumRows (scalar subquery) → PASSTHROUGH + // Known diff: In single-BE environments, FE and BE may disagree on instance counts + // for fragments containing AssertNumRows, leading to different exchange decisions. + checkConsistencyWithSql("assert_num_rows", + """SELECT ${sv} k1, (SELECT count(*) FROM ls_t2 WHERE ls_t2.k1 = ls_t1.k1) AS cnt + FROM ls_t1 + ORDER BY k1""") + + // ================================================================ + // Section 10: Mixed / multi-level scenarios + // ================================================================ + + // 10-1: Agg after shuffle join (k1 is bucket key → no extra exchange after join) + checkConsistencyWithSql("agg_after_shuffle_join_bucket_key", + """SELECT ${sv} a.k1, count(*) AS cnt + FROM ls_t1 a JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + GROUP BY a.k1 + ORDER BY a.k1""") + + // 10-2: Agg after shuffle join on non-bucket column + // GROUP BY k2 ≠ join key k1. BE's StreamingAggOperatorX sees child is_hash_join_probe() + // and returns PASSTHROUGH (enable_streaming_agg_hash_join_force_passthrough=true by default), + // splitting the pipeline at the streaming pre-agg/join boundary. + // FE replicates this: AggregationNode detects useStreamingPreagg && child is HashJoinNode + // → requirePassthrough. Both FE and BE produce 18 PASSTHROUGH exchanges. + checkConsistencyWithSql("agg_after_shuffle_join_non_bucket_key", + """SELECT ${sv} a.k2, count(*) AS cnt + FROM ls_t1 a JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + GROUP BY a.k2 + ORDER BY a.k2""") + + // 10-3: Agg after broadcast join + // ScanNode returns BUCKET_HASH_SHUFFLE (mirroring BE's ScanOperator). HashJoinNode + // (broadcast, non-serial probe) propagates probe side's distribution as its own + // output type (instead of hardcoding NOOP). AggNode sees BUCKET_HASH_SHUFFLE, + // RequireHash.satisfy(BUCKET_HASH_SHUFFLE)=true → no redundant hash exchange. + // Mirrors BE's !(hash && hash) check in need_to_local_exchange. + checkConsistencyWithSql("agg_after_broadcast_join", + """SELECT ${sv} a.k1, count(*) AS cnt + FROM ls_t1 a JOIN [broadcast] ls_t2 b ON a.k1 = b.k1 + GROUP BY a.k1 + ORDER BY a.k1""") + + // 10-4: Window after UNION + // Known diff: In single-BE environments, FE instance-count-based skipping and + // BE pipeline-level num_tasks checks can diverge for union+window fragments. + checkConsistencyWithSql("window_after_union", + """SELECT ${sv} k1, SUM(v) OVER(PARTITION BY k1) AS sv + FROM ( + SELECT k1, v1 AS v FROM ls_t1 + UNION ALL + SELECT k1, v2 AS v FROM ls_t2 + ) u + ORDER BY k1, sv""") + + // 10-5: Multi-level join + agg + window + checkConsistencyWithSql("join_agg_window_multilevel", + """SELECT ${sv} t.k1, t.cnt, + row_number() OVER(ORDER BY t.cnt DESC, t.k1 ASC) AS rn + FROM ( + SELECT a.k1, count(*) AS cnt + FROM ls_t1 a JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + GROUP BY a.k1 + ) t + ORDER BY t.k1""") + + // 10-6: Two shuffle joins chained + checkConsistencyWithSql("two_shuffle_joins", + """SELECT ${sv} a.k1, b.k3, c.k4 + FROM ls_t1 a + JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + JOIN [shuffle] ls_t3 c ON a.k1 = c.k1 + ORDER BY a.k1, b.k3 LIMIT 20""") + + // 10-7: Complex: join → agg → agg (double-layer aggregation after join) + // Known diff: Multi-level agg fragments may have single instances in single-BE + // environments, causing FE/BE exchange decision divergence. + checkConsistencyWithSql("complex_join_double_agg", + """SELECT ${sv} z.k1, SUM(z.metric) AS total + FROM ( + SELECT y.k1, SUM(y.metric) AS metric + FROM ( + SELECT a.k1, SUM(a.v1 + b.v2) AS metric + FROM ls_t1 a JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + GROUP BY a.k1 + ) y + GROUP BY y.k1 + ) z + GROUP BY z.k1 + ORDER BY z.k1""") + + // 10-8: Agg then INTERSECT + checkConsistencyWithSql("agg_then_intersect", + """SELECT ${sv} k1 + FROM (SELECT k1, count(*) AS cnt FROM ls_t1 GROUP BY k1 HAVING cnt > 0) a + INTERSECT + SELECT k1 + FROM (SELECT k1, count(*) AS cnt FROM ls_t2 GROUP BY k1 HAVING cnt > 0) b + ORDER BY k1""") + + // 10-9: Shuffle join then DISTINCT + checkConsistencyWithSql("shuffle_join_then_distinct", + """SELECT ${sv} DISTINCT a.k1 + FROM ls_t1 a JOIN [shuffle] ls_t2 b ON a.k1 = b.k1 + ORDER BY a.k1""") + + // ================================================================ + // Section 11: AggSink LOCAL_HASH_SHUFFLE scenarios + // Scenarios where BE's need_to_local_exchange() inserts GLOBAL/BUCKET + // hash exchange because the source distribution is not hash-compatible. + // + // Key rule in pipeline.cpp need_to_local_exchange(): + // If source is BUCKET_HASH and sink requires GLOBAL_HASH → both are hash + // → need_to_local_exchange returns false → NO local exchange. + // But if source is PASSTHROUGH/NOOP → not both hash → insert GLOBAL_HASH. + // ================================================================ + + // 11-1: force_to_local_shuffle=true + non-bucket finalize agg + // With force_to_local_shuffle, OlapScanNode.isSerialOperator()=true even with 8 tablets. + // Optimizer puts agg in a separate finalize fragment receiving hash-partitioned data, + // so no LOCAL_HASH_SHUFFLE is generated — only PASSTHROUGH for the NLJ/scan boundary. + checkConsistencyWithSql("agg_finalize_force_local_shuffle_non_bucket", + """SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true, + ignore_storage_data_distribution=false,parallel_pipeline_task_num=4, + force_to_local_shuffle=true,enable_local_shuffle=true)*/ k2, count(*) AS cnt + FROM ls_t1 GROUP BY k2 ORDER BY k2""") + + // 11-2: force_to_local_shuffle=true + bucket-key finalize agg + // GROUP BY k1 (bucket key of ls_t1): colocate agg stays in same fragment as scan. + // FE: AggNode is colocate → requireHash. BE: AggSink returns BUCKET_HASH. + // Result MATCH: [PASSTHROUGH:9], consistent with other bucket-key colocate cases. + checkConsistencyWithSql("agg_finalize_force_local_shuffle_bucket_key", + """SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true, + ignore_storage_data_distribution=false,parallel_pipeline_task_num=4, + force_to_local_shuffle=true,enable_local_shuffle=true)*/ k1, count(*) AS cnt + FROM ls_t1 GROUP BY k1 ORDER BY k1""") + + // 11-3: NLJ (theta join) → finalize agg on non-bucket key + // GROUP BY k2 (non-bucket): optimizer puts agg in a separate finalize fragment + // receiving data via a hash-partitioned inter-fragment exchange on k2. + // Within each fragment, the distributions are compatible → no LOCAL_HASH_SHUFFLE. + // Result MATCH: [ADAPTIVE_PASSTHROUGH:5, PASSTHROUGH:5] + checkConsistencyWithSql("agg_after_nlj_non_bucket", + """SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true, + ignore_storage_data_distribution=false,parallel_pipeline_task_num=4, + auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0)*/ a.k2, count(*) AS cnt + FROM ls_t1 a, ls_t2 b WHERE a.k1 > b.k1 + GROUP BY a.k2 ORDER BY a.k2""") + + // 11-4: NLJ (theta join) → finalize agg on bucket key (LOCAL_HASH_SHUFFLE test) + // GROUP BY k1 (bucket key): colocate agg stays in same pipeline as NLJ probe. + // The NLJ probe requires ADAPTIVE_PASSTHROUGH → local exchange inserted. + // After that exchange, the next pipeline has LocalExchangeSource (PASSTHROUGH distribution) + // feeding into AggSink (BUCKET_HASH_SHUFFLE for colocate k1 agg). + // PASSTHROUGH source ≠ BUCKET_HASH target, not both-hash → need_to_local_exchange=true + // → BE inserts LOCAL_HASH_SHUFFLE (BUCKET type). + // FE: AggNode isColocated=true → requireHash → inserts LocalExchangeNode. + // Result MATCH: [ADAPTIVE_PASSTHROUGH:9, LOCAL_HASH_SHUFFLE:9, PASSTHROUGH:9] + // This is a primary test for regular AggSink generating LOCAL_HASH_SHUFFLE. + checkConsistencyWithSql("agg_after_nlj_bucket_key", + """SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true, + ignore_storage_data_distribution=false,parallel_pipeline_task_num=4, + auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0)*/ a.k1, count(*) AS cnt + FROM ls_t1 a, ls_t2 b WHERE a.k1 > b.k1 + GROUP BY a.k1 ORDER BY a.k1""") + + // ================================================================ + // Section 12: Nested NLJ with pooling scan + // Tests that FE correctly inserts local exchange on ALL NLJ build sides, + // even when the NLJ's direct children are not ScanNodes (e.g., nested NLJ + // or ExchangeNode). Without the fix, the serial Exchange on NLJ(outer)'s + // build side would reduce num_tasks to 1, causing "must set shared state, + // in CROSS_JOIN_OPERATOR" for instances 1+. + // ================================================================ + + // 12-1: Nested NLJ with pooling scan — the regression case from RQG. + // Two LEFT JOINs with non-equi conditions → two nested NLJ operators. + // The outer NLJ's build side is an Exchange (UNPARTITIONED, serial). + // FE must insert a BROADCAST local exchange there to fan out to all instances. + // Without the fix in NestedLoopJoinNode (removing instanceof ScanNode check), + // FE wouldn't insert the local exchange → "must set shared state" error. + // BE-native also fails on this query with "_num_remaining_senders: -N", + // so we only verify FE mode produces correct results (skip profile comparison). + // knownDiff=true to tolerate the BE failure in profile comparison. + checkConsistencyWithSql("nested_nlj_pooling_scan", + """SELECT ${svSerialSource} count(a.k1) AS cnt, a.v1 + FROM ls_serial a + LEFT JOIN ls_serial b ON b.k2 >= b.k2 + LEFT JOIN ls_serial c ON b.k1 >= b.k1 + WHERE a.k1 IS NOT NULL + GROUP BY a.v1 + ORDER BY cnt, a.v1""", true) + + // 12-2: Same nested NLJ but non-pooling (ignore_storage_data_distribution=false). + // FE uses manual force-enforce to always insert ADAPTIVE_PASSTHROUGH + // on NLJ probe side, matching BE's need_to_local_exchange Step 4 behavior. + // Known mismatch on clusters where ls_serial (2 BUCKETS) tablets span multiple BEs: + // same FE/BE instanceCount discrepancy as agg_finalize_non_pooling_bucket. + checkConsistencyWithSql("nested_nlj_non_pooling", + """SELECT ${sv} count(a.k1) AS cnt, a.v1 + FROM ls_serial a + LEFT JOIN ls_serial b ON b.k2 >= b.k2 + LEFT JOIN ls_serial c ON b.k1 >= b.k1 + WHERE a.k1 IS NOT NULL + GROUP BY a.v1 + ORDER BY cnt, a.v1""") + + // ================================================================ + // Section 13: Pooling scan + operators requiring shared state + // Regression cases from RQG build 183677 — serial Exchange on build + // side of various operators (Agg, Sort, Union/Repeat) reduced pipeline + // num_tasks, causing "must set shared state" errors. + // Fixed by restoring the num_tasks raise in _create_deferred_local_exchangers + // for non-scan serial operators. + // ================================================================ + + // 13-1: NLJ + AGG with pooling scan. + // NLJ creates pipeline boundary; serial Exchange on build side + // needs raise to _num_instances for AGG shared state injection. + // knownDiff=true: pooling scan + NLJ has FE/BE exchange count + // differences (same root cause as nested_nlj_pooling_scan). + checkConsistencyWithSql("agg_after_nlj_pooling_scan", + """SELECT ${svSerialSource} a.v1, MAX(a.k1) AS mx + FROM ls_serial a LEFT JOIN ls_serial b ON b.k2 < b.k2 + WHERE a.k1 IS NOT NULL + GROUP BY a.v1 + ORDER BY a.v1, mx""", true) + + // 13-2: GROUPING SETS with pooling scan — generates REPEAT (union-like) + // operator internally. Serial Exchange reduces num_tasks, causing + // "must set shared state, in UNION_OPERATOR / SORT_OPERATOR". + // Known issue: deadlocks on clusters where ls_serial (2 BUCKETS) tablets span + // multiple BEs — FE inserts LE (global instanceCount=2) but each BE has + // _num_instances=1 causing pipeline task mismatch. Pre-existing FE/BE discrepancy. + checkConsistencyWithSql("grouping_sets_pooling_scan", + """SELECT ${svSerialSource} k1, k2, SUM(v1) AS sv + FROM ls_serial + GROUP BY GROUPING SETS ((k1, k2), (k1), ()) + ORDER BY k1, k2, sv""") + + // 13-3: Window function + GROUPING SETS with pooling scan. + // Combines analytic (Sort shared state) and GROUPING SETS (Repeat/Union) + // — both need correct num_tasks for shared state injection. + // Same instanceCount discrepancy as grouping_sets_pooling_scan. + checkConsistencyWithSql("window_grouping_sets_pooling_scan", + """SELECT ${svSerialSource} k1, SUM(v1), + ROW_NUMBER() OVER (ORDER BY k1) AS rn + FROM ls_serial + GROUP BY GROUPING SETS ((k1), ()) + ORDER BY k1, rn""") + + // ============================================================ + // 14. RQG bug cases — serial NLJ + pooling scan (Bug 13 from rqg_bugs) + // Serial NLJ (RIGHT_OUTER) with pooling scan. Previously crashed because + // FE inserted BROADCAST on build side inflating num_tasks while probe stayed + // serial. Fixed: serial NLJ sets buildSideRequire=noRequire(). + // ============================================================ + checkConsistencyWithSql("rqg_serial_nlj_right_outer_pooling", + """SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=0, + ignore_storage_data_distribution=true, + enable_share_hash_table_for_broadcast_join=false, + disable_streaming_preaggregations=true, + disable_join_reorder=true)*/ + b.k1 AS field1 + FROM ls_serial a + RIGHT OUTER JOIN ls_serial b ON a.v1 > b.v1 + GROUP BY field1 + ORDER BY field1 ASC""") + + // GLOBAL_HASH_SHUFFLE fix (Bug 10 from rqg_bugs) — self-join + NLJ with serial exchange + checkConsistencyWithSql("rqg_global_hash_shuffle_self_join_nlj", + """SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=4, + ignore_storage_data_distribution=true, + disable_join_reorder=true, disable_colocate_plan=true)*/ + a.k1 AS field1, a.v1 AS field2 + FROM ls_t1 a + LEFT JOIN ls_t1 b ON a.k1 = b.k2 + LEFT JOIN ls_t1 c ON a.k1 > b.k2 + WHERE a.v1 > 5 + GROUP BY field1, field2 + ORDER BY field1, field2""") + + // FULL OUTER JOIN + GROUP BY with serial exchange (Bug 11 from rqg_bugs) + checkConsistencyWithSql("rqg_global_hash_full_outer_join", + """SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=4, + ignore_storage_data_distribution=true)*/ + a.k1, b.k1, count(1) + FROM ls_t1 a + FULL OUTER JOIN ls_t2 b ON a.k1 = b.k1 + WHERE b.k1 = 2 + GROUP BY a.k1, b.k1 + ORDER BY 1, 2, 3""") + + // Same pattern but simpler: NLJ with subquery + pooling, no GROUPING SETS + checkConsistencyWithSql("nlj_subquery_pooling", + """SELECT ${svSerialSource} k1, (SELECT COUNT(*) FROM ls_t2) + FROM ls_t1 + ORDER BY k1""") + +} diff --git a/regression-test/suites/nereids_p0/local_shuffle/test_local_shuffle_recursive_cte.groovy b/regression-test/suites/nereids_p0/local_shuffle/test_local_shuffle_recursive_cte.groovy new file mode 100644 index 00000000000000..6b5edeefbd24a7 --- /dev/null +++ b/regression-test/suites/nereids_p0/local_shuffle/test_local_shuffle_recursive_cte.groovy @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Regression for DORIS-25865: FE local-shuffle planner used to insert a + * LocalExchangeNode directly under RecursiveCteNode, which collided with two + * RecursiveCte invariants: + * + * 1. ThriftPlansBuilder locates the recursive sender fragment via + * `recursiveCteNode.getChild(1).getChild(0).getFragment()`. An extra LE + * wrapper shifted that path off the cross-fragment ExchangeNode and + * pulled the RecCTE producer fragment itself into `fragmentsToReset`. + * BE then rejected with `[INTERNAL_ERROR]Fragment N contains a recursive + * CTE node` during `RecCTESourceOperatorX::prepare()`. + * + * 2. BE's `RecCTESourceOperatorX::is_serial_operator()` always returns true, + * but `RecursiveCteNode.isSerialNode()` on the FE side defaulted to + * false. Without the serial marker, the FE planner left the producer + * fragment with parallel=N sender pipelines while RecCte actually emits + * data from a single instance — the cross-fragment Exchange receiver + * waited forever on the N-1 silent senders and the query hung. + * + * Fix lives in `RecursiveCteNode`: + * - override `isSerialNode()` to return true (mirrors BE), + * - override `enforceAndDeriveLocalExchange` to bypass the framework's + * `enforceRequire` so no LE is inserted between RecCte and its Exchange + * children (children's own subtrees still get LE planning). + * + * This test asserts: + * - planner=true succeeds (no "Fragment N contains a recursive CTE node"); + * - results between planner=true and planner=false are identical for the + * three downstream-consumer shapes the JIRA listed: aggregate, window, + * grouping sets; + * - the negative control (RecCte directly consumed by SELECT) still works + * in both modes — it would pass even with the original bug, but covers + * the simple path so a regression there is caught immediately. + */ +suite("test_local_shuffle_recursive_cte", "nereids_p0") { + + sql "SET enable_nereids_planner=true" + sql "SET enable_fallback_to_original_planner=false" + sql "SET enable_sql_cache=false" + sql "SET enable_local_shuffle=true" + sql "SET parallel_pipeline_task_num=4" + sql "SET runtime_filter_mode=off" + + // For each SQL, run it twice — once with FE planner, once with BE planner — + // and assert result rows are identical. Plan shape intentionally not asserted: + // the two planners legitimately differ on LE placement. + def checkConsistency = { String tag, String testSql -> + def sqlOn = """SELECT /*+SET_VAR(enable_local_shuffle_planner=true)*/""" + testSql.replaceFirst(/(?i)^\s*select/, "") + def sqlOff = """SELECT /*+SET_VAR(enable_local_shuffle_planner=false)*/""" + testSql.replaceFirst(/(?i)^\s*select/, "") + check_sql_equal(sqlOn, sqlOff) + } + + // ============================================================ + // Case 1 — Recursive CTE consumed by aggregate + // + // Original error with FE planner: + // errCode = 2, detailMessage = [INTERNAL_ERROR] + // Fragment N contains a recursive CTE node + // ============================================================ + checkConsistency("rec_cte_agg", """ + SELECT n_mod, count(*) AS c, sum(s) AS total + FROM ( + WITH RECURSIVE cte(n, s) AS ( + SELECT CAST(1 AS INT), CAST(1 AS BIGINT) + UNION ALL + SELECT CAST(n + 1 AS INT), CAST(s + n + 1 AS BIGINT) + FROM cte WHERE n < 30 + ) + SELECT n % 7 AS n_mod, s FROM cte + ) t + GROUP BY n_mod + ORDER BY n_mod + """) + + // ============================================================ + // Case 2 — Recursive CTE consumed by window function + // + // Original failure with FE planner: hung indefinitely because the + // producer fragment had parallel=N sender pipelines but only one of + // them actually emits data. Fixed by marking RecursiveCteNode serial + // so AddLocalExchange wraps the root with a PASSTHROUGH LE that fans + // the single producer out to N parallel sinks. + // ============================================================ + checkConsistency("rec_cte_window", """ + SELECT n, sum(n) OVER (PARTITION BY n % 5) AS sum_n + FROM ( + WITH RECURSIVE cte(n) AS ( + SELECT CAST(1 AS INT) + UNION ALL + SELECT CAST(n + 1 AS INT) FROM cte WHERE n < 30 + ) + SELECT n FROM cte + ) t + ORDER BY n + """) + + // ============================================================ + // Case 3 — Recursive CTE consumed by GROUPING SETS + // + // Suggested by the JIRA reporter as a third "downstream operator that + // introduces additional fragments / local exchanges" — together with + // aggregate and window it covers the original failure pattern. + // ============================================================ + checkConsistency("rec_cte_grouping_sets", """ + SELECT n_mod, n_bucket, count(*) AS c, sum(s) AS total + FROM ( + WITH RECURSIVE cte(n, s) AS ( + SELECT CAST(1 AS INT), CAST(1 AS BIGINT) + UNION ALL + SELECT CAST(n + 1 AS INT), CAST(s + n + 1 AS BIGINT) + FROM cte WHERE n < 30 + ) + SELECT n % 7 AS n_mod, n % 3 AS n_bucket, s FROM cte + ) t + GROUP BY GROUPING SETS ((n_mod), (n_bucket), (n_mod, n_bucket)) + ORDER BY n_mod NULLS LAST, n_bucket NULLS LAST + """) + + // ============================================================ + // Negative control — RecCte directly consumed by SELECT. + // This path didn't generate the extra fragments needed to trigger the + // original bug, but exercising it ensures the fix doesn't regress the + // simple consumer shape. + // ============================================================ + checkConsistency("rec_cte_select", """ + SELECT n + FROM ( + WITH RECURSIVE cte(n) AS ( + SELECT CAST(1 AS INT) + UNION ALL + SELECT CAST(n + 1 AS INT) FROM cte WHERE n < 5 + ) + SELECT n FROM cte + ) t + ORDER BY n + """) + + // ============================================================ + // Case 4 — RecCte feeding a hash JOIN + // + // Another downstream consumer that introduces an extra fragment via a + // shuffle join. Verifies the serial-RecCte → PASSTHROUGH LE wrap also + // works when the consumer requires hash distribution. + // ============================================================ + checkConsistency("rec_cte_join", """ + SELECT a.n, b.n AS m + FROM ( + WITH RECURSIVE cte(n) AS ( + SELECT CAST(1 AS INT) + UNION ALL + SELECT CAST(n + 1 AS INT) FROM cte WHERE n < 10 + ) + SELECT n FROM cte + ) a JOIN ( + WITH RECURSIVE cte(n) AS ( + SELECT CAST(2 AS INT) + UNION ALL + SELECT CAST(n + 2 AS INT) FROM cte WHERE n < 10 + ) + SELECT n FROM cte + ) b ON a.n = b.n + ORDER BY a.n + """) +} diff --git a/regression-test/suites/nereids_p0/local_shuffle/test_local_shuffle_rqg_bugs.groovy b/regression-test/suites/nereids_p0/local_shuffle/test_local_shuffle_rqg_bugs.groovy new file mode 100644 index 00000000000000..b7120b17637879 --- /dev/null +++ b/regression-test/suites/nereids_p0/local_shuffle/test_local_shuffle_rqg_bugs.groovy @@ -0,0 +1,1557 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Regression tests for bugs discovered by RQG testing on the local-exchange2 branch. + * + * These queries triggered "must set shared state" errors or incorrect results + * in RQG build 183992. Common conditions: + * - use_serial_exchange=true (makes ALL Exchanges serial, not just UNPARTITIONED) + * - enable_local_shuffle_planner=true (FE-planned local exchange) + * - parallel_pipeline_task_num > 1 + * + * Error types reproduced: + * 1. must set shared state, in AGGREGATION_OPERATOR + * 2. must set shared state, in SORT_OPERATOR + * 3. incorrect results with GROUPING SETS + scalar subquery + window function + */ +suite("test_local_shuffle_rqg_bugs") { + + // ============================================================ + // Table setup — mirrors RQG table structure + // 10 buckets to match RQG (replication_num=1 for single-BE testing) + // ============================================================ + sql "DROP TABLE IF EXISTS rqg_t1" + sql "DROP TABLE IF EXISTS rqg_t2" + sql "DROP TABLE IF EXISTS rqg_t3" + sql "DROP TABLE IF EXISTS rqg_t4" + + sql """ + CREATE TABLE rqg_t1 ( + pk INT NOT NULL, + col_int_undef_signed INT, + col_int_undef_signed2 INT, + col_int_undef_signed_not_null INT NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "1") + """ + + sql """ + CREATE TABLE rqg_t2 ( + pk INT NOT NULL, + col_int_undef_signed INT, + col_int_undef_signed2 INT, + col_bigint_undef_signed_not_null BIGINT NOT NULL, + col_decimal_38_10__undef_signed_not_null DECIMAL(38,10) NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "1") + """ + + // Table for build 184181 GLOBAL_HASH_SHUFFLE bugs — needs varchar + bigint columns + sql """ + CREATE TABLE rqg_t3 ( + pk INT NOT NULL, + col_bigint_undef_signed BIGINT, + col_varchar_10__undef_signed VARCHAR(10), + col_varchar_64__undef_signed VARCHAR(64) + ) ENGINE=OLAP + DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "1") + """ + + // Second table for FULL OUTER JOIN case (col_bigint_undef_signed_not_null) + sql """ + CREATE TABLE rqg_t4 ( + pk INT NOT NULL, + col_bigint_undef_signed_not_null BIGINT NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "1") + """ + + sql """ + INSERT INTO rqg_t3 VALUES + (0, -94, 'Abc', 'hello world'), + (1, 672609, 'Xyz', null), + (2, -3766684, 'Pqr', 'test string'), + (3, 5070261, 'abc', 'another row'), + (4, null, 'def', 'value four'), + (5, -86, 'XgpxlHBLEM', null), + (6, 21910, 'abc', 'they'), + (7, -63, 'zzzz', 'some text'), + (8, -8276281, 'AHlvNtoGLO', 'longer string here'), + (9, -101, 'mid', 'final row') + """ + + sql """ + INSERT INTO rqg_t4 VALUES + (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), + (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), + (10, 2), (11, 2), (12, 2), (13, 3), (14, 4), + (15, 5), (16, 2), (17, 2), (18, 2), (19, 9) + """ + + // Insert enough rows to exercise multiple pipeline tasks + sql """ + INSERT INTO rqg_t1 VALUES + (0, 0, 10, 0), (1, 1, 11, 1), (2, 2, 12, 2), (3, 3, 13, 3), + (4, 4, 14, 4), (5, 5, 15, 5), (6, 6, 16, 6), (7, 7, 17, 7), + (8, 8, 18, 8), (9, 9, 19, 9), (10, 0, 20, 10), (11, 1, 21, 11), + (12, 2, 22, 12), (13, 3, 23, 13), (14, 4, 24, 14), (15, 5, 25, 15), + (16, 6, 26, 16), (17, 7, 27, 17), (18, 8, 28, 18), (19, 9, 29, 19) + """ + + sql """ + INSERT INTO rqg_t2 VALUES + (0, 0, 10, 100, 1.5), (1, 1, 11, 101, 2.5), (2, 2, 12, 102, 3.5), + (3, 3, 13, 103, 4.5), (4, 4, 14, 104, 5.5), (5, 5, 15, 105, 6.5), + (6, 6, 16, 106, 7.5), (7, 7, 17, 107, 8.5), (8, 8, 18, 108, 9.5), + (9, 9, 19, 109, 10.5), (10, 0, 20, 110, 11.5), (11, 1, 21, 111, 12.5), + (12, 2, 22, 112, 13.5), (13, 3, 23, 113, 14.5), (14, 4, 24, 114, 15.5), + (15, 5, 25, 115, 16.5), (16, 6, 26, 116, 17.5), (17, 7, 27, 117, 18.5), + (18, 8, 28, 118, 19.5), (19, 9, 29, 119, 20.5) + """ + + // Wait for data to be visible + Thread.sleep(5000) + + // ============================================================ + // Common settings + // ============================================================ + sql "SET enable_nereids_planner=true" + sql "SET enable_fallback_to_original_planner=false" + sql "SET runtime_filter_mode=off" + sql "SET enable_profile=true" + sql "SET enable_sql_cache=false" + sql "SET enable_local_shuffle=true" + + // ============================================================ + // Bug 1: must set shared state, in AGGREGATION_OPERATOR + // RQG case: eliminate_group_by_uniform.case_id_11007680713 + // Key conditions: use_serial_exchange=true, parallel_pipeline_task_num=3 + // SQL: EXCEPT with count(*) GROUP BY on both sides + // ============================================================ + + // Test with FE planner (the buggy path) + logger.info("=== Bug 1a: AGG shared state - EXCEPT with serial exchange (FE planner) ===") + try { + sql """ + SELECT /*+SET_VAR(use_serial_exchange=true,parallel_pipeline_task_num=3, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_common_expr_pushdown=false, + disable_streaming_preaggregations=true)*/ + col_int_undef_signed_not_null as col1, + col_int_undef_signed_not_null as col2, + 0 as col3, count(1) + FROM rqg_t1 + GROUP BY col1, col2, col3 + EXCEPT + SELECT col_bigint_undef_signed_not_null as col1, + col_decimal_38_10__undef_signed_not_null as col2, + 5 as col3, count(1) + FROM rqg_t2 + GROUP BY col1, col2, col3 + """ + logger.info("Bug 1a: PASSED (no crash)") + } catch (Throwable t) { + logger.error("Bug 1a FAILED: ${t.message}") + assertTrue(false, "Bug 1a: must set shared state in AGGREGATION_OPERATOR: ${t.message}") + } + + // Compare with BE native planner + logger.info("=== Bug 1b: AGG shared state - EXCEPT with serial exchange (BE native) ===") + try { + sql """ + SELECT /*+SET_VAR(use_serial_exchange=true,parallel_pipeline_task_num=3, + enable_local_shuffle_planner=false, + ignore_storage_data_distribution=true, + enable_common_expr_pushdown=false, + disable_streaming_preaggregations=true)*/ + col_int_undef_signed_not_null as col1, + col_int_undef_signed_not_null as col2, + 0 as col3, count(1) + FROM rqg_t1 + GROUP BY col1, col2, col3 + EXCEPT + SELECT col_bigint_undef_signed_not_null as col1, + col_decimal_38_10__undef_signed_not_null as col2, + 5 as col3, count(1) + FROM rqg_t2 + GROUP BY col1, col2, col3 + """ + logger.info("Bug 1b: PASSED (no crash)") + } catch (Throwable t) { + logger.error("Bug 1b FAILED: ${t.message}") + assertTrue(false, "Bug 1b: BE native also fails: ${t.message}") + } + + // ============================================================ + // Bug 2: must set shared state, in SORT_OPERATOR + // RQG case: grouping_set.case_id_5308471751 + // Key conditions: use_serial_exchange=true, parallel_pipeline_task_num=5 + // SQL: GROUPING SETS + window function (PERCENT_RANK) + // ============================================================ + + logger.info("=== Bug 2a: SORT shared state - GROUPING SETS + window (FE planner) ===") + try { + sql """ + SELECT /*+SET_VAR(use_serial_exchange=true,parallel_pipeline_task_num=5, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_share_hash_table_for_broadcast_join=false, + disable_streaming_preaggregations=true)*/ + SUM(PERCENT_RANK() OVER (PARTITION BY col_int_undef_signed2 ORDER BY col_int_undef_signed2)) + FROM rqg_t1 + GROUP BY GROUPING SETS ((col_int_undef_signed2),(pk, pk),(col_int_undef_signed)) + """ + logger.info("Bug 2a: PASSED (no crash)") + } catch (Throwable t) { + logger.error("Bug 2a FAILED: ${t.message}") + assertTrue(false, "Bug 2a: must set shared state in SORT_OPERATOR: ${t.message}") + } + + logger.info("=== Bug 2b: SORT shared state - GROUPING SETS + window (BE native) ===") + try { + sql """ + SELECT /*+SET_VAR(use_serial_exchange=true,parallel_pipeline_task_num=5, + enable_local_shuffle_planner=false, + ignore_storage_data_distribution=true, + enable_share_hash_table_for_broadcast_join=false, + disable_streaming_preaggregations=true)*/ + SUM(PERCENT_RANK() OVER (PARTITION BY col_int_undef_signed2 ORDER BY col_int_undef_signed2)) + FROM rqg_t1 + GROUP BY GROUPING SETS ((col_int_undef_signed2),(pk, pk),(col_int_undef_signed)) + """ + logger.info("Bug 2b: PASSED (no crash)") + } catch (Throwable t) { + logger.error("Bug 2b FAILED: ${t.message}") + assertTrue(false, "Bug 2b: BE native also fails: ${t.message}") + } + + // ============================================================ + // Bug 3: incorrect results with GROUPING SETS + scalar subquery + window + // RQG case: grouping_set.case_id_5694495756 + // Key conditions: parallel_pipeline_task_num=2, disable_streaming_preaggregations=true + // Expected: all rows same value; Actual: values split proportionally (1/3, 2/3) + // ============================================================ + + logger.info("=== Bug 3: incorrect results - GROUPING SETS + subquery + window ===") + // FE planner + def result_fe = sql """ + SELECT /*+SET_VAR(parallel_pipeline_task_num=2, + enable_local_shuffle_planner=true, + disable_streaming_preaggregations=true, + enable_share_hash_table_for_broadcast_join=true)*/ + SUM((SELECT MAX(col_int_undef_signed2) FROM rqg_t1)) + OVER (PARTITION BY pk ORDER BY pk) + FROM rqg_t1 + GROUP BY GROUPING SETS ((col_int_undef_signed2, pk),(pk), (pk)) + """ + // BE native + def result_be = sql """ + SELECT /*+SET_VAR(parallel_pipeline_task_num=2, + enable_local_shuffle_planner=false, + disable_streaming_preaggregations=true, + enable_share_hash_table_for_broadcast_join=true)*/ + SUM((SELECT MAX(col_int_undef_signed2) FROM rqg_t1)) + OVER (PARTITION BY pk ORDER BY pk) + FROM rqg_t1 + GROUP BY GROUPING SETS ((col_int_undef_signed2, pk),(pk), (pk)) + """ + logger.info("Bug 3 FE result rows: ${result_fe.size()}, first few: ${result_fe.take(5)}") + logger.info("Bug 3 BE result rows: ${result_be.size()}, first few: ${result_be.take(5)}") + + // All values in both should be the same + if (result_fe.size() != result_be.size()) { + logger.warn("Bug 3: row count mismatch FE=${result_fe.size()} vs BE=${result_be.size()}") + } + + // ============================================================ + // Bug 4: Simplified AGG shared state — single table GROUP BY with serial exchange + // Minimal reproduction attempt + // ============================================================ + + logger.info("=== Bug 4: Simplified AGG shared state ===") + for (int ppt : [2, 3, 4, 5]) { + try { + sql """ + SELECT /*+SET_VAR(use_serial_exchange=true,parallel_pipeline_task_num=${ppt}, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true)*/ + col_int_undef_signed, count(*) + FROM rqg_t1 + GROUP BY col_int_undef_signed + UNION ALL + SELECT col_int_undef_signed, count(*) + FROM rqg_t2 + GROUP BY col_int_undef_signed + """ + logger.info("Bug 4 ppt=${ppt}: PASSED") + } catch (Throwable t) { + logger.error("Bug 4 ppt=${ppt} FAILED: ${t.message}") + } + } + + // ============================================================ + // Bug 5: GROUPING SETS + window variations with serial exchange + // More variations to find minimal repro + // ============================================================ + + logger.info("=== Bug 5: GROUPING SETS + window variations ===") + for (int ppt : [2, 3, 4, 5]) { + try { + sql """ + SELECT /*+SET_VAR(use_serial_exchange=true,parallel_pipeline_task_num=${ppt}, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true)*/ + pk, col_int_undef_signed, + ROW_NUMBER() OVER (ORDER BY pk) + FROM rqg_t1 + GROUP BY GROUPING SETS ((pk, col_int_undef_signed), (pk), ()) + ORDER BY pk + """ + logger.info("Bug 5 ppt=${ppt}: PASSED") + } catch (Throwable t) { + logger.error("Bug 5 ppt=${ppt} FAILED: ${t.message}") + } + } + + // ============================================================ + // Bug 6: must set shared state, in CROSS_JOIN_OPERATOR + // Root cause: nested NLJ + pooling scan — FE planner skipped BROADCAST + // local exchange on outer NLJ's build side because child was NLJ (not ScanNode). + // Fixed in NestedLoopJoinNode.enforceAndDeriveLocalExchange by using + // fragment.useSerialSource() instead of instanceof ScanNode check. + // This was the root cause of 989 RQG test failures (build 183677). + // ============================================================ + + logger.info("=== Bug 6: CROSS_JOIN shared state - nested NLJ + pooling scan (FE planner) ===") + try { + sql """ + SELECT /*+SET_VAR(ignore_storage_data_distribution=true, + parallel_pipeline_task_num=4, + enable_local_shuffle_planner=true, + disable_join_reorder=true, + disable_colocate_plan=true, + auto_broadcast_join_threshold=-1, + broadcast_row_count_limit=0, + query_timeout=60)*/ + count(a.pk) AS cnt, a.col_int_undef_signed + FROM rqg_t1 a + LEFT JOIN rqg_t1 b ON b.col_int_undef_signed >= b.col_int_undef_signed + LEFT JOIN rqg_t1 c ON b.pk >= b.pk + WHERE a.pk IS NOT NULL + GROUP BY a.col_int_undef_signed + ORDER BY cnt, a.col_int_undef_signed + """ + logger.info("Bug 6: PASSED (no CROSS_JOIN_OPERATOR shared state error)") + } catch (Throwable t) { + logger.error("Bug 6 FAILED: ${t.message}") + assertTrue(false, "Bug 6: must set shared state in CROSS_JOIN_OPERATOR: ${t.message}") + } + + // ============================================================ + // Bug 7: DataStreamSink hang — sender fragment with pooling scan + // Root cause: FE planner did not insert PASSTHROUGH at the root of pooling scan + // sender fragments. With pooling scan, only instance 0 creates pipeline tasks, + // so only 1 EOS is sent. The downstream ExchangeNode expects _num_instances EOSes + // and hangs indefinitely. + // Fixed in AddLocalExchange.addLocalExchangeForFragment: insert PASSTHROUGH + // when isLocalShuffle && newRoot.isSerialOperator(). + // Any NLJ + pooling scan query triggers this via the UNPARTITIONED sender fragments. + // ============================================================ + + logger.info("=== Bug 7: DataStreamSink hang - NLJ + pooling scan sender (FE planner) ===") + try { + sql """ + SELECT /*+SET_VAR(ignore_storage_data_distribution=true, + parallel_pipeline_task_num=4, + enable_local_shuffle_planner=true, + disable_join_reorder=true, + disable_colocate_plan=true, + auto_broadcast_join_threshold=-1, + broadcast_row_count_limit=0, + query_timeout=60)*/ + a.col_int_undef_signed, MAX(a.pk) AS mx + FROM rqg_t1 a + LEFT JOIN rqg_t1 b ON b.col_int_undef_signed < b.col_int_undef_signed + WHERE a.pk IS NOT NULL + GROUP BY a.col_int_undef_signed + ORDER BY a.col_int_undef_signed, mx + """ + logger.info("Bug 7: PASSED (no hang)") + } catch (Throwable t) { + logger.error("Bug 7 FAILED: ${t.message}") + assertTrue(false, "Bug 7: DataStreamSink hang (query timed out or crashed): ${t.message}") + } + + // ============================================================ + // Bug 8: must set shared state, in SORT_OPERATOR / UNION_OPERATOR + // Root cause: FE planner + pooling scan + GROUPING SETS. Serial UNPARTITIONED + // Exchange reduces downstream pipeline num_tasks to 1. SORT and UNION operators + // need _num_instances tasks to inject shared state for all instances. + // Fixed by: (1) restoring num_tasks raise for non-scan serial operators in BE + // deferred exchanger creation (commit 920d43d), and (2) FE inserting PASSTHROUGH + // after serial ExchangeNode in pooling scan fragments (commit d2e7fa2). + // ============================================================ + + logger.info("=== Bug 8a: SORT/UNION shared state - GROUPING SETS + pooling scan (FE planner) ===") + try { + sql """ + SELECT /*+SET_VAR(ignore_storage_data_distribution=true, + parallel_pipeline_task_num=4, + enable_local_shuffle_planner=true, + disable_streaming_preaggregations=true, + query_timeout=60)*/ + pk, col_int_undef_signed, SUM(col_int_undef_signed_not_null) AS sv + FROM rqg_t1 + GROUP BY GROUPING SETS ((pk, col_int_undef_signed), (pk), ()) + ORDER BY pk, col_int_undef_signed, sv + """ + logger.info("Bug 8a: PASSED (no SORT/UNION_OPERATOR shared state error)") + } catch (Throwable t) { + logger.error("Bug 8a FAILED: ${t.message}") + assertTrue(false, "Bug 8a: must set shared state in SORT/UNION_OPERATOR: ${t.message}") + } + + logger.info("=== Bug 8b: SORT shared state - window + GROUPING SETS + pooling scan (FE planner) ===") + try { + sql """ + SELECT /*+SET_VAR(ignore_storage_data_distribution=true, + parallel_pipeline_task_num=4, + enable_local_shuffle_planner=true, + disable_streaming_preaggregations=true, + query_timeout=60)*/ + pk, SUM(col_int_undef_signed_not_null) AS sv, + ROW_NUMBER() OVER (ORDER BY pk) AS rn + FROM rqg_t1 + GROUP BY GROUPING SETS ((pk), ()) + ORDER BY pk, sv, rn + """ + logger.info("Bug 8b: PASSED (no SORT_OPERATOR shared state error)") + } catch (Throwable t) { + logger.error("Bug 8b FAILED: ${t.message}") + assertTrue(false, "Bug 8b: must set shared state in SORT_OPERATOR (window+grouping_sets): ${t.message}") + } + + // ============================================================ + // Bug 9: FE/BE result inconsistency — agg after NLJ + pooling scan + // Root cause: StreamingAgg used fragment.useSerialSource()=true to require + // PASSTHROUGH from child, but when child is NLJ (not directly a serial scan), + // NLJ outputs ADAPTIVE_PASSTHROUGH. FE wrongly inserted an extra PASSTHROUGH + // exchange between StreamingAgg and NLJ (5 extra LOCAL_EXCHANGE_SINK_OPERATOR + // entries vs BE native). + // Fixed in AggregationNode: only requirePassthrough when + // children.get(0).isSerialOperator()=true, mirroring BE _child->is_serial_operator(). + // ============================================================ + + logger.info("=== Bug 9: FE/BE result consistency - agg after NLJ + pooling scan ===") + def bug9_fe = sql """ + SELECT /*+SET_VAR(ignore_storage_data_distribution=true, + parallel_pipeline_task_num=4, + enable_local_shuffle_planner=true, + disable_join_reorder=true, + disable_colocate_plan=true, + auto_broadcast_join_threshold=-1, + broadcast_row_count_limit=0)*/ + a.col_int_undef_signed, MAX(a.pk) AS mx + FROM rqg_t1 a LEFT JOIN rqg_t1 b ON b.col_int_undef_signed < b.col_int_undef_signed + WHERE a.pk IS NOT NULL + GROUP BY a.col_int_undef_signed + ORDER BY a.col_int_undef_signed, mx + """ + def bug9_be = sql """ + SELECT /*+SET_VAR(ignore_storage_data_distribution=true, + parallel_pipeline_task_num=4, + enable_local_shuffle_planner=false, + disable_join_reorder=true, + disable_colocate_plan=true, + auto_broadcast_join_threshold=-1, + broadcast_row_count_limit=0)*/ + a.col_int_undef_signed, MAX(a.pk) AS mx + FROM rqg_t1 a LEFT JOIN rqg_t1 b ON b.col_int_undef_signed < b.col_int_undef_signed + WHERE a.pk IS NOT NULL + GROUP BY a.col_int_undef_signed + ORDER BY a.col_int_undef_signed, mx + """ + logger.info("Bug 9 FE rows: ${bug9_fe.size()}, BE rows: ${bug9_be.size()}") + assertEquals(bug9_be.size(), bug9_fe.size(), "Bug 9: FE/BE row count mismatch") + assertEquals(bug9_be, bug9_fe, "Bug 9: FE/BE result mismatch for agg after NLJ + pooling scan") + logger.info("Bug 9: PASSED (FE/BE results match)") + + // ============================================================ + // Bug 10: GLOBAL_HASH_SHUFFLE Rows mismatched — self-join + NLJ + // RQG case: 906784672 (build 184181) + // Root cause: HashJoinNode used requireGlobalExecutionHash() → GLOBAL local exchange + // inserted when use_serial_exchange=true; shuffle_idx_to_instance_idx map has only + // 4 entries (1/BE) but GLOBAL hash needs N*dop entries → most rows unrouted (0 actual rows). + // Fixed: changed to requireHash() so resolveExchangeType() downgrades to LOCAL hash. + // SQL: self-join (table1 LEFT JOIN table1 table2 ON pk=col_bigint_undef_signed) + // then NLJ (LEFT JOIN table1 table3 ON pk > col_bigint_undef_signed) + // ============================================================ + + logger.info("=== Bug 10: GLOBAL_HASH_SHUFFLE Rows mismatched - self-join + NLJ (build 184181 case 906784672) ===") + def bug10_fe = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=4, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + disable_join_reorder=true, disable_colocate_plan=true)*/ + table1.pk AS field1, table1.col_bigint_undef_signed AS field2 + FROM rqg_t3 AS table1 + LEFT JOIN rqg_t3 AS table2 ON table1.pk = table2.col_bigint_undef_signed + LEFT JOIN rqg_t3 AS table3 ON table1.pk > table2.col_bigint_undef_signed + WHERE (table1.col_varchar_10__undef_signed > 'AHlvNtoGLO' + AND table1.col_varchar_10__undef_signed < 'zzzz') + OR (table1.col_bigint_undef_signed = table1.pk AND table1.col_varchar_64__undef_signed IS NULL) + OR (table1.pk != table1.pk AND table1.pk <> 2) + GROUP BY field1, field2 + ORDER BY field1, field2 + """ + def bug10_be = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=4, + enable_local_shuffle_planner=false, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + disable_join_reorder=true, disable_colocate_plan=true)*/ + table1.pk AS field1, table1.col_bigint_undef_signed AS field2 + FROM rqg_t3 AS table1 + LEFT JOIN rqg_t3 AS table2 ON table1.pk = table2.col_bigint_undef_signed + LEFT JOIN rqg_t3 AS table3 ON table1.pk > table2.col_bigint_undef_signed + WHERE (table1.col_varchar_10__undef_signed > 'AHlvNtoGLO' + AND table1.col_varchar_10__undef_signed < 'zzzz') + OR (table1.col_bigint_undef_signed = table1.pk AND table1.col_varchar_64__undef_signed IS NULL) + OR (table1.pk != table1.pk AND table1.pk <> 2) + GROUP BY field1, field2 + ORDER BY field1, field2 + """ + logger.info("Bug 10 FE rows: ${bug10_fe.size()}, BE rows: ${bug10_be.size()}") + assertEquals(bug10_be.size(), bug10_fe.size(), "Bug 10: FE/BE row count mismatch (GLOBAL_HASH_SHUFFLE Rows mismatched)") + assertEquals(bug10_be, bug10_fe, "Bug 10: FE/BE result mismatch for self-join + NLJ") + logger.info("Bug 10: PASSED") + + // ============================================================ + // Bug 11: GLOBAL_HASH_SHUFFLE Rows mismatched — FULL OUTER JOIN + GROUP BY + // RQG case: 11007681241 (build 184181) + // Same root cause as Bug 10. + // SQL: FULL OUTER JOIN on col_bigint_undef_signed_not_null with WHERE + GROUP BY + // ============================================================ + + logger.info("=== Bug 11: GLOBAL_HASH_SHUFFLE Rows mismatched - FULL OUTER JOIN + GROUP BY (build 184181 case 11007681241) ===") + def bug11_fe = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=4, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_sql_cache=false)*/ + t1.col_bigint_undef_signed_not_null, t2.col_bigint_undef_signed_not_null, count(1) + FROM rqg_t4 t1 + FULL OUTER JOIN rqg_t2 t2 + ON t1.col_bigint_undef_signed_not_null = t2.col_bigint_undef_signed_not_null + WHERE t2.col_bigint_undef_signed_not_null = 2 + GROUP BY t1.col_bigint_undef_signed_not_null, t2.col_bigint_undef_signed_not_null + ORDER BY 1, 2, 3 + """ + def bug11_be = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=4, + enable_local_shuffle_planner=false, + ignore_storage_data_distribution=true, + enable_sql_cache=false)*/ + t1.col_bigint_undef_signed_not_null, t2.col_bigint_undef_signed_not_null, count(1) + FROM rqg_t4 t1 + FULL OUTER JOIN rqg_t2 t2 + ON t1.col_bigint_undef_signed_not_null = t2.col_bigint_undef_signed_not_null + WHERE t2.col_bigint_undef_signed_not_null = 2 + GROUP BY t1.col_bigint_undef_signed_not_null, t2.col_bigint_undef_signed_not_null + ORDER BY 1, 2, 3 + """ + logger.info("Bug 11 FE rows: ${bug11_fe.size()}, BE rows: ${bug11_be.size()}") + assertEquals(bug11_be.size(), bug11_fe.size(), "Bug 11: FE/BE row count mismatch (GLOBAL_HASH_SHUFFLE Rows mismatched)") + assertEquals(bug11_be, bug11_fe, "Bug 11: FE/BE result mismatch for FULL OUTER JOIN + GROUP BY") + logger.info("Bug 11: PASSED") + + // ============================================================ + // Bug 12: GLOBAL_HASH_SHUFFLE Rows mismatched — LEFT JOIN + VARCHAR predicates + MIN() + // RQG case: 906784662 (build 184181) + // Same root cause as Bug 10/11. + // SQL: LEFT JOIN on pk with VARCHAR NOT IN / BETWEEN / IN predicates, MIN() aggregate + // ============================================================ + + logger.info("=== Bug 12: GLOBAL_HASH_SHUFFLE Rows mismatched - LEFT JOIN + VARCHAR predicates (build 184181 case 906784662) ===") + def bug12_fe = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=4, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + disable_join_reorder=true, disable_colocate_plan=true)*/ + table1.pk AS field1, MIN(table1.pk) AS field2 + FROM rqg_t3 AS table1 + LEFT JOIN rqg_t1 AS table2 ON table2.pk = table1.pk + WHERE table1.col_varchar_64__undef_signed NOT IN ('they') + AND table1.col_varchar_10__undef_signed BETWEEN 'AHlvNtoGLO' AND 'z' + AND table1.pk IN (3, 6, 8, 9, 2) + GROUP BY field1 + ORDER BY field1, field2 ASC + """ + def bug12_be = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=4, + enable_local_shuffle_planner=false, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + disable_join_reorder=true, disable_colocate_plan=true)*/ + table1.pk AS field1, MIN(table1.pk) AS field2 + FROM rqg_t3 AS table1 + LEFT JOIN rqg_t1 AS table2 ON table2.pk = table1.pk + WHERE table1.col_varchar_64__undef_signed NOT IN ('they') + AND table1.col_varchar_10__undef_signed BETWEEN 'AHlvNtoGLO' AND 'z' + AND table1.pk IN (3, 6, 8, 9, 2) + GROUP BY field1 + ORDER BY field1, field2 ASC + """ + logger.info("Bug 12 FE rows: ${bug12_fe.size()}, BE rows: ${bug12_be.size()}") + assertEquals(bug12_be.size(), bug12_fe.size(), "Bug 12: FE/BE row count mismatch (GLOBAL_HASH_SHUFFLE Rows mismatched)") + assertEquals(bug12_be, bug12_fe, "Bug 12: FE/BE result mismatch for LEFT JOIN + VARCHAR predicates") + logger.info("Bug 12: PASSED") + + // ============================================================ + // Bug 13: NLJ COREDUMP — serial NLJ + pooling scan + BROADCAST build side + // RQG build 184430, query c0dafc1bed0f4910 + // Root cause: serial NLJ (RIGHT_OUTER) with pooling scan inserted BROADCAST + // local exchange on build side, inflating build pipeline num_tasks to _num_instances + // while probe pipeline stayed at 1 task. Instance 1+ created build tasks without + // corresponding probe tasks → source_deps empty → set_ready_to_read() crash. + // Fixed: serial NLJ sets buildSideRequire=noRequire() to match BE-native + // num_tasks_of_parent()<=1 skip logic. + // ============================================================ + + logger.info("=== Bug 13: NLJ COREDUMP - serial NLJ + pooling scan (FE planner) ===") + try { + def bug13_fe = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=0, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + enable_share_hash_table_for_broadcast_join=false, + disable_streaming_preaggregations=true, + disable_join_reorder=true)*/ + t2.col_bigint_undef_signed_not_null AS field1 + FROM rqg_t4 AS t1 + RIGHT OUTER JOIN rqg_t2 AS t2 ON t1.col_bigint_undef_signed_not_null > t2.col_bigint_undef_signed_not_null + GROUP BY field1 + ORDER BY field1 ASC + """ + def bug13_be = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, parallel_pipeline_task_num=0, + enable_local_shuffle_planner=false, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + enable_share_hash_table_for_broadcast_join=false, + disable_streaming_preaggregations=true, + disable_join_reorder=true)*/ + t2.col_bigint_undef_signed_not_null AS field1 + FROM rqg_t4 AS t1 + RIGHT OUTER JOIN rqg_t2 AS t2 ON t1.col_bigint_undef_signed_not_null > t2.col_bigint_undef_signed_not_null + GROUP BY field1 + ORDER BY field1 ASC + """ + logger.info("Bug 13 FE rows: ${bug13_fe.size()}, BE rows: ${bug13_be.size()}") + assertEquals(bug13_be.size(), bug13_fe.size(), "Bug 13: FE/BE row count mismatch (NLJ COREDUMP)") + assertEquals(bug13_be, bug13_fe, "Bug 13: FE/BE result mismatch for serial NLJ + pooling scan") + logger.info("Bug 13: PASSED (no crash, results match)") + } catch (Throwable t) { + logger.error("Bug 13 FAILED: ${t.message}") + assertTrue(false, "Bug 13: NLJ COREDUMP (serial NLJ + pooling scan): ${t.message}") + } + + // ============================================================ + // Bug 14: BUCKET_SHUFFLE join + serial build Exchange — must set shared state + // RQG build 184563, cases 906784706/906784783/906784987/906785006 + // Root cause: BUCKET_SHUFFLE join build side ExchangeNode marked serial in + // pooling scan fragment → build pipeline num_tasks reduced to 1 → + // instance 1+ have probe tasks without build tasks → shared state injection + // fails. Fixed: isBucketShuffle() branch checks buildChildSerial and uses + // requirePassToOne() to restore num_tasks, matching BE-native behavior. + // Requires replication_num=3 + [shuffle] hint to force BUCKET_SHUFFLE plan. + // ============================================================ + + logger.info("=== Bug 14: BUCKET_SHUFFLE join + serial build Exchange (FE planner) ===") + // Need replication_num=3 for BUCKET_SHUFFLE. Check if allow_replica_on_same_host is enabled. + def allowSameHost = sql "ADMIN SHOW FRONTEND CONFIG LIKE 'allow_replica_on_same_host'" + if (allowSameHost[0][1].toString() == "true") { + sql "DROP TABLE IF EXISTS rqg_t5_rep3" + sql "DROP TABLE IF EXISTS rqg_t6_rep3" + try { + sql """ + CREATE TABLE rqg_t5_rep3 ( + pk INT NULL, + col_varchar_10__undef_signed VARCHAR(10) NULL, + col_bigint_undef_signed BIGINT NULL, + col_varchar_64__undef_signed VARCHAR(64) NULL + ) DUPLICATE KEY(pk, col_varchar_10__undef_signed) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "3") + """ + sql """ + CREATE TABLE rqg_t6_rep3 ( + pk INT NULL, + col_varchar_10__undef_signed VARCHAR(10) NULL, + col_bigint_undef_signed BIGINT NULL, + col_varchar_64__undef_signed VARCHAR(64) NULL + ) DUPLICATE KEY(pk, col_varchar_10__undef_signed) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "3") + """ + sql """ + INSERT INTO rqg_t5_rep3 VALUES + (0,'abc',-94,'hello'),(1,'xyz',672609,null),(2,'pqr',-3766684,'test'), + (3,'abc',5070261,'another'),(4,'def',null,'value'),(5,'so',-86,null), + (6,'abc',21910,'they'),(7,'zzzz',-63,'some'),(8,'xPLflvBEcW',-8276281,'longer'), + (9,'mid',-101,'final') + """ + sql """ + INSERT INTO rqg_t6_rep3 VALUES + (0,'aaa',100,'world'),(1,'bbb',200,null),(2,'ccc',300,'foo'), + (3,'ddd',400,'bar'),(4,'eee',500,'baz'),(5,'fff',600,null), + (6,'ggg',700,'qux'),(7,'hhh',800,'quux'),(8,'iii',900,'corge'), + (9,'jjj',1000,'grault') + """ + Thread.sleep(3000) + + def bug14_fe = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=true, + parallel_pipeline_task_num=3, + disable_streaming_preaggregations=true, + enable_sql_cache=false, + disable_join_reorder=true)*/ + table1.pk AS field1 + FROM rqg_t5_rep3 AS table1 + LEFT OUTER JOIN [shuffle] rqg_t6_rep3 AS table2 ON table1.pk = table2.pk + WHERE table1.col_varchar_10__undef_signed >= 'so' + GROUP BY field1 + ORDER BY field1 + """ + def bug14_be = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=false, + parallel_pipeline_task_num=3, + disable_streaming_preaggregations=true, + enable_sql_cache=false, + disable_join_reorder=true)*/ + table1.pk AS field1 + FROM rqg_t5_rep3 AS table1 + LEFT OUTER JOIN [shuffle] rqg_t6_rep3 AS table2 ON table1.pk = table2.pk + WHERE table1.col_varchar_10__undef_signed >= 'so' + GROUP BY field1 + ORDER BY field1 + """ + logger.info("Bug 14 FE rows: ${bug14_fe.size()}, BE rows: ${bug14_be.size()}") + assertEquals(bug14_be.size(), bug14_fe.size(), "Bug 14: FE/BE row count mismatch (BUCKET_SHUFFLE must set shared state)") + assertEquals(bug14_be, bug14_fe, "Bug 14: FE/BE result mismatch for BUCKET_SHUFFLE + serial build Exchange") + logger.info("Bug 14: PASSED (no crash, results match)") + } catch (Throwable t) { + logger.error("Bug 14 FAILED: ${t.message}") + assertTrue(false, "Bug 14: BUCKET_SHUFFLE must set shared state: ${t.message}") + } + } else { + logger.info("Bug 14: SKIPPED (allow_replica_on_same_host not enabled, cannot create replication_num=3 tables)") + } + + // ============================================================ + // Bug 15: BUCKET_SHUFFLE join wrong results with serial exchange — PASS_TO_ONE data loss + // Root cause: When serial exchange feeds BUCKET_SHUFFLE join build side, + // PASS_TO_ONE routes ALL build data to task 0. Unlike BROADCAST joins, + // BUCKET_SHUFFLE has no shared hash table mechanism — tasks 1..N-1 build + // empty hash tables and lose rows during probe. Fixed by using + // BUCKET_HASH_SHUFFLE instead of PASS_TO_ONE for BUCKET_SHUFFLE build side. + // Tables use 3 buckets so pptn=4 triggers serial scan on single BE (3 < 4*1). + // ============================================================ + + logger.info("=== Bug 15: BUCKET_SHUFFLE join wrong results with serial PASS_TO_ONE ===") + + sql "DROP TABLE IF EXISTS rqg_t7_3bucket" + sql "DROP TABLE IF EXISTS rqg_t8_3bucket" + + sql """ + CREATE TABLE rqg_t7_3bucket ( + pk INT NOT NULL, + col_int INT NULL, + col_varchar VARCHAR(64) NULL + ) DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 3 + PROPERTIES ("replication_num" = "1") + """ + + sql """ + CREATE TABLE rqg_t8_3bucket ( + pk INT NOT NULL, + col_int INT NULL, + col_varchar VARCHAR(64) NULL + ) DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 3 + PROPERTIES ("replication_num" = "1") + """ + + sql """ + INSERT INTO rqg_t7_3bucket VALUES + (0, 10, 'aaa'), (1, 20, 'bbb'), (2, 30, 'ccc'), + (3, 40, 'ddd'), (4, 50, 'eee'), (5, 60, 'fff'), + (6, 70, 'ggg'), (7, 80, 'hhh'), (8, 90, 'iii'), (9, 100, 'jjj') + """ + + sql """ + INSERT INTO rqg_t8_3bucket VALUES + (0, 10, 'aaa'), (1, 20, 'bbb'), (2, 30, 'ccc'), + (3, 40, 'ddd'), (4, 50, 'eee'), (5, 60, 'fff'), + (6, 70, 'ggg'), (7, 80, 'hhh'), (8, 90, 'iii'), (9, 100, 'jjj') + """ + + Thread.sleep(3000) + + try { + // pptn=4 with 3 buckets on 1 BE: 3 < 4*1 → serial scan → serial exchange + // This triggers the PASS_TO_ONE bug for BUCKET_SHUFFLE build side. + // Also test with higher pptn values to cover more cases. + for (int ppt : [4, 6, 8]) { + def bug15_fe = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, + parallel_pipeline_task_num=${ppt}, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_share_hash_table_for_broadcast_join=false, + enable_sql_cache=false, + disable_join_reorder=true)*/ + t1.pk, t1.col_int, t2.col_varchar + FROM rqg_t7_3bucket t1 + INNER JOIN [shuffle] rqg_t8_3bucket t2 ON t1.pk = t2.pk + ORDER BY t1.pk + """ + def bug15_be = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, + parallel_pipeline_task_num=${ppt}, + enable_local_shuffle_planner=false, + ignore_storage_data_distribution=true, + enable_share_hash_table_for_broadcast_join=false, + enable_sql_cache=false, + disable_join_reorder=true)*/ + t1.pk, t1.col_int, t2.col_varchar + FROM rqg_t7_3bucket t1 + INNER JOIN [shuffle] rqg_t8_3bucket t2 ON t1.pk = t2.pk + ORDER BY t1.pk + """ + logger.info("Bug 15 ppt=${ppt}: FE rows=${bug15_fe.size()}, BE rows=${bug15_be.size()}") + assertEquals(10, bug15_fe.size(), "Bug 15 ppt=${ppt}: expected 10 rows from FE planner, got ${bug15_fe.size()}") + assertEquals(bug15_be, bug15_fe, "Bug 15 ppt=${ppt}: FE/BE result mismatch for BUCKET_SHUFFLE + serial exchange") + } + + // Also test LEFT OUTER JOIN to verify no rows lost on probe side + def bug15_left_fe = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, + parallel_pipeline_task_num=6, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_share_hash_table_for_broadcast_join=false, + enable_sql_cache=false, + disable_join_reorder=true)*/ + t1.pk, t2.col_int + FROM rqg_t7_3bucket t1 + LEFT OUTER JOIN [shuffle] rqg_t8_3bucket t2 ON t1.pk = t2.pk + ORDER BY t1.pk + """ + def bug15_left_be = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, + parallel_pipeline_task_num=6, + enable_local_shuffle_planner=false, + ignore_storage_data_distribution=true, + enable_share_hash_table_for_broadcast_join=false, + enable_sql_cache=false, + disable_join_reorder=true)*/ + t1.pk, t2.col_int + FROM rqg_t7_3bucket t1 + LEFT OUTER JOIN [shuffle] rqg_t8_3bucket t2 ON t1.pk = t2.pk + ORDER BY t1.pk + """ + assertEquals(10, bug15_left_fe.size(), "Bug 15 LEFT JOIN: expected 10 rows from FE planner") + assertEquals(bug15_left_be, bug15_left_fe, "Bug 15 LEFT JOIN: FE/BE result mismatch") + + logger.info("Bug 15: PASSED (no wrong results, all pptn values correct)") + } catch (Throwable t) { + logger.error("Bug 15 FAILED: ${t.message}") + assertTrue(false, "Bug 15: BUCKET_SHUFFLE wrong results with serial PASS_TO_ONE: ${t.message}") + } + + // ============================================================ + // Bug 16 & 17: Serial AnalyticEval crash and DataStreamSink hang + // with LocalShuffleAssignedJob (multiple instances on one BE) + // + // Bug 16 (crash): Exchange wraps itself with PASSTHROUGH LocalExchange. + // This restores AnalyticSink pipeline to _num_instances tasks while + // serial AnalyticSource stays at 1 task. For instance_idx > 0, + // source_deps is empty → DCHECK crash. + // + // Bug 17 (hang): After fixing the crash, serial AnalyticSource reduces + // all downstream pipeline tasks to 1 via add_pipeline() inheritance. + // Only instance 0 runs DataStreamSink → receiver expects _num_instances + // EOSes → hang. + // + // Both triggered by: OVER() with no PARTITION BY + GROUPING SETS + + // pptn=0 (auto-parallel) + disable_streaming_preaggregations=true + // RQG build 186195, query IDs: 7f3178a77c2c4b6b, 71887f7bf804c0c, 5dd9fcad234c4484 + // ============================================================ + sql "DROP TABLE IF EXISTS rqg_analytic_t1" + sql """ + CREATE TABLE rqg_analytic_t1 ( + pk INT NOT NULL, + col_int_undef_signed INT + ) ENGINE=OLAP + DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "1") + """ + sql """ + INSERT INTO rqg_analytic_t1 VALUES + (1, 10), (2, 20), (3, 30), (4, 40), (5, 50), + (6, 60), (7, 70), (8, 80), (9, 90), (10, 100), + (11, 10), (12, 20), (13, 30), (14, 40), (15, 50), + (16, 60), (17, 70), (18, 80), (19, 90), (20, 100) + """ + + try { + logger.info("Bug 16+17: Testing serial AnalyticEval with GROUPING SETS") + + // Baseline: pptn=1 (no multi-instance, no local shuffle) + def bug16_baseline = sql """ + SELECT /*+SET_VAR(parallel_pipeline_task_num=1, + enable_sql_cache=false, + disable_streaming_preaggregations=true)*/ + COUNT(MIN(col_int_undef_signed) OVER()) + FROM rqg_analytic_t1 + GROUP BY GROUPING SETS ((col_int_undef_signed, pk), (), (pk)) + ORDER BY 1 + """ + assertEquals(41, bug16_baseline.size(), "Bug 16 baseline: expected 41 rows") + + // Test with pptn=0 (auto-parallel, triggers LocalShuffleAssignedJob) + for (int ppt : [0, 2, 4, 8]) { + def bug16_result = sql """ + SELECT /*+SET_VAR(parallel_pipeline_task_num=${ppt}, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + disable_streaming_preaggregations=true)*/ + COUNT(MIN(col_int_undef_signed) OVER()) + FROM rqg_analytic_t1 + GROUP BY GROUPING SETS ((col_int_undef_signed, pk), (), (pk)) + ORDER BY 1 + """ + assertEquals(bug16_baseline, bug16_result, + "Bug 16+17 pptn=${ppt}: result mismatch with serial AnalyticEval") + } + + // Also test with use_serial_exchange=true (makes ALL exchanges serial) + def bug16_serial = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, + parallel_pipeline_task_num=0, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + disable_streaming_preaggregations=true)*/ + COUNT(MIN(col_int_undef_signed) OVER()) + FROM rqg_analytic_t1 + GROUP BY GROUPING SETS ((col_int_undef_signed, pk), (), (pk)) + ORDER BY 1 + """ + assertEquals(bug16_baseline, bug16_serial, + "Bug 16+17 serial_exchange: result mismatch") + + logger.info("Bug 16+17: PASSED (no crash, no hang, correct results)") + } catch (Throwable t) { + logger.error("Bug 16+17 FAILED: ${t.message}") + assertTrue(false, "Bug 16+17: Serial AnalyticEval crash/hang: ${t.message}") + } + + // Bug 18: DCHECK crash in Pipeline::set_num_tasks when PASSTHROUGH LE is inserted + // between serial NLJ and its child Exchange. + // Root cause: ExchangeNode.enforceAndDeriveLocalExchange wraps UNPARTITIONED serial + // Exchange with PASSTHROUGH LE. On BE, NLJ_PROBE (serial) sets pipeline num_tasks=1, + // then the LE handler's set_num_tasks(_num_instances) overrides it to N, triggering + // DCHECK (serial operator in pipeline with num_tasks > 1). + // Fix: skip PASSTHROUGH wrapping when hasSerialAncestorInPipeline is true. + // Query: LEFT JOIN with always-true self-ref condition (table.pk = table.pk) creates + // RIGHT_OUTER NLJ (serial). With pptn>1 and ignore_data_distribution, the fragment + // gets N instances but NLJ forces 1 task. + try { + logger.info("Bug 18: Testing serial NLJ with PASSTHROUGH LE crash") + // Use existing rqg_t1 table (10 rows, 10 buckets) + def bug18_baseline = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=false, + enable_sql_cache=false)*/ + table1.col_int_undef_signed AS field1 + FROM rqg_t1 AS table1 + LEFT JOIN rqg_t1 AS table2 + ON table2.pk = table2.pk + WHERE table1.pk BETWEEN 2 AND 11 + GROUP BY field1 + ORDER BY 1 + """ + + // Test with various pptn values — crash requires pptn > 1 + for (int ppt : [4, 7]) { + def bug18_result = sql """ + SELECT /*+SET_VAR(use_serial_exchange=false, + parallel_pipeline_task_num=${ppt}, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + enable_share_hash_table_for_broadcast_join=false, + enable_broadcast_join_force_passthrough=true, + enable_parallel_scan=false)*/ + table1.col_int_undef_signed AS field1 + FROM rqg_t1 AS table1 + LEFT JOIN rqg_t1 AS table2 + ON table2.pk = table2.pk + WHERE table1.pk BETWEEN 2 AND 11 + GROUP BY field1 + ORDER BY 1 + """ + assertEquals(bug18_baseline, bug18_result, + "Bug 18 pptn=${ppt}: result mismatch with serial NLJ + local exchange") + } + logger.info("Bug 18: PASSED (no crash, correct results)") + } catch (Throwable t) { + logger.error("Bug 18 FAILED: ${t.message}") + assertTrue(false, "Bug 18: Serial NLJ PASSTHROUGH LE crash: ${t.message}") + } + + // Bug 19: source_deps.size()=0 crash in NLJ build sink. + // Root cause: serial NLJ (RIGHT_OUTER) resets serial ancestor flag for build side. + // Exchange(UNPARTITIONED) on build side sees hasSerialAncestorInPipeline=false and + // inserts PASSTHROUGH LE. This restores build pipeline num_tasks to _num_instances + // while probe pipeline stays at 1. The extra build tasks have NLJ shared state with + // empty source_deps → crash in set_ready_to_read(). + // Fix: shouldResetSerialFlagForChild(1) returns false when NLJ is serial. + // Differs from Bug 18 in fuzzy vars: enable_share_hash_table=true, broadcast_passthrough=false. + try { + logger.info("Bug 19: Testing serial NLJ build-side source_deps crash") + def bug19_baseline = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=false, + enable_sql_cache=false)*/ + table1.col_int_undef_signed AS field1 + FROM rqg_t1 AS table1 + LEFT JOIN rqg_t1 AS table2 + ON table2.pk = table2.pk + WHERE table1.pk BETWEEN 2 AND 11 + GROUP BY field1 + ORDER BY 1 + """ + + for (int ppt : [2, 4]) { + def bug19_result = sql """ + SELECT /*+SET_VAR(use_serial_exchange=false, + parallel_pipeline_task_num=${ppt}, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + enable_share_hash_table_for_broadcast_join=true, + enable_broadcast_join_force_passthrough=false, + enable_parallel_scan=true, + disable_streaming_preaggregations=true)*/ + table1.col_int_undef_signed AS field1 + FROM rqg_t1 AS table1 + LEFT JOIN rqg_t1 AS table2 + ON table2.pk = table2.pk + WHERE table1.pk BETWEEN 2 AND 11 + GROUP BY field1 + ORDER BY 1 + """ + assertEquals(bug19_baseline, bug19_result, + "Bug 19 pptn=${ppt}: result mismatch with serial NLJ build side crash") + } + logger.info("Bug 19: PASSED (no crash, correct results)") + } catch (Throwable t) { + logger.error("Bug 19 FAILED: ${t.message}") + assertTrue(false, "Bug 19: Serial NLJ build-side source_deps crash: ${t.message}") + } + + // Bug 20: Hang (ASAN: COREDUMP source_deps.size()=0 in AggSinkOperatorX) when + // use_serial_exchange=true + RIGHT JOIN + GROUP BY in non-pooling fragment. + // Root cause: serial HASH Exchange in non-pooling fragment returned NOOP, causing FE + // to insert LOCAL_EXECUTION_HASH_SHUFFLE LE. On BE, serial Exchange pipeline has 1 task + // but LE downstream has _num_instances tasks. AggSink on instances 1+ has empty source_deps. + // Fix: ExchangeNode.enforceAndDeriveLocalExchange() returns actual distribution type + // (GLOBAL_EXECUTION_HASH_SHUFFLE/BUCKET_HASH_SHUFFLE) for serial Exchange in non-pooling + // fragments, preventing LE insertion. + // Requires 3+ BEs to reproduce (single BE has _num_instances=1, no hang). + try { + logger.info("Bug 20: Testing serial exchange + agg hang in non-pooling fragment") + // Baseline uses same fuzzy vars but with planner=false (BE-native). + // This way we compare FE-planned vs BE-native under identical conditions, + // not against "correct" results — use_serial_exchange=true itself may have + // pre-existing BE bugs with certain pptn values. + def bug20_baseline_sql = { int ppt -> """ + SELECT /*+SET_VAR(use_serial_exchange=true, + parallel_pipeline_task_num=${ppt}, + enable_local_shuffle_planner=false, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + enable_share_hash_table_for_broadcast_join=false, + disable_streaming_preaggregations=true)*/ + table1.pk AS field1 + FROM rqg_t1 AS table1 + RIGHT OUTER JOIN rqg_t1 AS table2 ON table1.pk = table2.pk + LEFT JOIN rqg_t1 AS table3 ON table3.pk = table1.pk + WHERE table1.col_int_undef_signed IS NOT NULL OR table1.pk <> 10 + GROUP BY field1 + ORDER BY 1 + """ } + for (int ppt : [3, 4, 7]) { + def bug20_baseline = sql bug20_baseline_sql(ppt) + def bug20_result = sql """ + SELECT /*+SET_VAR(use_serial_exchange=true, + parallel_pipeline_task_num=${ppt}, + enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + enable_share_hash_table_for_broadcast_join=false, + disable_streaming_preaggregations=true)*/ + table1.pk AS field1 + FROM rqg_t1 AS table1 + RIGHT OUTER JOIN rqg_t1 AS table2 ON table1.pk = table2.pk + LEFT JOIN rqg_t1 AS table3 ON table3.pk = table1.pk + WHERE table1.col_int_undef_signed IS NOT NULL OR table1.pk <> 10 + GROUP BY field1 + ORDER BY 1 + """ + assertEquals(bug20_baseline, bug20_result, + "Bug 20 pptn=${ppt}: result mismatch with serial exchange + agg hang") + } + logger.info("Bug 20: PASSED (no hang, correct results)") + } catch (Throwable t) { + logger.error("Bug 20 FAILED: ${t.message}") + assertTrue(false, "Bug 20: Serial exchange + agg hang: ${t.message}") + } + + // ============================================================ + // Bug 21: Multi-distinct COUNT on many-bucket table → COREDUMP + // RQG build 186737/186929/186952: AggSinkOperatorX::sink → set_ready_to_read + // with empty source_deps. + // + // Root cause: AGG operators (streaming, distinct-streaming, serialize) requested + // PASSTHROUGH from non-ScanNode serial children (Exchange, AGG), inserting a + // PASSTHROUGH LE that created a pipeline split disconnecting AggSink↔AggSource + // shared state. + // + // Fix: restrict AGG PASSTHROUGH requests to ScanNode children only. + // Triggered by: multi-distinct COUNT/MIN with MultiCastDataSinks feeding + // serial UNPARTITIONED Exchanges into streaming AGG fragments. + // ============================================================ + sql "DROP TABLE IF EXISTS rqg_t5_many_buckets" + sql """ + CREATE TABLE rqg_t5_many_buckets ( + pk INT NOT NULL, + col_int_undef_signed INT, + col_date_undef_signed DATE, + col_date_undef_signed2 DATE, + col_varchar_1024__undef_signed VARCHAR(1024) + ) ENGINE=OLAP + DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 56 + PROPERTIES ("replication_num" = "1") + """ + sql """INSERT INTO rqg_t5_many_buckets VALUES + (1,1,'2023-12-09','2024-06-01','s1'),(2,2,'2023-03-15','2024-01-20','s2'), + (3,3,'2023-07-22','2024-03-10',NULL),(4,4,'2023-12-09','2024-06-01','s4'), + (5,5,'2023-01-05','2024-09-15','s5'),(6,6,'2023-08-11','2024-02-28','s6'), + (7,7,'2023-04-18','2024-07-04',NULL),(8,8,'2023-11-25','2024-05-12','s8'), + (9,9,'2023-06-30','2024-11-19','s9'),(10,10,'2023-02-14','2024-08-07','s10') + """ + + try { + logger.info("Bug 21: Testing multi-distinct COUNT on many-bucket table (COREDUMP fix)") + for (int ppt : [4, 6]) { + // Test without use_serial_exchange + def bug21_baseline = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=false, + parallel_pipeline_task_num=${ppt}, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + query_timeout=60)*/ + MIN(distinct col_date_undef_signed), + COUNT(distinct col_date_undef_signed2), + COUNT(distinct col_int_undef_signed) + FROM rqg_t5_many_buckets + WHERE col_int_undef_signed = col_int_undef_signed + LIMIT 1000 + """ + def bug21_result = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=true, + parallel_pipeline_task_num=${ppt}, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + query_timeout=60)*/ + MIN(distinct col_date_undef_signed), + COUNT(distinct col_date_undef_signed2), + COUNT(distinct col_int_undef_signed) + FROM rqg_t5_many_buckets + WHERE col_int_undef_signed = col_int_undef_signed + LIMIT 1000 + """ + assertEquals(bug21_baseline, bug21_result, + "Bug 21 pptn=${ppt}: multi-distinct COUNT result mismatch (was COREDUMP)") + } + logger.info("Bug 21: PASSED (no crash, correct results)") + } catch (Throwable t) { + if (t.message != null && t.message.contains("query timeout")) { + logger.warn("Bug 21: SKIPPED (query timeout - likely BE crash from previous test, not our bug)") + } else { + logger.error("Bug 21 FAILED: ${t.message}") + assertTrue(false, "Bug 21: Multi-distinct COUNT COREDUMP: ${t.message}") + } + } + + // ============================================================ + // Bug 22: AGG/SORT above FE-planned LOCAL_EXCHANGE → COREDUMP + // (set_ready_to_read DCHECK failure with empty source_deps) + // + // Root cause: when FE inserts LOCAL_EXCHANGE_NODE below a pipeline- + // splitting operator (AGG, SORT), LOCAL_EXCHANGE restores its immediate + // pipeline to _num_instances tasks, but ancestor pipelines (e.g., + // AggSource) still carry the reduced num_tasks from the serial operator. + // This causes instance 1+ to create AggSink tasks but not AggSource + // tasks, leaving source_deps uninitialized → DCHECK in set_ready_to_read. + // + // Fix: _propagate_local_exchange_num_tasks() walks the DAG upward from + // LOCAL_EXCHANGE and raises ancestor pipeline num_tasks to _num_instances. + // ============================================================ + try { + logger.info("Bug 22: Testing AGG/SORT above LOCAL_EXCHANGE num_tasks propagation") + for (int ppt : [4, 6]) { + // 22a: Simple AGG with GROUP BY over pooling scan + def bug22a_baseline = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=false, + parallel_pipeline_task_num=${ppt}, + ignore_storage_data_distribution=true, + enable_sql_cache=false)*/ + col_int_undef_signed, COUNT(*), SUM(col_int_undef_signed2) + FROM rqg_t1 + GROUP BY col_int_undef_signed + ORDER BY 1 + """ + def bug22a_result = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=true, + parallel_pipeline_task_num=${ppt}, + ignore_storage_data_distribution=true, + enable_sql_cache=false)*/ + col_int_undef_signed, COUNT(*), SUM(col_int_undef_signed2) + FROM rqg_t1 + GROUP BY col_int_undef_signed + ORDER BY 1 + """ + assertEquals(bug22a_baseline, bug22a_result, + "Bug 22a pptn=${ppt}: AGG GROUP BY result mismatch (was COREDUMP)") + + // 22b: SORT + AGG (two pipeline splits above LOCAL_EXCHANGE) + def bug22b_baseline = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=false, + parallel_pipeline_task_num=${ppt}, + ignore_storage_data_distribution=true, + enable_sql_cache=false)*/ + col_int_undef_signed, COUNT(*) AS cnt + FROM rqg_t1 + GROUP BY col_int_undef_signed + ORDER BY cnt DESC, col_int_undef_signed + """ + def bug22b_result = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=true, + parallel_pipeline_task_num=${ppt}, + ignore_storage_data_distribution=true, + enable_sql_cache=false)*/ + col_int_undef_signed, COUNT(*) AS cnt + FROM rqg_t1 + GROUP BY col_int_undef_signed + ORDER BY cnt DESC, col_int_undef_signed + """ + assertEquals(bug22b_baseline, bug22b_result, + "Bug 22b pptn=${ppt}: SORT+AGG result mismatch (was COREDUMP)") + + // 22c: JOIN + AGG (join probe pipeline also needs num_tasks propagation) + def bug22c_baseline = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=false, + parallel_pipeline_task_num=${ppt}, + ignore_storage_data_distribution=true, + enable_sql_cache=false)*/ + t1.col_int_undef_signed, COUNT(*) + FROM rqg_t1 t1 JOIN rqg_t2 t2 ON t1.pk = t2.pk + GROUP BY t1.col_int_undef_signed + ORDER BY 1 + """ + def bug22c_result = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=true, + parallel_pipeline_task_num=${ppt}, + ignore_storage_data_distribution=true, + enable_sql_cache=false)*/ + t1.col_int_undef_signed, COUNT(*) + FROM rqg_t1 t1 JOIN rqg_t2 t2 ON t1.pk = t2.pk + GROUP BY t1.col_int_undef_signed + ORDER BY 1 + """ + assertEquals(bug22c_baseline, bug22c_result, + "Bug 22c pptn=${ppt}: JOIN+AGG result mismatch (was COREDUMP)") + + // 22d: AGG without GROUP BY (scalar agg, PASS_TO_ONE exchange) + def bug22d_baseline = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=false, + parallel_pipeline_task_num=${ppt}, + ignore_storage_data_distribution=true, + enable_sql_cache=false)*/ + COUNT(*), SUM(col_int_undef_signed), AVG(col_int_undef_signed2) + FROM rqg_t1 + """ + def bug22d_result = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=true, + parallel_pipeline_task_num=${ppt}, + ignore_storage_data_distribution=true, + enable_sql_cache=false)*/ + COUNT(*), SUM(col_int_undef_signed), AVG(col_int_undef_signed2) + FROM rqg_t1 + """ + assertEquals(bug22d_baseline, bug22d_result, + "Bug 22d pptn=${ppt}: scalar AGG result mismatch") + } + logger.info("Bug 22: PASSED (no crash, correct results)") + } catch (Throwable t) { + logger.error("Bug 22 FAILED: ${t.message}") + assertTrue(false, "Bug 22: AGG/SORT num_tasks propagation: ${t.message}") + } + + // ==================== Bug 23 ==================== + // canUseDistinctStreamingAgg + GROUPING SETS + serial scan → missing LE + // When enable_distinct_streaming_aggregation=true, AggregationNode's + // canUseDistinctStreamingAgg path set requireChild=noRequire() without + // checking child serial status → serial RepeatNode feeds directly into + // non-serial AggregationNode → shared_state mismatch on multi-BE. + // Fix: add isSerialOperatorOnBe check in the noRequire branch. + try { + for (def pptn : [2, 4]) { + def bug23_baseline = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=false, + parallel_pipeline_task_num=${pptn}, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + disable_streaming_preaggregations=false, + enable_distinct_streaming_aggregation=true)*/ + col_int_undef_signed, count(*) + FROM rqg_t1 + GROUP BY GROUPING SETS ((col_int_undef_signed), (pk), ()) + ORDER BY 1, 2 + """ + def bug23_result = sql """ + SELECT /*+SET_VAR(enable_local_shuffle_planner=true, + parallel_pipeline_task_num=${pptn}, + ignore_storage_data_distribution=true, + enable_sql_cache=false, + disable_streaming_preaggregations=false, + enable_distinct_streaming_aggregation=true)*/ + col_int_undef_signed, count(*) + FROM rqg_t1 + GROUP BY GROUPING SETS ((col_int_undef_signed), (pk), ()) + ORDER BY 1, 2 + """ + assertEquals(bug23_baseline, bug23_result, + "Bug 23 pptn=${pptn}: GROUPING SETS + distinct streaming agg result mismatch") + } + logger.info("Bug 23: PASSED (no crash, correct results)") + } catch (Throwable t) { + logger.error("Bug 23 FAILED: ${t.message}") + assertTrue(false, "Bug 23: canUseDistinctStreamingAgg + GROUPING SETS: ${t.message}") + } + + // ============================================================ + // Bug 24: BUCKET_SHUFFLE join + pooling scan + local shuffle + // causes data loss when destination routing uses all + // instances instead of firstInstancePerWorker. + // + // Root cause: filterInstancesWhichCanReceiveDataFromRemote() had + // a special branch for BUCKET_SHUFFLE that returned all instances + // (40) as destinations, but BE native local exchange creates only + // 4 receiver tasks (one per BE). Destination mismatch causes rows + // sent to non-existent receivers to be lost. + // + // Trigger conditions: + // - BUCKET_SHUFFLE join plan (requires multi-BE + specific pptn) + // - ignore_storage_data_distribution=true (pooling scan) + // - enable_local_shuffle=true + // - pptn that makes scan serial (scanRanges < pptn * numBE) + // ============================================================ + try { + logger.info("Bug 24: BUCKET_SHUFFLE + pooling scan destination routing") + sql "DROP TABLE IF EXISTS bug24_t1" + sql "DROP TABLE IF EXISTS bug24_t2" + + sql """ + CREATE TABLE bug24_t1 ( + pk INT NOT NULL, + val VARCHAR(64), + INDEX idx_val (val) USING INVERTED + ) ENGINE=OLAP + DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "1") + """ + + sql """ + CREATE TABLE bug24_t2 ( + pk INT NOT NULL, + val VARCHAR(64), + INDEX idx_val (val) USING INVERTED + ) ENGINE=OLAP + DUPLICATE KEY(pk) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "1") + """ + + // Insert 20 rows into t1, 50 into t2 + for (int i = 1; i <= 20; i++) { + sql "INSERT INTO bug24_t1 VALUES (${i}, 'row_${i}')" + } + for (int i = 1; i <= 50; i++) { + sql "INSERT INTO bug24_t2 VALUES (${i}, 'row_${i}')" + } + + // Baseline: no local shuffle + def bug24_baseline = sql """ + SELECT /*+SET_VAR(enable_local_shuffle=false,enable_sql_cache=false)*/ + count(*) FROM ( + SELECT * FROM bug24_t1 AS t1 + LEFT JOIN (SELECT * FROM bug24_t2) AS t2 ON t1.pk = t2.pk + ORDER BY t2.pk DESC, t1.pk DESC LIMIT 21 + ) t + """ + + // Test with multiple pptn values to catch the specific trigger + for (int pptn : [1, 2, 3, 4, 5, 8, 10]) { + def result = sql """ + SELECT /*+SET_VAR( + parallel_pipeline_task_num=${pptn}, + ignore_storage_data_distribution=true, + enable_local_shuffle=true, + enable_local_shuffle_planner=false, + enable_sql_cache=false + )*/ count(*) FROM ( + SELECT * FROM bug24_t1 AS t1 + LEFT JOIN (SELECT * FROM bug24_t2) AS t2 ON t1.pk = t2.pk + ORDER BY t2.pk DESC, t1.pk DESC LIMIT 21 + ) t + """ + assertEquals(bug24_baseline, result, + "Bug 24 pptn=${pptn} planner=false: BUCKET_SHUFFLE+pooling result mismatch") + + def result2 = sql """ + SELECT /*+SET_VAR( + parallel_pipeline_task_num=${pptn}, + ignore_storage_data_distribution=true, + enable_local_shuffle=true, + enable_local_shuffle_planner=true, + enable_sql_cache=false + )*/ count(*) FROM ( + SELECT * FROM bug24_t1 AS t1 + LEFT JOIN (SELECT * FROM bug24_t2) AS t2 ON t1.pk = t2.pk + ORDER BY t2.pk DESC, t1.pk DESC LIMIT 21 + ) t + """ + assertEquals(bug24_baseline, result2, + "Bug 24 pptn=${pptn} planner=true: BUCKET_SHUFFLE+pooling result mismatch") + } + logger.info("Bug 24: PASSED") + } catch (Throwable t) { + logger.error("Bug 24 FAILED: ${t.message}") + assertTrue(false, "Bug 24: BUCKET_SHUFFLE+pooling destination routing: ${t.message}") + } + + // Bug 25: COLOCATE JOIN + NLJ CROSS JOIN probe side → wrong BUCKET_HASH_SHUFFLE + // isColocated() traverses subtree and returns false when NLJ is in the probe side, + // causing COLOCATE JOIN to fall into generic requireHash() → LOCAL_EXECUTION_HASH_SHUFFLE + // which breaks bucket distribution → result mismatch. + // Fix: use isColocate() directly on the HashJoinNode instead of subtree check. + logger.info("Bug 25: COLOCATE JOIN with NLJ CROSS JOIN probe side") + try { + sql """ + CREATE TABLE IF NOT EXISTS bug25_t20 ( + pk INT, col1 INT + ) DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "1") + """ + sql """ + CREATE TABLE IF NOT EXISTS bug25_t24 ( + pk INT, col1 INT + ) DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "1") + """ + sql """ + CREATE TABLE IF NOT EXISTS bug25_t7 ( + pk INT, col1 INT + ) DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ("replication_num" = "1") + """ + sql "TRUNCATE TABLE bug25_t20" + sql "TRUNCATE TABLE bug25_t24" + sql "TRUNCATE TABLE bug25_t7" + (1..20).each { i -> sql "INSERT INTO bug25_t20 VALUES ($i, $i)" } + (1..24).each { i -> sql "INSERT INTO bug25_t24 VALUES ($i, $i)" } + (1..7).each { i -> sql "INSERT INTO bug25_t7 VALUES ($i, $i)" } + + def query = """ + WITH cte1 AS ( + SELECT t1.pk FROM bug25_t20 AS t1 CROSS JOIN bug25_t24 AS alias1 + ), + cte2 AS ( + SELECT t1.pk FROM bug25_t20 AS t1 + INNER JOIN bug25_t7 AS alias2 ON t1.pk = alias2.pk + ) + SELECT cte1.pk AS pk1 FROM cte1 + RIGHT OUTER JOIN cte2 AS alias3 ON cte1.pk = alias3.pk + LIMIT 66666666 + """ + + for (int pptn : [0, 1, 2, 4]) { + def feResult = sql """ + /*+SET_VAR(enable_local_shuffle_planner=true, + ignore_storage_data_distribution=true, + parallel_pipeline_task_num=${pptn}, + enable_sql_cache=false)*/ ${query} + """ + def beResult = sql """ + /*+SET_VAR(enable_local_shuffle_planner=false, + ignore_storage_data_distribution=true, + parallel_pipeline_task_num=${pptn}, + enable_sql_cache=false)*/ ${query} + """ + assertEquals(beResult.size(), feResult.size(), + "Bug 25 pptn=${pptn}: FE rows=${feResult.size()}, BE rows=${beResult.size()}") + } + logger.info("Bug 25: PASSED") + } catch (Throwable t) { + logger.error("Bug 25 FAILED: ${t.message}") + assertTrue(false, "Bug 25: COLOCATE+NLJ CROSS probe: ${t.message}") + } + + logger.info("=== All RQG bug reproduction tests completed ===") +} diff --git a/regression-test/suites/nereids_p0/local_shuffle/test_old_coordinator_local_shuffle.groovy b/regression-test/suites/nereids_p0/local_shuffle/test_old_coordinator_local_shuffle.groovy new file mode 100644 index 00000000000000..79270a58c36cf5 --- /dev/null +++ b/regression-test/suites/nereids_p0/local_shuffle/test_old_coordinator_local_shuffle.groovy @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Regression test: old Coordinator + enable_local_shuffle_planner must not hang. + * + * When canUseNereidsDistributePlanner=false (e.g. proxyExecute forwarding), + * FE uses old Coordinator which does not plan local exchange. If + * enable_local_shuffle_planner=true was passed to BE, BE would skip its own + * _plan_local_exchange, leaving no LE at all — causing pooling fragments to + * hang on SHUFFLE_DATA_DEPENDENCY. + * + * Fix: old Coordinator forces enableLocalShufflePlanner=false in query options + * when distributedPlans is null, so BE falls back to native LE planning. + */ +suite("test_old_coordinator_local_shuffle") { + + sql "DROP TABLE IF EXISTS oc_t0" + sql "DROP TABLE IF EXISTS oc_t1" + + sql """ + CREATE TABLE oc_t0 ( + pk INT NULL, + k1 INT NULL, + v1 INT NULL + ) ENGINE=OLAP + UNIQUE KEY(pk, k1) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ( + 'replication_allocation' = 'tag.location.default: 1', + 'enable_unique_key_merge_on_write' = 'true' + ) + """ + + sql """ + CREATE TABLE oc_t1 ( + pk INT NULL, + k1 INT NULL, + v1 INT NULL + ) ENGINE=OLAP + UNIQUE KEY(pk, k1) + DISTRIBUTED BY HASH(pk) BUCKETS 10 + PROPERTIES ( + 'replication_allocation' = 'tag.location.default: 1', + 'enable_unique_key_merge_on_write' = 'true' + ) + """ + + sql "INSERT INTO oc_t0 VALUES (1,1,10),(2,2,20),(3,3,30),(4,4,40),(5,5,50)" + sql "INSERT INTO oc_t1 VALUES (1,1,100),(2,2,200),(3,3,300),(6,6,600),(7,7,700)" + + // Test 1: old Coordinator + default enable_local_shuffle_planner=true + // This simulates the proxyExecute forwarding scenario where + // canUseNereidsDistributePlanner=false. + sql "SET enable_nereids_distribute_planner = false" + sql "SET enable_local_shuffle = true" + sql "SET ignore_storage_data_distribution = true" + sql "SET query_timeout = 30" + + sql """ + MERGE INTO oc_t0 t USING oc_t1 s + ON t.pk = s.pk AND t.k1 = s.k1 + WHEN MATCHED THEN UPDATE SET v1 = s.v1 + WHEN NOT MATCHED THEN INSERT (pk, k1, v1) VALUES (s.pk, s.k1, s.v1) + """ + + def result = sql "SELECT * FROM oc_t0 ORDER BY pk" + assertEquals(7, result.size()) + + // Test 2: INSERT INTO SELECT with FE-planned LE should not double-insert LE. + // NereidsCoordinator has distributedPlans != null, so enableLocalShufflePlanner + // stays true and BE does not add its own LE on top of FE's. + sql "SET enable_nereids_distribute_planner = true" + sql "SET enable_local_shuffle_planner = true" + + sql "TRUNCATE TABLE oc_t0" + sql """ + INSERT INTO oc_t0 + SELECT number, number % 10, number * 100 + FROM numbers('number' = '50') + """ + + def cnt = sql "SELECT COUNT(*) FROM oc_t0" + assertEquals(50, cnt[0][0] as int) +} diff --git a/regression-test/suites/pythonudf_complex_p0/test_python_udaf_complex.groovy b/regression-test/suites/pythonudf_complex_p0/test_python_udaf_complex.groovy index 96ca48f13defbc..344ec21e77d48c 100644 --- a/regression-test/suites/pythonudf_complex_p0/test_python_udaf_complex.groovy +++ b/regression-test/suites/pythonudf_complex_p0/test_python_udaf_complex.groovy @@ -476,7 +476,7 @@ suite("test_python_udaf_complex") { ); """ - qt_json_array_agg """ + order_qt_json_array_agg """ SELECT category, py_json_array_agg(CAST(id AS STRING)) AS id_array FROM udaf_test_data GROUP BY category diff --git a/regression-test/suites/query_p0/join/test_multilevel_join_agg_local_shuffle.groovy b/regression-test/suites/query_p0/join/test_multilevel_join_agg_local_shuffle.groovy new file mode 100644 index 00000000000000..12917553d354fb --- /dev/null +++ b/regression-test/suites/query_p0/join/test_multilevel_join_agg_local_shuffle.groovy @@ -0,0 +1,884 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_multilevel_join_agg_local_shuffle", "nereids_p0") { + sql "DROP TABLE IF EXISTS test_multilevel_join_agg_local_shuffle_a" + sql "DROP TABLE IF EXISTS test_multilevel_join_agg_local_shuffle_b" + sql "DROP TABLE IF EXISTS test_multilevel_join_agg_local_shuffle_c" + sql "DROP TABLE IF EXISTS test_multilevel_join_agg_local_shuffle_d" + + sql "SET enable_nereids_planner=true" + sql "SET enable_fallback_to_original_planner=false" + sql "SET enable_local_shuffle=true" + sql "SET runtime_filter_mode=off" + + sql """ + CREATE TABLE test_multilevel_join_agg_local_shuffle_a ( + k1 INT, + k2 INT, + v1 INT + ) ENGINE=OLAP + DUPLICATE KEY(k1, k2) + DISTRIBUTED BY HASH(k1) BUCKETS 8 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + sql """ + CREATE TABLE test_multilevel_join_agg_local_shuffle_b ( + k1 INT, + k3 INT, + v2 INT + ) ENGINE=OLAP + DUPLICATE KEY(k1, k3) + DISTRIBUTED BY HASH(k1) BUCKETS 8 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + sql """ + CREATE TABLE test_multilevel_join_agg_local_shuffle_c ( + k1 INT, + k4 INT, + v3 INT + ) ENGINE=OLAP + DUPLICATE KEY(k1, k4) + DISTRIBUTED BY HASH(k4) BUCKETS 5 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + sql """ + CREATE TABLE test_multilevel_join_agg_local_shuffle_d ( + k1 INT, + flag INT + ) ENGINE=OLAP + DUPLICATE KEY(k1) + DISTRIBUTED BY HASH(k1) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + sql """ + INSERT INTO test_multilevel_join_agg_local_shuffle_a VALUES + (1, 10, 2), + (1, 11, 3), + (2, 20, 4), + (2, 21, 1), + (3, 30, 5), + (4, 40, 6) + """ + + sql """ + INSERT INTO test_multilevel_join_agg_local_shuffle_b VALUES + (1, 100, 7), + (1, 101, 1), + (2, 200, 2), + (3, 300, 3), + (4, 400, 4) + """ + + sql """ + INSERT INTO test_multilevel_join_agg_local_shuffle_c VALUES + (1, 1001, 5), + (1, 1001, 6), + (2, 1002, 7), + (3, 1003, 8), + (4, 1004, 9) + """ + + sql """ + INSERT INTO test_multilevel_join_agg_local_shuffle_d VALUES + (1, 10), + (2, 20), + (3, 30), + (4, 40) + """ + + def checkCase = { String tag, String sqlBody -> + def sqlOn = sqlBody.replace("/*+SET_VAR(", "/*+SET_VAR(enable_local_shuffle_planner=true,") + def sqlOff = sqlBody.replace("/*+SET_VAR(", "/*+SET_VAR(enable_local_shuffle_planner=false,") + + // Plan-shape assertions removed: the FE local-shuffle planner emits its LocalExchange + // nodes *after* the Nereids physical plan, so `explain shape plan` shape is independent + // of enable_local_shuffle_planner. When the shape *does* differ, it's usually a stats- + // dependent rewrite (e.g. cost-based InferSetOperatorDistinct) and the shape check + // becomes flaky across environments. Result-equality + cross-mode equality below give + // us the actual coverage we need. + sql "SET enable_local_shuffle_planner=true" + "order_qt_${tag}_result_on" "${sqlBody}" + + sql "SET enable_local_shuffle_planner=false" + "order_qt_${tag}_result_off" "${sqlBody}" + + check_sql_equal(sqlOn, sqlOff) + } + + def buildAggLayers = { String rawSql, int aggStages -> + String currentSql = rawSql + for (int stage = 1; stage <= aggStages; stage++) { + String alias = "agg_stage_${stage}" + currentSql = """ + SELECT ${alias}.k1, + SUM(${alias}.metric_a) AS metric_a, + SUM(${alias}.metric_b) AS metric_b, + MAX(${alias}.flag_metric) AS flag_metric + FROM ( + ${currentSql} + ) ${alias} + GROUP BY ${alias}.k1 + """ + } + return """ + SELECT final_q.k1, + SUM(final_q.metric_a) AS total_metric_a, + SUM(final_q.metric_b) AS total_metric_b, + MAX(final_q.flag_metric) AS max_flag_metric + FROM ( + ${currentSql} + ) final_q + GROUP BY final_q.k1 + ORDER BY final_q.k1 + """ + } + + def buildAlternatingCase = { String join1, String join2, String join3, String tag -> + String stage1Sql = """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + a.k1, + CAST(SUM(a.v1 + b.v2) AS BIGINT) AS metric_a, + CAST(MAX(a.v1) AS BIGINT) AS metric_b, + CAST(MAX(a.k1) AS BIGINT) AS flag_metric + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN ${join1} test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + GROUP BY a.k1 + """ + + String stage2Sql = """ + SELECT s1.k1, + SUM(s1.metric_a + c.v3) AS metric_a, + SUM(s1.metric_b) AS metric_b, + MAX(s1.flag_metric) AS flag_metric + FROM ( + ${stage1Sql} + ) s1 + JOIN ${join2} ( + SELECT k4 - 1000 AS k1, v3 + FROM test_multilevel_join_agg_local_shuffle_c + ) c + ON s1.k1 = c.k1 + GROUP BY s1.k1 + """ + + String stage3Sql = """ + SELECT s2.k1, + SUM(s2.metric_a) AS metric_a, + SUM(s2.metric_b + d.flag) AS metric_b, + MAX(s2.flag_metric + d.flag) AS flag_metric + FROM ( + ${stage2Sql} + ) s2 + JOIN ${join3} test_multilevel_join_agg_local_shuffle_d d + ON s2.k1 = d.k1 + GROUP BY s2.k1 + """ + + return """ + SELECT final_q.k1, + SUM(final_q.metric_a) AS total_metric_a, + SUM(final_q.metric_b) AS total_metric_b, + MAX(final_q.flag_metric) AS max_flag_metric + FROM ( + ${stage3Sql} + ) final_q + GROUP BY final_q.k1 + ORDER BY final_q.k1 + """ + } + + def joinModeConfigs = [ + [tag: "bucket", hint: ""], + [tag: "shuffle", hint: "[shuffle]"], + [tag: "broadcast", hint: "[broadcast]"] + ] + + def setOpConfigs = [ + [tag: "union_all", body: "SELECT k1, v1 AS mix_value FROM test_multilevel_join_agg_local_shuffle_a UNION ALL SELECT k1, v2 AS mix_value FROM test_multilevel_join_agg_local_shuffle_b"], + [tag: "except", body: "SELECT k1, v1 AS mix_value FROM test_multilevel_join_agg_local_shuffle_a EXCEPT SELECT k1, flag AS mix_value FROM test_multilevel_join_agg_local_shuffle_d WHERE k1 = 4"], + [tag: "intersect", body: "SELECT k1, v1 AS mix_value FROM test_multilevel_join_agg_local_shuffle_a INTERSECT SELECT k1, v2 AS mix_value FROM test_multilevel_join_agg_local_shuffle_b"], + ] + + def buildWindowSetOpCase = { Map setOpCfg, Map joinCfg, String windowTag -> + String windowExpr = windowTag == "row_number" + ? "ROW_NUMBER() OVER (PARTITION BY mid_q.k1 ORDER BY mid_q.mix_value DESC) AS metric_b" + : "SUM(mid_q.mix_value) OVER (PARTITION BY mid_q.k1) AS metric_b" + String metricAExpr = windowTag == "row_number" + ? "SUM(mid_q.mix_value) OVER (PARTITION BY mid_q.k1) AS metric_a" + : "ROW_NUMBER() OVER (PARTITION BY mid_q.k1 ORDER BY mid_q.mix_value DESC) AS metric_a" + String joinRhs = joinCfg.tag == "shuffle" + ? "(SELECT k4 - 1000 AS k1, v3, CAST(v3 AS BIGINT) AS flag_value FROM test_multilevel_join_agg_local_shuffle_c) rhs" + : "test_multilevel_join_agg_local_shuffle_d rhs" + String joinCond = joinCfg.tag == "shuffle" ? "mid_q.k1 = rhs.k1" : "mid_q.k1 = rhs.k1" + String metricCExpr = joinCfg.tag == "shuffle" ? "CAST(rhs.flag_value AS BIGINT)" : "CAST(rhs.flag AS BIGINT)" + return """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + final_q.k1, + SUM(final_q.metric_a) AS total_metric_a, + MAX(final_q.metric_b) AS max_metric_b, + SUM(final_q.metric_c) AS total_metric_c + FROM ( + SELECT mid_q.k1, + ${metricAExpr}, + ${windowExpr}, + ${metricCExpr} AS metric_c + FROM ( + SELECT base_q.k1, base_q.mix_value + FROM ( + ${setOpCfg.body} + ) base_q + ) mid_q + JOIN ${joinCfg.hint} ${joinRhs} + ON ${joinCond} + ) final_q + GROUP BY final_q.k1 + ORDER BY final_q.k1 + """ + } + + def layeredJoinCases = [ + [ + tag: "bucket_shuffle_broadcast", + rawSql: """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + ab.k1, + CAST(ab.bucket_metric AS BIGINT) AS metric_a, + CAST(c1.v3 AS BIGINT) AS metric_b, + d.flag AS flag_metric + FROM ( + SELECT a.k1, SUM(a.v1 + b.v2) AS bucket_metric + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + GROUP BY a.k1 + ) ab + JOIN [shuffle] ( + SELECT k4 - 1000 AS k1, v3 + FROM test_multilevel_join_agg_local_shuffle_c + ) c1 + ON ab.k1 = c1.k1 + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON ab.k1 = d.k1 + """ + ], + [ + tag: "shuffle_broadcast_broadcast", + rawSql: """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + a.k1, + CAST(a.v1 AS BIGINT) AS metric_a, + CAST(c1.v3 + d1.flag AS BIGINT) AS metric_b, + d2.flag AS flag_metric + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN [shuffle] ( + SELECT k4 - 1000 AS k1, v3 + FROM test_multilevel_join_agg_local_shuffle_c + ) c1 + ON a.k1 = c1.k1 + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d1 + ON a.k1 = d1.k1 + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d2 + ON a.k1 = d2.k1 + """ + ], + [ + tag: "bucket_broadcast_shuffle", + rawSql: """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + abd.k1, + CAST(abd.bucket_metric AS BIGINT) AS metric_a, + CAST(c1.shuffle_metric AS BIGINT) AS metric_b, + abd.flag_metric AS flag_metric + FROM ( + SELECT ab.k1, + SUM(ab.bucket_metric) AS bucket_metric, + MAX(d.flag) AS flag_metric + FROM ( + SELECT a.k1, SUM(a.v1 + b.v2) AS bucket_metric + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + GROUP BY a.k1 + ) ab + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON ab.k1 = d.k1 + GROUP BY ab.k1 + ) abd + JOIN [shuffle] ( + SELECT k4 - 1000 AS k1, SUM(v3) AS shuffle_metric + FROM test_multilevel_join_agg_local_shuffle_c + GROUP BY k4 - 1000 + ) c1 + ON abd.k1 = c1.k1 + """ + ] + ] + + layeredJoinCases.each { cfg -> + (1..3).each { aggStage -> + checkCase("${cfg.tag}_agg_stage_${aggStage}", buildAggLayers(cfg.rawSql, aggStage)) + } + } + + joinModeConfigs.each { firstJoin -> + joinModeConfigs.each { secondJoin -> + joinModeConfigs.each { thirdJoin -> + checkCase( + "alternating_${firstJoin.tag}_${secondJoin.tag}_${thirdJoin.tag}", + buildAlternatingCase(firstJoin.hint, secondJoin.hint, thirdJoin.hint, + "alternating_${firstJoin.tag}_${secondJoin.tag}_${thirdJoin.tag}")) + } + } + } + + setOpConfigs.each { setOpCfg -> + joinModeConfigs.each { joinCfg -> + ["row_number", "window_sum"].each { windowTag -> + checkCase( + "window_${setOpCfg.tag}_${joinCfg.tag}_${windowTag}", + buildWindowSetOpCase(setOpCfg, joinCfg, windowTag)) + } + } + } + + checkCase("bucket_broadcast_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + x.k1, SUM(x.bucket_sum) AS total_sum, MAX(d.flag) AS max_flag + FROM ( + SELECT a.k1, SUM(a.v1 + b.v2) AS bucket_sum + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + GROUP BY a.k1 + ) x + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON x.k1 = d.k1 + GROUP BY x.k1 + ORDER BY x.k1 + """) + + checkCase("partitioned_broadcast_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + x.k1, SUM(x.shuffle_sum) AS total_sum, MAX(d.flag) AS max_flag + FROM ( + SELECT a.k1, SUM(a.v1 + c.v3) AS shuffle_sum + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN [shuffle] test_multilevel_join_agg_local_shuffle_c c + ON a.k1 + 1000 = c.k4 + GROUP BY a.k1 + ) x + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON x.k1 = d.k1 + GROUP BY x.k1 + ORDER BY x.k1 + """) + + checkCase("bucket_partitioned_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + t.k1, SUM(t.metric1) AS total_metric1, MAX(t.metric2) AS max_metric2 + FROM ( + SELECT a.k1, + SUM(a.v1 + b.v2) AS metric1, + SUM(c.v3) AS metric2 + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + JOIN [shuffle] test_multilevel_join_agg_local_shuffle_c c + ON b.k1 + 1000 = c.k4 + GROUP BY a.k1 + ) t + GROUP BY t.k1 + ORDER BY t.k1 + """) + + checkCase("all_three_multilevel_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + z.k1, SUM(z.metric) AS total_metric, MAX(z.flag) AS max_flag + FROM ( + SELECT y.k1, + SUM(y.metric) AS metric, + MAX(d.flag) AS flag + FROM ( + SELECT a.k1, + SUM(a.v1 + b.v2 + c.v3) AS metric + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + JOIN [shuffle] test_multilevel_join_agg_local_shuffle_c c + ON b.k1 + 1000 = c.k4 + GROUP BY a.k1 + ) y + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON y.k1 = d.k1 + GROUP BY y.k1 + ) z + GROUP BY z.k1 + ORDER BY z.k1 + """) + + checkCase("agg_join_agg_mix", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + l.k1, l.sa, r.sb, MAX(d.flag) AS max_flag + FROM ( + SELECT a.k1, SUM(a.v1) AS sa + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + GROUP BY a.k1 + ) l + JOIN [shuffle] ( + SELECT c.k4 - 1000 AS k1, SUM(c.v3) AS sb + FROM test_multilevel_join_agg_local_shuffle_c c + GROUP BY c.k4 - 1000 + ) r + ON l.k1 = r.k1 + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON l.k1 = d.k1 + GROUP BY l.k1, l.sa, r.sb + ORDER BY l.k1 + """) + + checkCase("double_broadcast_after_bucket", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + z.k1, SUM(z.metric) AS total_metric, MAX(z.flag_sum) AS max_flag_sum + FROM ( + SELECT x.k1, + SUM(x.bucket_sum) AS metric, + MAX(d1.flag + d2.flag) AS flag_sum + FROM ( + SELECT a.k1, SUM(a.v1 + b.v2) AS bucket_sum + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + GROUP BY a.k1 + ) x + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d1 + ON x.k1 = d1.k1 + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d2 + ON x.k1 = d2.k1 + GROUP BY x.k1 + ) z + GROUP BY z.k1 + ORDER BY z.k1 + """) + + checkCase("partitioned_join_between_two_aggs_then_broadcast", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + m.k1, SUM(m.left_sum + m.right_sum) AS total_metric, MAX(d.flag) AS max_flag + FROM ( + SELECT l.k1, l.left_sum, r.right_sum + FROM ( + SELECT a.k1, SUM(a.v1) AS left_sum + FROM test_multilevel_join_agg_local_shuffle_a a + GROUP BY a.k1 + ) l + JOIN [shuffle] ( + SELECT c.k4 - 1000 AS k1, SUM(c.v3) AS right_sum + FROM test_multilevel_join_agg_local_shuffle_c c + GROUP BY c.k4 - 1000 + ) r + ON l.k1 = r.k1 + ) m + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON m.k1 = d.k1 + GROUP BY m.k1 + ORDER BY m.k1 + """) + + checkCase("bucket_shuffle_broadcast_two_stage_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + q.k1, SUM(q.metric_a) AS total_a, SUM(q.metric_b) AS total_b, MAX(q.flag) AS max_flag + FROM ( + SELECT y.k1, + SUM(y.metric_a) AS metric_a, + SUM(y.metric_b) AS metric_b, + MAX(d.flag) AS flag + FROM ( + SELECT a.k1, + SUM(a.v1 + b.v2) AS metric_a, + SUM(c.v3) AS metric_b + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + JOIN [shuffle] test_multilevel_join_agg_local_shuffle_c c + ON a.k1 + 1000 = c.k4 + GROUP BY a.k1 + ) y + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON y.k1 = d.k1 + GROUP BY y.k1 + ) q + GROUP BY q.k1 + ORDER BY q.k1 + """) + + checkCase("left_join_null_preserving_with_multilevel_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + s.k1, SUM(s.left_metric) AS total_left_metric, SUM(s.right_metric) AS total_right_metric + FROM ( + SELECT a.k1, + SUM(a.v1) AS left_metric, + SUM(IFNULL(t.right_metric, 0)) AS right_metric + FROM test_multilevel_join_agg_local_shuffle_a a + LEFT JOIN ( + SELECT b.k1, SUM(c.v3) AS right_metric + FROM test_multilevel_join_agg_local_shuffle_b b + JOIN [shuffle] test_multilevel_join_agg_local_shuffle_c c + ON b.k1 + 1000 = c.k4 + GROUP BY b.k1 + ) t + ON a.k1 = t.k1 + GROUP BY a.k1 + ) s + GROUP BY s.k1 + ORDER BY s.k1 + """) + + checkCase("seven_layer_bucket_shuffle_broadcast", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + o.k1, SUM(o.final_metric) AS total_metric, MAX(o.final_flag) AS max_flag + FROM ( + SELECT n.k1, + SUM(n.stage5_metric + n.stage6_metric) AS final_metric, + MAX(n.stage7_flag) AS final_flag + FROM ( + SELECT m.k1, + SUM(m.stage3_metric) AS stage5_metric, + SUM(m.stage4_metric) AS stage6_metric, + MAX(d2.flag) AS stage7_flag + FROM ( + SELECT l.k1, + SUM(l.stage1_metric) AS stage3_metric, + SUM(l.stage2_metric) AS stage4_metric + FROM ( + SELECT x.k1, + SUM(x.bucket_metric) AS stage1_metric, + SUM(y.shuffle_metric) AS stage2_metric + FROM ( + SELECT a.k1, SUM(a.v1 + b.v2) AS bucket_metric + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + GROUP BY a.k1 + ) x + JOIN [shuffle] ( + SELECT c.k4 - 1000 AS k1, SUM(c.v3) AS shuffle_metric + FROM test_multilevel_join_agg_local_shuffle_c c + GROUP BY c.k4 - 1000 + ) y + ON x.k1 = y.k1 + GROUP BY x.k1 + ) l + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d1 + ON l.k1 = d1.k1 + GROUP BY l.k1 + ) m + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d2 + ON m.k1 = d2.k1 + GROUP BY m.k1 + ) n + GROUP BY n.k1 + ) o + GROUP BY o.k1 + ORDER BY o.k1 + """) + + checkCase("eight_layer_mixed_join_agg_chain", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + q.k1, SUM(q.metric_a) AS total_a, SUM(q.metric_b) AS total_b, MAX(q.metric_c) AS max_c + FROM ( + SELECT p.k1, + SUM(p.stage6_a) AS metric_a, + SUM(p.stage7_b) AS metric_b, + MAX(p.stage8_c) AS metric_c + FROM ( + SELECT n.k1, + SUM(n.stage4_a) AS stage6_a, + SUM(n.stage5_b) AS stage7_b, + MAX(d2.flag + n.stage5_c) AS stage8_c + FROM ( + SELECT m.k1, + SUM(m.stage2_a) AS stage4_a, + SUM(m.stage3_b) AS stage5_b, + MAX(d1.flag) AS stage5_c + FROM ( + SELECT l.k1, + SUM(l.bucket_metric) AS stage2_a, + SUM(l.shuffle_metric) AS stage3_b + FROM ( + SELECT a.k1, + SUM(a.v1 + b.v2) AS bucket_metric, + SUM(c.v3) AS shuffle_metric + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + JOIN [shuffle] test_multilevel_join_agg_local_shuffle_c c + ON b.k1 + 1000 = c.k4 + GROUP BY a.k1 + ) l + GROUP BY l.k1 + ) m + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d1 + ON m.k1 = d1.k1 + GROUP BY m.k1 + ) n + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d2 + ON n.k1 = d2.k1 + GROUP BY n.k1 + ) p + GROUP BY p.k1 + ) q + GROUP BY q.k1 + ORDER BY q.k1 + """) + + checkCase("seven_layer_left_join_mix", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + z.k1, SUM(z.left_total) AS total_left, SUM(z.right_total) AS total_right, MAX(z.flag_metric) AS max_flag_metric + FROM ( + SELECT y.k1, + SUM(y.stage5_left) AS left_total, + SUM(y.stage6_right) AS right_total, + MAX(y.stage7_flag) AS flag_metric + FROM ( + SELECT x.k1, + SUM(x.stage3_left) AS stage5_left, + SUM(IFNULL(x.stage4_right, 0)) AS stage6_right, + MAX(d.flag) AS stage7_flag + FROM ( + SELECT l.k1, + SUM(l.stage1_left) AS stage3_left, + SUM(IFNULL(r.stage2_right, 0)) AS stage4_right + FROM ( + SELECT a.k1, SUM(a.v1) AS stage1_left + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + GROUP BY a.k1 + ) l + LEFT JOIN ( + SELECT c.k4 - 1000 AS k1, SUM(c.v3) AS stage2_right + FROM test_multilevel_join_agg_local_shuffle_c c + GROUP BY c.k4 - 1000 + ) r + ON l.k1 = r.k1 + GROUP BY l.k1 + ) x + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON x.k1 = d.k1 + GROUP BY x.k1 + ) y + GROUP BY y.k1 + ) z + GROUP BY z.k1 + ORDER BY z.k1 + """) + + checkCase("broadcast_shuffle_broadcast_nested_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + z.k1, + SUM(z.m1) AS total_m1, + SUM(z.m2) AS total_m2, + MAX(z.m3) AS max_m3 + FROM ( + SELECT y.k1, + SUM(y.s3) AS m1, + SUM(y.s4) AS m2, + MAX(y.s5) AS m3 + FROM ( + SELECT x.k1, + SUM(x.s1) AS s3, + SUM(x.s2) AS s4, + MAX(d1.flag) AS s5 + FROM ( + SELECT l.k1, + SUM(l.left_metric) AS s1, + SUM(r.right_metric) AS s2 + FROM ( + SELECT a.k1, + SUM(a.v1 + b.v2) AS left_metric + FROM test_multilevel_join_agg_local_shuffle_a a + JOIN test_multilevel_join_agg_local_shuffle_b b + ON a.k1 = b.k1 + GROUP BY a.k1 + ) l + JOIN [shuffle] ( + SELECT c.k4 - 1000 AS k1, + SUM(c.v3) AS right_metric + FROM test_multilevel_join_agg_local_shuffle_c c + GROUP BY c.k4 - 1000 + ) r + ON l.k1 = r.k1 + GROUP BY l.k1 + ) x + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d1 + ON x.k1 = d1.k1 + GROUP BY x.k1 + ) y + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d2 + ON y.k1 = d2.k1 + GROUP BY y.k1 + ) z + GROUP BY z.k1 + ORDER BY z.k1 + """) + + checkCase("window_union_join_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + q.k1, + SUM(q.metric_a) AS total_metric_a, + MAX(q.metric_b) AS max_metric_b + FROM ( + SELECT w.k1, + SUM(w.mix_value) AS metric_a, + MAX(d.flag + w.rn) AS metric_b + FROM ( + SELECT u.k1, + u.mix_value, + ROW_NUMBER() OVER (PARTITION BY u.k1 ORDER BY u.mix_value DESC) AS rn + FROM ( + SELECT a.k1, a.v1 AS mix_value FROM test_multilevel_join_agg_local_shuffle_a a + UNION ALL + SELECT b.k1, b.v2 AS mix_value FROM test_multilevel_join_agg_local_shuffle_b b + ) u + ) w + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON w.k1 = d.k1 + GROUP BY w.k1 + ) q + GROUP BY q.k1 + ORDER BY q.k1 + """) + + checkCase("window_except_join_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + t.k1, + SUM(t.rn) AS total_rn, + MAX(t.flag_metric) AS max_flag_metric + FROM ( + SELECT e.k1, + ROW_NUMBER() OVER (PARTITION BY e.k1 ORDER BY e.k1) AS rn, + MAX(d.flag) OVER (PARTITION BY e.k1) AS flag_metric + FROM ( + SELECT k1 FROM test_multilevel_join_agg_local_shuffle_a + EXCEPT + SELECT k1 FROM test_multilevel_join_agg_local_shuffle_c WHERE k4 = 1004 + ) e + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON e.k1 = d.k1 + ) t + GROUP BY t.k1 + ORDER BY t.k1 + """) + + checkCase("window_intersect_shuffle_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + x.k1, + SUM(x.metric_a) AS total_metric_a, + MAX(x.metric_b) AS max_metric_b + FROM ( + SELECT i.k1, + SUM(c.v3) OVER (PARTITION BY i.k1) AS metric_a, + ROW_NUMBER() OVER (PARTITION BY i.k1 ORDER BY c.v3 DESC) AS metric_b + FROM ( + SELECT k1 FROM test_multilevel_join_agg_local_shuffle_a + INTERSECT + SELECT k1 FROM test_multilevel_join_agg_local_shuffle_b + ) i + JOIN [shuffle] test_multilevel_join_agg_local_shuffle_c c + ON i.k1 + 1000 = c.k4 + ) x + GROUP BY x.k1 + ORDER BY x.k1 + """) + + checkCase("window_union_except_broadcast_agg", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + final_q.k1, + SUM(final_q.metric_a) AS total_metric_a, + SUM(final_q.metric_b) AS total_metric_b, + MAX(final_q.metric_c) AS max_metric_c + FROM ( + SELECT s.k1, + SUM(s.mix_value) AS metric_a, + MAX(s.rn) AS metric_b, + MAX(d.flag) AS metric_c + FROM ( + SELECT u.k1, + u.mix_value, + ROW_NUMBER() OVER (PARTITION BY u.k1 ORDER BY u.mix_value DESC) AS rn + FROM ( + SELECT k1, v1 AS mix_value FROM test_multilevel_join_agg_local_shuffle_a + UNION ALL + SELECT k1, v2 AS mix_value FROM test_multilevel_join_agg_local_shuffle_b + EXCEPT + SELECT k1, flag AS mix_value FROM test_multilevel_join_agg_local_shuffle_d WHERE k1 = 4 + ) u + ) s + JOIN [broadcast] test_multilevel_join_agg_local_shuffle_d d + ON s.k1 = d.k1 + GROUP BY s.k1 + ) final_q + GROUP BY final_q.k1 + ORDER BY final_q.k1 + """) + + checkCase("window_setop_join_agg_chain", """ + SELECT /*+SET_VAR(disable_join_reorder=true,disable_colocate_plan=true,ignore_storage_data_distribution=false,parallel_pipeline_task_num=4,auto_broadcast_join_threshold=-1,broadcast_row_count_limit=0) */ + outer_q.k1, + SUM(outer_q.metric_a) AS total_metric_a, + MAX(outer_q.metric_b) AS max_metric_b, + SUM(outer_q.metric_c) AS total_metric_c + FROM ( + SELECT mid_q.k1, + SUM(mid_q.window_metric) AS metric_a, + MAX(mid_q.window_rank) AS metric_b, + SUM(c.v3) AS metric_c + FROM ( + SELECT base_q.k1, + SUM(base_q.mix_value) OVER (PARTITION BY base_q.k1) AS window_metric, + ROW_NUMBER() OVER (PARTITION BY base_q.k1 ORDER BY base_q.mix_value DESC) AS window_rank + FROM ( + SELECT k1, v1 AS mix_value FROM test_multilevel_join_agg_local_shuffle_a + UNION ALL + SELECT k1, v2 AS mix_value FROM test_multilevel_join_agg_local_shuffle_b + INTERSECT + SELECT k1, v3 AS mix_value FROM test_multilevel_join_agg_local_shuffle_c WHERE k4 >= 1001 + ) base_q + ) mid_q + JOIN [shuffle] test_multilevel_join_agg_local_shuffle_c c + ON mid_q.k1 + 1000 = c.k4 + GROUP BY mid_q.k1 + ) outer_q + GROUP BY outer_q.k1 + ORDER BY outer_q.k1 + """) +}