diff --git a/be/src/exec/operator/olap_scan_operator.cpp b/be/src/exec/operator/olap_scan_operator.cpp index 4ce564241aabcc..db20ef96868a76 100644 --- a/be/src/exec/operator/olap_scan_operator.cpp +++ b/be/src/exec/operator/olap_scan_operator.cpp @@ -600,6 +600,18 @@ Status OlapScanLocalState::_init_scanners(std::list* scanners) { } bool enable_parallel_scan = state()->enable_parallel_scan(); + auto resolve_binlog_scan_type = [](const TPaloScanRange& scan_range) { + if (scan_range.__isset.binlog_scan_type) { + return scan_range.binlog_scan_type; + } + return TBinlogScanType::NONE; + }; + auto resolve_binlog_read_source = [](const TPaloScanRange& scan_range) { + if (scan_range.__isset.binlog_read_source) { + return scan_range.binlog_read_source; + } + return TBinlogReadSource::NONE; + }; bool read_row_binlog = p._olap_scan_node.__isset.read_row_binlog && p._olap_scan_node.read_row_binlog; @@ -671,11 +683,10 @@ Status OlapScanLocalState::_init_scanners(std::list* scanners) { int scanners_per_tablet = std::max(1, 64 / (int)_scan_ranges.size()); for (size_t scan_range_idx = 0; scan_range_idx < _scan_ranges.size(); scan_range_idx++) { + const auto& palo_scan_range = *_scan_ranges[scan_range_idx]; int64_t version = 0; - std::from_chars(_scan_ranges[scan_range_idx]->version.data(), - _scan_ranges[scan_range_idx]->version.data() + - _scan_ranges[scan_range_idx]->version.size(), - version); + std::from_chars(palo_scan_range.version.data(), + palo_scan_range.version.data() + palo_scan_range.version.size(), version); std::vector>* ranges = &_cond_ranges; int size_based_scanners_per_tablet = 1; @@ -703,18 +714,27 @@ Status OlapScanLocalState::_init_scanners(std::list* scanners) { for (auto& split : _read_sources[scan_range_idx].rs_splits) { split.rs_reader = split.rs_reader->clone(); } - auto scanner = - OlapScanner::create_shared(this, OlapScanner::Params { - state(), - _scanner_profile.get(), - scanner_ranges, - _tablets[scan_range_idx].tablet, - version, - _read_sources[scan_range_idx], - p._limit, - p._olap_scan_node.is_preaggregation, - read_row_binlog, - }); + + auto scanner = OlapScanner::create_shared( + this, OlapScanner::Params { + state(), + _scanner_profile.get(), + scanner_ranges, + _tablets[scan_range_idx].tablet, + version, + _read_sources[scan_range_idx], + p._limit, + p._olap_scan_node.is_preaggregation, + read_row_binlog, + resolve_binlog_scan_type(palo_scan_range), + resolve_binlog_read_source(palo_scan_range), + palo_scan_range.__isset.start_tso + ? std::make_optional(palo_scan_range.start_tso) + : std::nullopt, + palo_scan_range.__isset.end_tso + ? std::make_optional(palo_scan_range.end_tso) + : std::nullopt, + }); RETURN_IF_ERROR(scanner->init(state(), _conjuncts)); scanners->push_back(std::move(scanner)); } diff --git a/be/src/exec/scan/olap_scanner.cpp b/be/src/exec/scan/olap_scanner.cpp index 8656576dbeb4cd..f8154b33f3e4e0 100644 --- a/be/src/exec/scan/olap_scanner.cpp +++ b/be/src/exec/scan/olap_scanner.cpp @@ -38,6 +38,7 @@ #include "common/logging.h" #include "common/metrics/doris_metrics.h" #include "core/block/block.h" +#include "core/data_type/data_type_number.h" #include "exec/common/variant_util.h" #include "exec/operator/olap_scan_operator.h" #include "exec/scan/scan_node.h" @@ -51,12 +52,14 @@ #include "runtime/runtime_profile.h" #include "runtime/runtime_state.h" #include "service/backend_options.h" +#include "storage/binlog.h" #include "storage/id_manager.h" #include "storage/index/inverted/inverted_index_profile.h" #include "storage/iterator/block_reader.h" #include "storage/olap_common.h" #include "storage/olap_tuple.h" #include "storage/olap_utils.h" +#include "storage/predicate/predicate_creator.h" #include "storage/storage_engine.h" #include "storage/tablet/tablet_schema.h" #ifndef NDEBUG @@ -98,7 +101,11 @@ OlapScanner::OlapScanner(ScanLocalStateBase* parent, OlapScanner::Params&& param .score_runtime {}, .collection_statistics {}, .ann_topn_runtime {}, - .condition_cache_digest = parent->get_condition_cache_digest()}) { + .condition_cache_digest = parent->get_condition_cache_digest(), + .binlog_scan_type = params.binlog_scan_type, + .binlog_read_source = params.binlog_read_source}), + _start_tso(params.start_tso), + _end_tso(params.end_tso) { _tablet_reader_params.set_read_source(std::move(params.read_source), _state->skip_delete_bitmap()); _has_prepared = false; @@ -287,6 +294,49 @@ Status OlapScanner::_open_impl(RuntimeState* state) { return Status::OK(); } +Status OlapScanner::_init_row_binlog_tso_predicates() { + if (_tablet_reader_params.reader_type != ReaderType::READER_BINLOG) { + return Status::OK(); + } + + if (!_start_tso.has_value() && !_end_tso.has_value()) { + return Status::OK(); + } + + auto& tablet_schema = _tablet_reader_params.tablet_schema; + int32_t tso_index = tablet_schema->field_index(std::string(kRowBinlogTimestampColName)); + if (tso_index < 0) { + auto source_tablet_schema = _tablet_reader_params.tablet->row_binlog_tablet_schema(); + const int32_t source_tso_index = + source_tablet_schema->field_index(std::string(kRowBinlogTimestampColName)); + if (source_tso_index < 0) { + return Status::InternalError("Column {} not found in tablet schema", + std::string(kRowBinlogTimestampColName)); + } + tablet_schema->append_column(TabletColumn(source_tablet_schema->column(source_tso_index))); + tso_index = tablet_schema->field_index(std::string(kRowBinlogTimestampColName)); + } + if (tso_index < 0) { + return Status::InternalError("Column {} not found in tablet schema after append", + std::string(kRowBinlogTimestampColName)); + } + + auto data_type = std::make_shared(); + if (_start_tso.has_value()) { + Field start_value = + Field::create_field(extract_tso_physical_time(*_start_tso)); + _tablet_reader_params.predicates.push_back(create_comparison_predicate( + tso_index, std::string(kRowBinlogTimestampColName), data_type, start_value, false)); + } + if (_end_tso.has_value()) { + Field end_value = Field::create_field(extract_tso_physical_time(*_end_tso)); + _tablet_reader_params.predicates.push_back(create_comparison_predicate( + tso_index, std::string(kRowBinlogTimestampColName), data_type, end_value, false)); + } + + return Status::OK(); +} + // it will be called under tablet read lock because capture rs readers need Status OlapScanner::_init_tablet_reader_params( const phmap::flat_hash_map& slot_id_to_slot_desc, @@ -371,7 +421,49 @@ Status OlapScanner::_init_tablet_reader_params( _tablet_reader_params.origin_return_columns = &_return_columns; _tablet_reader_params.tablet_columns_convert_to_null_set = &_tablet_columns_convert_to_null_set; - if (_tablet_reader_params.direct_mode) { + auto add_return_column_if_absent = [&](uint32_t cid) { + if (std::find(_tablet_reader_params.return_columns.begin(), + _tablet_reader_params.return_columns.end(), + cid) == _tablet_reader_params.return_columns.end()) { + _tablet_reader_params.return_columns.push_back(cid); + } + }; + + const bool need_before_columns = + _tablet_reader_params.binlog_scan_type == TBinlogScanType::MIN_DELTA || + (_tablet_reader_params.binlog_scan_type == TBinlogScanType::DETAIL && + _tablet_reader_params.binlog_read_source == TBinlogReadSource::CHANGES); + if (need_before_columns) { + for (size_t i = 0; i < tablet_schema->num_key_columns(); ++i) { + add_return_column_if_absent(static_cast(i)); + } + for (auto cid : _return_columns) { + add_return_column_if_absent(cid); + } + + if (int32_t op_idx = tablet_schema->field_index(std::string(kRowBinlogOpColName)); + op_idx >= 0) { + add_return_column_if_absent(static_cast(op_idx)); + } + if (int32_t lsn_idx = tablet_schema->field_index(std::string(kRowBinlogLsnColName)); + lsn_idx >= 0) { + add_return_column_if_absent(static_cast(lsn_idx)); + } + + for (auto cid : _return_columns) { + if (cid >= tablet_schema->num_key_columns()) { + const auto& col_name = tablet_schema->column(cid).name(); + std::string before_col_name; + before_col_name.append("__BEFORE__"); + before_col_name.append(col_name); + before_col_name.append("__"); + if (int32_t before_idx = tablet_schema->field_index(before_col_name); + before_idx >= 0) { + add_return_column_if_absent(static_cast(before_idx)); + } + } + } + } else if (_tablet_reader_params.direct_mode) { _tablet_reader_params.return_columns = _return_columns; } else { // we need to fetch all key columns to do the right aggregation on storage engine side. @@ -424,6 +516,17 @@ Status OlapScanner::_init_tablet_reader_params( } } + RETURN_IF_ERROR(_init_row_binlog_tso_predicates()); + + if (_tablet_reader_params.binlog_scan_type != TBinlogScanType::NONE) { + _tablet_reader_params.read_orderby_key = true; + _tablet_reader_params.read_orderby_key_reverse = false; + _tablet_reader_params.read_orderby_key_num_prefix_columns = 0; + _tablet_reader_params.read_orderby_key_limit = 0; + _tablet_reader_params.force_key_ordered_read = true; + _tablet_reader_params.topn_filter_source_node_ids.clear(); + } + _tablet_reader_params.use_page_cache = _state->enable_page_cache(); DBUG_EXECUTE_IF("NewOlapScanner::_init_tablet_reader_params.block", DBUG_BLOCK); diff --git a/be/src/exec/scan/olap_scanner.h b/be/src/exec/scan/olap_scanner.h index ec19e38d7413c7..c0c24277b38721 100644 --- a/be/src/exec/scan/olap_scanner.h +++ b/be/src/exec/scan/olap_scanner.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -69,6 +70,10 @@ class OlapScanner : public Scanner { int64_t limit; bool aggregation; bool read_row_binlog = false; + TBinlogScanType::type binlog_scan_type = TBinlogScanType::NONE; + TBinlogReadSource::type binlog_read_source = TBinlogReadSource::NONE; + std::optional start_tso; + std::optional end_tso; }; OlapScanner(ScanLocalStateBase* parent, Params&& params); @@ -95,6 +100,8 @@ class OlapScanner : public Scanner { predicates, const std::vector& function_filters); + [[nodiscard]] Status _init_row_binlog_tso_predicates(); + [[nodiscard]] Status _init_return_columns(); [[nodiscard]] Status _init_variant_columns(); #ifndef NDEBUG @@ -105,6 +112,8 @@ class OlapScanner : public Scanner { TabletReader::ReaderParams _tablet_reader_params; std::unique_ptr _tablet_reader; + std::optional _start_tso; + std::optional _end_tso; int64_t _bytes_read_from_local = 0; int64_t _bytes_read_from_remote = 0; diff --git a/be/src/exec/scan/parallel_scanner_builder.cpp b/be/src/exec/scan/parallel_scanner_builder.cpp index 80cda08531f8e6..4c58bf4f4c8275 100644 --- a/be/src/exec/scan/parallel_scanner_builder.cpp +++ b/be/src/exec/scan/parallel_scanner_builder.cpp @@ -255,8 +255,19 @@ std::shared_ptr ParallelScannerBuilder::_build_scanner( BaseTabletSPtr tablet, int64_t version, const std::vector& key_ranges, TabletReadSource&& read_source) { OlapScanner::Params params { - _state, _scanner_profile.get(), key_ranges, std::move(tablet), - version, std::move(read_source), _limit, _is_preaggregation, + _state, + _scanner_profile.get(), + key_ranges, + std::move(tablet), + version, + std::move(read_source), + _limit, + _is_preaggregation, + false, + TBinlogScanType::NONE, + TBinlogReadSource::NONE, + std::nullopt, + std::nullopt, }; return OlapScanner::create_shared(_parent, std::move(params)); } diff --git a/be/src/storage/binlog.h b/be/src/storage/binlog.h index 1a302ea26453d5..161a2b3182e415 100644 --- a/be/src/storage/binlog.h +++ b/be/src/storage/binlog.h @@ -53,6 +53,8 @@ constexpr std::string_view kBinlogDataPrefix = "binlog_data_"; constexpr std::string_view kRowBinlogPrefix = "binlog_row_"; constexpr std::string_view kRowBinlogLsnColName = "__DORIS_BINLOG_LSN__"; constexpr std::string_view kRowBinlogTimestampColName = "__DORIS_BINLOG_TIMESTAMP__"; +constexpr std::string_view kRowBinlogOpColName = "__DORIS_BINLOG_OP__"; + constexpr int64_t kBinlogLsnAutoIncId = -1; // used in file directory constexpr std::string_view FDRowBinlogSuffix = "_row_binlog"; diff --git a/be/src/storage/iterator/block_reader.cpp b/be/src/storage/iterator/block_reader.cpp index c9af7fed63bf2d..6205f463d7df49 100644 --- a/be/src/storage/iterator/block_reader.cpp +++ b/be/src/storage/iterator/block_reader.cpp @@ -34,11 +34,14 @@ #include "common/status.h" #include "core/block/column_with_type_and_name.h" #include "core/column/column_nullable.h" +#include "core/column/column_string.h" #include "core/column/column_vector.h" #include "core/data_type/data_type_number.h" #include "exprs/aggregate/aggregate_function_reader.h" #include "exprs/function_filter.h" #include "runtime/runtime_state.h" +#include "storage/binlog.h" +#include "storage/iterator/block_reader_utils.h" #include "storage/iterator/vcollect_iterator.h" #include "storage/olap_common.h" #include "storage/olap_define.h" @@ -57,6 +60,40 @@ using namespace ErrorCode; static constexpr int32_t BLOCK_SIZE_CHECK_INTERVAL_ROWS = 64; +namespace { + +Status insert_cell_with_wrapper_adaptation(IColumn& dst, const IColumn& src, size_t row_pos) { + auto* dst_nullable = typeid_cast(&dst); + auto* src_nullable = check_and_get_column(src); + + if (dst_nullable == nullptr && src_nullable == nullptr) { + dst.insert_from(src, row_pos); + return Status::OK(); + } + + if (dst_nullable != nullptr && src_nullable != nullptr) { + dst.insert_from(src, row_pos); + return Status::OK(); + } + + if (dst_nullable != nullptr) { + dst_nullable->get_nested_column().insert_from(src, row_pos); + dst_nullable->get_null_map_data().push_back(0); + return Status::OK(); + } + + DCHECK(src_nullable != nullptr); + if (src_nullable->is_null_at(row_pos)) { + return Status::InternalError( + "cannot append null value from source column {} to non-nullable target column {}", + src.get_name(), dst.get_name()); + } + dst.insert_from(src_nullable->get_nested_column(), row_pos); + return Status::OK(); +} + +} // namespace + BlockReader::~BlockReader() { for (int i = 0; i < _agg_functions.size(); ++i) { _agg_functions[i]->destroy(_agg_places[i]); @@ -74,6 +111,426 @@ Status BlockReader::next_block_with_aggregation(Block* block, bool* eof) { return res; } +Status BlockReader::_ensure_binlog_column_pos(const Block& src_block) { + if (_binlog_column_pos_inited) { + if (_binlog_op_pos >= 0 && _binlog_op_pos < src_block.columns() && + src_block.get_by_position(_binlog_op_pos).name == kRowBinlogOpColName) { + return Status::OK(); + } + _binlog_op_pos = -1; + _binlog_lsn_pos = -1; + _binlog_timestamp_pos = -1; + _binlog_column_pos_inited = false; + } + + const size_t col_num = src_block.columns(); + size_t col_names_total_length = 0; + + for (size_t i = 0; i < col_num; ++i) { + const auto& name = src_block.get_by_position(i).name; + col_names_total_length += name.size(); + if (name == kRowBinlogOpColName) { + _binlog_op_pos = static_cast(i); + } else if (name == kRowBinlogLsnColName) { + _binlog_lsn_pos = static_cast(i); + } else if (name == kRowBinlogTimestampColName) { + _binlog_timestamp_pos = static_cast(i); + } + } + + if (_binlog_op_pos < 0) { + std::string col_names; + col_names.reserve(col_names_total_length + (col_num > 0 ? (col_num - 1) * 2 : 0)); + if (col_num > 0) { + col_names.append(src_block.get_by_position(0).name); + for (size_t i = 1; i < col_num; ++i) { + col_names.append(", "); + col_names.append(src_block.get_by_position(i).name); + } + } + return Status::InternalError("row binlog op column not found, block columns: {}", + col_names); + } + + _binlog_column_pos_inited = true; + return Status::OK(); +} + +int64_t BlockReader::_read_binlog_op(const IColumn& col, size_t row) const { + const IColumn* cur = &col; + if (const auto* nullable = check_and_get_column(*cur)) { + if (nullable->is_null_at(row)) { + return ROW_BINLOG_UNKNOWN; + } + cur = &nullable->get_nested_column(); + } + + if (const auto* int64_col = check_and_get_column(*cur)) { + return int64_col->get_element(row); + } + + return ROW_BINLOG_UNKNOWN; +} + +Status BlockReader::_write_binlog_op(IColumn& col, int64_t op) const { + IColumn* cur = &col; + ColumnNullable* nullable = nullptr; + if (auto* n = typeid_cast(cur)) { + nullable = n; + cur = &nullable->get_nested_column(); + } + + if (auto* int64_col = typeid_cast(cur)) { + int64_col->insert_value(op); + } else { + return Status::InternalError("invalid column type"); + } + + if (nullable != nullptr) { + nullable->get_null_map_data().push_back(0); + } + return Status::OK(); +} + +bool BlockReader::_is_binlog_meta_column(int idx) const { + return idx == _binlog_op_pos || idx == _binlog_lsn_pos || idx == _binlog_timestamp_pos; +} + +int BlockReader::_resolve_source_column_index(const Block& src_block, int idx, + bool use_before) const { + if (!use_before || _is_binlog_meta_column(idx)) { + return idx; + } + + return resolve_before_column_index(src_block, idx, _binlog_op_pos); +} + +void BlockReader::_init_pending_row_columns(const Block& block) { + if (!_pending_row_columns.empty()) { + return; + } + _pending_row_columns = block.clone_empty_columns(); +} + +bool BlockReader::_emit_pending_row(Block* block, MutableColumns& target_columns, + size_t& output_row_count, bool* eof) { + if (!_has_pending_row) { + return false; + } + + for (size_t i = 0; i < _pending_row_columns.size(); ++i) { + target_columns[i]->insert_from(*_pending_row_columns[i], 0); + _pending_row_columns[i]->clear(); + } + _has_pending_row = false; + output_row_count++; + + if (_eof) { + block->set_columns(std::move(target_columns)); + *eof = false; + return true; + } + + return false; +} + +Status BlockReader::_append_change_row(MutableColumns& target_columns, const Block& src_block, + size_t row_pos, int64_t output_op, bool use_before) { + RETURN_IF_ERROR(_ensure_binlog_column_pos(src_block)); + for (auto idx : _normal_columns_idx) { + int target_col_idx = _return_columns_loc[idx]; + if (target_col_idx < 0) { + continue; + } + if (idx == _binlog_op_pos) { + RETURN_IF_ERROR(_write_binlog_op(*target_columns[target_col_idx], output_op)); + continue; + } + int source_idx = _resolve_source_column_index(src_block, idx, use_before); + RETURN_IF_ERROR(insert_cell_with_wrapper_adaptation( + *target_columns[target_col_idx], *src_block.get_by_position(source_idx).column, + row_pos)); + } + return Status::OK(); +} + +Status BlockReader::_min_delta_next_block(Block* block, bool* eof) { + if (UNLIKELY(_eof && !_has_pending_row)) { + *eof = true; + return Status::OK(); + } + + if (_stored_data_columns.empty()) { + _stored_data_columns = _next_row.block->clone_empty_columns(); + } + + auto target_columns_guard = block->mutate_columns_scoped(); + auto& target_columns = target_columns_guard.mutable_columns(); + size_t output_row_count = 0; + + _init_pending_row_columns(*block); + if (_emit_pending_row(block, target_columns, output_row_count, eof)) { + return Status::OK(); + } + + while (output_row_count < _reader_context.batch_size && !_eof) { + if (_stored_data_columns[0]->empty()) { + for (size_t i = 0; i < _stored_data_columns.size(); ++i) { + _stored_data_columns[i]->insert_from(*_next_row.block->get_by_position(i).column, + _next_row.row_pos); + } + } + + IteratorRowRef last_row_ref = _next_row; + + auto res = _vcollect_iter.next(&_next_row); + if (UNLIKELY(res.is())) { + _eof = true; + *eof = true; + } else if (UNLIKELY(!res.ok())) { + return res; + } + + if (!_eof && _min_delta_next_row_has_same_key()) { + continue; + } + + if (UNLIKELY(last_row_ref.block == nullptr)) { + return Status::InternalError("invalid row reference in min-delta stream reader"); + } + RETURN_IF_ERROR(_ensure_binlog_column_pos(*last_row_ref.block)); + auto first_op = _read_binlog_op(*_stored_data_columns[_binlog_op_pos], 0); + + auto& last_op_col = last_row_ref.block->get_by_position(_binlog_op_pos).column; + auto last_op = _read_binlog_op(*last_op_col, last_row_ref.row_pos); + + auto result = AggregateFunctionMinDelta::calculate_result(first_op, last_op); + switch (result) { + case AggregateFunctionMinDelta::ResultType::SKIP: + break; + case AggregateFunctionMinDelta::ResultType::INSERT: + for (auto idx : _normal_columns_idx) { + int target_col_idx = _return_columns_loc[idx]; + if (idx == _binlog_op_pos) { + RETURN_IF_ERROR(_write_binlog_op(*target_columns[target_col_idx], + STREAM_CHANGE_INSERT)); + } else { + target_columns[target_col_idx]->insert_from( + *last_row_ref.block->get_by_position(idx).column, last_row_ref.row_pos); + } + } + output_row_count++; + break; + case AggregateFunctionMinDelta::ResultType::DELETE: + for (auto idx : _normal_columns_idx) { + int target_col_idx = _return_columns_loc[idx]; + if (idx == _binlog_op_pos) { + RETURN_IF_ERROR(_write_binlog_op(*target_columns[target_col_idx], + STREAM_CHANGE_DELETE)); + } else { + target_columns[target_col_idx]->insert_from( + *last_row_ref.block->get_by_position(idx).column, last_row_ref.row_pos); + } + } + output_row_count++; + break; + case AggregateFunctionMinDelta::ResultType::UPDATE_BEFORE_AFTER: + for (auto idx : _normal_columns_idx) { + int target_col_idx = _return_columns_loc[idx]; + if (idx == _binlog_op_pos) { + RETURN_IF_ERROR(_write_binlog_op(*target_columns[target_col_idx], + STREAM_CHANGE_UPDATE_BEFORE)); + } else if (idx == _binlog_lsn_pos) { + target_columns[target_col_idx]->insert_from( + *last_row_ref.block->get_by_position(idx).column, last_row_ref.row_pos); + } else { + int source_idx = _resolve_source_column_index(*last_row_ref.block, idx, true); + target_columns[target_col_idx]->insert_from(*_stored_data_columns[source_idx], + 0); + } + } + output_row_count++; + + if (output_row_count >= _reader_context.batch_size) { + for (auto& col : _pending_row_columns) { + col->clear(); + } + for (auto idx : _normal_columns_idx) { + int target_col_idx = _return_columns_loc[idx]; + if (idx == _binlog_op_pos) { + RETURN_IF_ERROR(_write_binlog_op(*_pending_row_columns[target_col_idx], + STREAM_CHANGE_UPDATE_AFTER)); + } else { + _pending_row_columns[target_col_idx]->insert_from( + *last_row_ref.block->get_by_position(idx).column, + last_row_ref.row_pos); + } + } + _has_pending_row = true; + } else { + for (auto idx : _normal_columns_idx) { + int target_col_idx = _return_columns_loc[idx]; + if (idx == _binlog_op_pos) { + RETURN_IF_ERROR(_write_binlog_op(*target_columns[target_col_idx], + STREAM_CHANGE_UPDATE_AFTER)); + } else { + target_columns[target_col_idx]->insert_from( + *last_row_ref.block->get_by_position(idx).column, + last_row_ref.row_pos); + } + } + output_row_count++; + } + break; + } + + for (auto& col : _stored_data_columns) { + col->clear(); + } + } + + block->set_columns(std::move(target_columns)); + *eof = _eof && !_has_pending_row; + return Status::OK(); +} + +Status BlockReader::_detail_change_next_block(Block* block, bool* eof) { + auto output_template_block = block->clone_empty(); + auto target_columns_guard = block->mutate_columns_scoped(); + auto& target_columns = target_columns_guard.mutable_columns(); + size_t output_row_count = 0; + _init_pending_row_columns(*block); + if (_emit_pending_row(block, target_columns, output_row_count, eof)) { + return Status::OK(); + } + + while (output_row_count < _reader_context.batch_size) { + if (_vcollect_iter.is_merge()) { + if (_eof) { + break; + } + if (UNLIKELY(_next_row.block == nullptr)) { + return Status::InternalError("invalid row reference in detail stream reader"); + } + const Block& source_block = *_next_row.block; + const size_t row = _next_row.row_pos; + RETURN_IF_ERROR(_ensure_binlog_column_pos(source_block)); + int64_t op = _read_binlog_op(*source_block.get_by_position(_binlog_op_pos).column, row); + if (op == ROW_BINLOG_UPDATE) { + RETURN_IF_ERROR(_append_change_row(target_columns, source_block, row, + STREAM_CHANGE_UPDATE_BEFORE, true)); + output_row_count++; + if (output_row_count >= _reader_context.batch_size) { + for (auto& col : _pending_row_columns) { + col->clear(); + } + RETURN_IF_ERROR(_append_change_row(_pending_row_columns, source_block, row, + STREAM_CHANGE_UPDATE_AFTER, false)); + _has_pending_row = true; + } else { + RETURN_IF_ERROR(_append_change_row(target_columns, source_block, row, + STREAM_CHANGE_UPDATE_AFTER, false)); + output_row_count++; + } + } else if (op == ROW_BINLOG_APPEND) { + RETURN_IF_ERROR(_append_change_row(target_columns, source_block, row, + STREAM_CHANGE_INSERT, false)); + output_row_count++; + } else if (op == ROW_BINLOG_DELETE) { + RETURN_IF_ERROR(_append_change_row(target_columns, source_block, row, + STREAM_CHANGE_DELETE, false)); + output_row_count++; + } + + auto res = _vcollect_iter.next(&_next_row); + if (UNLIKELY(res.is())) { + _eof = true; + *eof = true; + } else if (UNLIKELY(!res.ok())) { + return res; + } + continue; + } + + DCHECK(_next_row.block != nullptr); + auto source_template_block = _next_row.block->clone_empty(); + Block source_block_storage; + source_block_storage = source_template_block.clone_empty(); + Block* read_block = &source_block_storage; + Status res = _vcollect_iter.next(read_block); + if (UNLIKELY(!res.ok() && !res.is())) { + return res; + } + *eof = res.is(); + _eof = *eof; + const Block& source_block = *read_block; + if (source_block.rows() == 0) { + break; + } + RETURN_IF_ERROR(_ensure_binlog_column_pos(source_block)); + auto result_columns = output_template_block.clone_empty_columns(); + + for (size_t row = 0; + row < source_block.rows() && output_row_count < _reader_context.batch_size; ++row) { + int64_t op = _read_binlog_op(*source_block.get_by_position(_binlog_op_pos).column, row); + if (op == ROW_BINLOG_UPDATE) { + RETURN_IF_ERROR(_append_change_row(result_columns, source_block, row, + STREAM_CHANGE_UPDATE_BEFORE, true)); + output_row_count++; + if (output_row_count >= _reader_context.batch_size) { + for (auto& col : _pending_row_columns) { + col->clear(); + } + RETURN_IF_ERROR(_append_change_row(_pending_row_columns, source_block, row, + STREAM_CHANGE_UPDATE_AFTER, false)); + _has_pending_row = true; + break; + } + RETURN_IF_ERROR(_append_change_row(result_columns, source_block, row, + STREAM_CHANGE_UPDATE_AFTER, false)); + output_row_count++; + } else if (op == ROW_BINLOG_APPEND) { + RETURN_IF_ERROR(_append_change_row(result_columns, source_block, row, + STREAM_CHANGE_INSERT, false)); + output_row_count++; + } else if (op == ROW_BINLOG_DELETE) { + RETURN_IF_ERROR(_append_change_row(result_columns, source_block, row, + STREAM_CHANGE_DELETE, false)); + output_row_count++; + } + } + block->set_columns(std::move(result_columns)); + *eof = _eof && !_has_pending_row; + return Status::OK(); + } + + block->set_columns(std::move(target_columns)); + *eof = _eof && !_has_pending_row; + return Status::OK(); +} + +bool BlockReader::_min_delta_next_row_has_same_key() const { + if (_next_row.row_pos < 0 || _next_row.block == nullptr) { + return false; + } + if (_stored_data_columns.empty() || _stored_data_columns[0]->empty()) { + return false; + } + + const size_t num_key_columns = _tablet_schema->num_key_columns(); + if (num_key_columns > _stored_data_columns.size() || + num_key_columns > _next_row.block->columns()) { + return false; + } + for (size_t idx = 0; idx < num_key_columns; ++idx) { + const IColumn& stored_key_col = *_stored_data_columns[idx]; + const IColumn& next_key_col = *_next_row.block->get_by_position(idx).column; + if (stored_key_col.compare_at(0, _next_row.row_pos, next_key_col, -1) != 0) { + return false; + } + } + return true; +} bool BlockReader::_rowsets_not_mono_asc_disjoint(const ReaderParams& read_params) { std::string pre_rs_last_key; bool pre_rs_key_bounds_truncated {false}; @@ -118,8 +575,10 @@ Status BlockReader::_init_collect_iter(const ReaderParams& read_params) { { SCOPED_RAW_TIMER(&_stats.block_reader_vcollect_iter_init_timer_ns); _is_rowsets_overlapping = _rowsets_not_mono_asc_disjoint(read_params); - _vcollect_iter.init(this, _is_rowsets_overlapping, read_params.read_orderby_key, - read_params.read_orderby_key_reverse); + const bool is_min_delta_stream = read_params.binlog_scan_type == TBinlogScanType::MIN_DELTA; + const bool force_merge = read_params.read_orderby_key || is_min_delta_stream; + const bool is_reverse = !is_min_delta_stream && read_params.read_orderby_key_reverse; + _vcollect_iter.init(this, _is_rowsets_overlapping, force_merge, is_reverse); } std::vector valid_rs_readers; @@ -274,6 +733,16 @@ Status BlockReader::init(const ReaderParams& read_params) { return status; } + if (read_params.binlog_scan_type == TBinlogScanType::MIN_DELTA) { + _next_block_func = &BlockReader::_min_delta_next_block; + return Status::OK(); + } + if (read_params.binlog_scan_type == TBinlogScanType::DETAIL && + read_params.binlog_read_source == TBinlogReadSource::CHANGES) { + _next_block_func = &BlockReader::_detail_change_next_block; + return Status::OK(); + } + if (_direct_mode) { _next_block_func = &BlockReader::_direct_next_block; return Status::OK(); diff --git a/be/src/storage/iterator/block_reader.h b/be/src/storage/iterator/block_reader.h index cc4eced6113b7c..4d9644b32ea5f4 100644 --- a/be/src/storage/iterator/block_reader.h +++ b/be/src/storage/iterator/block_reader.h @@ -75,6 +75,25 @@ class BlockReader final : public TabletReader { // to minimize the comparison time in merge heap. Status _unique_key_next_block(Block* block, bool* eof); + Status _min_delta_next_block(Block* block, bool* eof); + + Status _detail_change_next_block(Block* block, bool* eof); + + Status _ensure_binlog_column_pos(const Block& src_block); + + int64_t _read_binlog_op(const IColumn& col, size_t row) const; + + Status _write_binlog_op(IColumn& col, int64_t op) const; + + bool _is_binlog_meta_column(int idx) const; + + int _resolve_source_column_index(const Block& src_block, int idx, bool use_before) const; + + void _init_pending_row_columns(const Block& block); + + bool _emit_pending_row(Block* block, MutableColumns& target_columns, size_t& output_row_count, + bool* eof); + Status _replace_key_next_block(Block* block, bool* eof); Status _init_collect_iter(const ReaderParams& read_params); @@ -99,6 +118,13 @@ class BlockReader final : public TabletReader { void _update_agg_value(MutableColumns& columns, int begin, int end, bool is_close = true); + bool _min_delta_next_row_has_same_key() const; + + Status _append_change_row(MutableColumns& target_columns, const Block& src_block, + size_t row_pos, int64_t output_op, bool use_before); + + bool _get_next_row_same(); + // return false if keys of rowsets are mono ascending and disjoint bool _rowsets_not_mono_asc_disjoint(const ReaderParams& read_params); @@ -121,6 +147,9 @@ class BlockReader final : public TabletReader { std::vector _stored_has_null_tag; std::vector _stored_has_variable_length_tag; + MutableColumns _pending_row_columns; + bool _has_pending_row = false; + phmap::flat_hash_map>> _temp_ref_map; bool _eof = false; @@ -133,6 +162,11 @@ class BlockReader final : public TabletReader { bool _is_rowsets_overlapping = true; + int _binlog_op_pos = -1; + int _binlog_lsn_pos = -1; + int _binlog_timestamp_pos = -1; + bool _binlog_column_pos_inited = false; + bool _has_seq_map = false; // for check multi seq std::unordered_map _seq_columns; diff --git a/be/src/storage/iterator/block_reader_utils.h b/be/src/storage/iterator/block_reader_utils.h new file mode 100644 index 00000000000000..d4be6236a6c7a6 --- /dev/null +++ b/be/src/storage/iterator/block_reader_utils.h @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "core/block/block.h" +#include "storage/binlog.h" + +namespace doris { + +constexpr int64_t ROW_BINLOG_UNKNOWN = 3; + +constexpr int64_t STREAM_CHANGE_INSERT = 0; +constexpr int64_t STREAM_CHANGE_DELETE = 1; +constexpr int64_t STREAM_CHANGE_UPDATE_BEFORE = 2; +constexpr int64_t STREAM_CHANGE_UPDATE_AFTER = 3; + +// Build the __BEFORE__ column name for a base column. +inline std::string build_before_column_name(std::string_view name) { + std::string before_name = "__BEFORE__"; + before_name.append(name.data(), name.size()); + before_name.append("__"); + return before_name; +} + +// Resolve __BEFORE__ column index for a base column when present. +inline int resolve_before_column_index(const Block& block, int idx, int binlog_op_pos) { + if (idx == binlog_op_pos) { + return idx; + } + + const auto& col_with_name = block.get_by_position(idx); + std::string before_name = build_before_column_name(col_with_name.name); + int tmp_idx = block.get_position_by_name(before_name); + return tmp_idx < 0 ? idx : tmp_idx; +} + +enum class MinDeltaResultType { SKIP, INSERT, DELETE, UPDATE_BEFORE_AFTER }; + +// MIN_DELTA uses row binlog op codes as indices into a 2D lookup table, so we guard the op layout here. +static_assert(ROW_BINLOG_APPEND == 0 && ROW_BINLOG_UPDATE == 1 && ROW_BINLOG_DELETE == 2, + "row binlog op layout changed; update min-delta transition matrix"); + +inline bool is_valid_row_binlog_op(int64_t op) { + return op >= ROW_BINLOG_APPEND && op <= ROW_BINLOG_DELETE; +} + +inline MinDeltaResultType calculate_min_delta_result(int64_t first_op, int64_t last_op) { + using ResultType = MinDeltaResultType; + + // Transition matrix: row=first_op, col=last_op, value=min-delta result type. + // Column order is fixed as [APPEND, UPDATE, DELETE]. + // + // Semantic examples: + // 1) APPEND -> DELETE = SKIP: + // Insert then delete within the same window yields no visible change. + // 2) UPDATE -> DELETE = DELETE: + // Update then delete; downstream only needs the pre-delete snapshot. + // 3) DELETE -> APPEND = INSERT: + // Delete then append (rebuild) is equivalent to inserting a new value. + static constexpr std::array, 3> kTransitionMatrix = {{ + // first_op = APPEND + {ResultType::INSERT, ResultType::INSERT, ResultType::SKIP}, + // first_op = UPDATE + {ResultType::UPDATE_BEFORE_AFTER, ResultType::UPDATE_BEFORE_AFTER, ResultType::DELETE}, + // first_op = DELETE + {ResultType::INSERT, ResultType::INSERT, ResultType::DELETE}, + }}; + + // Fallback for unknown/invalid op codes: avoid out-of-bounds and keep changes conservatively. + if (!is_valid_row_binlog_op(first_op) || !is_valid_row_binlog_op(last_op)) { + return ResultType::UPDATE_BEFORE_AFTER; + } + + return kTransitionMatrix[static_cast(first_op)][static_cast(last_op)]; +} + +/** + * MIN_DELTA result helper: + * Given the first/last row binlog op within the same key group, returns the min-delta change type. + */ +class AggregateFunctionMinDelta { +public: + using ResultType = MinDeltaResultType; + + static ResultType calculate_result(int64_t first_op, int64_t last_op) { + return calculate_min_delta_result(first_op, last_op); + } +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/storage/iterator/vcollect_iterator.cpp b/be/src/storage/iterator/vcollect_iterator.cpp index 32df10c933d32d..2aa7363919d21e 100644 --- a/be/src/storage/iterator/vcollect_iterator.cpp +++ b/be/src/storage/iterator/vcollect_iterator.cpp @@ -42,6 +42,7 @@ #include "runtime/runtime_predicate.h" #include "runtime/runtime_profile.h" #include "runtime/runtime_state.h" +#include "storage/binlog.h" #include "storage/olap_common.h" #include "storage/olap_define.h" #include "storage/rowset/rowset.h" @@ -228,7 +229,11 @@ bool VCollectIterator::LevelIteratorComparator::operator()(LevelIterator* lhs, L // read data from higher version to lower version. // for UNIQUE_KEYS just read the highest version and no need agg_update. // for AGG_KEYS if a version is deleted, the lower version no need to agg_update - bool lower = (cmp_res != 0) ? (cmp_res < 0) : (lhs->version() < rhs->version()); + // Tie-break direction depends on which column was used: + // - sequence column (UNIQUE_KEYS): larger value wins (cmp_res < 0 => lhs lower). + // - binlog LSN column (row binlog reads): smaller value wins (cmp_res > 0 => lhs lower). + bool lower = (cmp_res != 0) ? (_lsn_mode ? (cmp_res > 0) : (cmp_res < 0)) + : (lhs->version() < rhs->version()); lower ? lhs->set_same(true) : rhs->set_same(true); return lower; @@ -690,7 +695,21 @@ Status VCollectIterator::Level1Iterator::init(bool get_data_by_ref) { break; } } - _heap = std::make_unique(LevelIteratorComparator(sequence_loc, _is_reverse)); + + int32_t lsn_col_id = + _reader->_tablet_schema->field_index(std::string(kRowBinlogLsnColName)); + if (lsn_col_id >= 0) { + DCHECK(sequence_loc == -1); + for (int loc = 0; loc < _reader->_return_columns.size(); ++loc) { + if (_reader->_return_columns[loc] == static_cast(lsn_col_id)) { + sequence_loc = loc; + break; + } + } + } + + _heap = std::make_unique( + LevelIteratorComparator(sequence_loc, _is_reverse, lsn_col_id >= 0)); for (auto&& child : _children) { DCHECK(child != nullptr); //DCHECK(child->current_row().ok()); diff --git a/be/src/storage/iterator/vcollect_iterator.h b/be/src/storage/iterator/vcollect_iterator.h index c297f9f3750d03..998241c2dc7cdb 100644 --- a/be/src/storage/iterator/vcollect_iterator.h +++ b/be/src/storage/iterator/vcollect_iterator.h @@ -165,8 +165,8 @@ class VCollectIterator { // if row cursors equal, compare data version. class LevelIteratorComparator { public: - LevelIteratorComparator(int sequence, bool is_reverse) - : _sequence(sequence), _is_reverse(is_reverse) {} + LevelIteratorComparator(int sequence, bool is_reverse, bool lsn_mode) + : _sequence(sequence), _is_reverse(is_reverse), _lsn_mode(lsn_mode) {} bool operator()(LevelIterator* lhs, LevelIterator* rhs); @@ -174,6 +174,11 @@ class VCollectIterator { int _sequence; // reverse the compare order bool _is_reverse = false; + // Tie-break column position when keys are equal: + // - false : sequence column (Unique Key tables) larger value wins. + // - true : binlog LSN column (row binlog reads) smaller value wins. + // The two cases are mutually exclusive (DCHECK at construction site). + bool _lsn_mode = false; }; #ifdef USE_LIBCPP diff --git a/be/src/storage/iterator/vgeneric_iterators.cpp b/be/src/storage/iterator/vgeneric_iterators.cpp index 19efea82d5af3d..27d40d95f35b34 100644 --- a/be/src/storage/iterator/vgeneric_iterators.cpp +++ b/be/src/storage/iterator/vgeneric_iterators.cpp @@ -133,11 +133,18 @@ bool VMergeIteratorContext::compare(const VMergeIteratorContext& rhs) const { } auto col_cmp_res = 0; - if (_sequence_id_idx != -1) { + bool compared_by_lsn = false; + if (col_cmp_res == 0 && _binlog_lsn_idx != -1) { + col_cmp_res = _block->compare_column_at(_index_in_block, rhs._index_in_block, + _binlog_lsn_idx, *rhs._block, -1); + compared_by_lsn = (col_cmp_res != 0); + } + if (col_cmp_res == 0 && _sequence_id_idx != -1) { col_cmp_res = _block->compare_column_at(_index_in_block, rhs._index_in_block, _sequence_id_idx, *rhs._block, -1); } - auto result = col_cmp_res == 0 ? data_id() < rhs.data_id() : col_cmp_res < 0; + auto result = col_cmp_res == 0 ? data_id() < rhs.data_id() + : (compared_by_lsn ? (col_cmp_res > 0) : (col_cmp_res < 0)); if (_is_unique) { result ? set_skip(true) : rhs.set_skip(true); @@ -358,7 +365,7 @@ Status VMergeIterator::init(const StorageReadOptions& opts) { for (auto& iter : _origin_iters) { auto ctx = std::make_shared( std::move(iter), _sequence_id_idx, _is_unique, _is_reverse, - opts.read_orderby_key_columns, _output_schema); + opts.read_orderby_key_columns, _output_schema, _binlog_lsn_idx); RETURN_IF_ERROR(ctx->init(opts)); if (!ctx->valid()) { continue; @@ -453,12 +460,14 @@ Status VUnionIterator::current_block_row_locations(std::vector* loc RowwiseIteratorUPtr new_merge_iterator(std::vector&& inputs, int sequence_id_idx, bool is_unique, bool is_reverse, - uint64_t* merged_rows, SchemaSPtr output_schema) { + uint64_t* merged_rows, SchemaSPtr output_schema, + int binlog_lsn_idx) { // when the size of inputs is 1, we also need to use VMergeIterator, because the // next_block_view function only be implemented in VMergeIterator. The reason why // the size of inputs is 1 is that the segment was filtered out by zone map or others. return std::make_unique(std::move(inputs), sequence_id_idx, is_unique, - is_reverse, merged_rows, std::move(output_schema)); + is_reverse, merged_rows, std::move(output_schema), + binlog_lsn_idx); } RowwiseIteratorUPtr new_union_iterator(std::vector&& inputs, diff --git a/be/src/storage/iterator/vgeneric_iterators.h b/be/src/storage/iterator/vgeneric_iterators.h index e33850de16dc78..8f4abe853336e6 100644 --- a/be/src/storage/iterator/vgeneric_iterators.h +++ b/be/src/storage/iterator/vgeneric_iterators.h @@ -85,11 +85,12 @@ class VMergeIteratorContext { public: VMergeIteratorContext(RowwiseIteratorUPtr&& iter, int sequence_id_idx, bool is_unique, bool is_reverse, std::vector* read_orderby_key_columns, - SchemaSPtr output_schema) + SchemaSPtr output_schema, int binlog_lsn_idx = -1) : _iter(std::move(iter)), _sequence_id_idx(sequence_id_idx), _is_unique(is_unique), _is_reverse(is_reverse), + _binlog_lsn_idx(binlog_lsn_idx), _output_schema(std::move(output_schema)), _num_key_columns(cast_set(_output_schema->num_key_columns())), _compare_columns(read_orderby_key_columns) {} @@ -187,6 +188,7 @@ class VMergeIteratorContext { int _sequence_id_idx = -1; bool _is_unique = false; bool _is_reverse = false; + int _binlog_lsn_idx = -1; bool _valid = false; mutable bool _skip = false; mutable bool _same = false; @@ -218,12 +220,14 @@ class VMergeIterator : public RowwiseIterator { public: // VMergeIterator takes the ownership of input iterators VMergeIterator(std::vector&& iters, int sequence_id_idx, bool is_unique, - bool is_reverse, uint64_t* merged_rows, SchemaSPtr output_schema) + bool is_reverse, uint64_t* merged_rows, SchemaSPtr output_schema, + int binlog_lsn_idx = -1) : _origin_iters(std::move(iters)), _output_schema(std::move(output_schema)), _sequence_id_idx(sequence_id_idx), _is_unique(is_unique), _is_reverse(is_reverse), + _binlog_lsn_idx(binlog_lsn_idx), _merged_rows(merged_rows) {} ~VMergeIterator() override = default; @@ -341,6 +345,7 @@ class VMergeIterator : public RowwiseIterator { int _sequence_id_idx = -1; bool _is_unique = false; bool _is_reverse = false; + int _binlog_lsn_idx = -1; uint64_t* _merged_rows = nullptr; bool _record_rowids = false; std::vector _block_row_locations; @@ -354,7 +359,8 @@ class VMergeIterator : public RowwiseIterator { // should delete returned iterator after usage. RowwiseIteratorUPtr new_merge_iterator(std::vector&& inputs, int sequence_id_idx, bool is_unique, bool is_reverse, - uint64_t* merged_rows, SchemaSPtr output_schema); + uint64_t* merged_rows, SchemaSPtr output_schema, + int binlog_lsn_idx = -1); // Create a union iterator for input iterators. Union iterator will read // input iterators one by one. diff --git a/be/src/storage/iterators.h b/be/src/storage/iterators.h index ff2d48360155a7..c2525383a7ae85 100644 --- a/be/src/storage/iterators.h +++ b/be/src/storage/iterators.h @@ -121,6 +121,7 @@ class StorageReadOptions { int topn_filter_target_node_id = -1; // used for special optimization for query : ORDER BY key DESC LIMIT n bool read_orderby_key_reverse = false; + int binlog_lsn_idx = -1; // columns for orderby keys std::vector* read_orderby_key_columns = nullptr; io::IOContext io_ctx; diff --git a/be/src/storage/rowset/beta_rowset_reader.cpp b/be/src/storage/rowset/beta_rowset_reader.cpp index 9aa51eb47239d1..98ee259a5d210d 100644 --- a/be/src/storage/rowset/beta_rowset_reader.cpp +++ b/be/src/storage/rowset/beta_rowset_reader.cpp @@ -34,6 +34,7 @@ #include "io/io_common.h" #include "runtime/descriptors.h" #include "runtime/runtime_profile.h" +#include "storage/binlog.h" #include "storage/delete/delete_handler.h" #include "storage/iterator/vgeneric_iterators.h" #include "storage/olap_define.h" @@ -135,14 +136,31 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context std::vector read_columns; std::set read_columns_set; std::set delete_columns_set; + auto add_read_column_if_absent = [&](uint32_t cid) { + if (read_columns_set.insert(cid).second) { + read_columns.push_back(cid); + } + }; for (int i = 0; i < _read_context->return_columns->size(); ++i) { - read_columns.push_back(_read_context->return_columns->at(i)); - read_columns_set.insert(_read_context->return_columns->at(i)); + add_read_column_if_absent(_read_context->return_columns->at(i)); } _read_options.delete_condition_predicates->get_all_column_ids(delete_columns_set); for (auto cid : delete_columns_set) { - if (read_columns_set.find(cid) == read_columns_set.end()) { - read_columns.push_back(cid); + add_read_column_if_absent(cid); + } + if (_read_context->predicates != nullptr) { + for (auto pred : *(_read_context->predicates)) { + add_read_column_if_absent(pred->column_id()); + } + } + if (_should_push_down_value_predicates()) { + // sequence mapping currently only support merge on read, so can not push down value + // predicates + if (_read_context->value_predicates != nullptr && + !read_context->tablet_schema->has_seq_map()) { + for (auto pred : *(_read_context->value_predicates)) { + add_read_column_if_absent(pred->column_id()); + } } } // disable condition cache if you have delete condition @@ -213,6 +231,17 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context _read_options.topn_filter_source_node_ids = _read_context->topn_filter_source_node_ids; _read_options.topn_filter_target_node_id = _read_context->topn_filter_target_node_id; _read_options.read_orderby_key_reverse = _read_context->read_orderby_key_reverse; + _read_options.binlog_lsn_idx = -1; + int32_t lsn_col_id = + _read_context->tablet_schema->field_index(std::string(kRowBinlogLsnColName)); + if (lsn_col_id >= 0) { + for (size_t i = 0; i < _read_context->return_columns->size(); ++i) { + if (_read_context->return_columns->at(i) == static_cast(lsn_col_id)) { + _read_options.binlog_lsn_idx = static_cast(i); + break; + } + } + } _read_options.read_orderby_key_columns = _read_context->read_orderby_key_columns; _read_options.io_ctx.reader_type = _read_context->reader_type; _read_options.io_ctx.file_cache_stats = &_stats->file_cache_stats; @@ -333,7 +362,8 @@ Status BetaRowsetReader::_init_iterator() { } _iterator = new_merge_iterator(std::move(iterators), sequence_loc, _read_context->is_unique, _read_context->read_orderby_key_reverse, - _read_context->merged_rows, _output_schema); + _read_context->merged_rows, _output_schema, + _read_options.binlog_lsn_idx); } else { if (_read_context->read_orderby_key_reverse) { // reverse iterators to read backward for ORDER BY key DESC diff --git a/be/src/storage/rowset/beta_rowset_reader.h b/be/src/storage/rowset/beta_rowset_reader.h index 9faefad21a8500..d06ac71c934f4a 100644 --- a/be/src/storage/rowset/beta_rowset_reader.h +++ b/be/src/storage/rowset/beta_rowset_reader.h @@ -60,8 +60,9 @@ class BetaRowsetReader : public RowsetReader { } bool is_merge_iterator() const override { - return _read_context->need_ordered_result && - _rowset->rowset_meta()->is_segments_overlapping() && _get_segment_num() > 1; + return _read_context->need_ordered_result && _get_segment_num() > 1 && + (_rowset->rowset_meta()->is_segments_overlapping() || + _read_context->force_key_ordered_read); } bool delete_flag() override { return _rowset->delete_flag(); } diff --git a/be/src/storage/rowset/rowset_reader_context.h b/be/src/storage/rowset/rowset_reader_context.h index dc25c3105d0624..6eb6a8411d1cf5 100644 --- a/be/src/storage/rowset/rowset_reader_context.h +++ b/be/src/storage/rowset/rowset_reader_context.h @@ -47,7 +47,7 @@ struct RowsetReaderContext { bool need_ordered_result = true; // used for special optimization for query : ORDER BY key DESC LIMIT n bool read_orderby_key_reverse = false; - // columns for orderby keys + bool force_key_ordered_read = false; std::vector* read_orderby_key_columns = nullptr; // limit of rows for read_orderby_key size_t read_orderby_key_limit = 0; diff --git a/be/src/storage/segment/historical_row_retriever.h b/be/src/storage/segment/historical_row_retriever.h index 5949896a4765a1..ceb07be40c42c1 100644 --- a/be/src/storage/segment/historical_row_retriever.h +++ b/be/src/storage/segment/historical_row_retriever.h @@ -38,6 +38,7 @@ struct HistoricalRowRetrieverContext { std::shared_ptr partial_update_info; bool is_transient_rowset_writer = false; DataWriteType write_type = DataWriteType::TYPE_DEFAULT; + bool need_before_image = false; }; class HistoricalRowRetriever { diff --git a/be/src/storage/segment/row_binlog_segment_writer.cpp b/be/src/storage/segment/row_binlog_segment_writer.cpp index ccfc53d9fe9223..928799cdcf4307 100644 --- a/be/src/storage/segment/row_binlog_segment_writer.cpp +++ b/be/src/storage/segment/row_binlog_segment_writer.cpp @@ -85,7 +85,8 @@ Status RowBinlogSegmentWriter::init() { .rowset_writer_ctx = _opts.rowset_ctx, .partial_update_info = _binlog_opts.source.partial_update_info, .is_transient_rowset_writer = _binlog_opts.source.is_transient_rowset_writer, - .write_type = _binlog_opts.source.source_write_type}; + .write_type = _binlog_opts.source.source_write_type, + .need_before_image = _write_before}; if (_tablet->enable_unique_key_merge_on_write()) { _historical_data_writer = std::make_unique(); RETURN_IF_ERROR(_historical_data_writer->init(historical_row_retriever_context)); diff --git a/be/src/storage/tablet/tablet_reader.cpp b/be/src/storage/tablet/tablet_reader.cpp index ffb98c469816a0..51607360c01bab 100644 --- a/be/src/storage/tablet/tablet_reader.cpp +++ b/be/src/storage/tablet/tablet_reader.cpp @@ -156,6 +156,7 @@ Status TabletReader::_capture_rs_readers(const ReaderParams& read_params) { _reader_context.topn_filter_source_node_ids = read_params.topn_filter_source_node_ids; _reader_context.topn_filter_target_node_id = read_params.topn_filter_target_node_id; _reader_context.read_orderby_key_reverse = read_params.read_orderby_key_reverse; + _reader_context.force_key_ordered_read = read_params.force_key_ordered_read; _reader_context.read_orderby_key_limit = read_params.read_orderby_key_limit; _reader_context.return_columns = &_return_columns; _reader_context.read_orderby_key_columns = diff --git a/be/src/storage/tablet/tablet_reader.h b/be/src/storage/tablet/tablet_reader.h index 6bf57dbe441868..29cb88115c90db 100644 --- a/be/src/storage/tablet/tablet_reader.h +++ b/be/src/storage/tablet/tablet_reader.h @@ -177,6 +177,7 @@ class TabletReader { bool read_orderby_key = false; // used for special optimization for query : ORDER BY key DESC LIMIT n bool read_orderby_key_reverse = false; + bool force_key_ordered_read = false; // num of columns for orderby key size_t read_orderby_key_num_prefix_columns = 0; // limit of rows for read_orderby_key @@ -212,6 +213,8 @@ class TabletReader { // General LIMIT budget forwarded to SegmentIterator. -1 means no limit. int64_t general_read_limit = -1; + TBinlogScanType::type binlog_scan_type = TBinlogScanType::NONE; + TBinlogReadSource::type binlog_read_source = TBinlogReadSource::NONE; }; TabletReader() = default; diff --git a/be/test/storage/iterator/block_reader_utils_min_delta_test.cpp b/be/test/storage/iterator/block_reader_utils_min_delta_test.cpp new file mode 100644 index 00000000000000..8d436e76200ee7 --- /dev/null +++ b/be/test/storage/iterator/block_reader_utils_min_delta_test.cpp @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include + +#include "storage/iterator/block_reader_utils.h" + +namespace doris { + +using ResultType = AggregateFunctionMinDelta::ResultType; + +TEST(AggregateFunctionMinDeltaTest, ValidOperationPairs) { + // Cover the 3x3 valid row binlog op pairs to keep the min-delta mapping stable. + struct Case { + int64_t first_op; + int64_t last_op; + ResultType expected; + }; + + const Case cases[] = { + {ROW_BINLOG_APPEND, ROW_BINLOG_APPEND, ResultType::INSERT}, + {ROW_BINLOG_APPEND, ROW_BINLOG_UPDATE, ResultType::INSERT}, + {ROW_BINLOG_APPEND, ROW_BINLOG_DELETE, ResultType::SKIP}, + {ROW_BINLOG_UPDATE, ROW_BINLOG_APPEND, ResultType::UPDATE_BEFORE_AFTER}, + {ROW_BINLOG_UPDATE, ROW_BINLOG_UPDATE, ResultType::UPDATE_BEFORE_AFTER}, + {ROW_BINLOG_UPDATE, ROW_BINLOG_DELETE, ResultType::DELETE}, + {ROW_BINLOG_DELETE, ROW_BINLOG_APPEND, ResultType::INSERT}, + {ROW_BINLOG_DELETE, ROW_BINLOG_UPDATE, ResultType::INSERT}, + {ROW_BINLOG_DELETE, ROW_BINLOG_DELETE, ResultType::DELETE}, + }; + + for (const auto& c : cases) { + EXPECT_EQ(c.expected, AggregateFunctionMinDelta::calculate_result(c.first_op, c.last_op)) + << "first_op=" << c.first_op << ", last_op=" << c.last_op; + } +} + +TEST(AggregateFunctionMinDeltaTest, InvalidOperationFallback) { + // Invalid op codes (negative/out-of-range) should fall back to avoid OOB and keep changes conservatively. + const int64_t invalid_values[] = {-1, + 3, + 4, + 100, + std::numeric_limits::min(), + std::numeric_limits::max()}; + + for (int64_t invalid_op : invalid_values) { + EXPECT_EQ(ResultType::UPDATE_BEFORE_AFTER, + AggregateFunctionMinDelta::calculate_result(invalid_op, ROW_BINLOG_APPEND)) + << "invalid first_op=" << invalid_op; + EXPECT_EQ(ResultType::UPDATE_BEFORE_AFTER, + AggregateFunctionMinDelta::calculate_result(ROW_BINLOG_DELETE, invalid_op)) + << "invalid last_op=" << invalid_op; + } +} + +TEST(AggregateFunctionMinDeltaTest, SemanticScenarios) { + // Scenario 1: insert then delete yields no net change. + EXPECT_EQ(ResultType::SKIP, + AggregateFunctionMinDelta::calculate_result(ROW_BINLOG_APPEND, ROW_BINLOG_DELETE)); + + // Scenario 2: update then delete emits DELETE (with pre-delete snapshot values). + EXPECT_EQ(ResultType::DELETE, + AggregateFunctionMinDelta::calculate_result(ROW_BINLOG_UPDATE, ROW_BINLOG_DELETE)); + + // Scenario 3: delete then insert (rebuild) is treated as INSERT. + EXPECT_EQ(ResultType::INSERT, + AggregateFunctionMinDelta::calculate_result(ROW_BINLOG_DELETE, ROW_BINLOG_APPEND)); +} + +TEST(AggregateFunctionMinDeltaTest, CrossRowsetSameKeyScenarios) { + // Model same-key row binlog ops read from multiple rowsets in commit order. + // The min-delta result depends on the first and last op for that key, regardless of rowset boundaries. + auto calc_from_rowsets = [](const std::vector>& rowsets) -> ResultType { + bool found = false; + int64_t first_op = 0; + int64_t last_op = 0; + for (const auto& rowset_ops : rowsets) { + for (int64_t op : rowset_ops) { + if (!found) { + first_op = op; + found = true; + } + last_op = op; + } + } + return found ? AggregateFunctionMinDelta::calculate_result(first_op, last_op) + : ResultType::SKIP; + }; + + // Scenario 1: key1 updated in rowset-1 and updated again in rowset-2 -> UPDATE_BEFORE_AFTER. + EXPECT_EQ(ResultType::UPDATE_BEFORE_AFTER, + calc_from_rowsets({{ROW_BINLOG_UPDATE}, {ROW_BINLOG_UPDATE}})); + + // Scenario 2: key1 appended/updated in rowset-1, then updated in rowset-2 -> INSERT. + EXPECT_EQ(ResultType::INSERT, + calc_from_rowsets({{ROW_BINLOG_APPEND, ROW_BINLOG_UPDATE}, {ROW_BINLOG_UPDATE}})); + + // Scenario 3: key1 appended in one rowset and deleted in a later rowset -> SKIP. + EXPECT_EQ(ResultType::SKIP, calc_from_rowsets({{ROW_BINLOG_APPEND}, {ROW_BINLOG_DELETE}})); + + // Scenario 4: key1 deleted first, then appended in later rowset -> INSERT. + EXPECT_EQ(ResultType::INSERT, calc_from_rowsets({{ROW_BINLOG_DELETE}, {ROW_BINLOG_APPEND}})); + + // Scenario 5: empty rowsets around the same key should not affect folding. + EXPECT_EQ(ResultType::DELETE, + calc_from_rowsets({{}, {ROW_BINLOG_UPDATE}, {}, {ROW_BINLOG_DELETE}, {}})); +} + +TEST(AggregateFunctionMinDeltaTest, RowBinlogOperationCodeLayoutGuard) { + // The implementation uses op codes as 2D lookup indices, so guard the op layout to prevent implicit OOB. + EXPECT_EQ(0, ROW_BINLOG_APPEND); + EXPECT_EQ(1, ROW_BINLOG_UPDATE); + EXPECT_EQ(2, ROW_BINLOG_DELETE); +} + +} // namespace doris diff --git a/be/test/storage/iterator/block_reader_utils_test.cpp b/be/test/storage/iterator/block_reader_utils_test.cpp new file mode 100644 index 00000000000000..2cc00b0daa766c --- /dev/null +++ b/be/test/storage/iterator/block_reader_utils_test.cpp @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/iterator/block_reader_utils.h" + +#include + +#include "vec/columns/columns_number.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/data_types/data_type_number.h" + +namespace doris::vectorized::detail { + +TEST(BlockReaderUtilsTest, BuildBeforeColumnName) { + EXPECT_EQ(build_before_column_name("v1"), "__BEFORE__v1__"); +} + +TEST(BlockReaderUtilsTest, ResolveBeforeColumnIndex) { + auto int_type = std::make_shared(); + auto col_key = ColumnInt64::create(); + auto col_val = ColumnInt64::create(); + auto col_before_val = ColumnInt64::create(); + auto col_op = ColumnInt64::create(); + + Block block; + block.insert({std::move(col_key), int_type, "k1"}); + block.insert({std::move(col_val), int_type, "v1"}); + block.insert({std::move(col_before_val), int_type, "__BEFORE__v1__"}); + block.insert({std::move(col_op), int_type, "__DORIS_BINLOG_OP__"}); + + EXPECT_EQ(resolve_before_column_index(block, 1, 3), 2); + EXPECT_EQ(resolve_before_column_index(block, 3, 3), 3); +} + +TEST(BlockReaderUtilsTest, ResolveBeforeColumnIndexFallbackWhenMissing) { + auto int_type = std::make_shared(); + auto col_key = ColumnInt64::create(); + auto col_val = ColumnInt64::create(); + auto col_op = ColumnInt64::create(); + + Block block; + block.insert({std::move(col_key), int_type, "k1"}); + block.insert({std::move(col_val), int_type, "v1"}); + block.insert({std::move(col_op), int_type, "__DORIS_BINLOG_OP__"}); + + // If __BEFORE__v1__ is missing, fall back to the current column to avoid out-of-bounds. + EXPECT_EQ(resolve_before_column_index(block, 1, 2), 1); + // Non-op columns should return themselves. + EXPECT_EQ(resolve_before_column_index(block, 0, 2), 0); +} + +} // namespace doris::vectorized::detail diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index bdd64284c9af27..69e4708eb79eb4 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2513,6 +2513,10 @@ public class Config extends ConfigBase { "Whether to enable the binlog feature"}) public static boolean enable_feature_binlog = false; + @ConfField(mutable = true, masterOnly = false, varType = VariableAnnotation.EXPERIMENTAL, description = { + "Whether to split update_before and update_after of binlog into two records"}) + public static boolean enable_split_binlog_before = true; + @ConfField(mutable = false, description = {"Whether to enable the binlog feature for databases/tables by default"}) public static boolean force_enable_feature_binlog = false; @@ -3003,6 +3007,11 @@ public class Config extends ConfigBase { varType = VariableAnnotation.EXPERIMENTAL) public static boolean enable_table_stream = false; + @ConfField(mutable = true, masterOnly = true, description = { + "The interval at which FE cleans stale partition offset state from table streams, in seconds."}, + varType = VariableAnnotation.EXPERIMENTAL) + public static int table_stream_partition_offset_cleanup_interval_second = 3600; + //========================================================================== // begin of cloud config //========================================================================== @@ -3451,10 +3460,6 @@ public static int metaServiceRpcRetryTimes() { @ConfField(mutable = true, masterOnly = true) public static long mow_get_ms_lock_retry_backoff_interval = 80; - @ConfField(mutable = true, masterOnly = true, description = { - "Whether to enable TSO."}, varType = VariableAnnotation.EXPERIMENTAL) - public static boolean enable_tso_feature = false; - @ConfField(mutable = false, masterOnly = true, description = { "TSO service update interval in milliseconds. Default is 50, which means the TSO service " + "will perform timestamp update checks every 50 milliseconds."}) @@ -3485,16 +3490,6 @@ public static int metaServiceRpcRetryTimes() { + "timestamp offset is 0 milliseconds."}) public static int tso_time_offset_debug_mode = 0; - @ConfField(mutable = true, masterOnly = true, description = { - "Whether to enable persisting TSO window end into edit log. Enabling emits new op code, " - + "which may break rollback to older versions."}) - public static boolean enable_tso_persist_journal = false; - - @ConfField(mutable = true, masterOnly = true, description = { - "Whether to include TSO info as an image module in checkpoint. Older versions may need to ignore " - + "unknown modules when reading new images."}) - public static boolean enable_tso_checkpoint_module = false; - @ConfField(mutable = true, masterOnly = true, description = { "Whether to forward TSO 1ms when logical counter is nearly full. Default is true."}) public static boolean enable_tso_forward_when_counter_full = true; diff --git a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java index 4f56b4aa94ad87..4d73864a28173e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java @@ -2825,7 +2825,6 @@ public void updateTableProperties(Database db, String tableName, Map> configtoThreads = ImmutableMap - .of("dynamic_partition_check_interval_seconds", this::getDynamicPartitionScheduler); + .>builder() + .put("dynamic_partition_check_interval_seconds", this::getDynamicPartitionScheduler) + .put("table_stream_partition_offset_cleanup_interval_second", + this::getTableStreamManager) + .build(); private TSOService tsoService; @@ -2029,6 +2034,7 @@ protected void startMasterOnlyDaemonThreads() { cooldownConfHandler.start(); } streamLoadRecordMgr.start(); + tableStreamManager.start(); tabletLoadIndexRecorderMgr.start(); new InternalSchemaInitializer().start(); getRefreshManager().start(); @@ -2738,6 +2744,10 @@ public long saveTableStreamManager(CountingDataOutputStream out, long checksum) return checksum; } + public void replayPruneTableStreamPartitionOffsets(PruneTableStreamPartitionOffsetInfo info) { + tableStreamManager.replayPruneTableStreamPartitionOffsets(info); + } + // Only called by checkpoint thread // return the latest image file's absolute path public String saveImage() throws IOException { @@ -6341,7 +6351,6 @@ public void modifyTableProperties(Database db, OlapTable table, Map()); - } - tableProperty.modifyTableProperties(PropertyAnalyzer.PROPERTIES_ENABLE_TSO, - Boolean.valueOf(enableTso).toString()); - tableProperty.buildEnableTso(); - } - + /** + * Returns whether table-level TSO is enabled by row binlog format. + */ public Boolean enableTso() { if (tableProperty != null) { - return tableProperty.enableTso(); + return getBinlogConfig().isRowFormat(); } return false; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTableWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTableWrapper.java index c8fa7e0b0a5484..a39a15468d67f4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTableWrapper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTableWrapper.java @@ -139,4 +139,19 @@ public List selectNonEmptyPartitionIds(Collection partitionIds) { public Set getDistributionColumnNames() { return originTable.getDistributionColumnNames(); } + + @Override + public boolean needRowBinlog() { + return originTable.needRowBinlog(); + } + + @Override + public MaterializedIndexMeta getBaseIndexMeta() { + return originTable.getBaseIndexMeta(); + } + + @Override + public MaterializedIndex getBaseIndex() { + return originTable.getBaseIndex(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/RowBinlogTableWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/RowBinlogTableWrapper.java index 39626231d97098..230b114822369c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/RowBinlogTableWrapper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/RowBinlogTableWrapper.java @@ -17,6 +17,8 @@ package org.apache.doris.catalog; +import org.apache.doris.catalog.stream.OlapTableStreamWrapper; + import com.google.common.base.Preconditions; /** @@ -25,6 +27,7 @@ public class RowBinlogTableWrapper extends OlapTableWrapper { private final MaterializedIndexMeta rowBinlogMeta; + private OlapTableStreamWrapper parent; public RowBinlogTableWrapper(OlapTable originTable) { super(originTable, originTable.getName(), originTable.getRowBinlogMeta().getSchema(), KeysType.DUP_KEYS); @@ -33,6 +36,14 @@ public RowBinlogTableWrapper(OlapTable originTable) { this.setBaseIndexId(rowBinlogMeta.getIndexId()); } + public RowBinlogTableWrapper(OlapTable originTable, OlapTableStreamWrapper parent) { + super(originTable, originTable.getName(), originTable.getRowBinlogMeta().getSchema(), KeysType.DUP_KEYS); + this.rowBinlogMeta = originTable.getRowBinlogMeta(); + Preconditions.checkNotNull(rowBinlogMeta, "row binlog meta is null, table=%s", originTable.getName()); + this.setBaseIndexId(rowBinlogMeta.getIndexId()); + this.parent = parent; + } + @Override public long getBaseIndexId() { return rowBinlogMeta.getIndexId(); @@ -51,4 +62,8 @@ public MaterializedIndex getPartitionIndex(Partition partition, long indexId) { } return null; } + + public OlapTableStreamWrapper getParent() { + return parent; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableProperty.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableProperty.java index 394ecc7a1b87df..a07b9336f8f5b7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableProperty.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableProperty.java @@ -98,8 +98,6 @@ public class TableProperty implements GsonPostProcessable { private boolean enableSingleReplicaCompaction = false; - private boolean enableTso = false; - private int verticalCompactionNumColumnsPerGroup = 5; private boolean storeRowColumn = false; @@ -140,6 +138,7 @@ public class TableProperty implements GsonPostProcessable { public TableProperty(Map properties) { this.properties = properties; + removeLegacyEnableTsoProperty(); } public static boolean isSamePrefixProperties(Map properties, String prefix) { @@ -171,7 +170,6 @@ public TableProperty buildProperty(short opCode) { buildTimeSeriesCompactionTimeThresholdSeconds(); buildSkipWriteIndexOnLoad(); buildEnableSingleReplicaCompaction(); - buildEnableTso(); buildVerticalCompactionNumColumnsPerGroup(); buildDisableAutoCompaction(); buildTimeSeriesCompactionEmptyRowsetsThreshold(); @@ -360,15 +358,6 @@ public boolean enableSingleReplicaCompaction() { return enableSingleReplicaCompaction; } - public TableProperty buildEnableTso() { - enableTso = Boolean.parseBoolean(properties.getOrDefault(PropertyAnalyzer.PROPERTIES_ENABLE_TSO, "false")); - return this; - } - - public boolean enableTso() { - return enableTso; - } - public TableProperty buildVerticalCompactionNumColumnsPerGroup() { verticalCompactionNumColumnsPerGroup = Integer.parseInt( properties.getOrDefault(PropertyAnalyzer.PROPERTIES_VERTICAL_COMPACTION_NUM_COLUMNS_PER_GROUP, "5")); @@ -664,6 +653,7 @@ public TableProperty buildInvertedIndexFileStorageFormat() { public void modifyTableProperties(Map modifyProperties) { properties.putAll(modifyProperties); + removeLegacyEnableTsoProperty(); removeDuplicateReplicaNumProperty(); } @@ -685,6 +675,7 @@ public ReplicaAllocation getReplicaAllocation() { public void modifyTableProperties(String key, String value) { properties.put(key, value); + removeLegacyEnableTsoProperty(); } public Map getProperties() { @@ -941,7 +932,6 @@ public void gsonPostProcess() throws IOException { buildTimeSeriesCompactionTimeThresholdSeconds(); buildDisableAutoCompaction(); buildEnableSingleReplicaCompaction(); - buildEnableTso(); buildVerticalCompactionNumColumnsPerGroup(); buildTimeSeriesCompactionEmptyRowsetsThreshold(); buildTimeSeriesCompactionLevelThreshold(); @@ -955,6 +945,13 @@ public void gsonPostProcess() throws IOException { buildColumnSeqMapping(); } + /** + * Drops the legacy table-level TSO property because binlog.format now owns that switch. + */ + private void removeLegacyEnableTsoProperty() { + properties.remove(PropertyAnalyzer.LEGACY_PROPERTIES_ENABLE_TSO); + } + // For some historical reason, // both "dynamic_partition.replication_num" and "dynamic_partition.replication_allocation" // may be exist in "properties". we need remove the "dynamic_partition.replication_num", or it will always replace diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/BaseTableStream.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/BaseTableStream.java index 099e0c3e0b8cec..6d6104bf7623ae 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/BaseTableStream.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/BaseTableStream.java @@ -25,6 +25,7 @@ import org.apache.doris.common.util.PropertyAnalyzer; import org.apache.doris.nereids.exceptions.AnalysisException; import org.apache.doris.persist.gson.GsonUtils; +import org.apache.doris.thrift.TBinlogScanType; import org.apache.doris.thrift.TRow; import com.google.common.collect.ImmutableList; @@ -33,15 +34,17 @@ import java.io.DataOutput; import java.io.IOException; import java.util.List; +import java.util.Locale; import java.util.Map; public abstract class BaseTableStream extends Table { - public enum StreamConsumeType { + public enum StreamScanType { DEFAULT, APPEND_ONLY, MIN_DELTA, + DETAIL, UNKNOWN; - public static StreamConsumeType getType(String typeName) { + public static StreamScanType getType(String typeName) { if (typeName == null) { return UNKNOWN; } @@ -57,12 +60,26 @@ public static StreamConsumeType getType(String typeName) { return UNKNOWN; } } + + public static TBinlogScanType toThrift(StreamScanType streamScanType) { + switch (streamScanType) { + case DEFAULT: + case MIN_DELTA: + return TBinlogScanType.MIN_DELTA; + case APPEND_ONLY: + return TBinlogScanType.APPEND_ONLY; + case DETAIL: + return TBinlogScanType.DETAIL; + default: + return TBinlogScanType.UNKNOWN; + } + } } private static ImmutableList supportedTableTypeList = ImmutableList.of(TableType.OLAP); @SerializedName("sct") - protected StreamConsumeType streamConsumeType = StreamConsumeType.DEFAULT; + protected StreamScanType streamScanType = StreamScanType.DEFAULT; @SerializedName("sir") protected boolean showInitialRows; @@ -109,15 +126,19 @@ public void setProperties(Map properties) throws org.apache.dori showInitialRows = PropertyAnalyzer.analyzeBooleanProp(properties, PropertyAnalyzer.PROPERTIES_STREAM_SHOW_INITIAL_ROWS, false); - streamConsumeType = PropertyAnalyzer.analyzeStreamType(properties); + streamScanType = PropertyAnalyzer.analyzeStreamType(properties); } public String getTableStreamType() { return "BASE_STREAM"; } - public String getConsumeType() { - return streamConsumeType.name(); + public String getConsumeTypeString() { + return streamScanType.name(); + } + + public StreamScanType getConsumeType() { + return streamScanType; } public boolean isDisabled() { @@ -150,7 +171,7 @@ public static boolean isTableTypeSupported(TableIf tableIf) { public void appendProperties(StringBuilder sb) { sb.append("\"").append(PropertyAnalyzer.PROPERTIES_STREAM_TYPE) - .append("\" = \"").append(streamConsumeType).append("\""); + .append("\" = \"").append(streamScanType).append("\""); sb.append(",\n\"").append(PropertyAnalyzer.PROPERTIES_STREAM_SHOW_INITIAL_ROWS) .append("\" = \"").append(showInitialRows).append("\"\n"); } @@ -187,4 +208,13 @@ public abstract void unprotectedCheckStreamUpdate(AbstractTableStreamUpdate upda throws UserException; public abstract void unprotectedUpdateStreamUpdate(AbstractTableStreamUpdate update, Long ts); + + /** + * Returns whether the column is a historical snapshot column in the form __BEFORE__xxx__. + */ + public static boolean isBeforeImageColumn(String columnName) { + String upper = columnName.toUpperCase(Locale.ROOT); + return upper.startsWith(Column.BINLOG_BEFORE_PREFIX) && upper.endsWith("__"); + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/OlapTableStream.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/OlapTableStream.java index 8633e5d893fef2..890525a6031e96 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/OlapTableStream.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/OlapTableStream.java @@ -18,6 +18,7 @@ package org.apache.doris.catalog.stream; import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.KeysType; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.TableIf; @@ -36,8 +37,10 @@ import java.io.DataOutput; import java.io.IOException; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; public class OlapTableStream extends BaseTableStream { @@ -48,9 +51,13 @@ public class OlapTableStream extends BaseTableStream { @SerializedName("pct") private Map partitionConsumptionTime; + // temporary var, would be removed after full implementation @SerializedName("hpo") private Map historicalPartitionOffset; + @SerializedName("hpt") + private Map historicalPartitionTSO; + // for persist public OlapTableStream() { super(); @@ -62,6 +69,7 @@ public OlapTableStream(long id, String streamName, List fullSchema, Tabl this.partitionOffset = new HashMap<>(); this.partitionConsumptionTime = new HashMap<>(); this.historicalPartitionOffset = new HashMap<>(); + this.historicalPartitionTSO = new HashMap<>(); this.baseTable = baseTable; } @@ -91,13 +99,24 @@ public void setProperties(Map properties) throws AnalysisExcepti // set offset according to baseTable if (!showInitialRows) { // set partition offset - // todo(TsukiokaKogane): change offset from partition version to commit tso ((OlapTable) baseTable).getPartitions() - .forEach(p -> partitionOffset.put(p.getId(), p.getVisibleVersion())); + .forEach(p -> partitionOffset.put(p.getId(), p.getVisibleVersionTime())); } else { ((OlapTable) baseTable).getPartitions() - .forEach(p -> historicalPartitionOffset.put(p.getId(), p.getVisibleVersion())); + .stream() + .filter(p -> p.getVisibleVersion() > Partition.PARTITION_INIT_VERSION) + .forEach(p -> { + historicalPartitionOffset.put(p.getId(), p.getVisibleVersion()); + historicalPartitionTSO.put(p.getId(), p.getVisibleVersionTime()); + } + ); + } + // default scan type for dup key is append-only + if (((OlapTable) baseTable).getKeysType().equals(KeysType.DUP_KEYS) + && streamScanType.equals(StreamScanType.DEFAULT)) { + streamScanType = StreamScanType.APPEND_ONLY; } + } public static OlapTableStream read(DataInput in) throws IOException { @@ -140,7 +159,8 @@ void fillTableStreamConsumptionInfo(List dataBatch) { // LAG trow.addToColumnValue(new TCell() .setStringVal(String.valueOf( - entry.getValue().getVisibleVersion() - partitionOffset.get(entry.getKey())))); + entry.getValue().getVisibleVersionTime() + - partitionOffset.get(entry.getKey())))); // LAST_CONSUMPTION_TIME if (partitionConsumptionTime.containsKey(entry.getKey())) { trow.addToColumnValue(new TCell() @@ -152,8 +172,7 @@ void fillTableStreamConsumptionInfo(List dataBatch) { // CONSUMPTION_STATUS trow.addToColumnValue(new TCell().setStringVal("N/A")); // LAG - trow.addToColumnValue(new TCell().setStringVal((String.valueOf( - entry.getValue().getVisibleVersion())))); + trow.addToColumnValue(new TCell().setStringVal("N/A")); // LAST_CONSUMPTION_TIME trow.addToColumnValue(new TCell().setLongVal(-1)); } @@ -169,21 +188,16 @@ public boolean hasData(Partition partition) { // if all available visible data has been consumed, return false // todo(TsukiokaKogane): change offset from partition version to commit tso return (!partitionOffset.containsKey(partition.getId()) - || !partitionOffset.get(partition.getId()).equals(partition.getVisibleVersion())) + || !partitionOffset.get(partition.getId()).equals(partition.getVisibleVersionTime())) && partition.hasData(); } + public boolean hasHistoricalData(long partitionId) { + return historicalPartitionOffset.containsKey(partitionId); + } + public Pair getStreamUpdate(Long partitionId) { - Long next = null; - Long prev = null; - if (historicalPartitionOffset.containsKey(partitionId)) { - next = historicalPartitionOffset.get(partitionId); - } else { - // todo(TsukiokaKogane): update next version with stepping - next = ((OlapTable) baseTable).getPartition(partitionId).getVisibleVersion(); - } - prev = partitionOffset.get(partitionId); - return Pair.of(prev, next); + return Pair.of(partitionOffset.get(partitionId), historicalPartitionOffset.get(partitionId)); } @Override @@ -201,12 +215,57 @@ public void unprotectedUpdateStreamUpdate(AbstractTableStreamUpdate update, Long for (Map.Entry entry : next.entrySet()) { if (historicalPartitionOffset.containsKey(entry.getKey())) { historicalPartitionOffset.remove(entry.getKey()); - partitionOffset.put(entry.getKey(), entry.getValue()); + if (historicalPartitionTSO == null) { + partitionOffset.put(entry.getKey(), entry.getValue()); + } else { + partitionOffset.put(entry.getKey(), historicalPartitionTSO.get(entry.getKey())); + historicalPartitionTSO.remove(entry.getKey()); + } } else { - // todo(TsukiokaKogane): update partition offset with tso partitionOffset.put(entry.getKey(), entry.getValue()); } partitionConsumptionTime.put(entry.getKey(), ts); } } + + Set unprotectedCollectStalePartitionOffsetIds(Set validPartitionIds) { + Preconditions.checkState(isWriteLockHeldByCurrentThread()); + Set stalePartitionIds = new HashSet<>(); + for (Long partitionId : partitionOffset.keySet()) { + if (!validPartitionIds.contains(partitionId)) { + stalePartitionIds.add(partitionId); + } + } + for (Long partitionId : partitionConsumptionTime.keySet()) { + if (!validPartitionIds.contains(partitionId)) { + stalePartitionIds.add(partitionId); + } + } + for (Long partitionId : historicalPartitionOffset.keySet()) { + if (!validPartitionIds.contains(partitionId)) { + stalePartitionIds.add(partitionId); + } + } + if (historicalPartitionTSO != null) { + for (Long partitionId : historicalPartitionTSO.keySet()) { + if (!validPartitionIds.contains(partitionId)) { + stalePartitionIds.add(partitionId); + } + } + } + return stalePartitionIds; + } + + int unprotectedPrunePartitionOffsets(Set partitionIds) { + Preconditions.checkState(isWriteLockHeldByCurrentThread()); + for (Long partitionId : partitionIds) { + partitionOffset.remove(partitionId); + partitionConsumptionTime.remove(partitionId); + historicalPartitionOffset.remove(partitionId); + if (historicalPartitionTSO != null) { + historicalPartitionTSO.remove(partitionId); + } + } + return partitionIds.size(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/OlapTableStreamWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/OlapTableStreamWrapper.java index ddd8d55ac20d15..d284e087d56390 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/OlapTableStreamWrapper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/OlapTableStreamWrapper.java @@ -32,6 +32,7 @@ import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; // runtime-only class for unified query/insert experience, created when bind relation with OlapTableStream public class OlapTableStreamWrapper extends OlapTable { @@ -42,6 +43,12 @@ public class OlapTableStreamWrapper extends OlapTable { public OlapTableStreamWrapper(OlapTableStream stream, OlapTable baseTable) { super(stream.getId(), stream.getName(), stream.getFullSchema(), baseTable.getKeysType(), baseTable.getPartitionInfo(), baseTable.getDefaultDistributionInfo()); + // Inherit base table's qualifiedDbName so that wrapper.getDatabase() can resolve the + // owning Database via Env.getCurrentInternalCatalog().getDbNullable(qualifiedDbName). + // Otherwise downstream consumers (e.g. QueryPartitionCollector, partition routing, + // MV partition compensation) treat the wrapper as having no database and silently + // fall back to empty results when scanning the stream. + setQualifiedDbName(baseTable.getQualifiedDbName()); this.stream = stream; this.baseTable = baseTable; this.getOrCreatTableProperty().setEnableUniqueKeyMergeOnWrite(baseTable.getEnableUniqueKeyMergeOnWrite()); @@ -163,4 +170,29 @@ public List selectNonEmptyPartitionIds(Collection partitionIds) { } return nonEmptyIds; } + + public List getRowBinlogSchema() { + return baseTable.getRowBinlogMeta().getSchema(); + } + + public List filterHistoryPartitionIds(List partitionIds) { + return partitionIds.stream() + .filter(partitionId -> stream.hasHistoricalData(partitionId)) + .collect(Collectors.toList()); + } + + public List filterIncrementalPartitionIds(List partitionIds) { + return partitionIds.stream() + .filter(partitionId -> !stream.hasHistoricalData(partitionId) + && stream.hasData(getPartition(partitionId))) + .collect(Collectors.toList()); + } + + public OlapTable getBaseTable() { + return baseTable; + } + + public BaseTableStream.StreamScanType getConsumeType() { + return stream.getConsumeType(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/TableStreamManager.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/TableStreamManager.java index a9608261d72ea6..d22f701083ce46 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/TableStreamManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/stream/TableStreamManager.java @@ -20,11 +20,16 @@ import org.apache.doris.catalog.Database; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Table; import org.apache.doris.catalog.TableIf; +import org.apache.doris.common.Config; +import org.apache.doris.common.Pair; import org.apache.doris.common.io.Text; import org.apache.doris.common.io.Writable; import org.apache.doris.common.lock.MonitoredReentrantReadWriteLock; +import org.apache.doris.common.util.MasterDaemon; +import org.apache.doris.persist.PruneTableStreamPartitionOffsetInfo; import org.apache.doris.persist.gson.GsonPostProcessable; import org.apache.doris.persist.gson.GsonUtils; import org.apache.doris.thrift.TCell; @@ -38,24 +43,34 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.concurrent.TimeUnit; -public class TableStreamManager implements Writable, GsonPostProcessable { +public class TableStreamManager extends MasterDaemon implements Writable, GsonPostProcessable { private static final Logger LOG = LogManager.getLogger(TableStreamManager.class); @SerializedName(value = "dbStreamMap") private Map> dbStreamMap; protected MonitoredReentrantReadWriteLock rwLock; public TableStreamManager() { + super("table-stream-cleanup", Config.table_stream_partition_offset_cleanup_interval_second * 1000L); this.rwLock = new MonitoredReentrantReadWriteLock(true); this.dbStreamMap = new HashMap<>(); } + @Override + protected void runAfterCatalogReady() { + if (Config.enable_table_stream) { + cleanupStalePartitionOffsets(); + } + } + public void addTableStream(BaseTableStream stream) { rwLock.writeLock().lock(); try { @@ -65,7 +80,7 @@ public void addTableStream(BaseTableStream stream) { } } - public void removeTableStream(BaseTableStream stream) { + public void removeStaleDbAndStream(BaseTableStream stream) { rwLock.writeLock().lock(); try { Optional.ofNullable(dbStreamMap.get(stream.getDatabase().getId())) @@ -75,6 +90,22 @@ public void removeTableStream(BaseTableStream stream) { } } + public void removeStaleDbAndStream(List staleDbIds, List> staleStreamIds) { + if (staleStreamIds.isEmpty() && staleDbIds.isEmpty()) { + return; + } + rwLock.writeLock().lock(); + try { + staleDbIds.forEach(dbId -> dbStreamMap.remove(dbId)); + staleStreamIds.forEach( + pair -> Optional.ofNullable(dbStreamMap.get(pair.first)) + .ifPresent(set -> set.remove(pair.second)) + ); + } finally { + rwLock.writeLock().unlock(); + } + } + @Override public void write(DataOutput out) throws IOException { String json = GsonUtils.GSON.toJson(this); @@ -97,7 +128,129 @@ public Set getTableStreamIds(DatabaseIf db) { return result; } - public void fillTableStreamValuesMetadataResult(List dataBatch) { + public void cleanupStalePartitionOffsets() { + List staleDbIds = new ArrayList<>(); + List> staleStreamIds = new ArrayList<>(); + List pruneEntries = new ArrayList<>(); + for (Map.Entry> entry : copyDbStreamMap().entrySet()) { + Optional db = Env.getCurrentInternalCatalog().getDb(entry.getKey()); + if (!db.isPresent()) { + staleDbIds.add(entry.getKey()); + continue; + } + for (Long tableId : entry.getValue()) { + Optional table = db.get().getTable(tableId); + if (!table.isPresent()) { + staleStreamIds.add(Pair.of(db.get().getId(), tableId)); + continue; + } + if (!(table.get() instanceof OlapTableStream)) { + staleStreamIds.add(Pair.of(db.get().getId(), tableId)); + continue; + } + cleanupStalePartitionOffsets((OlapTableStream) table.get()).ifPresent(pruneEntries::add); + } + } + removeStaleDbAndStream(staleDbIds, staleStreamIds); + if (!pruneEntries.isEmpty()) { + Env.getCurrentEnv().getEditLog().logPruneTableStreamPartitionOffsets( + new PruneTableStreamPartitionOffsetInfo(pruneEntries)); + } + } + + private Optional cleanupStalePartitionOffsets(OlapTableStream stream) { + if (stream.isDisabled() || stream.isStale()) { + return Optional.empty(); + } + OlapTable baseTable = stream.getBaseTableNullable(); + if (baseTable == null) { + return Optional.empty(); + } + Set validPartitionIds; + if (!baseTable.tryReadLock(Table.TRY_LOCK_TIMEOUT_MS, TimeUnit.MILLISECONDS)) { + if (LOG.isDebugEnabled()) { + LOG.debug("skip cleaning stream {} because base table {} read lock is busy", + stream.getName(), baseTable.getName()); + } + return Optional.empty(); + } + try { + if (baseTable.isDropped) { + return Optional.empty(); + } + validPartitionIds = new HashSet<>(baseTable.getPartitionIds()); + } finally { + baseTable.readUnlock(); + } + if (!stream.tryWriteLockIfExist(Table.TRY_LOCK_TIMEOUT_MS, TimeUnit.MILLISECONDS)) { + if (LOG.isDebugEnabled()) { + LOG.debug("skip cleaning stream {} because stream write lock is busy", stream.getName()); + } + return Optional.empty(); + } + try { + if (stream.isDisabled() || stream.isStale()) { + return Optional.empty(); + } + Set stalePartitionIds = stream.unprotectedCollectStalePartitionOffsetIds(validPartitionIds); + if (stalePartitionIds.isEmpty()) { + return Optional.empty(); + } + int removedPartitionCount = stream.unprotectedPrunePartitionOffsets(stalePartitionIds); + if (removedPartitionCount > 0) { + LOG.info("cleaned {} stale partition offset entries from stream {}.{} ({})", + removedPartitionCount, stream.getDatabase().getFullName(), stream.getName(), stream.getId()); + } + return Optional.of(new PruneTableStreamPartitionOffsetInfo.Entry( + stream.getDatabase().getId(), stream.getId(), stalePartitionIds)); + } finally { + stream.writeUnlock(); + } + } + + public void replayPruneTableStreamPartitionOffsets(PruneTableStreamPartitionOffsetInfo info) { + if (info == null || info.getEntries() == null || info.getEntries().isEmpty()) { + return; + } + for (PruneTableStreamPartitionOffsetInfo.Entry entry : info.getEntries()) { + replayPruneTableStreamPartitionOffsets(entry); + } + } + + private void replayPruneTableStreamPartitionOffsets(PruneTableStreamPartitionOffsetInfo.Entry entry) { + if (entry == null || entry.getPartitionIds() == null || entry.getPartitionIds().isEmpty()) { + return; + } + Optional db = Env.getCurrentInternalCatalog().getDb(entry.getDbId()); + if (!db.isPresent()) { + LOG.info("skip replay pruning partition offsets because db {} does not exist", entry.getDbId()); + return; + } + Optional
table = db.get().getTable(entry.getStreamId()); + if (!table.isPresent()) { + LOG.info("skip replay pruning partition offsets because stream {}.{} does not exist", + entry.getDbId(), entry.getStreamId()); + return; + } + if (!(table.get() instanceof OlapTableStream)) { + LOG.info("skip replay pruning partition offsets because table {}.{} is not an olap table stream", + entry.getDbId(), entry.getStreamId()); + return; + } + OlapTableStream stream = (OlapTableStream) table.get(); + if (!stream.tryWriteLockIfExist(Table.TRY_LOCK_TIMEOUT_MS, TimeUnit.MILLISECONDS)) { + LOG.warn("skip replay pruning partition offsets because stream {}.{} write lock is busy", + db.get().getFullName(), stream.getName()); + return; + } + try { + stream.unprotectedPrunePartitionOffsets(entry.getPartitionIds()); + } finally { + stream.writeUnlock(); + } + } + + private Map> copyDbStreamMap() { Map> copiedMap = new HashMap<>(); rwLock.readLock().lock(); try { @@ -107,7 +260,11 @@ public void fillTableStreamValuesMetadataResult(List dataBatch) { } finally { rwLock.readLock().unlock(); } - for (Map.Entry> entry : copiedMap.entrySet()) { + return copiedMap; + } + + public void fillTableStreamValuesMetadataResult(List dataBatch) { + for (Map.Entry> entry : copyDbStreamMap().entrySet()) { Optional db = Env.getCurrentInternalCatalog().getDb(entry.getKey()); if (db.isPresent()) { for (Long tableId : entry.getValue()) { @@ -135,7 +292,7 @@ public void fillTableStreamValuesMetadataResult(List dataBatch) { // STREAM_TYPE trow.addToColumnValue(new TCell().setStringVal(stream.getTableStreamType())); // CONSUME_TYPE - trow.addToColumnValue(new TCell().setStringVal(stream.getConsumeType())); + trow.addToColumnValue(new TCell().setStringVal(stream.getConsumeTypeString())); // STREAM_COMMENT trow.addToColumnValue(new TCell().setStringVal(stream.getComment())); TableIf baseTable = stream.getBaseTableNullable(); @@ -176,16 +333,7 @@ public void fillTableStreamValuesMetadataResult(List dataBatch) { } public void fillStreamConsumptionValuesMetadataResult(List dataBatch) { - Map> copiedMap = new HashMap<>(); - rwLock.readLock().lock(); - try { - for (Map.Entry> e : dbStreamMap.entrySet()) { - copiedMap.put(e.getKey(), new HashSet<>(e.getValue())); - } - } finally { - rwLock.readLock().unlock(); - } - for (Map.Entry> entry : copiedMap.entrySet()) { + for (Map.Entry> entry : copyDbStreamMap().entrySet()) { Optional db = Env.getCurrentInternalCatalog().getDb(entry.getKey()); if (db.isPresent()) { for (Long tableId : entry.getValue()) { @@ -213,5 +361,6 @@ public void fillStreamConsumptionValuesMetadataResult(List dataBatch) { @Override public void gsonPostProcess() throws IOException { this.rwLock = new MonitoredReentrantReadWriteLock(true); + this.intervalMs = Config.table_stream_partition_offset_cleanup_interval_second * 1000L; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/alter/CloudSchemaChangeHandler.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/alter/CloudSchemaChangeHandler.java index c4b4c45ab2b476..c2155963273e09 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/alter/CloudSchemaChangeHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/alter/CloudSchemaChangeHandler.java @@ -112,7 +112,6 @@ public void updateTableProperties(Database db, String tableName, Map getReadyToPublishTransactions() { return new ArrayList(); } + @Override + public List getCommittedTransactions(long dbId) throws AnalysisException { + return new ArrayList(); + } + @Override public boolean existCommittedTxns(Long dbId, Long tableId, Long partitionId) { //do nothing for CloudGlobalTransactionMgr diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java index fc950373e88324..224f5e9bfe9572 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java @@ -170,7 +170,9 @@ public class PropertyAnalyzer { public static final String PROPERTIES_ENABLE_SINGLE_REPLICA_COMPACTION = "enable_single_replica_compaction"; - public static final String PROPERTIES_ENABLE_TSO = "enable_tso"; + // Legacy persisted switch before table-level TSO was derived from binlog.format. + @Deprecated + public static final String LEGACY_PROPERTIES_ENABLE_TSO = "enable_tso"; public static final String PROPERTIES_VERTICAL_COMPACTION_NUM_COLUMNS_PER_GROUP = "vertical_compaction_num_columns_per_group"; @@ -873,27 +875,6 @@ public static Boolean analyzeEnableSingleReplicaCompaction(Map p + " must be `true` or `false`"); } - public static Boolean analyzeEnableTso(Map properties) throws AnalysisException { - if (properties == null || properties.isEmpty()) { - return false; - } - String value = properties.get(PROPERTIES_ENABLE_TSO); - if (null == value) { - return false; - } - properties.remove(PROPERTIES_ENABLE_TSO); - if (value.equalsIgnoreCase("true")) { - if (!Config.enable_tso_feature) { - throw new AnalysisException(PROPERTIES_ENABLE_TSO - + " can not be enabled when experimental_enable_tso_feature is disabled"); - } - return true; - } else if (value.equalsIgnoreCase("false")) { - return false; - } - throw new AnalysisException(PROPERTIES_ENABLE_TSO + " must be `true` or `false`"); - } - public static Boolean analyzeEnableDuplicateWithoutKeysByDefault(Map properties) throws AnalysisException { if (properties == null || properties.isEmpty()) { @@ -2339,18 +2320,18 @@ public static Integer analyzeVerticalCompactionNumColumnsPerGroup(Map properties) + public static BaseTableStream.StreamScanType analyzeStreamType(Map properties) throws AnalysisException { if (properties != null && properties.containsKey(PROPERTIES_STREAM_TYPE)) { String value = properties.get(PROPERTIES_STREAM_TYPE); - BaseTableStream.StreamConsumeType type = BaseTableStream.StreamConsumeType.getType(value); - if (type.equals(BaseTableStream.StreamConsumeType.UNKNOWN)) { + BaseTableStream.StreamScanType type = BaseTableStream.StreamScanType.getType(value); + if (type.equals(BaseTableStream.StreamScanType.UNKNOWN)) { throw new AnalysisException("not supported " + PropertyAnalyzer.PROPERTIES_STREAM_TYPE + ": " + value); } properties.remove(PROPERTIES_STREAM_TYPE); return type; } - return BaseTableStream.StreamConsumeType.DEFAULT; + return BaseTableStream.StreamScanType.DEFAULT; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java index 15f78c23ff4fef..e0362fba2a051d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java @@ -1017,7 +1017,7 @@ private void dropTableInternal(Database db, Table table, boolean isView, boolean Env.getCurrentEnv().getMtmvService().dropTable(table); } if (table instanceof BaseTableStream) { - Env.getCurrentEnv().getTableStreamManager().removeTableStream((BaseTableStream) table); + Env.getCurrentEnv().getTableStreamManager().removeStaleDbAndStream((BaseTableStream) table); } if (Config.isCloudMode()) { ((CloudGlobalTransactionMgr) Env.getCurrentGlobalTransactionMgr()).afterDropTable(db.getId(), @@ -1048,7 +1048,7 @@ public boolean unprotectDropTable(Database db, Table table, boolean isForceDrop, Env.getCurrentEnv().getMtmvService().dropView(new BaseTableInfo(table)); } if (table instanceof BaseTableStream) { - Env.getCurrentEnv().getTableStreamManager().removeTableStream((BaseTableStream) table); + Env.getCurrentEnv().getTableStreamManager().removeStaleDbAndStream((BaseTableStream) table); } Env.getCurrentEnv().getAnalysisManager().removeTableStats(table.getId()); Env.getCurrentEnv().getDictionaryManager().dropTableDictionaries(db.getName(), table.getName()); @@ -1568,10 +1568,6 @@ public void addPartition(Database db, String tableName, AddPartitionOp addPartit properties.put(PropertyAnalyzer.PROPERTIES_ENABLE_SINGLE_REPLICA_COMPACTION, olapTable.enableSingleReplicaCompaction().toString()); } - if (!properties.containsKey(PropertyAnalyzer.PROPERTIES_ENABLE_TSO)) { - properties.put(PropertyAnalyzer.PROPERTIES_ENABLE_TSO, - olapTable.enableTso().toString()); - } if (!properties.containsKey(PropertyAnalyzer.PROPERTIES_STORE_ROW_COLUMN)) { properties.put(PropertyAnalyzer.PROPERTIES_STORE_ROW_COLUMN, olapTable.storeRowColumn().toString()); @@ -2693,14 +2689,6 @@ private boolean createOlapTable(Database db, CreateTableInfo createTableInfo) th } olapTable.setEnableSingleReplicaCompaction(enableSingleReplicaCompaction); - boolean enableTso = false; - try { - enableTso = PropertyAnalyzer.analyzeEnableTso(properties); - } catch (AnalysisException e) { - throw new DdlException(e.getMessage()); - } - olapTable.setEnableTso(enableTso); - if (Config.isCloudMode() && ((CloudEnv) env).getEnableStorageVault()) { // Pair storageVaultInfoPair = PropertyAnalyzer.analyzeStorageVault(properties, db); @@ -4011,6 +3999,8 @@ public void createTableStream(CreateStreamCommand command) throws DdlException { BaseTableStream newStream; TableIf baseTable = baseCatalog.getDbOrDdlException(createStreamInfo.getBaseTableName().getDb()) .getTableOrDdlException(createStreamInfo.getBaseTableName().getTbl()); + // check base table type is supported for stream + checkBaseTableAvailable(baseTable); // lock base table for stream init baseTable.readLock(); try { @@ -4026,6 +4016,9 @@ public void createTableStream(CreateStreamCommand command) throws DdlException { } catch (AnalysisException e) { throw new DdlException(e.getMessage(), e); } + if (baseTable instanceof OlapTable) { + checkBaseTableAvailableForStreamType((OlapTable) baseTable, newStream.getConsumeType()); + } if (properties != null && !properties.isEmpty()) { // before here, all properties should be checked throw new DdlException("Unknown properties: " + properties); @@ -4041,4 +4034,36 @@ public void createTableStream(CreateStreamCommand command) throws DdlException { LOG.info("successfully create stream[{}]", streamName); } } + + void checkBaseTableAvailable(TableIf tableIf) throws DdlException { + if (!BaseTableStream.isTableTypeSupported(tableIf)) { + throw new DdlException("Base table type " + tableIf.getType() + + " is not supported for create table stream"); + } + if (tableIf instanceof OlapTable) { + OlapTable olapTable = (OlapTable) tableIf; + if (!olapTable.needRowBinlog()) { + throw new DdlException("Base Olap table " + olapTable.getQualifiedName() + + " need to enable row binlog for table stream"); + } + } + } + + void checkBaseTableAvailableForStreamType(OlapTable olapTable, BaseTableStream.StreamScanType streamScanType) + throws DdlException { + if (streamScanType != BaseTableStream.StreamScanType.DEFAULT + && streamScanType != BaseTableStream.StreamScanType.MIN_DELTA) { + return; + } + if (!olapTable.isUniqKeyMergeOnWrite()) { + throw new DdlException("MIN_DELTA table stream requires base table to be UNIQUE KEY with " + + "enable_unique_key_merge_on_write=true. Table " + olapTable.getQualifiedName() + + " is " + olapTable.getKeysType() + "."); + } + if (!olapTable.getBinlogConfig().getNeedHistoricalValue()) { + throw new DdlException("MIN_DELTA table stream requires base table to enable " + + "binlog.need_historical_value=true. Table " + olapTable.getQualifiedName() + + " doesn't enable historical value in row binlog."); + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java b/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java index 7b1945106fc1a1..0b1b67871783f2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java +++ b/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java @@ -113,6 +113,7 @@ import org.apache.doris.persist.OperationType; import org.apache.doris.persist.PartitionPersistInfo; import org.apache.doris.persist.PrivInfo; +import org.apache.doris.persist.PruneTableStreamPartitionOffsetInfo; import org.apache.doris.persist.RecoverInfo; import org.apache.doris.persist.RefreshExternalTableInfo; import org.apache.doris.persist.RemoveAlterJobV2OperationLog; @@ -966,6 +967,11 @@ public void readFields(DataInput in) throws IOException { isRead = true; break; } + case OperationType.OP_PRUNE_TABLE_STREAM_PARTITION_OFFSETS: { + data = PruneTableStreamPartitionOffsetInfo.read(in); + isRead = true; + break; + } // FIXME: support cloud related operation types. case OperationType.OP_UPDATE_CLOUD_REPLICA: { data = UpdateCloudReplicaInfo.read(in); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/analyzer/UnboundRelation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/analyzer/UnboundRelation.java index 093af6ca6b300e..1c33256791f86c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/analyzer/UnboundRelation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/analyzer/UnboundRelation.java @@ -24,6 +24,7 @@ import org.apache.doris.nereids.memo.GroupExpression; import org.apache.doris.nereids.properties.LogicalProperties; import org.apache.doris.nereids.properties.UnboundLogicalProperties; +import org.apache.doris.nereids.trees.ChangeScanInfo; import org.apache.doris.nereids.trees.TableSample; import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.plans.BlockFuncDepsPropagation; @@ -62,17 +63,21 @@ public class UnboundRelation extends LogicalRelation implements Unbound, BlockFu private final Optional tableSnapshot; + // Change scan metadata derived from table@incr(...), only used by Nereids path. + private final Optional changeScanInfo; + public UnboundRelation(RelationId id, List nameParts) { this(id, nameParts, Optional.empty(), Optional.empty(), ImmutableList.of(), false, ImmutableList.of(), ImmutableList.of(), Optional.empty(), Optional.empty(), null, - Optional.empty(), Optional.empty()); + Optional.empty(), Optional.empty(), Optional.empty()); } public UnboundRelation(RelationId id, List nameParts, List partNames, boolean isTempPart) { this(id, nameParts, Optional.empty(), Optional.empty(), partNames, isTempPart, ImmutableList.of(), - ImmutableList.of(), Optional.empty(), Optional.empty(), null, Optional.empty(), Optional.empty()); + ImmutableList.of(), Optional.empty(), Optional.empty(), null, Optional.empty(), Optional.empty(), + Optional.empty()); } public UnboundRelation(RelationId id, List nameParts, List partNames, @@ -80,15 +85,23 @@ public UnboundRelation(RelationId id, List nameParts, List partN Optional indexName) { this(id, nameParts, Optional.empty(), Optional.empty(), partNames, isTempPart, tabletIds, hints, tableSample, indexName, null, Optional.empty(), - Optional.empty()); + Optional.empty(), Optional.empty()); } public UnboundRelation(RelationId id, List nameParts, List partNames, boolean isTempPart, List tabletIds, List hints, Optional tableSample, Optional indexName, TableScanParams scanParams, Optional tableSnapshot) { + this(id, nameParts, isTempPart, partNames, tabletIds, hints, tableSample, indexName, + scanParams, tableSnapshot, Optional.empty()); + } + + public UnboundRelation(RelationId id, List nameParts, boolean isTempPart, List partNames, + List tabletIds, List hints, Optional tableSample, Optional indexName, + TableScanParams scanParams, + Optional tableSnapshot, Optional changeScanInfo) { this(id, nameParts, Optional.empty(), Optional.empty(), - partNames, isTempPart, tabletIds, hints, tableSample, indexName, scanParams, Optional.empty(), - tableSnapshot); + partNames, isTempPart, tabletIds, hints, tableSample, indexName, scanParams, + Optional.empty(), tableSnapshot, changeScanInfo); } public UnboundRelation(RelationId id, List nameParts, @@ -96,7 +109,8 @@ public UnboundRelation(RelationId id, List nameParts, List partNames, boolean isTempPart, List tabletIds, List hints, Optional tableSample, Optional indexName) { this(id, nameParts, groupExpression, logicalProperties, partNames, - isTempPart, tabletIds, hints, tableSample, indexName, null, Optional.empty(), Optional.empty()); + isTempPart, tabletIds, hints, tableSample, indexName, null, Optional.empty(), Optional.empty(), + Optional.empty()); } public UnboundRelation(RelationId id, List nameParts, List partNames, @@ -105,7 +119,7 @@ public UnboundRelation(RelationId id, List nameParts, List partN Optional tableSnapshot) { this(id, nameParts, Optional.empty(), Optional.empty(), partNames, isTempPart, tabletIds, hints, tableSample, indexName, scanParams, indexInSqlString, - tableSnapshot); + tableSnapshot, Optional.empty()); } /** @@ -116,7 +130,8 @@ public UnboundRelation(RelationId id, List nameParts, List partNames, boolean isTempPart, List tabletIds, List hints, Optional tableSample, Optional indexName, TableScanParams scanParams, Optional> indexInSqlString, - Optional tableSnapshot) { + Optional tableSnapshot, + Optional changeScanInfo) { super(id, PlanType.LOGICAL_UNBOUND_RELATION, groupExpression, logicalProperties); this.nameParts = ImmutableList.copyOf(Objects.requireNonNull(nameParts, "nameParts should not null")); this.partNames = ImmutableList.copyOf(Objects.requireNonNull(partNames, "partNames should not null")); @@ -128,6 +143,7 @@ public UnboundRelation(RelationId id, List nameParts, this.scanParams = scanParams; this.indexInSqlString = indexInSqlString; this.tableSnapshot = tableSnapshot; + this.changeScanInfo = changeScanInfo; } public List getNameParts() { @@ -148,21 +164,28 @@ public LogicalProperties computeLogicalProperties() { public Plan withGroupExpression(Optional groupExpression) { return new UnboundRelation(relationId, nameParts, groupExpression, Optional.of(getLogicalProperties()), - partNames, isTempPart, tabletIds, hints, tableSample, indexName, null, indexInSqlString, tableSnapshot); + partNames, isTempPart, tabletIds, hints, tableSample, indexName, null, indexInSqlString, + tableSnapshot, changeScanInfo); } @Override public Plan withGroupExprLogicalPropChildren(Optional groupExpression, Optional logicalProperties, List children) { - return new UnboundRelation(relationId, nameParts, groupExpression, - logicalProperties, partNames, isTempPart, tabletIds, hints, tableSample, indexName, null, - indexInSqlString, tableSnapshot); + return new UnboundRelation(relationId, nameParts, groupExpression, logicalProperties, partNames, + isTempPart, tabletIds, hints, tableSample, indexName, null, indexInSqlString, tableSnapshot, + changeScanInfo); } public UnboundRelation withIndexInSql(Pair index) { - return new UnboundRelation(relationId, nameParts, groupExpression, - Optional.of(getLogicalProperties()), partNames, isTempPart, tabletIds, hints, tableSample, indexName, - null, Optional.of(index), tableSnapshot); + return new UnboundRelation(relationId, nameParts, groupExpression, Optional.of(getLogicalProperties()), + partNames, isTempPart, tabletIds, hints, tableSample, indexName, null, + Optional.of(index), tableSnapshot, changeScanInfo); + } + + public UnboundRelation withChangeScanInfo(ChangeScanInfo changeInfo) { + return new UnboundRelation(relationId, nameParts, groupExpression, Optional.of(getLogicalProperties()), + partNames, isTempPart, tabletIds, hints, tableSample, indexName, null, + indexInSqlString, tableSnapshot, Optional.of(changeInfo)); } @Override @@ -207,6 +230,9 @@ public String toDigest() { if (tableSnapshot.isPresent()) { sb.append(tableSnapshot.get().toDigest()).append(" "); } + if (changeScanInfo.isPresent()) { + sb.append(changeScanInfo.get().toDigest()).append(" "); + } return sb.substring(0, sb.length() - 1); } @@ -224,7 +250,8 @@ public boolean equals(Object o) { that.tabletIds) && Objects.equals(hints, that.hints) && Objects.equals(tableSample, that.tableSample) && Objects.equals(indexName, that.indexName) && Objects.equals( scanParams, that.scanParams) && Objects.equals(indexInSqlString, that.indexInSqlString) - && Objects.equals(tableSnapshot, that.tableSnapshot); + && Objects.equals(tableSnapshot, that.tableSnapshot) + && Objects.equals(changeScanInfo, that.changeScanInfo); } @Override @@ -272,4 +299,8 @@ public Optional> getIndexInSqlString() { public Optional getTableSnapshot() { return tableSnapshot; } + + public Optional getChangeScanInfo() { + return changeScanInfo; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java index 7b01ce6c6b143c..6bc307cac5cea6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java @@ -38,7 +38,9 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.RowBinlogTableWrapper; import org.apache.doris.catalog.TableIf; +import org.apache.doris.catalog.stream.OlapTableStreamWrapper; import org.apache.doris.common.Config; import org.apache.doris.common.Pair; import org.apache.doris.common.util.Util; @@ -93,6 +95,7 @@ import org.apache.doris.nereids.rules.implementation.LogicalWindowToPhysicalWindow.WindowFrameGroup; import org.apache.doris.nereids.rules.rewrite.MergeLimits; import org.apache.doris.nereids.stats.StatsErrorEstimator; +import org.apache.doris.nereids.trees.ChangeScanInfo; import org.apache.doris.nereids.trees.expressions.AggregateExpression; import org.apache.doris.nereids.trees.expressions.CTEId; import org.apache.doris.nereids.trees.expressions.EqualPredicate; @@ -238,6 +241,7 @@ import org.apache.doris.thrift.TPushAggOp; import org.apache.doris.thrift.TResultSinkType; import org.apache.doris.thrift.TRuntimeFilterType; +import org.apache.doris.tso.TSOTimestamp; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; @@ -872,6 +876,10 @@ public PlanFragment visitPhysicalOlapScan(PhysicalOlapScan olapScan, PlanTransla private PlanFragment computePhysicalOlapScan(PhysicalOlapScan olapScan, PlanTranslatorContext context) { List slots = olapScan.getOutput(); OlapTable olapTable = olapScan.getTable(); + if (olapScan.isIncrementalScan() && !olapScan.getChangeScanInfo().isPresent()) { + olapTable = new RowBinlogTableWrapper(((OlapTableStreamWrapper) olapTable).getBaseTable(), + (OlapTableStreamWrapper) olapTable); + } // generate real output tuple TupleDescriptor tupleDescriptor = generateTupleDesc(slots, olapTable, context); List slotDescriptors = tupleDescriptor.getSlots(); @@ -888,12 +896,12 @@ private PlanFragment computePhysicalOlapScan(PhysicalOlapScan olapScan, PlanTran } // generate base index tuple because this fragment partitioned expr relay on slots of based index - if (olapScan.getSelectedIndexId() != olapScan.getTable().getBaseIndexId()) { + if (!olapScan.isIncrementalScan() && olapScan.getSelectedIndexId() != olapScan.getTable().getBaseIndexId()) { generateTupleDesc(olapScan.getBaseOutputs(), olapTable, context); } OlapScanNode olapScanNode = new OlapScanNode(context.nextPlanNodeId(), tupleDescriptor, "OlapScanNode", - context.getScanContext()); + context.getScanContext(), olapScan.isIncrementalScan()); olapScanNode.setNereidsId(olapScan.getId()); context.getNereidsIdToPlanNodeIdMap().put(olapScan.getId(), olapScanNode.getId()); @@ -950,15 +958,25 @@ private PlanFragment computePhysicalOlapScan(PhysicalOlapScan olapScan, PlanTran } // TODO: remove this switch? - switch (olapScan.getTable().getKeysType()) { - case AGG_KEYS: - case UNIQUE_KEYS: - case DUP_KEYS: - PreAggStatus preAgg = olapScan.getPreAggStatus(); - olapScanNode.setSelectedIndexInfo(olapScan.getSelectedIndexId(), preAgg.isOn(), preAgg.getOffReason()); - break; - default: - throw new RuntimeException("Not supported key type: " + olapScan.getTable().getKeysType()); + if (olapScan.isIncrementalScan()) { + olapScanNode.setSelectedIndexInfo(olapTable.getBaseIndexId(), false, "binlog read"); + } else { + switch (olapScan.getTable().getKeysType()) { + case AGG_KEYS: + case UNIQUE_KEYS: + case DUP_KEYS: + PreAggStatus preAgg = olapScan.getPreAggStatus(); + olapScanNode.setSelectedIndexInfo(olapScan.getSelectedIndexId(), preAgg.isOn(), + preAgg.getOffReason()); + break; + default: + throw new RuntimeException("Not supported key type: " + olapScan.getTable().getKeysType()); + } + } + + // apply change scan info if present + if (olapScan.getChangeScanInfo().isPresent()) { + applyChangeScanInfo(olapScanNode, olapScan.getChangeScanInfo().get()); } // create scan range @@ -987,6 +1005,26 @@ private PlanFragment computePhysicalOlapScan(PhysicalOlapScan olapScan, PlanTran return planFragment; } + private void applyChangeScanInfo(OlapScanNode olapScanNode, ChangeScanInfo changeScanInfo) { + ChangeScanInfo.Position at = changeScanInfo.getAt(); + if (at.kind != ChangeScanInfo.PositionKind.TIMESTAMP) { + throw new AnalysisException("only incr startTimestamp/endTimestamp is supported now"); + } + long startTimestamp = at.value; + + long endTimestamp = 0; + if (changeScanInfo.getEnd().isPresent()) { + ChangeScanInfo.Position end = changeScanInfo.getEnd().get(); + if (end.kind != ChangeScanInfo.PositionKind.TIMESTAMP) { + throw new AnalysisException("only incr startTimestamp/endTimestamp is supported now"); + } + endTimestamp = end.value; + } else { + endTimestamp = TSOTimestamp.extractPhysicalTime(Env.getCurrentEnv().getTSOService().getCurrentTSO()); + } + olapScanNode.enableTimestampChangeScan(startTimestamp, endTimestamp, changeScanInfo.getInformationKind()); + } + private void translateRuntimeFilter(PhysicalRelation physicalRelation, ScanNode scanNode, PlanTranslatorContext context) { if (context.getRuntimeTranslator().isPresent()) { @@ -2937,7 +2975,6 @@ private void updateScanSlotsMaterialization(ScanNode scanNode, .collect(Collectors.toSet()); requiredWithVirtualColumns.addAll(virtualColumnInputSlotIds); } - // Find the smallest column, for count(*) or other situation that slot is empty after prune SlotDescriptor smallest = getSmallestSlot(scanNode.getTupleDesc().getSlots()); scanNode.getTupleDesc().getSlots().removeIf(s -> !requiredWithVirtualColumns.contains(s.getId())); if (scanNode.getTupleDesc().getSlots().isEmpty()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/jobs/executor/Rewriter.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/jobs/executor/Rewriter.java index e118b21b7e969a..5d7c045918cd52 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/jobs/executor/Rewriter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/jobs/executor/Rewriter.java @@ -17,6 +17,7 @@ package org.apache.doris.nereids.jobs.executor; +import org.apache.doris.common.Config; import org.apache.doris.nereids.CascadesContext; import org.apache.doris.nereids.jobs.JobContext; import org.apache.doris.nereids.jobs.rewrite.CostBasedRewriteJob; @@ -112,6 +113,7 @@ import org.apache.doris.nereids.rules.rewrite.MergeSetOperationsExcept; import org.apache.doris.nereids.rules.rewrite.MergeTopNs; import org.apache.doris.nereids.rules.rewrite.NestedColumnPruning; +import org.apache.doris.nereids.rules.rewrite.NormalizeOlapTableBinlogScan; import org.apache.doris.nereids.rules.rewrite.NormalizeOlapTableStreamScan; import org.apache.doris.nereids.rules.rewrite.NormalizeSort; import org.apache.doris.nereids.rules.rewrite.OperativeColumnDerive; @@ -892,12 +894,21 @@ private static List getWholeTreeRewriteJobs( ImmutableSet.of(LogicalCTEAnchor.class), () -> { List rewriteJobs = Lists.newArrayListWithExpectedSize(300); - rewriteJobs.add( - topic("normalize olap table stream scan", - custom(RuleType.NORMALIZE_OlAP_TABLE_STREAM_SCAN, - NormalizeOlapTableStreamScan::new) - ) - ); + if (Config.enable_table_stream) { + // todo(TuskiokaKogane): add rule to split increment scan and base scan + // normalize olap table stream scan after partition prune + rewriteJobs.addAll(jobs( + topic("normalize olap table stream scan", + custom(RuleType.NORMALIZE_OlAP_TABLE_STREAM_SCAN, + NormalizeOlapTableStreamScan::new) + ), + topic("normalize olap table binlog scan", + custom(RuleType.NORMALIZE_OLAP_TABLE_BINLOG_SCAN, + NormalizeOlapTableBinlogScan::new) + ) + ) + ); + } rewriteJobs.addAll(jobs( topic("cte inline and pull up all cte anchor", custom(RuleType.PULL_UP_CTE_ANCHOR, PullUpCteAnchor::new), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 0bb14eff373d39..48dcd928e5fe08 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -2775,10 +2775,10 @@ public LogicalPlan visitTableName(TableNameContext ctx) { } TableSample tableSample = ctx.sample() == null ? null : (TableSample) visit(ctx.sample()); - UnboundRelation relation = new UnboundRelation( - StatementScopeIdGenerator.newRelationId(), - nameParts, partitionNames, isTempPart, tabletIdLists, relationHints, - Optional.ofNullable(tableSample), indexName, scanParams, Optional.ofNullable(tableSnapshot)); + UnboundRelation relation = new UnboundRelation(StatementScopeIdGenerator.newRelationId(), + nameParts, isTempPart, partitionNames, tabletIdLists, relationHints, + Optional.ofNullable(tableSample), indexName, scanParams, Optional.ofNullable(tableSnapshot), + Optional.empty()); LogicalPlan checkedRelation = LogicalPlanBuilderAssistant.withCheckPolicy(relation); LogicalPlan plan = withTableAlias(checkedRelation, ctx.tableAlias()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/RuleType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/RuleType.java index 71e7514025e7db..c9f8d9a1d3c262 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/RuleType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/RuleType.java @@ -426,6 +426,7 @@ public enum RuleType { PROCESS_SCALAR_AGG_MUST_USE_MULTI_DISTINCT(RuleTypeClass.REWRITE), // table stream scan rewrite NORMALIZE_OlAP_TABLE_STREAM_SCAN(RuleTypeClass.REWRITE), + NORMALIZE_OLAP_TABLE_BINLOG_SCAN(RuleTypeClass.REWRITE), // exploration rules REORDER_INTERSECT(RuleTypeClass.EXPLORATION), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java index e87a1e449a8b2d..5a55813973a7f0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java @@ -17,6 +17,7 @@ package org.apache.doris.nereids.rules.analysis; +import org.apache.doris.analysis.TableScanParams; import org.apache.doris.analysis.TableSnapshot; import org.apache.doris.catalog.AggStateType; import org.apache.doris.catalog.AggregateType; @@ -27,16 +28,19 @@ import org.apache.doris.catalog.KeysType; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; +import org.apache.doris.catalog.RowBinlogTableWrapper; import org.apache.doris.catalog.SchemaTable; import org.apache.doris.catalog.SchemaTable.SchemaColumn; import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.Type; import org.apache.doris.catalog.View; +import org.apache.doris.catalog.stream.BaseTableStream; import org.apache.doris.catalog.stream.OlapTableStream; import org.apache.doris.catalog.stream.OlapTableStreamWrapper; import org.apache.doris.common.Config; import org.apache.doris.common.IdGenerator; import org.apache.doris.common.Pair; +import org.apache.doris.common.util.TimeUtils; import org.apache.doris.common.util.Util; import org.apache.doris.datasource.ExternalTable; import org.apache.doris.datasource.ExternalView; @@ -54,6 +58,7 @@ import org.apache.doris.nereids.analyzer.UnboundRelation; import org.apache.doris.nereids.analyzer.UnboundResultSink; import org.apache.doris.nereids.exceptions.AnalysisException; +import org.apache.doris.nereids.exceptions.ParseException; import org.apache.doris.nereids.hint.LeadingHint; import org.apache.doris.nereids.parser.NereidsParser; import org.apache.doris.nereids.parser.SqlDialectHelper; @@ -62,10 +67,13 @@ import org.apache.doris.nereids.properties.PhysicalProperties; import org.apache.doris.nereids.rules.Rule; import org.apache.doris.nereids.rules.RuleType; +import org.apache.doris.nereids.trees.ChangeScanInfo; import org.apache.doris.nereids.trees.expressions.Alias; import org.apache.doris.nereids.trees.expressions.EqualTo; import org.apache.doris.nereids.trees.expressions.ExprId; import org.apache.doris.nereids.trees.expressions.Expression; +//import org.apache.doris.nereids.trees.expressions.GreaterThanEqual; +//import org.apache.doris.nereids.trees.expressions.LessThan; import org.apache.doris.nereids.trees.expressions.NamedExpression; import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.expressions.SlotReference; @@ -79,7 +87,10 @@ import org.apache.doris.nereids.trees.expressions.functions.agg.QuantileUnion; import org.apache.doris.nereids.trees.expressions.functions.agg.Sum; import org.apache.doris.nereids.trees.expressions.functions.table.TableValuedFunction; +//import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; +import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; import org.apache.doris.nereids.trees.expressions.literal.TinyIntLiteral; +import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.PreAggStatus; import org.apache.doris.nereids.trees.plans.algebra.Relation; @@ -100,6 +111,7 @@ import org.apache.doris.nereids.trees.plans.logical.LogicalWorkTableReference; import org.apache.doris.nereids.util.RelationUtil; import org.apache.doris.nereids.util.Utils; +import org.apache.doris.planner.ScanNode; import org.apache.doris.qe.AutoCloseSessionVariable; import org.apache.doris.qe.ConnectContext; @@ -107,6 +119,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import org.apache.commons.collections4.CollectionUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -115,6 +128,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; /** * Rule to bind relations in query plan. @@ -243,6 +257,28 @@ private LogicalPlan makeOlapScan(TableIf table, UnboundRelation unboundRelation, CollectionUtils.isEmpty(partIds) ? ((OlapTable) table).getPartitionIds() : partIds, indexId, preAggStatus, CollectionUtils.isEmpty(partIds) ? ImmutableList.of() : partIds, unboundRelation.getHints(), unboundRelation.getTableSample(), ImmutableList.of()); + } else if (isChangeRead(unboundRelation)) { + if (unboundRelation.getScanParams() != null && unboundRelation.getScanParams().incrementalRead()) { + OlapTable olapTable = (OlapTable) table; + ChangeScanInfo changeScanInfo = buildChangeScanInfo(unboundRelation.getScanParams()); + validateChangeReadRequirements(olapTable, changeScanInfo); + unboundRelation = unboundRelation.withChangeScanInfo(changeScanInfo); + } + OlapTable olapTable = (OlapTable) table; + RowBinlogTableWrapper wrapper = new RowBinlogTableWrapper(olapTable); + ChangeScanInfo changeScanInfo = unboundRelation.getChangeScanInfo().get(); + Preconditions.checkState(changeScanInfo != null); + if (changeScanInfo.getInformationKind() == ChangeScanInfo.InformationKind.DETAIL + && !Config.enable_split_binlog_before) { + scan = new LogicalOlapScan(unboundRelation.getRelationId(), + wrapper, qualifier, tabletIds, unboundRelation.getHints(), + unboundRelation.getTableSample(), ImmutableList.of()); + } else { + scan = makeOlapTableStreamScan(wrapper, unboundRelation, qualifier); + // Preconditions.checkState(scan instanceof LogicalOlapTableStreamScan); + // LogicalOlapTableStreamScan streamScan = (LogicalOlapTableStreamScan) scan; + //scan = streamScan.withIncrementalScan(true); + } } else { scan = new LogicalOlapScan(unboundRelation.getRelationId(), (OlapTable) table, qualifier, tabletIds, unboundRelation.getHints(), @@ -256,7 +292,13 @@ private LogicalPlan makeOlapScan(TableIf table, UnboundRelation unboundRelation, if (cascadesContext.getStatementContext().isHintForcePreAggOn()) { return scan.withPreAggStatus(PreAggStatus.on()); } - if (needGenerateLogicalAggForRandomDistAggTable(scan)) { + if (unboundRelation.getChangeScanInfo().isPresent()) { + OlapTable olapTable = (OlapTable) table; + validateChangeReadRequirements(olapTable, unboundRelation.getChangeScanInfo().get()); + scan.setChangeScanInfo(unboundRelation.getChangeScanInfo()); + return scan; + } + if (needGenerateLogicalAggForRandomDistAggTable(scan) && !unboundRelation.getChangeScanInfo().isPresent()) { // it's a random distribution agg table // add agg on olap scan return preAggForRandomDistribution(scan); @@ -267,8 +309,8 @@ private LogicalPlan makeOlapScan(TableIf table, UnboundRelation unboundRelation, } } - private LogicalPlan makeOlapTableStreamScan(TableIf table, UnboundRelation unboundRelation, - List qualifier, CascadesContext cascadesContext) { + private LogicalOlapScan makeOlapTableStreamScan(TableIf table, UnboundRelation unboundRelation, + List qualifier) { LogicalOlapTableStreamScan scan; List partIds = getPartitionIds(table, unboundRelation, qualifier); List tabletIds = unboundRelation.getTabletIds(); @@ -447,6 +489,145 @@ private Optional handleMetaTable(TableIf table, UnboundRelation unb return Optional.of(new LogicalTVFRelation(unboundRelation.getRelationId(), tvf, ImmutableList.of())); } + private ChangeScanInfo buildChangeScanInfo(TableScanParams scanParams) { + ChangeScanInfo.InformationKind informationKind + = ChangeScanInfo.InformationKind.DETAIL; + Map params = scanParams.getMapParams(); + String type = params.get(ScanNode.DORIS_INCREMENT_TYPE); + if (type != null) { + String info = type.toUpperCase(); + if ("APPEND_ONLY".equals(info)) { + informationKind = ChangeScanInfo.InformationKind.APPEND_ONLY; + } else if ("MIN_DELTA".equals(info)) { + informationKind = ChangeScanInfo.InformationKind.MIN_DELTA; + } else if ("DETAIL".equals(info)) { + informationKind = ChangeScanInfo.InformationKind.DETAIL; + } else if (!"DEFAULT".equals(info)) { + throw new ParseException("Unsupported increment type in incr query: " + info); + } + } + + String startTimestamp = params.getOrDefault(ScanNode.DORIS_START_TIMESTAMP, "0"); + ChangeScanInfo.Position at = buildChangePosition(startTimestamp); + Optional end = Optional.empty(); + if (params.containsKey(ScanNode.DORIS_END_TIMESTAMP)) { + end = Optional.of(buildChangePosition(params.get(ScanNode.DORIS_END_TIMESTAMP))); + } + return new ChangeScanInfo(informationKind, at, end); + } + + private ChangeScanInfo.Position buildChangePosition(String ts) { + if (ts != null) { + long changeTimestamp; + if (ts.equals("0")) { + changeTimestamp = 0; + } else { + changeTimestamp = TimeUtils.timeStringToLong(ts); + } + if (changeTimestamp < 0) { + throw new ParseException("Invalid TIMESTAMP format in incr clause: " + ts); + } + return ChangeScanInfo.Position.forTimestamp(changeTimestamp); + } + throw new ParseException("Invalid timestamp:" + ts); + } + + private void validateChangeReadRequirements(OlapTable olapTable, ChangeScanInfo changeScanInfo) { + if (!olapTable.needRowBinlog()) { + throw new AnalysisException("INCR query requires ROW binlog enabled on base table " + + "(PROPERTIES('binlog.enable'='true','binlog.format'='ROW')). " + + "Table " + olapTable.getQualifiedName() + " doesn't enable row binlog."); + } + if (changeScanInfo.getInformationKind() != ChangeScanInfo.InformationKind.MIN_DELTA) { + return; + } + if (!olapTable.isUniqKeyMergeOnWrite()) { + throw new AnalysisException("MIN_DELTA INCR query requires base table to be UNIQUE KEY with " + + "enable_unique_key_merge_on_write=true. Table " + olapTable.getQualifiedName() + + " is " + olapTable.getKeysType() + "."); + } + if (!olapTable.getBinlogConfig().getNeedHistoricalValue()) { + throw new AnalysisException("MIN_DELTA INCR query requires base table to enable " + + "binlog.need_historical_value=true. Table " + olapTable.getQualifiedName() + + " doesn't enable historical value in row binlog."); + } + } + + private boolean isChangeRead(UnboundRelation unboundRelation) { + return unboundRelation.getScanParams() != null && unboundRelation.getScanParams().incrementalRead(); + } + + /* + public static LogicalPlan addChangeScanTimestampFilter(LogicalOlapScan scan) { + Set conjuncts = Sets.newHashSet(); + // add timestamp filter + Slot tsSlot = null; + for (Slot slot : scan.getOutput()) { + if (slot.getName().equals(Column.BINLOG_TIMESTAMP_COL)) { + tsSlot = slot; + break; + } + } + Preconditions.checkArgument(tsSlot != null); + ChangeScanInfo changeScanInfo = scan.getChangeScanInfo().get(); + long startTimestamp = changeScanInfo.getAt().value; + Expression gePred = new GreaterThanEqual(tsSlot, new BigIntLiteral(startTimestamp)); + conjuncts.add(gePred); + if (changeScanInfo.getEnd().isPresent()) { + long endTimestamp = changeScanInfo.getEnd().get().value; + Expression ltPred = new LessThan(tsSlot, new BigIntLiteral(endTimestamp)); + conjuncts.add(ltPred); + } + return new LogicalFilter<>(conjuncts, scan); + } + */ + + /** + * Add op filter on append only binlog change scan if need. + */ + public static LogicalPlan addAppendOnlyFilterForBinlog(LogicalPlan scan) { + Set conjuncts = Sets.newHashSet(); + // add append only op filter + Slot opSlot = null; + for (Slot slot : scan.getOutput()) { + if (slot.getName().equals(Column.BINLOG_OPERATION_COL)) { + opSlot = slot; + break; + } + } + Preconditions.checkArgument(opSlot != null); + Expression conjunct = new EqualTo(new BigIntLiteral(0), opSlot); + conjuncts.add(conjunct); + return new LogicalFilter<>(conjuncts, scan); + } + + /** + * Add op filter on append only table stream olap scan if need. + */ + public static LogicalPlan addAppendOnlyFilter(LogicalPlan scan) { + Set conjuncts = Sets.newHashSet(); + // add append only op filter + Slot opSlot = null; + for (Slot slot : scan.getOutput()) { + if (slot.getName().equals(Column.STREAM_CHANGE_TYPE_COL)) { + opSlot = slot; + break; + } + } + Preconditions.checkArgument(opSlot != null); + Expression conjunct = new EqualTo(new VarcharLiteral("APPEND"), opSlot); + conjuncts.add(conjunct); + return new LogicalFilter<>(conjuncts, scan); + } + + private boolean isScanAppendOnlyTableStream(OlapTableStreamWrapper stream) { + return stream.getConsumeType().equals(BaseTableStream.StreamScanType.APPEND_ONLY); + } + + private boolean isScanMinDeltaTableStream(OlapTableStreamWrapper stream) { + return stream.getConsumeType().equals(BaseTableStream.StreamScanType.MIN_DELTA); + } + private LogicalPlan getLogicalPlan(TableIf table, UnboundRelation unboundRelation, List qualifiedTableName, CascadesContext cascadesContext) { // for create view stmt replace tableName to ctl.db.tableName @@ -601,7 +782,7 @@ private LogicalPlan getLogicalPlan(TableIf table, UnboundRelation unboundRelatio case TEST_EXTERNAL_TABLE: return new LogicalTestScan(unboundRelation.getRelationId(), table, qualifierWithoutTableName); case STREAM: - return makeTableStreamScan(table, unboundRelation, qualifierWithoutTableName, cascadesContext); + return makeTableStreamScan(table, unboundRelation, qualifierWithoutTableName); default: throw new AnalysisException("Unsupported tableType " + table.getType()); } @@ -727,14 +908,19 @@ private List getPartitionIds(TableIf t, UnboundRelation unboundRelation, L }).collect(ImmutableList.toImmutableList()); } - private LogicalPlan makeTableStreamScan(TableIf table, UnboundRelation unboundRelation, List qualifier, - CascadesContext cascadesContext) throws AnalysisException { + private LogicalPlan makeTableStreamScan(TableIf table, UnboundRelation unboundRelation, List qualifier) + throws AnalysisException { if (table instanceof OlapTableStream) { // create OlapTableStreamWrapper OlapTableStream olapTableStream = (OlapTableStream) table; OlapTableStreamWrapper olapTableStreamWrapper = new OlapTableStreamWrapper(olapTableStream, (OlapTable) olapTableStream.getBaseTableOrNereidsAnalysisException()); - return makeOlapTableStreamScan(olapTableStreamWrapper, unboundRelation, qualifier, cascadesContext); + LogicalOlapScan scan = makeOlapTableStreamScan(olapTableStreamWrapper, unboundRelation, qualifier); + if (isScanAppendOnlyTableStream(olapTableStreamWrapper)) { + LOG.debug("Add append only filter on olap scan if need."); + return addAppendOnlyFilter(scan); + } + return scan; } throw new AnalysisException("Unsupported stream Type: " + table.getClass().getName()); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalOlapScanToPhysicalOlapScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalOlapScanToPhysicalOlapScan.java index 48ff1674709245..33cb6917cfeb77 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalOlapScanToPhysicalOlapScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalOlapScanToPhysicalOlapScan.java @@ -71,7 +71,9 @@ public Rule build() { olapScan.getAnnOrderKeys(), olapScan.getAnnLimit(), olapScan.getTableAlias(), - olapScan.getPartitionPrunablePredicates()) + olapScan.getPartitionPrunablePredicates(), + olapScan.isIncrementalScan(), + olapScan.getChangeScanInfo()) ).toRule(RuleType.LOGICAL_OLAP_SCAN_TO_PHYSICAL_OLAP_SCAN_RULE); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/NormalizeOlapTableBinlogScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/NormalizeOlapTableBinlogScan.java new file mode 100644 index 00000000000000..6be7c7edfa395f --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/NormalizeOlapTableBinlogScan.java @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.rules.rewrite; + +import org.apache.doris.catalog.Column; +import org.apache.doris.nereids.jobs.JobContext; +import org.apache.doris.nereids.trees.ChangeScanInfo; +import org.apache.doris.nereids.trees.expressions.EqualTo; +import org.apache.doris.nereids.trees.expressions.NamedExpression; +import org.apache.doris.nereids.trees.expressions.Slot; +import org.apache.doris.nereids.trees.expressions.SlotReference; +import org.apache.doris.nereids.trees.expressions.StatementScopeIdGenerator; +import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; +import org.apache.doris.nereids.trees.plans.Plan; +import org.apache.doris.nereids.trees.plans.logical.LogicalFilter; +import org.apache.doris.nereids.trees.plans.logical.LogicalOlapTableStreamScan; +import org.apache.doris.nereids.trees.plans.logical.LogicalProject; +import org.apache.doris.nereids.trees.plans.visitor.CustomRewriter; +import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanRewriter; + +import com.google.common.collect.ImmutableSet; + +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Normalize CHANGES semantic binlog scans without touching real stream scan behavior. + */ +public class NormalizeOlapTableBinlogScan implements CustomRewriter { + private static final long ROW_BINLOG_APPEND = 0L; + + @Override + public Plan rewriteRoot(Plan plan, JobContext jobContext) { + return plan.accept(BinlogScanReplacer.INSTANCE, null); + } + + private static class BinlogScanReplacer extends DefaultPlanRewriter { + private static final BinlogScanReplacer INSTANCE = new BinlogScanReplacer(); + + @Override + public Plan visitLogicalOlapTableStreamScan(LogicalOlapTableStreamScan scan, Void context) { + if (scan.isNormalized() || !scan.getChangeScanInfo().isPresent()) { + return scan; + } + + ChangeScanInfo.InformationKind informationKind = scan.getChangeScanInfo().get().getInformationKind(); + List originSlots = scan.getOutput(); + List visibleSlots = originSlots; + List scanSlots = new ArrayList<>(visibleSlots); + Set visibleNames = visibleSlots.stream().map(Slot::getName).collect(Collectors.toCollection( + LinkedHashSet::new)); + boolean needBeforeColumns = informationKind == ChangeScanInfo.InformationKind.MIN_DELTA + || informationKind == ChangeScanInfo.InformationKind.DETAIL; + boolean addedHiddenColumn = false; + + for (Column column : scan.getTable().getBaseSchema(true)) { + String columnName = column.getName(); + if (informationKind == ChangeScanInfo.InformationKind.APPEND_ONLY + && columnName.equals(Column.BINLOG_OPERATION_COL) + && !visibleNames.contains(columnName)) { + scanSlots.add(SlotReference.fromColumn(StatementScopeIdGenerator.newExprId(), scan.getTable(), + column, scan.qualified())); + addedHiddenColumn = true; + continue; + } + if (!needBeforeColumns || !Column.BINLOG_OPERATION_COL.equals(columnName) + && !Column.BINLOG_LSN_COL.equals(columnName) + && !Column.BINLOG_TIMESTAMP_COL.equals(columnName) + && !isRequiredBeforeImageColumn(columnName, visibleNames)) { + continue; + } + if (!visibleNames.contains(columnName)) { + scanSlots.add(SlotReference.fromColumn(StatementScopeIdGenerator.newExprId(), scan.getTable(), + column, scan.qualified())); + addedHiddenColumn = true; + } + } + + Plan plan = scan.withCachedOutput(scanSlots) + .withIncrementalScan(true) + .withNormalized(true); + if (informationKind == ChangeScanInfo.InformationKind.APPEND_ONLY) { + Slot opSlot = findSlotByName(plan.getOutput(), Column.BINLOG_OPERATION_COL); + plan = new LogicalFilter<>(ImmutableSet.of(new EqualTo(opSlot, new BigIntLiteral(ROW_BINLOG_APPEND))), + plan); + } + if (addedHiddenColumn || visibleSlots.size() != originSlots.size()) { + plan = new LogicalProject<>(visibleSlots.stream().map(NamedExpression.class::cast) + .collect(Collectors.toList()), plan); + } + return plan; + } + + private boolean isRequiredBeforeImageColumn(String columnName, Set visibleNames) { + if (!columnName.startsWith(Column.BINLOG_BEFORE_PREFIX) || !columnName.endsWith("__")) { + return false; + } + String baseColumnName = columnName.substring(Column.BINLOG_BEFORE_PREFIX.length(), columnName.length() - 2); + return visibleNames.contains(baseColumnName); + } + + private Slot findSlotByName(List slots, String slotName) { + for (Slot slot : slots) { + if (slot.getName().equals(slotName)) { + return slot; + } + } + throw new IllegalStateException("Missing binlog slot " + slotName); + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/NormalizeOlapTableStreamScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/NormalizeOlapTableStreamScan.java index 6cf7a8caa225ba..6bf91e23b16395 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/NormalizeOlapTableStreamScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/NormalizeOlapTableStreamScan.java @@ -18,27 +18,38 @@ package org.apache.doris.nereids.rules.rewrite; import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.stream.OlapTableStreamWrapper; import org.apache.doris.nereids.jobs.JobContext; import org.apache.doris.nereids.trees.expressions.Alias; +import org.apache.doris.nereids.trees.expressions.CaseWhen; import org.apache.doris.nereids.trees.expressions.EqualTo; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.NamedExpression; import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.StatementScopeIdGenerator; +import org.apache.doris.nereids.trees.expressions.WhenClause; import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; import org.apache.doris.nereids.trees.expressions.literal.TinyIntLiteral; import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.PreAggStatus; +import org.apache.doris.nereids.trees.plans.algebra.SetOperation.Qualifier; +import org.apache.doris.nereids.trees.plans.logical.LogicalEmptyRelation; import org.apache.doris.nereids.trees.plans.logical.LogicalFilter; import org.apache.doris.nereids.trees.plans.logical.LogicalOlapTableStreamScan; import org.apache.doris.nereids.trees.plans.logical.LogicalProject; +import org.apache.doris.nereids.trees.plans.logical.LogicalUnion; import org.apache.doris.nereids.trees.plans.visitor.CustomRewriter; import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanRewriter; +import org.apache.doris.qe.ConnectContext; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; @@ -48,22 +59,52 @@ * 2. add delete sign column if unique base table */ public class NormalizeOlapTableStreamScan implements CustomRewriter { + private static final long ROW_BINLOG_APPEND = 0; + private static final long ROW_BINLOG_DELETE = 1; + private static final long ROW_BINLOG_UPDATE_BEFORE = 2; + private static final long ROW_BINLOG_UPDATE_AFTER = 3; @Override public Plan rewriteRoot(Plan plan, JobContext jobContext) { return plan.accept(OlapTableStreamScanReplacer.INSTANCE, null); } + private static Expression buildChangeTypeExpr(Slot opSlot) { + return new CaseWhen(ImmutableList.of( + new WhenClause(new EqualTo(opSlot, new BigIntLiteral(ROW_BINLOG_APPEND)), + new VarcharLiteral("APPEND")), + new WhenClause(new EqualTo(opSlot, new BigIntLiteral(ROW_BINLOG_DELETE)), + new VarcharLiteral("DELETE")), + new WhenClause(new EqualTo(opSlot, new BigIntLiteral(ROW_BINLOG_UPDATE_BEFORE)), + new VarcharLiteral("UPDATE_BEFORE")), + new WhenClause(new EqualTo(opSlot, new BigIntLiteral(ROW_BINLOG_UPDATE_AFTER)), + new VarcharLiteral("UPDATE_AFTER"))), new VarcharLiteral("UNKNOWN")); + } + private static class OlapTableStreamScanReplacer extends DefaultPlanRewriter { protected static final OlapTableStreamScanReplacer INSTANCE = new OlapTableStreamScanReplacer(); @Override public Plan visitLogicalOlapTableStreamScan(LogicalOlapTableStreamScan scan, Void context) { - if (scan.isIncrementalScan()) { + if (scan.isNormalized()) { + return scan; + } + if (scan.getChangeScanInfo().isPresent()) { + // change scan, do not use stream scan rules to rewrite. + return scan; + } + List selectedPartitionIds = scan.getSelectedPartitionIds(); + if (selectedPartitionIds.isEmpty()) { return scan; } + List historicalPartitionIds = ImmutableList.copyOf(((OlapTableStreamWrapper) scan.getTable()) + .filterHistoryPartitionIds(selectedPartitionIds)); + List incrementalPartitionIds = ImmutableList.copyOf(((OlapTableStreamWrapper) scan.getTable()) + .filterIncrementalPartitionIds(selectedPartitionIds)); + Plan historyPlan = null; + Plan incrementalPlan = null; List originSlots = scan.getLogicalProperties().getOutput(); - List newSlots = originSlots.stream() + List newSlots = ImmutableList.copyOf(originSlots.stream() .filter(slot -> !(slot instanceof SlotReference && ((SlotReference) slot).getOriginalColumn().isPresent() && ((SlotReference) slot).getOriginalColumn().get() @@ -72,49 +113,125 @@ public Plan visitLogicalOlapTableStreamScan(LogicalOlapTableStreamScan scan, Voi && ((SlotReference) slot).getOriginalColumn().isPresent() && ((SlotReference) slot).getOriginalColumn().get() .equals(Column.STREAM_SEQ_VIRTUAL_COLUMN))) - .collect(Collectors.toList()); + .collect(Collectors.toList())); - if (originSlots.equals(newSlots)) { - return scan; - } - - // add delete sign column if unique base table - Slot deleteSlot = null; - for (Column column : scan.getTable().getBaseSchema(true)) { - if (column.getName().equals(Column.DELETE_SIGN)) { - deleteSlot = SlotReference.fromColumn(StatementScopeIdGenerator.newExprId(), scan.getTable(), - column, scan.qualified()); - newSlots.add(deleteSlot); - break; + // history plan + if (!historicalPartitionIds.isEmpty()) { + List scanSlots = new ArrayList<>(newSlots); + // add delete sign column if unique base table + Slot deleteSlot = null; + for (Column column : scan.getTable().getBaseSchema(true)) { + if (column.getName().equals(Column.DELETE_SIGN)) { + deleteSlot = SlotReference.fromColumn(StatementScopeIdGenerator.newExprId(), scan.getTable(), + column, scan.qualified()); + scanSlots.add(deleteSlot); + break; + } } - } - Plan plan = scan.withCachedOutput(newSlots); - if (deleteSlot != null) { - Expression conjunct = new EqualTo(deleteSlot, new TinyIntLiteral((byte) 0)); - if (!scan.getTable().getEnableUniqueKeyMergeOnWrite()) { - plan = scan.withPreAggStatus(PreAggStatus.off( - Column.DELETE_SIGN + " is used as conjuncts.")); + Plan plan = scan.withSelectedPartitionIds(historicalPartitionIds, true) + .withCachedOutput(new ArrayList<>(scanSlots)) + .withNormalized(true); + if (deleteSlot != null) { + Expression conjunct = new EqualTo(deleteSlot, new TinyIntLiteral((byte) 0)); + if (!scan.getTable().getEnableUniqueKeyMergeOnWrite()) { + plan = scan.withPreAggStatus(PreAggStatus.off( + Column.DELETE_SIGN + " is used as conjuncts.")); + } + plan = new LogicalFilter<>(ImmutableSet.of(conjunct), plan); + } + // replace virtual column with constant projection + List project = newSlots.stream() + .map(NamedExpression.class::cast).collect(Collectors.toList()); + for (Slot slot : originSlots) { + if (slot instanceof SlotReference + && ((SlotReference) slot).getOriginalColumn().isPresent() + && ((SlotReference) slot).getOriginalColumn().get() + .equals(Column.STREAM_CHANGE_TYPE_VIRTUAL_COLUMN)) { + project.add(new Alias(slot.getExprId(), new VarcharLiteral("APPEND"), + Column.STREAM_CHANGE_TYPE_COL)); + } + if (slot instanceof SlotReference + && ((SlotReference) slot).getOriginalColumn().isPresent() + && ((SlotReference) slot).getOriginalColumn().get() + .equals(Column.STREAM_SEQ_VIRTUAL_COLUMN)) { + project.add(new Alias(slot.getExprId(), new BigIntLiteral(-1), Column.STREAM_SEQ_COL)); + } } - plan = new LogicalFilter<>(ImmutableSet.of(conjunct), plan); + historyPlan = new LogicalProject<>(project, plan); } - // replace virtual column with constant projection - List project = newSlots.stream() - .map(NamedExpression.class::cast).collect(Collectors.toList()); - for (Slot slot : originSlots) { - if (slot instanceof SlotReference - && ((SlotReference) slot).getOriginalColumn().isPresent() - && ((SlotReference) slot).getOriginalColumn().get() - .equals(Column.STREAM_CHANGE_TYPE_VIRTUAL_COLUMN)) { - project.add(new Alias(slot.getExprId(), new VarcharLiteral("APPEND"), - Column.STREAM_CHANGE_TYPE_COL)); + // incremental plan + if (!incrementalPartitionIds.isEmpty()) { + List scanSlots = new ArrayList<>(newSlots); + // add slot from binlog + Slot opSlot = null; + Slot seqSlot = null; + for (Column column : ((OlapTableStreamWrapper) scan.getTable()).getRowBinlogSchema()) { + if (column.getName().equals(Column.BINLOG_TIMESTAMP_COL)) { + seqSlot = SlotReference.fromColumn(StatementScopeIdGenerator.newExprId(), scan.getTable(), + column, scan.qualified()); + scanSlots.add(seqSlot); + } else if (column.getName().equals(Column.BINLOG_OPERATION_COL)) { + opSlot = SlotReference.fromColumn(StatementScopeIdGenerator.newExprId(), scan.getTable(), + column, scan.qualified()); + scanSlots.add(opSlot); + } } - if (slot instanceof SlotReference - && ((SlotReference) slot).getOriginalColumn().isPresent() - && ((SlotReference) slot).getOriginalColumn().get() - .equals(Column.STREAM_SEQ_VIRTUAL_COLUMN)) { - project.add(new Alias(slot.getExprId(), new BigIntLiteral(-1), Column.STREAM_SEQ_COL)); + Plan plan = scan.withSelectedPartitionIds(incrementalPartitionIds, true) + .withCachedOutput(new ArrayList<>(scanSlots)) + .withIncrementalScan(true) + .withNormalized(true); + // replace virtual column with alias slot reference + List project = newSlots.stream() + .map(NamedExpression.class::cast).collect(Collectors.toList()); + for (Slot slot : originSlots) { + if (slot instanceof SlotReference + && ((SlotReference) slot).getOriginalColumn().isPresent() + && ((SlotReference) slot).getOriginalColumn().get() + .equals(Column.STREAM_CHANGE_TYPE_VIRTUAL_COLUMN)) { + project.add(new Alias(slot.getExprId(), buildChangeTypeExpr(opSlot), + Column.STREAM_CHANGE_TYPE_COL)); + } else if (slot instanceof SlotReference + && ((SlotReference) slot).getOriginalColumn().isPresent() + && ((SlotReference) slot).getOriginalColumn().get() + .equals(Column.STREAM_SEQ_VIRTUAL_COLUMN)) { + project.add(new Alias(slot.getExprId(), seqSlot, Column.STREAM_SEQ_COL)); + } } + incrementalPlan = new LogicalProject<>(project, plan); + } + + if (historyPlan == null && incrementalPlan == null) { + return new LogicalEmptyRelation(ConnectContext.get().getStatementContext().getNextRelationId(), + scan.getOutput()); + } else if (historyPlan == null) { + return incrementalPlan; + } else if (incrementalPlan == null) { + return historyPlan; + } + historyPlan = refreshUnionChildOutputExprIds(historyPlan, originSlots); + incrementalPlan = refreshUnionChildOutputExprIds(incrementalPlan, originSlots); + // return union plan + List children = Lists.newArrayList(historyPlan, incrementalPlan); + return new LogicalUnion(Qualifier.ALL, + originSlots.stream().map(NamedExpression.class::cast).collect(Collectors.toList()), + children.stream() + .map(plan -> plan.getOutput().stream() + .map(slot -> (SlotReference) slot.toSlot()) + .collect(Collectors.toList())) + .collect(Collectors.toList()), + ImmutableList.of(), + false, + children); + } + + private Plan refreshUnionChildOutputExprIds(Plan plan, List unionOutputs) { + Preconditions.checkState(plan.getOutput().size() == unionOutputs.size(), + "Union child output size %s does not match union output size %s", + plan.getOutput().size(), unionOutputs.size()); + List project = new ArrayList<>(plan.getOutput().size()); + for (int i = 0; i < plan.getOutput().size(); i++) { + project.add(new Alias(plan.getOutput().get(i), unionOutputs.get(i).getName())); } return new LogicalProject<>(project, plan); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PruneOlapScanTablet.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PruneOlapScanTablet.java index 5e73916024469f..043198262ee7cc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PruneOlapScanTablet.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PruneOlapScanTablet.java @@ -79,6 +79,11 @@ public Rule build() { for (Long id : olapScan.getSelectedPartitionIds()) { Partition partition = table.getPartition(id); MaterializedIndex index = partition.getIndex(olapScan.getSelectedIndexId()); + if (index == null && table.needRowBinlog() + && olapScan.getSelectedIndexId() == table.getBaseIndexMeta().getRowBinlogIndexId()) { + // if row binlog index is selected, then use base index + index = table.getBaseIndex(); + } boolean isBaseIndexSelected = olapScan.getSelectedIndexId() == olapScan.getTable().getBaseIndexId(); Collection prunedTabletIds = getSelectedTabletIds( olapScan.getTable().getSchemaByIndexId(olapScan.getSelectedIndexId()), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/ChangeScanInfo.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/ChangeScanInfo.java new file mode 100644 index 00000000000000..6ec863377e87af --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/ChangeScanInfo.java @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees; + +import java.util.Objects; +import java.util.Optional; + +/** + * Options for single-table incremental binlog query on base table. + * + * This is a pure metadata holder used by Nereids planner and FE planner to + * carry information from parsed table@incr(...) down to scan node. + */ +public class ChangeScanInfo { + + /** + * What kind of change information the query expects. + * DEFAULT in SQL is mapped to MIN_DELTA here. + */ + public enum InformationKind { + MIN_DELTA, + APPEND_ONLY, + DETAIL + } + + /** + * The way user specifies the incremental position. + */ + public enum PositionKind { + VERSION, + TIMESTAMP, + OFFSET + } + + /** + * A single position in incremental query (used for both start and end). + * + */ + public static class Position { + public final PositionKind kind; + public final long value; + + public Position(PositionKind kind, long value) { + this.kind = Objects.requireNonNull(kind, "kind should not be null"); + this.value = value; + } + + public static Position forTimestamp(long timestampLiteral) { + return new Position(PositionKind.TIMESTAMP, timestampLiteral); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof Position)) { + return false; + } + Position that = (Position) o; + return value == that.value && kind == that.kind; + } + + @Override + public int hashCode() { + return Objects.hash(kind, value); + } + } + + private final InformationKind informationKind; + private final Position at; + private final Optional end; + + public ChangeScanInfo(InformationKind informationKind, Position at, Optional end) { + this.informationKind = Objects.requireNonNull(informationKind, "informationKind should not be null"); + this.at = Objects.requireNonNull(at, "at should not be null"); + this.end = Objects.requireNonNull(end, "end should not be null"); + } + + public InformationKind getInformationKind() { + return informationKind; + } + + public Position getAt() { + return at; + } + + public Optional getEnd() { + return end; + } + + /** + * Generate digest string used in normalized sql cache. + */ + public String toDigest() { + StringBuilder sb = new StringBuilder(); + sb.append("@incr(\"incrementType\" = \""); + if (informationKind == InformationKind.MIN_DELTA) { + sb.append("DEFAULT"); + } else if (informationKind == InformationKind.DETAIL) { + sb.append("DETAIL"); + } else { + sb.append("APPEND_ONLY"); + } + sb.append("\", \"startTimestamp\" = "); + sb.append(positionToDigest(at)); + if (end.isPresent()) { + sb.append(", \"endTimestamp\" = "); + sb.append(positionToDigest(end.get())); + } + sb.append(")"); + return sb.toString(); + } + + private String positionToDigest(Position pos) { + switch (pos.kind) { + case TIMESTAMP: + return "?"; + default: + return "?"; + } + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof ChangeScanInfo)) { + return false; + } + ChangeScanInfo that = (ChangeScanInfo) o; + return informationKind == that.informationKind && at.equals(that.at) && end.equals(that.end); + } + + @Override + public int hashCode() { + return Objects.hash(informationKind, at, end); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/ModifyTablePropertiesOp.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/ModifyTablePropertiesOp.java index c8c0d560b99c8b..92d2da077cb0e4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/ModifyTablePropertiesOp.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/ModifyTablePropertiesOp.java @@ -26,7 +26,6 @@ import org.apache.doris.catalog.Table; import org.apache.doris.catalog.TableProperty; import org.apache.doris.common.AnalysisException; -import org.apache.doris.common.Config; import org.apache.doris.common.UserException; import org.apache.doris.common.util.DatasourcePrintableMap; import org.apache.doris.common.util.DynamicPartitionUtil; @@ -296,21 +295,6 @@ public void validate(ConnectContext ctx) throws UserException { + " should be set to true or false"); } this.opType = AlterOpType.MODIFY_TABLE_PROPERTY_SYNC; - } else if (properties.containsKey(PropertyAnalyzer.PROPERTIES_ENABLE_TSO)) { - if (!properties.get(PropertyAnalyzer.PROPERTIES_ENABLE_TSO).equalsIgnoreCase("true") - && !properties.get(PropertyAnalyzer.PROPERTIES_ENABLE_TSO).equalsIgnoreCase("false")) { - throw new AnalysisException( - "Property " - + PropertyAnalyzer.PROPERTIES_ENABLE_TSO - + " should be set to true or false"); - } - if (properties.get(PropertyAnalyzer.PROPERTIES_ENABLE_TSO).equalsIgnoreCase("true") - && !Config.enable_tso_feature) { - throw new AnalysisException( - "Property " + PropertyAnalyzer.PROPERTIES_ENABLE_TSO - + " can not be enabled when experimental_enable_tso_feature is disabled"); - } - this.opType = AlterOpType.MODIFY_TABLE_PROPERTY_SYNC; } else if (properties.containsKey(PropertyAnalyzer.PROPERTIES_ENABLE_MOW_LIGHT_DELETE)) { if (!properties.get(PropertyAnalyzer.PROPERTIES_ENABLE_MOW_LIGHT_DELETE) .equalsIgnoreCase("true") diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommand.java index 89e4579c45837a..0d6731ff2ebd7f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommand.java @@ -21,11 +21,13 @@ import org.apache.doris.analysis.StmtType; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.RowBinlogTableWrapper; import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.stream.AbstractTableStreamUpdate; import org.apache.doris.catalog.stream.OlapTableStreamUpdate; import org.apache.doris.catalog.stream.OlapTableStreamWrapper; import org.apache.doris.catalog.stream.TableStreamUpdateInfo; +import org.apache.doris.common.Config; import org.apache.doris.common.ErrorCode; import org.apache.doris.common.ErrorReport; import org.apache.doris.common.Pair; @@ -295,9 +297,15 @@ public AbstractInsertExecutor initPlan(ConnectContext ctx, StmtExecutor stmtExec List tableStreamScanNodes = buildResult.planner.getScanNodes().stream() - .filter(s -> s.getTableIf() instanceof OlapTableStreamWrapper).collect(Collectors.toList()); + .filter((s -> (s.getTableIf() instanceof OlapTableStreamWrapper + || s instanceof OlapScanNode && ((OlapScanNode) s).isIncrementalScan()))) + .collect(Collectors.toList()); if (!tableStreamScanNodes.isEmpty()) { + if (!Config.enable_feature_binlog) { + throw new AnalysisException("Insert plan with Table stream failed." + + " should enable binlog feature in FE config."); + } // stream id -> Map, AbstractTableStreamUpdate> distinctUpdate = new HashMap<>(tableStreamScanNodes.size()); @@ -305,7 +313,12 @@ public AbstractInsertExecutor initPlan(ConnectContext ctx, StmtExecutor stmtExec // only support OlapScanNode currently Preconditions.checkArgument(scanNode instanceof OlapScanNode); OlapScanNode olapScanNode = (OlapScanNode) scanNode; - OlapTableStreamWrapper wrapper = (OlapTableStreamWrapper) scanNode.getTableIf(); + OlapTableStreamWrapper wrapper; + if (scanNode.getTableIf() instanceof OlapTableStreamWrapper) { + wrapper = (OlapTableStreamWrapper) scanNode.getTableIf(); + } else { + wrapper = ((RowBinlogTableWrapper) scanNode.getTableIf()).getParent(); + } if (!distinctUpdate.containsKey( Pair.of(wrapper.getStreamDbId(), wrapper.getStreamId()))) { distinctUpdate.put(Pair.of(wrapper.getStreamDbId(), wrapper.getStreamId()), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalOlapScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalOlapScan.java index 833e198d886873..e05b52d17a6b70 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalOlapScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalOlapScan.java @@ -28,6 +28,7 @@ import org.apache.doris.nereids.properties.DataTrait; import org.apache.doris.nereids.properties.LogicalProperties; import org.apache.doris.nereids.properties.OrderKey; +import org.apache.doris.nereids.trees.ChangeScanInfo; import org.apache.doris.nereids.trees.TableSample; import org.apache.doris.nereids.trees.expressions.ExprId; import org.apache.doris.nereids.trees.expressions.NamedExpression; @@ -170,7 +171,9 @@ public class LogicalOlapScan extends LogicalCatalogRelation implements OlapScan, * original predicates. The set is preserved through {@code with*} rewrites * and copied onto MV rewrite outputs. */ - private final Optional partitionPrunablePredicates; + protected final Optional partitionPrunablePredicates; + + protected Optional changeScanInfo = Optional.empty(); public LogicalOlapScan(RelationId id, OlapTable table) { this(id, table, ImmutableList.of()); @@ -272,7 +275,7 @@ public LogicalOlapScan(RelationId id, Table table, List qualifier, specifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, specifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - Optional.empty()); + Optional.empty(), Optional.empty()); } /** @@ -289,7 +292,8 @@ public LogicalOlapScan(RelationId id, Table table, List qualifier, Collection operativeSlots, List virtualColumns, List scoreOrderKeys, Optional scoreLimit, Optional scoreRangeInfo, List annOrderKeys, Optional annLimit, String tableAlias, - Optional partitionPrunablePredicates) { + Optional partitionPrunablePredicates, + Optional changeScanInfo) { super(id, PlanType.LOGICAL_OLAP_SCAN, table, qualifier, operativeSlots, virtualColumns, groupExpression, logicalProperties, tableAlias); Preconditions.checkArgument(selectedPartitionIds != null, @@ -331,6 +335,45 @@ public LogicalOlapScan(RelationId id, Table table, List qualifier, this.partitionPrunablePredicates = partitionPrunablePredicates == null ? Optional.empty() : partitionPrunablePredicates; + this.changeScanInfo = changeScanInfo; + } + + /** + * Constructor for LogicalOlapScan with changeScanInfo. + */ + public LogicalOlapScan(RelationId id, Table table, List qualifier, + Optional groupExpression, Optional logicalProperties, + List selectedPartitionIds, boolean partitionPruned, List selectedTabletIds, + long selectedIndexId, boolean indexSelected, PreAggStatus preAggStatus, + List specifiedPartitions, List hints, + Map, Slot> cacheSlotWithSlotName, + Optional tableSample, boolean directMvScan, + Map>> colToSubPathsMap, List specifiedTabletIds, + Optional changeScanInfo) { + this(id, table, qualifier, groupExpression, logicalProperties, selectedPartitionIds, partitionPruned, false, + selectedTabletIds, selectedIndexId, indexSelected, preAggStatus, specifiedPartitions, hints, + cacheSlotWithSlotName, Optional.empty(), tableSample, directMvScan, colToSubPathsMap, + specifiedTabletIds, ImmutableList.of(), ImmutableList.of(), ImmutableList.of(), + Optional.empty(), Optional.empty(), ImmutableList.of(), Optional.empty(), "", + Optional.empty(), changeScanInfo); + } + + /** + * Factory method to create a new LogicalOlapScan. Can be overridden by subclasses. + */ + protected LogicalOlapScan newLogicalOlapScan(RelationId id, Table table, List qualifier, + Optional groupExpression, Optional logicalProperties, + List selectedPartitionIds, boolean partitionPruned, List selectedTabletIds, + long selectedIndexId, boolean indexSelected, PreAggStatus preAggStatus, + List specifiedPartitions, List hints, + Map, Slot> cacheSlotWithSlotName, + Optional tableSample, boolean directMvScan, + Map>> colToSubPathsMap, List specifiedTabletIds, + Optional changeScanInfo) { + return new LogicalOlapScan(id, table, qualifier, groupExpression, logicalProperties, + selectedPartitionIds, partitionPruned, selectedTabletIds, selectedIndexId, indexSelected, + preAggStatus, specifiedPartitions, hints, cacheSlotWithSlotName, tableSample, directMvScan, + colToSubPathsMap, specifiedTabletIds, changeScanInfo); } public List getSelectedPartitionIds() { @@ -361,7 +404,7 @@ public LogicalOlapScan withPartitionPrunablePredicates( hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - partitionPrunablePredicates)); + partitionPrunablePredicates, changeScanInfo)); } @Override @@ -432,7 +475,8 @@ public boolean equals(Object o) { && Objects.equals(scoreRangeInfo, that.scoreRangeInfo) && Objects.equals(annOrderKeys, that.annOrderKeys) && Objects.equals(annLimit, that.annLimit) - && Objects.equals(partitionPrunablePredicates, that.partitionPrunablePredicates); + && Objects.equals(partitionPrunablePredicates, that.partitionPrunablePredicates) + && Objects.equals(changeScanInfo, that.changeScanInfo); } @Override @@ -450,7 +494,7 @@ public LogicalOlapScan withGroupExpression(Optional groupExpres hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - partitionPrunablePredicates)); + partitionPrunablePredicates, changeScanInfo)); } @Override @@ -463,7 +507,7 @@ public Plan withGroupExprLogicalPropChildren(Optional groupExpr hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - partitionPrunablePredicates)); + partitionPrunablePredicates, changeScanInfo)); } /** @@ -486,7 +530,7 @@ public LogicalOlapScan withSelectedPartitionIds(List selectedPartitionIds, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - partitionPrunablePredicates)); + partitionPrunablePredicates, changeScanInfo)); } /** @@ -502,7 +546,7 @@ public LogicalOlapScan withMaterializedIndexSelected(long indexId) { indexId, true, PreAggStatus.unset(), manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, - annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, changeScanInfo)); } /** @@ -516,7 +560,8 @@ public LogicalOlapScan withSelectedTabletIds(List selectedTabletIds) { selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, - scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, + changeScanInfo)); } /** @@ -531,7 +576,7 @@ public LogicalOlapScan withPreAggStatus(PreAggStatus preAggStatus) { hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - partitionPrunablePredicates)); + partitionPrunablePredicates, changeScanInfo)); } /** @@ -546,7 +591,7 @@ public LogicalOlapScan withColToSubPathsMap(Map>> colTo hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - partitionPrunablePredicates)); + partitionPrunablePredicates, changeScanInfo)); } /** @@ -561,7 +606,7 @@ public LogicalOlapScan withManuallySpecifiedTabletIds(List manuallySpecifi hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - partitionPrunablePredicates)); + partitionPrunablePredicates, changeScanInfo)); } @Override @@ -574,7 +619,8 @@ public LogicalOlapScan withRelationId(RelationId relationId) { selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, Maps.newHashMap(), Optional.empty(), tableSample, directMvScan, colToSubPathsMap, selectedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, - scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, + changeScanInfo)); } @Override @@ -587,7 +633,7 @@ public LogicalOlapScan withTableAlias(String tableAlias) { hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - partitionPrunablePredicates)); + partitionPrunablePredicates, changeScanInfo)); } /** @@ -608,7 +654,7 @@ public LogicalOlapScan withVirtualColumns(List virtualColumns) selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, - scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, changeScanInfo)); } /** @@ -631,7 +677,7 @@ public LogicalOlapScan appendVirtualColumns(List additionalVirt selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, mergedVirtualColumns, scoreOrderKeys, scoreLimit, - scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates); + scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, changeScanInfo); } /** @@ -660,7 +706,7 @@ public LogicalOlapScan appendVirtualColumnsAndTopN( selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, mergedVirtualColumns, scoreOrderKeys, scoreLimit, - scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates); + scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, changeScanInfo); } @Override @@ -712,14 +758,14 @@ public List computeOutput() { if (selectedIndexId != ((OlapTable) table).getBaseIndexId()) { return getOutputByIndex(selectedIndexId); } - List baseSchema = table.getBaseSchema(true); - List slotFromColumn = createSlotsVectorized(baseSchema); + List outputSchema = filterOutputColumns(table.getBaseSchema(true)); + List slotFromColumn = createSlotsVectorized(outputSchema); Builder slots = ImmutableList.builder(); IdGenerator exprIdGenerator = StatementScopeIdGenerator.getExprIdGenerator(); - for (int i = 0; i < baseSchema.size(); i++) { + for (int i = 0; i < outputSchema.size(); i++) { final int index = i; - Column col = baseSchema.get(i); + Column col = outputSchema.get(i); Pair key = Pair.of(selectedIndexId, col.getName()); Slot slot = cacheSlotWithSlotName.computeIfAbsent(key, k -> slotFromColumn.get(index)); slots.add(slot); @@ -727,7 +773,7 @@ public List computeOutput() { for (List subPath : colToSubPathsMap.get(key.getValue())) { if (!subPath.isEmpty()) { SlotReference slotReference = SlotReference.fromColumn( - exprIdGenerator.getNextId(), table, baseSchema.get(i), qualified() + exprIdGenerator.getNextId(), table, outputSchema.get(i), qualified() ).withSubPath(subPath); slots.add(slotReference); subPathToSlotMap.computeIfAbsent(slot, k -> Maps.newHashMap()) @@ -837,6 +883,18 @@ public Optional getScoreRangeInfo() { return scoreRangeInfo; } + public Optional getChangeScanInfo() { + return changeScanInfo; + } + + public void setChangeScanInfo(Optional changeScanInfo) { + this.changeScanInfo = changeScanInfo; + } + + protected List filterOutputColumns(List columns) { + return columns; + } + protected List createSlotsVectorized(List columns) { List qualified = qualified(); SlotReference[] slots = new SlotReference[columns.size()]; @@ -1007,7 +1065,7 @@ public CatalogRelation withOperativeSlots(Collection operativeSlots) { selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, - scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, changeScanInfo)); } @VisibleForTesting @@ -1089,11 +1147,15 @@ public LogicalOlapScan withCachedOutput(List outputSlots) { selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, Optional.of(outputSlots), tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, - scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, changeScanInfo)); } @Override public boolean supportPruneNestedColumn() { return true; } + + public boolean isIncrementalScan() { + return false; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalOlapTableStreamScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalOlapTableStreamScan.java index 3b5298bf42cd75..12f5cc0562ff86 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalOlapTableStreamScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalOlapTableStreamScan.java @@ -20,10 +20,12 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Table; +import org.apache.doris.catalog.stream.BaseTableStream; import org.apache.doris.common.IdGenerator; import org.apache.doris.nereids.memo.GroupExpression; import org.apache.doris.nereids.properties.LogicalProperties; import org.apache.doris.nereids.properties.OrderKey; +import org.apache.doris.nereids.trees.ChangeScanInfo; import org.apache.doris.nereids.trees.TableSample; import org.apache.doris.nereids.trees.expressions.Alias; import org.apache.doris.nereids.trees.expressions.ExprId; @@ -34,6 +36,7 @@ import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; import org.apache.doris.nereids.trees.plans.AbstractPlan; +import org.apache.doris.nereids.trees.plans.PartitionPrunablePredicate; import org.apache.doris.nereids.trees.plans.PreAggStatus; import org.apache.doris.nereids.trees.plans.RelationId; import org.apache.doris.nereids.trees.plans.ScoreRangeInfo; @@ -44,6 +47,7 @@ import com.google.common.collect.Maps; import org.apache.commons.lang3.tuple.Pair; +import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; @@ -55,7 +59,8 @@ * Logical OlapTableStreamScan */ public class LogicalOlapTableStreamScan extends LogicalOlapScan { - private boolean isIncrementalScan = false; + private final boolean isNormalized; + private final boolean isIncrementalScan; /** * LogicalOlapTableStreamScan construct method @@ -63,6 +68,8 @@ public class LogicalOlapTableStreamScan extends LogicalOlapScan { public LogicalOlapTableStreamScan(RelationId id, OlapTable table, List qualifier, List tabletIds, List hints, Optional tableSample, Collection operativeSlots) { super(id, table, qualifier, tabletIds, hints, tableSample, operativeSlots); + this.isNormalized = false; + this.isIncrementalScan = false; } /** @@ -72,6 +79,8 @@ public LogicalOlapTableStreamScan(RelationId id, OlapTable table, List q List specifiedPartitions, List tabletIds, List hints, Optional tableSample, List operativeSlots) { super(id, table, qualifier, specifiedPartitions, tabletIds, hints, tableSample, operativeSlots); + this.isNormalized = false; + this.isIncrementalScan = false; } /** @@ -81,6 +90,7 @@ public LogicalOlapTableStreamScan(RelationId id, Table table, List quali Optional groupExpression, Optional logicalProperties, List selectedPartitionIds, boolean partitionPruned, + boolean hasPartitionPredicate, List selectedTabletIds, long selectedIndexId, boolean indexSelected, PreAggStatus preAggStatus, List specifiedPartitions, List hints, Map, Slot> cacheSlotWithSlotName, @@ -91,12 +101,16 @@ public LogicalOlapTableStreamScan(RelationId id, Table table, List quali List scoreOrderKeys, Optional scoreLimit, Optional scoreRangeInfo, List annOrderKeys, Optional annLimit, String tableAlias, - Boolean isIncrementalScan) { + Optional partitionPrunablePredicates, + Optional changeScanInfo, + Boolean isNormalized, boolean isIncrementalScan) { super(id, table, qualifier, groupExpression, logicalProperties, - selectedPartitionIds, partitionPruned, selectedTabletIds, selectedIndexId, indexSelected, - preAggStatus, specifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, - directMvScan, colToSubPathsMap, specifiedTabletIds, operativeSlots, virtualColumns, - scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias); + selectedPartitionIds, partitionPruned, hasPartitionPredicate, selectedTabletIds, selectedIndexId, + indexSelected, preAggStatus, specifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, + tableSample, directMvScan, colToSubPathsMap, specifiedTabletIds, operativeSlots, virtualColumns, + scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, + partitionPrunablePredicates, changeScanInfo); + this.isNormalized = isNormalized; this.isIncrementalScan = isIncrementalScan; } @@ -105,12 +119,12 @@ public LogicalOlapTableStreamScan withManuallySpecifiedTabletIds(List manu return AbstractPlan.copyWithSameId(this, () -> new LogicalOlapTableStreamScan(relationId, (Table) table, qualifier, Optional.empty(), Optional.of(getLogicalProperties()), - selectedPartitionIds, partitionPruned, selectedTabletIds, + selectedPartitionIds, partitionPruned, hasPartitionPredicate, selectedTabletIds, selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - isIncrementalScan)); + partitionPrunablePredicates, changeScanInfo, isNormalized, isIncrementalScan)); } @Override @@ -118,9 +132,9 @@ public List computeOutput() { if (cachedOutput.isPresent()) { return cachedOutput.get(); } - // we need to create slots vectorized for stream scan, no need for invisible column - // todo(TsukiokaKogane): support compute binlog-based schema - List baseSchema = table.getBaseSchema(false); + List baseSchema = changeScanInfo.isPresent() + ? filterOutputColumns(table.getBaseSchema(true)) + : table.getBaseSchema(false); List slotFromColumn = createSlotsVectorized(baseSchema); ImmutableList.Builder slots = ImmutableList.builder(); @@ -144,14 +158,16 @@ public List computeOutput() { } } } - if (!isIncrementalScan) { - // inject virtual stream hidden columns - SlotReference seqColRef = (SlotReference) new Alias(new BigIntLiteral(-1L), Column.STREAM_SEQ_COL).toSlot(); + if (!changeScanInfo.isPresent()) { + // Only stream queries expose stream virtual columns. + SlotReference seqColRef = (SlotReference) new Alias(new BigIntLiteral(-1L), Column.STREAM_SEQ_COL) + .toSlot(); slots.add(seqColRef.withColumn(Column.STREAM_SEQ_VIRTUAL_COLUMN)); SlotReference changeTypeColRef = (SlotReference) new Alias(new VarcharLiteral("APPEND"), Column.STREAM_CHANGE_TYPE_COL).toSlot(); slots.add(changeTypeColRef.withColumn(Column.STREAM_CHANGE_TYPE_VIRTUAL_COLUMN)); } + for (NamedExpression virtualColumn : virtualColumns) { slots.add(virtualColumn.toSlot()); } @@ -166,11 +182,12 @@ public LogicalOlapTableStreamScan withSelectedTabletIds(List selectedTable return AbstractPlan.copyWithSameId(this, () -> new LogicalOlapTableStreamScan(relationId, (Table) table, qualifier, Optional.empty(), Optional.of(getLogicalProperties()), - selectedPartitionIds, partitionPruned, selectedTabletIds, + selectedPartitionIds, partitionPruned, hasPartitionPredicate, selectedTabletIds, selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, - hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, - colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, - scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, isIncrementalScan)); + hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, + manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, + scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, + changeScanInfo, isNormalized, isIncrementalScan)); } /** withCachedOutput */ @@ -179,11 +196,12 @@ public LogicalOlapTableStreamScan withCachedOutput(List outputSlots) { return AbstractPlan.copyWithSameId(this, () -> new LogicalOlapTableStreamScan(relationId, (Table) table, qualifier, groupExpression, Optional.empty(), - selectedPartitionIds, partitionPruned, selectedTabletIds, + selectedPartitionIds, partitionPruned, hasPartitionPredicate, selectedTabletIds, selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, Optional.of(outputSlots), tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, - scoreRangeInfo, annOrderKeys, annLimit, tableAlias, isIncrementalScan)); + scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, + changeScanInfo, isNormalized, isIncrementalScan)); } @Override @@ -191,21 +209,23 @@ public LogicalOlapTableStreamScan withOperativeSlots(Collection operativeS return AbstractPlan.copyWithSameId(this, () -> new LogicalOlapTableStreamScan(relationId, (Table) table, qualifier, groupExpression, Optional.of(getLogicalProperties()), - selectedPartitionIds, partitionPruned, selectedTabletIds, + selectedPartitionIds, partitionPruned, hasPartitionPredicate, selectedTabletIds, selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, - scoreRangeInfo, annOrderKeys, annLimit, tableAlias, isIncrementalScan)); + scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, + changeScanInfo, isNormalized, isIncrementalScan)); } @Override public List getOutputByIndex(long indexId) { - // we need to create slots vectorized for stream scan, no need for invisible column OlapTable olapTable = (OlapTable) table; // PhysicalStorageLayerAggregateTest has no visible index // when we have a partitioned table without any partition, visible index is // empty - List schema = olapTable.getIndexMetaByIndexId(indexId).getSchema(); + List schema = changeScanInfo.isPresent() + ? filterOutputColumns(olapTable.getIndexMetaByIndexId(indexId).getSchema()) + : olapTable.getIndexMetaByIndexId(indexId).getSchema(); List slots = Lists.newArrayListWithCapacity(schema.size()); IdGenerator exprIdGenerator = StatementScopeIdGenerator.getExprIdGenerator(); for (Column c : schema) { @@ -213,9 +233,11 @@ public List getOutputByIndex(long indexId) { olapTable, c, indexId == ((OlapTable) table).getBaseIndexId(), indexId, exprIdGenerator )); } - // add virtual slots, TODO: maybe wrong, should test virtual column + sync mv - for (NamedExpression virtualColumn : virtualColumns) { - slots.add(virtualColumn.toSlot()); + if (!changeScanInfo.isPresent()) { + // add virtual slots, TODO: maybe wrong, should test virtual column + sync mv + for (NamedExpression virtualColumn : virtualColumns) { + slots.add(virtualColumn.toSlot()); + } } return slots; } @@ -228,12 +250,12 @@ public LogicalOlapTableStreamScan withPreAggStatus(PreAggStatus preAggStatus) { return AbstractPlan.copyWithSameId(this, () -> new LogicalOlapTableStreamScan(relationId, (Table) table, qualifier, Optional.empty(), Optional.of(getLogicalProperties()), - selectedPartitionIds, partitionPruned, selectedTabletIds, + selectedPartitionIds, partitionPruned, hasPartitionPredicate, selectedTabletIds, selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - isIncrementalScan)); + partitionPrunablePredicates, changeScanInfo, isNormalized, isIncrementalScan)); } /** @@ -244,18 +266,98 @@ public LogicalOlapTableStreamScan withGroupExpression(Optional return AbstractPlan.copyWithSameId(this, () -> new LogicalOlapTableStreamScan(relationId, (Table) table, qualifier, groupExpression, Optional.of(getLogicalProperties()), - selectedPartitionIds, partitionPruned, selectedTabletIds, + selectedPartitionIds, partitionPruned, hasPartitionPredicate, selectedTabletIds, selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - isIncrementalScan)); + partitionPrunablePredicates, changeScanInfo, isNormalized, isIncrementalScan)); } + /** + * withNormalized + */ + public LogicalOlapTableStreamScan withNormalized(boolean isNormalized) { + return AbstractPlan.copyWithSameId(this, () -> + new LogicalOlapTableStreamScan(relationId, (Table) table, qualifier, + groupExpression, Optional.of(getLogicalProperties()), + selectedPartitionIds, partitionPruned, hasPartitionPredicate, selectedTabletIds, + selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, + hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, + colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, + scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, + partitionPrunablePredicates, changeScanInfo, isNormalized, isIncrementalScan)); + } + + /** + * withIncrementalScan + */ + public LogicalOlapTableStreamScan withIncrementalScan(boolean isIncrementalScan) { + return AbstractPlan.copyWithSameId(this, () -> + new LogicalOlapTableStreamScan(relationId, (Table) table, qualifier, + groupExpression, Optional.of(getLogicalProperties()), + selectedPartitionIds, partitionPruned, hasPartitionPredicate, selectedTabletIds, + selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, + hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, + colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, + scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, + partitionPrunablePredicates, changeScanInfo, isNormalized, isIncrementalScan)); + } + + @Override public boolean isIncrementalScan() { return isIncrementalScan; } + public boolean isNormalized() { + return isNormalized; + } + + /** + * withSelectedPartitionIds + */ + @Override + public LogicalOlapTableStreamScan withSelectedPartitionIds(List selectedPartitionIdsd) { + return withSelectedPartitionIds(selectedPartitionIdsd, false); + } + + /** + * withSelectedPartitionIds + */ + @Override + public LogicalOlapTableStreamScan withSelectedPartitionIds(List selectedPartitionIds, + boolean isPartitionPruned) { + return AbstractPlan.copyWithSameId(this, () -> + new LogicalOlapTableStreamScan(relationId, (Table) table, qualifier, + groupExpression, Optional.of(getLogicalProperties()), + selectedPartitionIds, isPartitionPruned, hasPartitionPredicate, selectedTabletIds, + selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, + hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, + colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, + scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, + partitionPrunablePredicates, changeScanInfo, isNormalized, isIncrementalScan)); + } + + /** + * Returns a new {@code LogicalOlapScan} carrying the supplied + * {@link PartitionPrunablePredicate}. It is preserved across all other + * {@code with*} builders so partition-derived conjuncts can be removed + * safely after MV rewrite has had a chance to match the plan. + */ + @Override + public LogicalOlapTableStreamScan withPartitionPrunablePredicates( + Optional partitionPrunablePredicates) { + return AbstractPlan.copyWithSameId(this, () -> + new LogicalOlapTableStreamScan(relationId, (Table) table, qualifier, + groupExpression, Optional.of(getLogicalProperties()), + selectedPartitionIds, partitionPruned, hasPartitionPredicate, selectedTabletIds, + selectedIndexId, indexSelected, preAggStatus, manuallySpecifiedPartitions, + hints, cacheSlotWithSlotName, cachedOutput, tableSample, directMvScan, + colToSubPathsMap, manuallySpecifiedTabletIds, operativeSlots, virtualColumns, + scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, + partitionPrunablePredicates, changeScanInfo, isNormalized, isIncrementalScan)); + } + @Override public R accept(PlanVisitor visitor, C context) { return visitor.visitLogicalOlapTableStreamScan(this, context); @@ -273,6 +375,19 @@ public boolean equals(Object o) { return false; } LogicalOlapTableStreamScan that = (LogicalOlapTableStreamScan) o; - return Objects.equals(isIncrementalScan, that.isIncrementalScan); + return Objects.equals(isNormalized, that.isNormalized) + && Objects.equals(isIncrementalScan, that.isIncrementalScan); + } + + @Override + protected List filterOutputColumns(List columns) { + List filtered = new ArrayList<>(columns.size()); + for (Column column : columns) { + // The stream does not expose __BEFORE__ snapshot columns; they are only used as BE internal inputs. + if (!BaseTableStream.isBeforeImageColumn(column.getName())) { + filtered.add(column); + } + } + return filtered; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalOlapScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalOlapScan.java index 9205f008cf4b1d..6d2ebfbaee829b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalOlapScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalOlapScan.java @@ -23,6 +23,7 @@ import org.apache.doris.nereids.properties.LogicalProperties; import org.apache.doris.nereids.properties.OrderKey; import org.apache.doris.nereids.properties.PhysicalProperties; +import org.apache.doris.nereids.trees.ChangeScanInfo; import org.apache.doris.nereids.trees.TableSample; import org.apache.doris.nereids.trees.expressions.ExprId; import org.apache.doris.nereids.trees.expressions.Expression; @@ -78,6 +79,9 @@ public class PhysicalOlapScan extends PhysicalCatalogRelation implements OlapSca // use for ann push down private final List annOrderKeys; private final Optional annLimit; + // user for binlog scan + private final boolean incrementalScan; + private final Optional changeScanInfo; /** * Predicates known to be TRUE on this scan thanks to partition pruning. @@ -158,7 +162,7 @@ public PhysicalOlapScan(RelationId id, OlapTable olapTable, List qualifi hasPartitionPredicate, distributionSpec, preAggStatus, baseOutputs, groupExpression, logicalProperties, physicalProperties, statistics, tableSample, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, - Optional.empty()); + Optional.empty(), false, Optional.empty()); } /** @@ -173,7 +177,8 @@ public PhysicalOlapScan(RelationId id, OlapTable olapTable, List qualifi Collection operativeSlots, List virtualColumns, List scoreOrderKeys, Optional scoreLimit, Optional scoreRangeInfo, List annOrderKeys, Optional annLimit, String tableAlias, - Optional partitionPrunablePredicates) { + Optional partitionPrunablePredicates, boolean incrementalScan, + Optional changeScanInfo) { super(id, PlanType.PHYSICAL_OLAP_SCAN, olapTable, qualifier, groupExpression, logicalProperties, physicalProperties, statistics, operativeSlots, tableAlias); this.selectedIndexId = selectedIndexId; @@ -194,6 +199,8 @@ public PhysicalOlapScan(RelationId id, OlapTable olapTable, List qualifi this.partitionPrunablePredicates = partitionPrunablePredicates == null ? Optional.empty() : partitionPrunablePredicates; + this.incrementalScan = incrementalScan; + this.changeScanInfo = changeScanInfo == null ? Optional.empty() : changeScanInfo; } @Override @@ -230,7 +237,8 @@ public PhysicalOlapScan withPartitionPrunablePredicates( selectedIndexId, selectedTabletIds, selectedPartitionIds, hasPartitionPredicate, distributionSpec, preAggStatus, baseOutputs, groupExpression, getLogicalProperties(), getPhysicalProperties(), statistics, tableSample, operativeSlots, virtualColumns, scoreOrderKeys, - scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, false, + changeScanInfo)); } @Override @@ -359,7 +367,9 @@ public boolean equals(Object o) { && Objects.equals(scoreRangeInfo, olapScan.scoreRangeInfo) && Objects.equals(annOrderKeys, olapScan.annOrderKeys) && Objects.equals(annLimit, olapScan.annLimit) - && Objects.equals(partitionPrunablePredicates, olapScan.partitionPrunablePredicates); + && Objects.equals(partitionPrunablePredicates, olapScan.partitionPrunablePredicates) + && Objects.equals(incrementalScan, olapScan.incrementalScan) + && Objects.equals(changeScanInfo, olapScan.changeScanInfo); } @Override @@ -378,7 +388,7 @@ public PhysicalOlapScan withGroupExpression(Optional groupExpre selectedIndexId, selectedTabletIds, selectedPartitionIds, hasPartitionPredicate, distributionSpec, preAggStatus, baseOutputs, groupExpression, getLogicalProperties(), null, null, tableSample, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, - annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, incrementalScan, changeScanInfo)); } @Override @@ -388,7 +398,7 @@ public Plan withGroupExprLogicalPropChildren(Optional groupExpr selectedIndexId, selectedTabletIds, selectedPartitionIds, hasPartitionPredicate, distributionSpec, preAggStatus, baseOutputs, groupExpression, logicalProperties.get(), null, null, tableSample, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, scoreRangeInfo, - annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, incrementalScan, changeScanInfo)); } @Override @@ -398,7 +408,8 @@ public PhysicalOlapScan withPhysicalPropertiesAndStats( selectedIndexId, selectedTabletIds, selectedPartitionIds, hasPartitionPredicate, distributionSpec, preAggStatus, baseOutputs, groupExpression, getLogicalProperties(), physicalProperties, statistics, tableSample, operativeSlots, virtualColumns, scoreOrderKeys, - scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + scoreLimit, scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, + incrementalScan, changeScanInfo)); } @Override @@ -426,11 +437,20 @@ public CatalogRelation withOperativeSlots(Collection operativeSlots) { distributionSpec, preAggStatus, baseOutputs, groupExpression, getLogicalProperties(), getPhysicalProperties(), statistics, tableSample, operativeSlots, virtualColumns, scoreOrderKeys, scoreLimit, - scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates)); + scoreRangeInfo, annOrderKeys, annLimit, tableAlias, partitionPrunablePredicates, incrementalScan, + changeScanInfo)); } @Override public List getOperativeSlots() { return operativeSlots; } + + public boolean isIncrementalScan() { + return incrementalScan; + } + + public Optional getChangeScanInfo() { + return changeScanInfo; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java b/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java index 5ae2af5419f3c9..30f7873e116515 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java @@ -1000,6 +1000,12 @@ public static void loadJournal(Env env, Long logId, JournalEntity journal) { env.getBinlogManager().addModifyTableProperty(log, logId); break; } + case OperationType.OP_PRUNE_TABLE_STREAM_PARTITION_OFFSETS: { + PruneTableStreamPartitionOffsetInfo info = + (PruneTableStreamPartitionOffsetInfo) journal.getData(); + env.replayPruneTableStreamPartitionOffsets(info); + break; + } case OperationType.OP_MODIFY_DISTRIBUTION_BUCKET_NUM: { ModifyTableDefaultDistributionBucketNumOperationLog log = (ModifyTableDefaultDistributionBucketNumOperationLog) journal.getData(); @@ -2283,6 +2289,10 @@ public void logDynamicPartition(ModifyTablePropertyOperationLog info) { logModifyTableProperty(OperationType.OP_DYNAMIC_PARTITION, info); } + public void logPruneTableStreamPartitionOffsets(PruneTableStreamPartitionOffsetInfo info) { + logEdit(OperationType.OP_PRUNE_TABLE_STREAM_PARTITION_OFFSETS, info); + } + public long logModifyReplicationNum(ModifyTablePropertyOperationLog info) { return logModifyTableProperty(OperationType.OP_MODIFY_REPLICATION_NUM, info); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/OperationType.java b/fe/fe-core/src/main/java/org/apache/doris/persist/OperationType.java index 7c2a24014e1f64..91a387483c2fcc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/OperationType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/OperationType.java @@ -424,6 +424,8 @@ public class OperationType { public static final short OP_CREATE_ROLE_MAPPING = 496; public static final short OP_DROP_ROLE_MAPPING = 497; + public static final short OP_PRUNE_TABLE_STREAM_PARTITION_OFFSETS = 498; + // For cloud. public static final short OP_UPDATE_CLOUD_REPLICA = 1000; @Deprecated diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/PruneTableStreamPartitionOffsetInfo.java b/fe/fe-core/src/main/java/org/apache/doris/persist/PruneTableStreamPartitionOffsetInfo.java new file mode 100644 index 00000000000000..30f6fd4181ca9d --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/PruneTableStreamPartitionOffsetInfo.java @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.persist; + +import org.apache.doris.common.io.Text; +import org.apache.doris.common.io.Writable; +import org.apache.doris.persist.gson.GsonUtils; + +import com.google.gson.annotations.SerializedName; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.List; +import java.util.Set; + +public class PruneTableStreamPartitionOffsetInfo implements Writable { + @SerializedName(value = "entries") + private List entries; + + public PruneTableStreamPartitionOffsetInfo(List entries) { + this.entries = entries; + } + + public List getEntries() { + return entries; + } + + @Override + public void write(DataOutput out) throws IOException { + Text.writeString(out, GsonUtils.GSON.toJson(this)); + } + + public static PruneTableStreamPartitionOffsetInfo read(DataInput in) throws IOException { + return GsonUtils.GSON.fromJson(Text.readString(in), PruneTableStreamPartitionOffsetInfo.class); + } + + public static class Entry { + @SerializedName(value = "dbId") + private long dbId; + + @SerializedName(value = "streamId") + private long streamId; + + @SerializedName(value = "partitionIds") + private Set partitionIds; + + public Entry(long dbId, long streamId, Set partitionIds) { + this.dbId = dbId; + this.streamId = streamId; + this.partitionIds = partitionIds; + } + + public long getDbId() { + return dbId; + } + + public long getStreamId() { + return streamId; + } + + public Set getPartitionIds() { + return partitionIds; + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java index f1d89b63d51a25..0df69ecb0f2258 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java @@ -47,6 +47,7 @@ import org.apache.doris.catalog.RowBinlogTableWrapper; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.Tablet; +import org.apache.doris.catalog.stream.BaseTableStream; import org.apache.doris.catalog.stream.OlapTableStreamUpdate; import org.apache.doris.catalog.stream.OlapTableStreamWrapper; import org.apache.doris.cloud.qe.ComputeGroupException; @@ -59,6 +60,7 @@ import org.apache.doris.common.UserException; import org.apache.doris.common.util.DebugUtil; import org.apache.doris.nereids.glue.translator.PlanTranslatorContext; +import org.apache.doris.nereids.trees.ChangeScanInfo; import org.apache.doris.nereids.trees.plans.ScoreRangeInfo; import org.apache.doris.planner.normalize.Normalizer; import org.apache.doris.planner.normalize.PartitionRangePredicateNormalizer; @@ -66,6 +68,8 @@ import org.apache.doris.resource.computegroup.ComputeGroup; import org.apache.doris.system.Backend; import org.apache.doris.thrift.TAggregationType; +import org.apache.doris.thrift.TBinlogReadSource; +import org.apache.doris.thrift.TBinlogScanType; import org.apache.doris.thrift.TColumn; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TExpr; @@ -82,6 +86,7 @@ import org.apache.doris.thrift.TScanRangeLocation; import org.apache.doris.thrift.TScanRangeLocations; import org.apache.doris.thrift.TSortInfo; +import org.apache.doris.tso.TSOTimestamp; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Joiner; @@ -210,8 +215,27 @@ public class OlapScanNode extends ScanNode { private Column globalRowIdColumn; + private final boolean incrementalScan; + + // single-table CHANGES scan on base table + private boolean hasChangeScan = false; + private long changeStartTimestamp = -1L; + private boolean hasChangeEndTimestamp = false; + private long changeEndTimestamp = -1L; + private ChangeScanInfo.InformationKind informationKind = ChangeScanInfo.InformationKind.DETAIL; + + private static long encodePhysicalTimestampToTso(long physicalTimestamp) { + return physicalTimestamp <= 0 ? 0 : TSOTimestamp.composeTimestamp(physicalTimestamp, 0); + } + // Constructs node to scan given data files of table 'tbl'. public OlapScanNode(PlanNodeId id, TupleDescriptor desc, String planNodeName, ScanContext scanContext) { + this (id, desc, planNodeName, scanContext, false); + } + + // Constructs node to scan given data files of table 'tbl'. + public OlapScanNode(PlanNodeId id, TupleDescriptor desc, String planNodeName, ScanContext scanContext, + boolean incrementalScan) { super(id, desc, planNodeName, scanContext); olapTable = (OlapTable) desc.getTable(); distributionColumnIds = Sets.newTreeSet(); @@ -229,6 +253,7 @@ public OlapScanNode(PlanNodeId id, TupleDescriptor desc, String planNodeName, Sc columnId++; } } + this.incrementalScan = incrementalScan; } @@ -444,11 +469,9 @@ private void addScanRangeLocations(Partition partition, if (!(Config.isCloudMode() && Config.enable_cloud_snapshot_version)) { visibleVersion = partition.getVisibleVersion(); } - // if partition offset is set, use the next offset to set the visible version if (olapTable instanceof OlapTableStreamWrapper) { visibleVersion = ((OlapTableStreamWrapper) olapTable).getStreamUpdate(partition.getId()).second; } - // for non-cloud mode. for cloud mode see `updateScanRangeVersions` maxVersion = Math.max(maxVersion, visibleVersion); int useFixReplica = -1; @@ -503,6 +526,36 @@ private void addScanRangeLocations(Partition partition, ); paloRange.setVersionHash(""); paloRange.setTabletId(tabletId); + if (incrementalScan && hasChangeScan) { + Preconditions.checkState(olapTable instanceof RowBinlogTableWrapper); + paloRange.setStartTso(encodePhysicalTimestampToTso(changeStartTimestamp)); + if (hasChangeEndTimestamp) { + paloRange.setEndTso(encodePhysicalTimestampToTso(changeEndTimestamp)); + } + paloRange.setBinlogScanType(informationKindToSchemaScanType(informationKind)); + paloRange.setBinlogReadSource(TBinlogReadSource.CHANGES); + } else if (incrementalScan) { + Preconditions.checkState(olapTable instanceof RowBinlogTableWrapper); + RowBinlogTableWrapper binlogWrapper = ((RowBinlogTableWrapper) olapTable); + Pair update = getStreamUpdate(partition.getId()); + if (update.first != null) { + paloRange.setStartTso(update.first); + } + if (update.second != null) { + paloRange.setEndTso(update.second); + } + TBinlogScanType streamScanType = + BaseTableStream.StreamScanType.toThrift(binlogWrapper.getParent().getConsumeType()); + paloRange.setBinlogScanType(streamScanType); + paloRange.setBinlogReadSource(TBinlogReadSource.STREAM); + } else if (hasChangeScan) { + Preconditions.checkState(olapTable instanceof RowBinlogTableWrapper); + paloRange.setStartTso(encodePhysicalTimestampToTso(changeStartTimestamp)); + if (hasChangeEndTimestamp) { + paloRange.setEndTso(encodePhysicalTimestampToTso(changeEndTimestamp)); + } + paloRange.setBinlogReadSource(TBinlogReadSource.CHANGES); + } // random shuffle List && only collect one copy // @@ -1239,7 +1292,8 @@ protected void toThrift(TPlanNode msg) { msg.olap_scan_node.setDistributeColumnIds(new ArrayList<>(distributionColumnIds)); - if (selectedIndexId != -1 && olapTable.getIndexMetaByIndexId(selectedIndexId).isRowBinlogIndex()) { + if (hasChangeScan + || (selectedIndexId != -1 && olapTable.getIndexMetaByIndexId(selectedIndexId).isRowBinlogIndex())) { msg.olap_scan_node.setReadRowBinlog(true); } @@ -1444,13 +1498,71 @@ public OlapTableStreamUpdate getStreamUpdate() { Map prev = Maps.newHashMap(); Map next = Maps.newHashMap(); for (Long partitionId : getSelectedPartitionIds()) { - Pair streamUpdate = ((OlapTableStreamWrapper) olapTable).getStreamUpdate(partitionId); + Pair streamUpdate = getStreamUpdate(partitionId); if (streamUpdate.first != null) { - // prev could be null, ignore + // prev could be null, in case of historical scan prev.put(partitionId, streamUpdate.first); } - next.put(partitionId, streamUpdate.second); + if (streamUpdate.second != null) { + next.put(partitionId, streamUpdate.second); + } else { + // next could be null, in case of incremental scan use most recent visible time + next.put(partitionId, olapTable.getPartition(partitionId).getVisibleVersionTime()); + } } return new OlapTableStreamUpdate(prev, next); } + + private Pair getStreamUpdate(Long partitionId) { + // unprotected assume partitionId is in SelectedPartitionIds + Pair streamUpdate; + if (olapTable instanceof RowBinlogTableWrapper) { + streamUpdate = ((RowBinlogTableWrapper) olapTable).getParent().getStreamUpdate(partitionId); + } else { + streamUpdate = ((OlapTableStreamWrapper) olapTable).getStreamUpdate(partitionId); + } + return streamUpdate; + } + + public boolean isIncrementalScan() { + return incrementalScan; + } + + public void enableTimestampChangeScan( + long startTimestamp, Long endTimestamp, ChangeScanInfo.InformationKind informationKind) { + this.hasChangeScan = true; + this.changeStartTimestamp = startTimestamp; + this.hasChangeEndTimestamp = endTimestamp != null; + this.changeEndTimestamp = endTimestamp != null ? endTimestamp : -1L; + this.informationKind = informationKind; + } + + public boolean hasChangeScan() { + return hasChangeScan; + } + + public long getChangeStartTimestamp() { + return changeStartTimestamp; + } + + public boolean hasChangeEndTimestamp() { + return hasChangeEndTimestamp; + } + + public long getChangeEndTimestamp() { + return changeEndTimestamp; + } + + TBinlogScanType informationKindToSchemaScanType(ChangeScanInfo.InformationKind informationKind) { + switch (informationKind) { + case MIN_DELTA: + return TBinlogScanType.MIN_DELTA; + case APPEND_ONLY: + return TBinlogScanType.APPEND_ONLY; + case DETAIL: + return TBinlogScanType.DETAIL; + default: + return TBinlogScanType.NONE; + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java index 02a40761db9c19..2132010fc72e66 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/ScanNode.java @@ -83,6 +83,9 @@ public abstract class ScanNode extends PlanNode implements SplitGenerator { private static final Logger LOG = LogManager.getLogger(ScanNode.class); protected static final int NUM_SPLITS_PER_PARTITION = 10; protected static final int NUM_SPLITTERS_ON_FLIGHT = Config.max_external_cache_loader_thread_pool_size; + public static final String DORIS_START_TIMESTAMP = "startTimestamp"; + public static final String DORIS_END_TIMESTAMP = "endTimestamp"; + public static final String DORIS_INCREMENT_TYPE = "incrementType"; protected TupleDescriptor desc; // for distribution prunner protected Map columnFilters = new CaseInsensitiveMap(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java index 5181a77b042f57..817d447739e8ee 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/Coordinator.java @@ -127,6 +127,9 @@ import org.apache.doris.thrift.TTabletCommitInfo; import org.apache.doris.thrift.TTopnFilterDesc; import org.apache.doris.thrift.TUniqueId; +import org.apache.doris.transaction.TransactionState; +import org.apache.doris.transaction.TransactionStatus; +import org.apache.doris.tso.TSOTimestamp; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; @@ -620,6 +623,105 @@ private void traceInstance() { } } + private void waitForTimeBasedReadTransactionsVisible() throws Exception { + if (context == null) { + return; + } + SessionVariable sessionVariable = context.getSessionVariable(); + if (sessionVariable == null || sessionVariable.isEnableEventualConsistentChange()) { + return; + } + + // Collect (dbId, tableId) -> max(endTimestampMs) + Map, Long> tableEndTimestampMs = new HashMap<>(); + for (ScanNode scanNode : scanNodes) { + if (scanNode instanceof OlapScanNode) { + OlapScanNode olapScanNode = (OlapScanNode) scanNode; + if (olapScanNode.hasChangeScan()) { + long endTs = olapScanNode.hasChangeEndTimestamp() + ? olapScanNode.getChangeEndTimestamp() + : queryGlobals.getTimestampMs(); + addTableEndTimestamp(tableEndTimestampMs, olapScanNode.getOlapTable(), endTs); + } + } + } + if (tableEndTimestampMs.isEmpty()) { + return; + } + + long deadlineMs = System.currentTimeMillis() + sessionVariable.getChangeVisibleTimeoutMs(); + for (Map.Entry, Long> entry : tableEndTimestampMs.entrySet()) { + long dbId = entry.getKey().first; + long tableId = entry.getKey().second; + long endTimestampMs = entry.getValue(); + + List committedTxns; + try { + committedTxns = Env.getCurrentGlobalTransactionMgr().getCommittedTransactions(dbId); + } catch (Exception e) { + throw new UserException("get committed transactions failed. dbId=" + dbId, e); + } + + for (TransactionState txn : committedTxns) { + if (txn == null + || txn.getTransactionStatus() != TransactionStatus.COMMITTED + || txn.getTableIdList() == null + || !txn.getTableIdList().contains(tableId)) { + continue; + } + + long txnCommitTimeMs = extractTransactionCommitTimeMs(txn); + if (txnCommitTimeMs < 0 || txnCommitTimeMs > endTimestampMs) { + continue; + } + + long remainingMs = deadlineMs - System.currentTimeMillis(); + if (remainingMs <= 0) { + throw new UserException(String.format( + "timeout waiting committed transactions become visible for time-based read, " + + "dbId=%d tableId=%d endTimestampMs=%d", + dbId, tableId, endTimestampMs)); + } + + while (txn.getTransactionStatus() == TransactionStatus.COMMITTED + && remainingMs > 0) { + try { + txn.waitTransactionVisible(remainingMs); + } catch (InterruptedException ignored) { + // ignore + } + remainingMs = deadlineMs - System.currentTimeMillis(); + } + + if (txn.getTransactionStatus() == TransactionStatus.COMMITTED) { + throw new UserException(String.format( + "timeout waiting transaction become visible for time-based read, " + + "txnId=%d dbId=%d tableId=%d endTimestampMs=%d", + txn.getTransactionId(), dbId, tableId, endTimestampMs)); + } + } + } + } + + private static void addTableEndTimestamp(Map, Long> tableEndTimestampMs, + org.apache.doris.catalog.OlapTable table, long endTimestampMs) { + if (table == null || table.getDatabase() == null) { + return; + } + long dbId = table.getDatabase().getId(); + long tableId = table.getId(); + Pair key = Pair.of(dbId, tableId); + Long oldEnd = tableEndTimestampMs.get(key); + if (oldEnd == null || oldEnd < endTimestampMs) { + tableEndTimestampMs.put(key, endTimestampMs); + } + } + + private static long extractTransactionCommitTimeMs(TransactionState txn) { + long tso = txn.getCommitTSO(); + return TSOTimestamp.extractPhysicalTime(tso); + } + protected void processFragmentAssignmentAndParams() throws Exception { // prepare information prepare(); @@ -729,6 +831,7 @@ protected void execInternal() throws Exception { DebugUtil.printId(queryId), fragments.get(0).toThrift()); } + waitForTimeBasedReadTransactionsVisible(); processFragmentAssignmentAndParams(); traceInstance(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 54095c9e08a08a..2531171050f7f4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -270,6 +270,10 @@ public class SessionVariable implements Serializable, Writable { // max ms to wait transaction publish finish when exec insert stmt. public static final String INSERT_VISIBLE_TIMEOUT_MS = "insert_visible_timeout_ms"; + // change scan consistency and wait options. + public static final String ENABLE_EVENTUAL_CONSISTENT_CHANGE = "enable_eventual_consistent_change"; + public static final String CHANGE_VISIBLE_TIMEOUT_MS = "change_visible_timeout_ms"; + public static final String DELETE_WITHOUT_PARTITION = "delete_without_partition"; public static final String ENABLE_VARIANT_ACCESS_IN_ORIGINAL_PLANNER = "enable_variant_access_in_original_planner"; @@ -300,6 +304,7 @@ public class SessionVariable implements Serializable, Writable { public static final String ENABLE_INFER_PREDICATE = "enable_infer_predicate"; public static final long DEFAULT_INSERT_VISIBLE_TIMEOUT_MS = 60_000; + public static final long DEFAULT_CHANGE_VISIBLE_TIMEOUT_MS = 10_000; public static final String ENABLE_VECTORIZED_ENGINE = "enable_vectorized_engine"; @@ -307,6 +312,7 @@ public class SessionVariable implements Serializable, Writable { // If user set a very small value, use this value instead. public static final long MIN_INSERT_VISIBLE_TIMEOUT_MS = 1000; + public static final long MIN_CHANGE_VISIBLE_TIMEOUT_MS = 1000; public static final String ENABLE_PIPELINE_ENGINE = "enable_pipeline_engine"; @@ -1072,6 +1078,17 @@ public static double getHotValueThreshold() { @VarAttrDef.VarAttr(name = INSERT_VISIBLE_TIMEOUT_MS, needForward = true) public long insertVisibleTimeoutMs = DEFAULT_INSERT_VISIBLE_TIMEOUT_MS; + @VarAttrDef.VarAttr(name = ENABLE_EVENTUAL_CONSISTENT_CHANGE, needForward = true, + description = {"是否允许在 CHANGES/快照类时间查询中使用最终一致语义(不等待事务发布)。开启后可能返回不包含最新 commit 的结果。", + "Whether to allow eventual consistent semantics for time-based CHANGES/snapshot queries. " + + "If true, query may return results without waiting committed txns to be visible."}) + public boolean enableEventualConsistentChange = false; + + @VarAttrDef.VarAttr(name = CHANGE_VISIBLE_TIMEOUT_MS, needForward = true, + description = {"时间范围 CHANGES/快照查询等待 COMMITTED 事务发布为 VISIBLE 的最长时间(毫秒)。", + "Max time in ms to wait committed txns become visible for time-based CHANGES/snapshot queries."}) + public long changeVisibleTimeoutMs = DEFAULT_CHANGE_VISIBLE_TIMEOUT_MS; + // max memory used on every backend. Default value to 100G. @VarAttrDef.VarAttr(name = EXEC_MEM_LIMIT, needForward = true) public long maxExecMemByte = 100147483648L; @@ -4910,6 +4927,30 @@ public void setInsertVisibleTimeoutMs(long insertVisibleTimeoutMs) { } } + public boolean isEnableEventualConsistentChange() { + return enableEventualConsistentChange; + } + + public void setEnableEventualConsistentChange(boolean enableEventualConsistentChange) { + this.enableEventualConsistentChange = enableEventualConsistentChange; + } + + public long getChangeVisibleTimeoutMs() { + if (changeVisibleTimeoutMs < MIN_CHANGE_VISIBLE_TIMEOUT_MS) { + return MIN_CHANGE_VISIBLE_TIMEOUT_MS; + } else { + return changeVisibleTimeoutMs; + } + } + + public void setChangeVisibleTimeoutMs(long changeVisibleTimeoutMs) { + if (changeVisibleTimeoutMs < MIN_CHANGE_VISIBLE_TIMEOUT_MS) { + this.changeVisibleTimeoutMs = MIN_CHANGE_VISIBLE_TIMEOUT_MS; + } else { + this.changeVisibleTimeoutMs = changeVisibleTimeoutMs; + } + } + public boolean getIsSingleSetVar() { return isSingleSetVar; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java index 27a61243d81a01..72fee2c902fd6c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java @@ -1583,6 +1583,14 @@ private PartitionCommitInfo generatePartitionCommitInfo(OlapTable table, long pa table.isTemporaryPartition(partitionId)); } + private PartitionCommitInfo generatePartitionCommitInfo(OlapTable table, long partitionId, long partitionVersion, + long commitTSO) { + PartitionInfo tblPartitionInfo = table.getPartitionInfo(); + String partitionRange = tblPartitionInfo.getPartitionRangeString(partitionId); + return new PartitionCommitInfo(partitionId, partitionRange, + partitionVersion, commitTSO, table.isTemporaryPartition(partitionId)); + } + protected void unprotectedCommitTransaction(TransactionState transactionState, Set errorReplicaIds, Map> tableToPartition, Set totalInvolvedBackends, Database db) throws TransactionCommitFailedException { @@ -1603,14 +1611,19 @@ protected void unprotectedCommitTransaction(TransactionState transactionState, S for (long tableId : tableToPartition.keySet()) { OlapTable table = (OlapTable) db.getTableNullable(tableId); TableCommitInfo tableCommitInfo = new TableCommitInfo(tableId); - if (Config.enable_tso_feature && table.enableTso()) { + if (Config.enable_feature_binlog && table.enableTso()) { tableCommitInfo.setCommitTSO(commitTSO); } for (long partitionId : tableToPartition.get(tableId)) { Partition partition = table.getPartition(partitionId); - tableCommitInfo.addPartitionCommitInfo( - generatePartitionCommitInfo(table, partitionId, partition.getNextVersion())); + if (Config.enable_feature_binlog && table.enableTso()) { + tableCommitInfo.addPartitionCommitInfo( + generatePartitionCommitInfo(table, partitionId, partition.getNextVersion(), commitTSO)); + } else { + tableCommitInfo.addPartitionCommitInfo( + generatePartitionCommitInfo(table, partitionId, partition.getNextVersion())); + } } transactionState.putIdToTableCommitInfo(tableId, tableCommitInfo); } @@ -1668,7 +1681,7 @@ protected void unprotectedCommitTransaction(TransactionState transactionState, S TableCommitInfo tableCommitInfo = new TableCommitInfo(tableId); tableCommitInfo.setVersion(tableNextVersion); tableCommitInfo.setVersionTime(System.currentTimeMillis()); - if (Config.enable_tso_feature && table.enableTso()) { + if (Config.enable_feature_binlog && table.enableTso()) { tableCommitInfo.setCommitTSO(commitTSO); } @@ -1679,8 +1692,14 @@ protected void unprotectedCommitTransaction(TransactionState transactionState, S } partitionToVersion.put(partitionId, partitionNextVersion); - PartitionCommitInfo partitionCommitInfo = generatePartitionCommitInfo(table, partitionId, - partitionNextVersion); + PartitionCommitInfo partitionCommitInfo; + if (Config.enable_feature_binlog && table.enableTso()) { + partitionCommitInfo = generatePartitionCommitInfo(table, partitionId, + partitionNextVersion, commitTSO); + } else { + partitionCommitInfo = generatePartitionCommitInfo(table, partitionId, + partitionNextVersion); + } tableCommitInfo.addPartitionCommitInfo(partitionCommitInfo); LOG.info("commit txn_id={}, sub_txn_id={}, partition_id={}, version={}", transactionState.getTransactionId(), subTransactionState.getSubTransactionId(), @@ -1727,7 +1746,7 @@ protected void unprotectedCommitTransaction2PC(TransactionState transactionState transactionState); continue; } - if (Config.enable_tso_feature && table.enableTso()) { + if (Config.enable_feature_binlog && table.enableTso()) { tableCommitInfo.setCommitTSO(commitTSO); } Iterator partitionCommitInfoIterator @@ -1745,7 +1764,11 @@ protected void unprotectedCommitTransaction2PC(TransactionState transactionState continue; } partitionCommitInfo.setVersion(partition.getNextVersion()); - partitionCommitInfo.setVersionTime(System.currentTimeMillis()); + if (Config.enable_feature_binlog && table.enableTso()) { + partitionCommitInfo.setVersionTime(commitTSO); + } else { + partitionCommitInfo.setVersionTime(System.currentTimeMillis()); + } } } // Update in-memory state only; caller handles edit log persistence @@ -3091,7 +3114,7 @@ private void cleanSubTransactions(long transactionId) { private long getCommitTSO(TransactionState transactionState, Database db, Set tableIds) throws TransactionCommitFailedException { long tso = -1L; - if (!Config.enable_tso_feature) { + if (!Config.enable_feature_binlog) { return tso; } if (tableIds == null || tableIds.isEmpty()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgr.java index c231f4d875d72c..d0fae8d1d8c2a9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgr.java @@ -448,6 +448,11 @@ public List getReadyToPublishTransactions() { return transactionStateList; } + @Override + public List getCommittedTransactions(long dbId) throws AnalysisException { + return getDatabaseTransactionMgr(dbId).getCommittedTxnList(); + } + public boolean existCommittedTxns(Long dbId, Long tableId, Long partitionId) { DatabaseTransactionMgr dbTransactionMgr = dbIdToDatabaseTransactionMgrs.get(dbId); if (tableId == null && partitionId == null) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgrIface.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgrIface.java index d05291e93e2e2a..ff7cf751cece4e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgrIface.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/GlobalTransactionMgrIface.java @@ -129,6 +129,8 @@ public void abortTransaction(Long dbId, Long txnId, String reason, public List getReadyToPublishTransactions(); + public List getCommittedTransactions(long dbId) throws AnalysisException; + public boolean existCommittedTxns(Long dbId, Long tableId, Long partitionId); public void finishTransaction(long dbId, long transactionId, Map partitionVisibleVersions, diff --git a/fe/fe-core/src/main/java/org/apache/doris/tso/TSOService.java b/fe/fe-core/src/main/java/org/apache/doris/tso/TSOService.java index ff95f3c632fcd8..ffd9bb536d85a1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tso/TSOService.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tso/TSOService.java @@ -76,7 +76,7 @@ public synchronized void start() { */ @Override protected void runAfterCatalogReady() { - if (!Config.enable_tso_feature) { + if (!isTsoEnabled()) { lock.lock(); try { isInitialized.set(false); @@ -160,8 +160,8 @@ protected void runAfterCatalogReady() { * @throws RuntimeException if TSO is not calibrated or other errors occur */ public long getTSO() { - if (!Config.enable_tso_feature) { - throw new RuntimeException("TSO feature is disabled, please check enable_tso_feature"); + if (!isTsoEnabled()) { + throw new RuntimeException("TSO feature is disabled, please check enable_feature_binlog"); } if (!isInitialized.get()) { throw new RuntimeException("TSO timestamp is not calibrated, please check"); @@ -247,11 +247,6 @@ private void calibrateTimestamp() { if (isInitialized.get()) { return; } - // Fail fast: calibration must persist the window end before the service can be considered initialized. - // Otherwise, a restart may lose the boundary and break TSO monotonicity guarantees. - if (!Config.enable_tso_persist_journal) { - throw new RuntimeException("TSO calibration requires enable_tso_persist_journal=true"); - } // Check if Env is ready before calibration Env env = Env.getCurrentEnv(); if (env == null || !env.isReady() || !env.isMaster()) { @@ -383,9 +378,9 @@ private void updateTimestamp() { * @param timestamp The timestamp to write */ private void writeTimestampToBDBJE(long timestamp) { - if (!Config.enable_tso_persist_journal) { + if (!isTsoEnabled()) { LOG.debug("TSO timestamp {} is not persisted to journal, " - + "please check if enable_tso_persist_journal is set to true", + + "please check if enable_feature_binlog is set to true", new TSOTimestamp(timestamp, 0)); return; } @@ -445,7 +440,7 @@ private void writeTimestampToBDBJE(long timestamp) { private Pair generateTSO() { lock.lock(); try { - if (!Config.enable_tso_feature || !isInitialized.get()) { + if (!isTsoEnabled() || !isInitialized.get()) { return Pair.of(0L, 0L); } long physicalTime = globalTimestamp.getPhysicalTimestamp(); @@ -502,7 +497,7 @@ public long getWindowEndTSO() { } public long saveTSO(CountingDataOutputStream dos, long checksum) throws IOException { - if (!Config.enable_tso_checkpoint_module) { + if (!isTsoEnabled()) { return checksum; } long currentWindowEnd = windowEndTSO.get(); @@ -523,4 +518,11 @@ public long loadTSO(DataInputStream dis, long checksum) throws IOException { LOG.info("Finished replay TSO windowEndTSO {} from image", windowEndTSO.get()); return newChecksum; } + + /** + * Returns whether TSO is globally enabled by the binlog feature switch. + */ + private boolean isTsoEnabled() { + return Config.enable_feature_binlog; + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateTableStreamTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateTableStreamTest.java index f10cdd46d479f0..90d4becb161353 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateTableStreamTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/CreateTableStreamTest.java @@ -45,7 +45,9 @@ public void testCreateStreamNormalOLAP() throws Exception { createDatabase("test_stream"); // create base sql String sql = "create table if not exists test_stream.tbl1\n" + "(k1 int, k2 int)\n" + "unique key(k1)\n" - + "distributed by hash(k1) buckets 1\n" + "properties('replication_num' = '1'); "; + + "distributed by hash(k1) buckets 1\n" + + "properties('replication_num' = '1', 'binlog.enable' = 'true', 'binlog.format' = 'ROW', " + + "'binlog.need_historical_value' = 'true'); "; createTable(sql); // create default stream ExceptionChecker @@ -76,7 +78,9 @@ public void testCreateStreamAbnormalOLAP() throws Exception { createDatabase("test_stream"); // create base sql String sql = "create table if not exists test_stream.tbl1\n" + "(k1 int, k2 int)\n" + "unique key(k1)\n" - + "distributed by hash(k1) buckets 1\n" + "properties('replication_num' = '1'); "; + + "distributed by hash(k1) buckets 1\n" + + "properties('replication_num' = '1', 'binlog.enable' = 'true', 'binlog.format' = 'ROW', " + + "'binlog.need_historical_value' = 'true'); "; createTable(sql); // create default stream ExceptionChecker diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/DropTableStreamTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/DropTableStreamTest.java index b28ad1ef6d8c26..43f9d5a63c902f 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/DropTableStreamTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/DropTableStreamTest.java @@ -44,7 +44,9 @@ protected void runBeforeAll() throws Exception { createDatabase("test_stream"); String createTableStr1 = "create table if not exists test_stream.tbl1\n" + "(k1 int, k2 int)\n" + "unique key(k1)\n" - + "distributed by hash(k1) buckets 1\n" + "properties('replication_num' = '1'); "; + + "distributed by hash(k1) buckets 1\n" + + "properties('replication_num' = '1', 'binlog.enable' = 'true', 'binlog.format' = 'ROW', " + + "'binlog.need_historical_value' = 'true'); "; createTable(createTableStr1); String createStreamStr1 = "create stream test_stream.s1 on table test_stream.tbl1\n" diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/stream/TableStreamManagerCleanupTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/stream/TableStreamManagerCleanupTest.java new file mode 100644 index 00000000000000..918a6046b7cb8c --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/stream/TableStreamManagerCleanupTest.java @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.catalog.stream; + +import org.apache.doris.catalog.Database; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.common.Config; +import org.apache.doris.common.FeConstants; +import org.apache.doris.common.jmockit.Deencapsulation; +import org.apache.doris.persist.PruneTableStreamPartitionOffsetInfo; +import org.apache.doris.utframe.TestWithFeService; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +public class TableStreamManagerCleanupTest extends TestWithFeService { + + @Override + protected void runBeforeAll() throws Exception { + FeConstants.runningUnitTest = true; + Config.allow_replica_on_same_host = true; + Config.enable_table_stream = true; + createDatabase("test_stream_cleanup"); + connectContext.setDatabase("test_stream_cleanup"); + } + + @Test + public void testCleanupRemovedPartitionOffsets() throws Exception { + StreamContext context = createStreamContext("cleanup_normal"); + long keptPartitionId = context.baseTable.getPartition("p1").getId(); + long removedPartitionId = context.baseTable.getPartition("p2").getId(); + setPartitionState(context.stream, keptPartitionId, removedPartitionId); + + alterTableSync("alter table test_stream_cleanup." + context.baseTable.getName() + " drop partition p2"); + Env.getCurrentEnv().getTableStreamManager().cleanupStalePartitionOffsets(); + + assertPartitionState(context.stream, keptPartitionId, removedPartitionId, true); + } + + @Test + public void testCleanupSkipsDisabledStream() throws Exception { + StreamContext context = createStreamContext("cleanup_disabled"); + long keptPartitionId = context.baseTable.getPartition("p1").getId(); + long removedPartitionId = context.baseTable.getPartition("p2").getId(); + setPartitionState(context.stream, keptPartitionId, removedPartitionId); + + alterTableSync("alter table test_stream_cleanup." + context.baseTable.getName() + " drop partition p2"); + context.stream.writeLock(); + try { + context.stream.setDisabled(true); + } finally { + context.stream.writeUnlock(); + } + Env.getCurrentEnv().getTableStreamManager().cleanupStalePartitionOffsets(); + + assertPartitionState(context.stream, keptPartitionId, removedPartitionId, false); + } + + @Test + public void testCleanupSkipsStaleStream() throws Exception { + StreamContext context = createStreamContext("cleanup_stale"); + long keptPartitionId = context.baseTable.getPartition("p1").getId(); + long removedPartitionId = context.baseTable.getPartition("p2").getId(); + setPartitionState(context.stream, keptPartitionId, removedPartitionId); + + alterTableSync("alter table test_stream_cleanup." + context.baseTable.getName() + " drop partition p2"); + context.stream.writeLock(); + try { + context.stream.setStale(true); + context.stream.setStaleReason("ut"); + } finally { + context.stream.writeUnlock(); + } + Env.getCurrentEnv().getTableStreamManager().cleanupStalePartitionOffsets(); + + assertPartitionState(context.stream, keptPartitionId, removedPartitionId, false); + } + + @Test + public void testReplayPrunePartitionOffsetsDirectly() throws Exception { + StreamContext context = createStreamContext("replay_prune"); + long keptPartitionId = context.baseTable.getPartition("p1").getId(); + long removedPartitionId = context.baseTable.getPartition("p2").getId(); + setPartitionState(context.stream, keptPartitionId, removedPartitionId); + + context.stream.writeLock(); + try { + context.stream.setDisabled(true); + } finally { + context.stream.writeUnlock(); + } + Env.getCurrentEnv().getTableStreamManager().replayPruneTableStreamPartitionOffsets( + new PruneTableStreamPartitionOffsetInfo(Collections.singletonList( + new PruneTableStreamPartitionOffsetInfo.Entry( + context.stream.getDatabase().getId(), context.stream.getId(), + Collections.singleton(removedPartitionId))))); + + assertPartitionState(context.stream, keptPartitionId, removedPartitionId, true); + } + + private StreamContext createStreamContext(String suffix) throws Exception { + String tableName = "tbl_" + suffix; + String streamName = "s_" + suffix; + createTable("create table test_stream_cleanup." + tableName + " (\n" + + " k1 int,\n" + + " k2 int\n" + + ")\n" + + "unique key(k1)\n" + + "partition by range(k1)\n" + + "(partition p1 values less than (\"100\"),\n" + + " partition p2 values less than (\"200\"))\n" + + "distributed by hash(k1) buckets 1\n" + + "properties(\"replication_num\"=\"1\",\"binlog.enable\"=\"true\"," + + "\"binlog.format\"=\"ROW\")"); + createTable("create stream test_stream_cleanup." + streamName + " on table test_stream_cleanup." + tableName + + " properties('type' = 'append_only', 'show_initial_rows' = 'true')"); + + Database db = (Database) Env.getCurrentInternalCatalog().getDbOrMetaException("test_stream_cleanup"); + return new StreamContext((OlapTable) db.getTableOrMetaException(tableName), + (OlapTableStream) db.getTableOrMetaException(streamName)); + } + + private void setPartitionState(OlapTableStream stream, long keptPartitionId, long removedPartitionId) { + Map partitionOffset = new HashMap<>(); + partitionOffset.put(keptPartitionId, 11L); + partitionOffset.put(removedPartitionId, 22L); + Map partitionConsumptionTime = new HashMap<>(); + partitionConsumptionTime.put(keptPartitionId, 111L); + partitionConsumptionTime.put(removedPartitionId, 222L); + Map historicalPartitionOffset = new HashMap<>(); + historicalPartitionOffset.put(keptPartitionId, 101L); + historicalPartitionOffset.put(removedPartitionId, 202L); + Map historicalPartitionTSO = new HashMap<>(); + historicalPartitionTSO.put(keptPartitionId, 1001L); + historicalPartitionTSO.put(removedPartitionId, 2002L); + Deencapsulation.setField(stream, "partitionOffset", partitionOffset); + Deencapsulation.setField(stream, "partitionConsumptionTime", partitionConsumptionTime); + Deencapsulation.setField(stream, "historicalPartitionOffset", historicalPartitionOffset); + Deencapsulation.setField(stream, "historicalPartitionTSO", historicalPartitionTSO); + } + + private void assertPartitionState(OlapTableStream stream, long keptPartitionId, long removedPartitionId, + boolean removedExpected) { + Map partitionOffset = Deencapsulation.getField(stream, "partitionOffset"); + Map partitionConsumptionTime = Deencapsulation.getField(stream, "partitionConsumptionTime"); + Map historicalPartitionOffset = Deencapsulation.getField(stream, "historicalPartitionOffset"); + Map historicalPartitionTSO = Deencapsulation.getField(stream, "historicalPartitionTSO"); + + Assertions.assertTrue(partitionOffset.containsKey(keptPartitionId)); + Assertions.assertTrue(partitionConsumptionTime.containsKey(keptPartitionId)); + Assertions.assertTrue(historicalPartitionOffset.containsKey(keptPartitionId)); + Assertions.assertTrue(historicalPartitionTSO.containsKey(keptPartitionId)); + Assertions.assertEquals(!removedExpected, partitionOffset.containsKey(removedPartitionId)); + Assertions.assertEquals(!removedExpected, partitionConsumptionTime.containsKey(removedPartitionId)); + Assertions.assertEquals(!removedExpected, historicalPartitionOffset.containsKey(removedPartitionId)); + Assertions.assertEquals(!removedExpected, historicalPartitionTSO.containsKey(removedPartitionId)); + } + + private static class StreamContext { + private final OlapTable baseTable; + private final OlapTableStream stream; + + private StreamContext(OlapTable baseTable, OlapTableStream stream) { + this.baseTable = baseTable; + this.stream = stream; + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java index 247facf6b64b8a..a64415480db2ab 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/parser/NereidsParserTest.java @@ -19,11 +19,13 @@ import org.apache.doris.analysis.StatementBase; import org.apache.doris.analysis.StmtType; +import org.apache.doris.analysis.TableScanParams; import org.apache.doris.common.Config; import org.apache.doris.common.Pair; import org.apache.doris.nereids.StatementContext; import org.apache.doris.nereids.analyzer.UnboundFunction; import org.apache.doris.nereids.analyzer.UnboundOneRowRelation; +import org.apache.doris.nereids.analyzer.UnboundRelation; import org.apache.doris.nereids.exceptions.AnalysisException; import org.apache.doris.nereids.exceptions.NotSupportedException; import org.apache.doris.nereids.exceptions.ParseException; @@ -118,6 +120,68 @@ public void testSingle() { Assertions.assertNull(exceptionOccurred); } + @Test + public void testParseTableIncrScanParams() { + NereidsParser nereidsParser = new NereidsParser(); + + UnboundRelation minDeltaRelation = findFirstUnboundRelation(nereidsParser.parseSingle( + "select * from tbl5@incr('startTimestamp' = '2026-05-25 20:51:28', " + + "\"endTimestamp\" = \"2026-05-25 20:54:00\", " + + "\"incrementType\" = \"MIN_DELTA\")")); + Assertions.assertNotNull(minDeltaRelation); + TableScanParams minDeltaScanParams = minDeltaRelation.getScanParams(); + Assertions.assertEquals("incr", minDeltaScanParams.getParamType()); + Assertions.assertEquals("2026-05-25 20:51:28", + minDeltaScanParams.getMapParams().get("startTimestamp")); + Assertions.assertEquals("2026-05-25 20:54:00", + minDeltaScanParams.getMapParams().get("endTimestamp")); + Assertions.assertEquals("MIN_DELTA", + minDeltaScanParams.getMapParams().get("incrementType")); + + UnboundRelation appendOnlyRelation = findFirstUnboundRelation(nereidsParser.parseSingle( + "select * from tbl5@incr('startTimestamp' = '2026-05-25 20:51:28', " + + "\"endTimestamp\" = \"2026-05-25 20:54:00\", " + + "\"incrementType\" = \"APPEND_ONLY\")")); + Assertions.assertNotNull(appendOnlyRelation); + Assertions.assertEquals("APPEND_ONLY", + appendOnlyRelation.getScanParams().getMapParams().get("incrementType")); + + UnboundRelation detailWithRangeRelation = findFirstUnboundRelation(nereidsParser.parseSingle( + "select * from tbl5@incr('startTimestamp' = '2026-05-25 20:51:28', " + + "\"endTimestamp\" = \"2026-05-25 20:54:00\", " + + "\"incrementType\" = \"DETAIL\")")); + Assertions.assertNotNull(detailWithRangeRelation); + Assertions.assertEquals("2026-05-25 20:51:28", + detailWithRangeRelation.getScanParams().getMapParams().get("startTimestamp")); + Assertions.assertEquals("2026-05-25 20:54:00", + detailWithRangeRelation.getScanParams().getMapParams().get("endTimestamp")); + Assertions.assertEquals("DETAIL", + detailWithRangeRelation.getScanParams().getMapParams().get("incrementType")); + + UnboundRelation detailWithStartRelation = findFirstUnboundRelation(nereidsParser.parseSingle( + "select * from tbl5@incr('startTimestamp' = '2026-05-25 20:51:28', " + + "\"incrementType\" = \"DETAIL\")")); + Assertions.assertNotNull(detailWithStartRelation); + Assertions.assertEquals("2026-05-25 20:51:28", + detailWithStartRelation.getScanParams().getMapParams().get("startTimestamp")); + Assertions.assertEquals("DETAIL", + detailWithStartRelation.getScanParams().getMapParams().get("incrementType")); + Assertions.assertFalse(detailWithStartRelation.getScanParams().getMapParams().containsKey("endTimestamp")); + + UnboundRelation detailOnlyRelation = findFirstUnboundRelation( + nereidsParser.parseSingle("select * from tbl5@incr(\"incrementType\" = \"DETAIL\")")); + Assertions.assertNotNull(detailOnlyRelation); + Assertions.assertEquals("DETAIL", + detailOnlyRelation.getScanParams().getMapParams().get("incrementType")); + Assertions.assertFalse(detailOnlyRelation.getScanParams().getMapParams().containsKey("startTimestamp")); + + UnboundRelation emptyIncrRelation = findFirstUnboundRelation( + nereidsParser.parseSingle("select * from tbl5@incr()")); + Assertions.assertNotNull(emptyIncrRelation); + Assertions.assertEquals("incr", emptyIncrRelation.getScanParams().getParamType()); + Assertions.assertTrue(emptyIncrRelation.getScanParams().getMapParams().isEmpty()); + } + @Test public void testErrorListener() { parsePlan("select * from t1 where a = 1 illegal_symbol") @@ -931,6 +995,19 @@ private void checkQueryTopPlanClass(String sql, NereidsParser parser, Class c } } + private UnboundRelation findFirstUnboundRelation(Plan plan) { + if (plan instanceof UnboundRelation) { + return (UnboundRelation) plan; + } + for (Plan child : plan.children()) { + UnboundRelation relation = findFirstUnboundRelation(child); + if (relation != null) { + return relation; + } + } + return null; + } + @Test public void testBlockSqlAst() { String sql = "plan replayer dump select `AD``D` from t1 where a = 1"; diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/analysis/BindRelationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/analysis/BindRelationTest.java index e877989f41e8a8..9a31f3d12d716c 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/analysis/BindRelationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/analysis/BindRelationTest.java @@ -17,9 +17,12 @@ package org.apache.doris.nereids.rules.analysis; +import org.apache.doris.analysis.TableScanParams; +import org.apache.doris.common.util.TimeUtils; import org.apache.doris.nereids.analyzer.UnboundRelation; import org.apache.doris.nereids.pattern.GeneratedPlanPatterns; import org.apache.doris.nereids.rules.RulePromise; +import org.apache.doris.nereids.trees.ChangeScanInfo; import org.apache.doris.nereids.trees.expressions.Alias; import org.apache.doris.nereids.trees.expressions.ExprId; import org.apache.doris.nereids.trees.expressions.Expression; @@ -36,11 +39,14 @@ import org.apache.doris.utframe.TestWithFeService; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; +import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -168,6 +174,69 @@ public Void visitLogicalAggregate(LogicalAggregate aggregate, }); } + @Test + void testIncrParamsMapToInformationKindAndDefaultPosition() throws Exception { + BindRelation bindRelation = new BindRelation(); + Method method = BindRelation.class.getDeclaredMethod("buildChangeScanInfo", TableScanParams.class); + method.setAccessible(true); + + String startTimestamp = "2026-05-25 20:51:28"; + String endTimestamp = "2026-05-25 20:54:00"; + long startTimestampMillis = TimeUtils.timeStringToLong(startTimestamp); + long endTimestampMillis = TimeUtils.timeStringToLong(endTimestamp); + + ChangeScanInfo changeScanInfo = (ChangeScanInfo) method.invoke(bindRelation, + new TableScanParams("incr", + ImmutableMap.of("incrementType", "MIN_DELTA", + "startTimestamp", startTimestamp, + "endTimestamp", endTimestamp), null)); + Assertions.assertEquals(ChangeScanInfo.InformationKind.MIN_DELTA, changeScanInfo.getInformationKind()); + Assertions.assertEquals(ChangeScanInfo.Position.forTimestamp(startTimestampMillis), changeScanInfo.getAt()); + Assertions.assertEquals(Optional.of(ChangeScanInfo.Position.forTimestamp(endTimestampMillis)), + changeScanInfo.getEnd()); + + changeScanInfo = (ChangeScanInfo) method.invoke(bindRelation, + new TableScanParams("incr", + ImmutableMap.of("incrementType", "APPEND_ONLY", + "startTimestamp", startTimestamp, + "endTimestamp", endTimestamp), null)); + Assertions.assertEquals(ChangeScanInfo.InformationKind.APPEND_ONLY, changeScanInfo.getInformationKind()); + Assertions.assertEquals(ChangeScanInfo.Position.forTimestamp(startTimestampMillis), changeScanInfo.getAt()); + Assertions.assertEquals(Optional.of(ChangeScanInfo.Position.forTimestamp(endTimestampMillis)), + changeScanInfo.getEnd()); + + changeScanInfo = (ChangeScanInfo) method.invoke(bindRelation, + new TableScanParams("incr", + ImmutableMap.of("incrementType", "DETAIL", + "startTimestamp", startTimestamp, + "endTimestamp", endTimestamp), null)); + Assertions.assertEquals(ChangeScanInfo.InformationKind.DETAIL, changeScanInfo.getInformationKind()); + Assertions.assertEquals(ChangeScanInfo.Position.forTimestamp(startTimestampMillis), changeScanInfo.getAt()); + Assertions.assertEquals(Optional.of(ChangeScanInfo.Position.forTimestamp(endTimestampMillis)), + changeScanInfo.getEnd()); + + changeScanInfo = (ChangeScanInfo) method.invoke(bindRelation, + new TableScanParams("incr", + ImmutableMap.of("incrementType", "DETAIL", + "startTimestamp", startTimestamp), null)); + Assertions.assertEquals(ChangeScanInfo.InformationKind.DETAIL, changeScanInfo.getInformationKind()); + Assertions.assertEquals(ChangeScanInfo.Position.forTimestamp(startTimestampMillis), changeScanInfo.getAt()); + Assertions.assertEquals(Optional.empty(), changeScanInfo.getEnd()); + + changeScanInfo = (ChangeScanInfo) method.invoke(bindRelation, + new TableScanParams("incr", + ImmutableMap.of("incrementType", "DETAIL"), null)); + Assertions.assertEquals(ChangeScanInfo.InformationKind.DETAIL, changeScanInfo.getInformationKind()); + Assertions.assertEquals(ChangeScanInfo.Position.forTimestamp(0L), changeScanInfo.getAt()); + Assertions.assertEquals(Optional.empty(), changeScanInfo.getEnd()); + + changeScanInfo = (ChangeScanInfo) method.invoke(bindRelation, + new TableScanParams("incr", ImmutableMap.of(), null)); + Assertions.assertEquals(ChangeScanInfo.InformationKind.DETAIL, changeScanInfo.getInformationKind()); + Assertions.assertEquals(ChangeScanInfo.Position.forTimestamp(0L), changeScanInfo.getAt()); + Assertions.assertEquals(Optional.empty(), changeScanInfo.getEnd()); + } + @Override public RulePromise defaultPromise() { return RulePromise.REWRITE; diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/ExplainTableStreamPlanTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/ExplainTableStreamPlanTest.java index fa8bed591da1b2..c311121e4fd570 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/ExplainTableStreamPlanTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/ExplainTableStreamPlanTest.java @@ -17,14 +17,20 @@ package org.apache.doris.nereids.trees.plans; +import org.apache.doris.analysis.CaseExpr; +import org.apache.doris.analysis.Expr; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Database; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.MaterializedIndex; +import org.apache.doris.catalog.MaterializedIndex.IndexExtState; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; +import org.apache.doris.catalog.Replica; import org.apache.doris.catalog.Tablet; +import org.apache.doris.catalog.stream.BaseTableStream; import org.apache.doris.catalog.stream.OlapTableStream; +import org.apache.doris.catalog.stream.OlapTableStreamUpdate; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; import org.apache.doris.nereids.NereidsPlanner; @@ -36,11 +42,13 @@ import org.apache.doris.nereids.trees.expressions.Alias; import org.apache.doris.nereids.trees.expressions.NamedExpression; import org.apache.doris.nereids.trees.expressions.StatementScopeIdGenerator; +import org.apache.doris.nereids.trees.expressions.WhenClause; import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; import org.apache.doris.nereids.trees.plans.commands.ExplainCommand; import org.apache.doris.nereids.trees.plans.logical.LogicalPlan; import org.apache.doris.nereids.trees.plans.logical.LogicalProject; +import org.apache.doris.nereids.trees.plans.logical.LogicalUnion; import org.apache.doris.nereids.trees.plans.physical.PhysicalPlan; import org.apache.doris.nereids.util.MemoTestUtils; import org.apache.doris.nereids.util.PlanChecker; @@ -48,6 +56,9 @@ import org.apache.doris.planner.PlanFragment; import org.apache.doris.planner.PlanNode; import org.apache.doris.qe.ConnectContext; +import org.apache.doris.thrift.TBinlogReadSource; +import org.apache.doris.thrift.TBinlogScanType; +import org.apache.doris.thrift.TPaloScanRange; import org.apache.doris.thrift.TScanRangeLocations; import org.apache.doris.utframe.TestWithFeService; @@ -71,6 +82,7 @@ public void runBeforeAll() throws Exception { FeConstants.runningUnitTest = true; Config.allow_replica_on_same_host = true; Config.enable_table_stream = true; + Config.enable_feature_binlog = true; createDatabase("test_stream"); connectContext.setDatabase("test_stream"); @@ -79,25 +91,47 @@ public void runBeforeAll() throws Exception { + " k1 int,\n" + " k2 int\n" + ")\n" - + "duplicate key(k1)\n" + + "unique key(k1)\n" + "partition by range(k1)\n" + "(partition p1 values less than (\"100\"),\n" + " partition p2 values less than (\"200\"))\n" + "distributed by hash(k1) buckets 1\n" - + "properties(\"replication_num\"=\"1\")"; + + "properties(\"replication_num\"=\"1\"," + + "\"enable_unique_key_merge_on_write\"=\"true\"," + + "\"binlog.enable\"=\"true\",\"binlog.format\"=\"ROW\"," + + "\"binlog.need_historical_value\"=\"true\")"; createTable(createBaseTable); + Database db = (Database) Env.getCurrentInternalCatalog().getDbOrMetaException("test_stream"); + OlapTable baseTable = (OlapTable) db.getTableOrMetaException("tbl_stream_base"); + // Bump base table partition + replica versions to a value larger than the initial version + // before creating s1 so that s1 populates historicalPartitionOffset (history path). + bumpPartitionsAndReplicas(baseTable, 1001L); + String createStream = "create stream if not exists test_stream.s1 on table test_stream.tbl_stream_base\n" + "properties('type' = 'default', 'show_initial_rows' = 'true')"; createTable(createStream); - // Make base table visible versions differ from stream offsets, so we can verify - // scan range version uses stream partitionOffset rather than partition visibleVersion. - Database db = (Database) Env.getCurrentInternalCatalog().getDbOrMetaException("test_stream"); - OlapTable baseTable = (OlapTable) db.getTableOrMetaException("tbl_stream_base"); - for (Partition partition : baseTable.getPartitions()) { - partition.setVisibleVersionAndTime(partition.getVisibleVersion() + 1000, - System.currentTimeMillis()); + // Create another stream s2 without showing initial rows, then bump versions again so + // s2 has incremental data (no historicalPartitionOffset). + String createIncrementalStream = + "create stream if not exists test_stream.s2 on table test_stream.tbl_stream_base\n" + + "properties('type' = 'default', 'show_initial_rows' = 'false')"; + createTable(createIncrementalStream); + bumpPartitionsAndReplicas(baseTable, 2002L); + } + + private static void bumpPartitionsAndReplicas(OlapTable table, long newVersion) { + for (Partition partition : table.getPartitions()) { + partition.setVisibleVersionAndTime(newVersion, System.currentTimeMillis()); + partition.setNextVersion(newVersion + 1); + for (MaterializedIndex index : partition.getMaterializedIndices(IndexExtState.VISIBLE)) { + for (Tablet tablet : index.getTablets()) { + for (Replica replica : tablet.getReplicas()) { + replica.updateVersion(newVersion); + } + } + } } } @@ -241,6 +275,165 @@ public void testPartitionOffsetAndScanRangeVersion() throws Exception { } } + @Test + public void testIncrementalScanChangeTypeProjectionIsCaseExpr() throws Exception { + ConnectContext ctx = createDefaultCtx(); + ctx.setDatabase("test_stream"); + ctx.getSessionVariable().showHiddenColumns = true; + + StatementScopeIdGenerator.clear(); + PlanFragment fragment = getFragment(ctx, "explain select * from test_stream.s2"); + PlanNode root = fragment.getPlanRoot(); + + List scanNodes = new ArrayList<>(); + collectOlapScanNodes(root, scanNodes); + Assertions.assertFalse(scanNodes.isEmpty()); + + boolean foundIncrementalCaseExpr = false; + for (OlapScanNode scanNode : scanNodes) { + List projectList = scanNode.getProjectList(); + if (projectList == null || projectList.size() < 4) { + continue; + } + Expr changeTypeExpr = projectList.get(3); + if (changeTypeExpr instanceof CaseExpr) { + foundIncrementalCaseExpr = true; + break; + } + } + Assertions.assertTrue(foundIncrementalCaseExpr, + "incremental stream scan should keep change type projection as CaseExpr"); + } + + @Test + public void testIncrementalScanRangeUsesPartitionOffsetAsStartTSO() throws Exception { + // s2 was created with show_initial_rows=false, so partitionOffset is seeded to the + // visibleVersionTime captured at creation time. After we bumped versions in + // runBeforeAll, the current partition.visibleVersionTime is strictly greater. The + // scan range should advertise a STREAM read source bounded by [partitionOffset, + // partition.visibleVersionTime). + Database db = (Database) Env.getCurrentInternalCatalog().getDbOrMetaException("test_stream"); + OlapTable baseTable = (OlapTable) db.getTableOrMetaException("tbl_stream_base"); + OlapTableStream stream = (OlapTableStream) db.getTableOrMetaException("s2"); + + ConnectContext ctx = createDefaultCtx(); + ctx.setDatabase("test_stream"); + ctx.getSessionVariable().showHiddenColumns = true; + StatementScopeIdGenerator.clear(); + PlanFragment fragment = getFragment(ctx, "explain select * from test_stream.s2"); + + List scanNodes = new ArrayList<>(); + collectOlapScanNodes(fragment.getPlanRoot(), scanNodes); + Assertions.assertFalse(scanNodes.isEmpty()); + + TBinlogScanType expectedScanType = BaseTableStream.StreamScanType.toThrift(stream.getConsumeType()); + Map tabletIdToPartitionId = new java.util.HashMap<>(); + for (Partition partition : baseTable.getPartitions()) { + MaterializedIndex baseIndex = partition.getIndex(baseTable.getBaseIndexId()); + for (Tablet tablet : baseIndex.getTablets()) { + tabletIdToPartitionId.put(tablet.getId(), partition.getId()); + } + } + + boolean assertedAtLeastOne = false; + for (OlapScanNode scanNode : scanNodes) { + if (!scanNode.isIncrementalScan()) { + continue; + } + List locations = scanNode.getScanRangeLocations(Long.MAX_VALUE); + Assertions.assertFalse(locations.isEmpty()); + for (TScanRangeLocations loc : locations) { + TPaloScanRange range = loc.getScanRange().getPaloScanRange(); + long tabletId = range.getTabletId(); + long pid = tabletIdToPartitionId.get(tabletId); + long expectedStart = stream.getStreamUpdate(pid).first; + Assertions.assertEquals(TBinlogReadSource.STREAM, range.getBinlogReadSource(), + "incremental scan should use STREAM as binlog read source"); + Assertions.assertEquals(expectedScanType, range.getBinlogScanType(), + "binlog scan type should match stream consume type"); + Assertions.assertEquals(expectedStart, range.getStartTso(), + "startTSO should equal stream partitionOffset (last committed binlog TSO)"); + assertedAtLeastOne = true; + } + } + Assertions.assertTrue(assertedAtLeastOne, + "expected at least one incremental scan range to assert binlog TSO bounds against"); + } + + @Test + public void testIncrementalScanStartTsoAdvancesAfterOffsetCommit() throws Exception { + // Closed-loop validation: after the offset is advanced via unprotectedUpdateStreamUpdate, + // a subsequent explain plan must use the new offset as the startTSO. + Database db = (Database) Env.getCurrentInternalCatalog().getDbOrMetaException("test_stream"); + OlapTable baseTable = (OlapTable) db.getTableOrMetaException("tbl_stream_base"); + OlapTableStream stream = (OlapTableStream) db.getTableOrMetaException("s2"); + + ConnectContext ctx = createDefaultCtx(); + ctx.setDatabase("test_stream"); + ctx.getSessionVariable().showHiddenColumns = true; + + // 1st explain: capture current scan node's stream update (prev/next per partition). + StatementScopeIdGenerator.clear(); + PlanFragment fragment1 = getFragment(ctx, "explain select * from test_stream.s2"); + List scanNodes1 = new ArrayList<>(); + collectOlapScanNodes(fragment1.getPlanRoot(), scanNodes1); + OlapScanNode incrementalScan1 = null; + for (OlapScanNode scanNode : scanNodes1) { + if (scanNode.isIncrementalScan()) { + incrementalScan1 = scanNode; + break; + } + } + Assertions.assertNotNull(incrementalScan1); + OlapTableStreamUpdate update = incrementalScan1.getStreamUpdate(); + Map nextOffsets = new java.util.HashMap<>(update.getNext()); + Assertions.assertFalse(nextOffsets.isEmpty()); + + // Commit the offsets. + baseTable.writeLock(); + try { + stream.unprotectedUpdateStreamUpdate(update, System.currentTimeMillis()); + } finally { + baseTable.writeUnlock(); + } + for (Map.Entry entry : nextOffsets.entrySet()) { + Assertions.assertEquals(entry.getValue(), stream.getStreamUpdate(entry.getKey()).first, + "partitionOffset should be advanced to committed next TSO"); + } + + // Simulate another BE-side commit by bumping partition versions further. + bumpPartitionsAndReplicas(baseTable, 3003L); + + // 2nd explain: new startTSO should equal the previous explain's endTSO. + StatementScopeIdGenerator.clear(); + PlanFragment fragment2 = getFragment(ctx, "explain select * from test_stream.s2"); + List scanNodes2 = new ArrayList<>(); + collectOlapScanNodes(fragment2.getPlanRoot(), scanNodes2); + + Map tabletIdToPartitionId = new java.util.HashMap<>(); + for (Partition partition : baseTable.getPartitions()) { + MaterializedIndex baseIndex = partition.getIndex(baseTable.getBaseIndexId()); + for (Tablet tablet : baseIndex.getTablets()) { + tabletIdToPartitionId.put(tablet.getId(), partition.getId()); + } + } + boolean assertedAtLeastOne = false; + for (OlapScanNode scanNode : scanNodes2) { + if (!scanNode.isIncrementalScan()) { + continue; + } + List locations = scanNode.getScanRangeLocations(Long.MAX_VALUE); + for (TScanRangeLocations loc : locations) { + TPaloScanRange range = loc.getScanRange().getPaloScanRange(); + long pid = tabletIdToPartitionId.get(range.getTabletId()); + Assertions.assertEquals(nextOffsets.get(pid), range.getStartTso(), + "after offset commit, new startTSO must equal the previously committed next TSO"); + assertedAtLeastOne = true; + } + } + Assertions.assertTrue(assertedAtLeastOne); + } + private void collectOlapScanNodes(PlanNode node, List result) { if (node instanceof OlapScanNode) { result.add((OlapScanNode) node); @@ -263,12 +456,45 @@ private LogicalProject findFirstLogicalProject(Plan plan) { return null; } + private LogicalUnion findFirstLogicalUnion(Plan plan) { + if (plan instanceof LogicalUnion) { + return (LogicalUnion) plan; + } + for (Plan child : plan.children()) { + LogicalUnion found = findFirstLogicalUnion(child); + if (found != null) { + return found; + } + } + return null; + } + + private void collectLogicalProjects(Plan plan, List> result) { + if (plan instanceof LogicalProject) { + result.add((LogicalProject) plan); + } + for (Plan child : plan.children()) { + collectLogicalProjects(child, result); + } + } + + private void assertWhenClause(WhenClause whenClause, long expectedOpCode, String expectedResult) { + Assertions.assertTrue(whenClause.getOperand() instanceof BigIntLiteral); + Assertions.assertEquals(expectedOpCode, ((BigIntLiteral) whenClause.getOperand()).getValue()); + Assertions.assertTrue(whenClause.getResult() instanceof VarcharLiteral); + Assertions.assertEquals(expectedResult, ((VarcharLiteral) whenClause.getResult()).getValue()); + } + private PlanFragment getFragment(String sql) throws Exception { + return getFragment(connectContext, sql); + } + + private PlanFragment getFragment(ConnectContext ctx, String sql) throws Exception { StatementScopeIdGenerator.clear(); - StatementContext statementContext = MemoTestUtils.createStatementContext(connectContext, sql); + StatementContext statementContext = MemoTestUtils.createStatementContext(ctx, sql); NereidsPlanner planner = new NereidsPlanner(statementContext); LogicalPlan logicalPlan = (LogicalPlan) ((Explainable) (((ExplainCommand) parser.parseSingle(sql)) - .getLogicalPlan())).getExplainPlan(connectContext); + .getLogicalPlan())).getExplainPlan(ctx); PhysicalPlan plan = planner.planWithLock(logicalPlan, PhysicalProperties.ANY); return new PhysicalPlanTranslator(new PlanTranslatorContext(planner.getCascadesContext())) .translatePlan(plan); diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommandTableStreamTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommandTableStreamTest.java index 6a6bc7e3e81f11..40affba0e2d0d1 100755 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommandTableStreamTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/commands/insert/InsertIntoTableCommandTableStreamTest.java @@ -19,6 +19,12 @@ import org.apache.doris.catalog.Database; import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.MaterializedIndex; +import org.apache.doris.catalog.MaterializedIndex.IndexExtState; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.Partition; +import org.apache.doris.catalog.Replica; +import org.apache.doris.catalog.Tablet; import org.apache.doris.catalog.stream.AbstractTableStreamUpdate; import org.apache.doris.catalog.stream.OlapTableStream; import org.apache.doris.catalog.stream.OlapTableStreamUpdate; @@ -36,7 +42,9 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.UUID; public class InsertIntoTableCommandTableStreamTest extends TestWithFeService { @@ -48,6 +56,7 @@ public void runBeforeAll() throws Exception { FeConstants.runningUnitTest = true; Config.allow_replica_on_same_host = true; Config.enable_table_stream = true; + Config.enable_feature_binlog = true; createDatabase("test_stream"); connectContext.setDatabase("test_stream"); @@ -56,12 +65,15 @@ public void runBeforeAll() throws Exception { + " k1 int,\n" + " k2 int\n" + ")\n" - + "duplicate key(k1)\n" + + "unique key(k1)\n" + "partition by range(k1)\n" + "(partition p1 values less than (\"100\"),\n" + " partition p2 values less than (\"200\"))\n" + "distributed by hash(k1) buckets 1\n" - + "properties(\"replication_num\"=\"1\")"; + + "properties(\"replication_num\"=\"1\"," + + "\"enable_unique_key_merge_on_write\"=\"true\"," + + "\"binlog.enable\"=\"true\",\"binlog.format\"=\"ROW\"," + + "\"binlog.need_historical_value\"=\"true\")"; createTable(createBaseTable); String createTargetTable = "create table test_stream.tbl_target (\n" @@ -127,4 +139,194 @@ public void testInitPlanCollectsStreamUpdateInfosForHistoricalConsume() throws E AbstractTableStreamUpdate txnUpdate = txnInfo.getUpdate(); Assertions.assertEquals(update.getNext(), ((OlapTableStreamUpdate) txnUpdate).getNext()); } + + @Test + public void testUnprotectedUpdateAdvancesPartitionOffsetAndConsumptionTime() throws Exception { + // (B1) When no historicalPartitionOffset is present, unprotectedUpdateStreamUpdate should + // advance partitionOffset to the committed `next` TSO, record partitionConsumptionTime, + // and leave hasHistoricalData() false. + Database db = (Database) Env.getCurrentInternalCatalog().getDbOrMetaException("test_stream"); + OlapTable baseTable = (OlapTable) db.getTableOrMetaException("tbl_stream_base"); + // Create a fresh stream without showing initial rows so historicalPartitionOffset is empty. + createTable("create stream if not exists test_stream.s_no_init on table test_stream.tbl_stream_base\n" + + "properties('type' = 'default', 'show_initial_rows' = 'false')"); + OlapTableStream stream = (OlapTableStream) db.getTableOrMetaException("s_no_init"); + + Map prev = new HashMap<>(); + Map next = new HashMap<>(); + long offset = 0; + for (Partition partition : baseTable.getPartitions()) { + long pid = partition.getId(); + long previousOffset = stream.getStreamUpdate(pid).first; + Assertions.assertNotNull(previousOffset); + Assertions.assertFalse(stream.hasHistoricalData(pid)); + prev.put(pid, previousOffset); + // Construct a strictly larger TSO so we can detect the advance. + next.put(pid, previousOffset + 4242 + offset); + offset++; + } + OlapTableStreamUpdate update = new OlapTableStreamUpdate(prev, next); + + long ts = 12345L; + baseTable.writeLock(); + try { + stream.unprotectedUpdateStreamUpdate(update, ts); + } finally { + baseTable.writeUnlock(); + } + + for (Map.Entry entry : next.entrySet()) { + long pid = entry.getKey(); + Assertions.assertEquals(entry.getValue(), stream.getStreamUpdate(pid).first, + "partitionOffset must be advanced to next TSO"); + Assertions.assertFalse(stream.hasHistoricalData(pid), + "no historical offset should be present after advance without prior history"); + } + // partitionConsumptionTime is private; verify via reflection on the same field name. + @SuppressWarnings("unchecked") + Map consumptionTime = (Map) Deencapsulation.getField(stream, + "partitionConsumptionTime"); + for (Long pid : next.keySet()) { + Assertions.assertEquals(Long.valueOf(ts), consumptionTime.get(pid), + "partitionConsumptionTime must be recorded with the commit ts"); + } + } + + @Test + public void testUnprotectedUpdateClearsAndPromotesHistoryOffset() throws Exception { + // (B2) When historicalPartitionOffset is present, the commit must: + // - remove the entry from historicalPartitionOffset + // - promote historicalPartitionTSO into partitionOffset (NOT use update.next) + // - clear the historicalPartitionTSO entry + // - still set partitionConsumptionTime = ts. + Database db = (Database) Env.getCurrentInternalCatalog().getDbOrMetaException("test_stream"); + OlapTable baseTable = (OlapTable) db.getTableOrMetaException("tbl_stream_base"); + createTable("create stream if not exists test_stream.s_history on table test_stream.tbl_stream_base\n" + + "properties('type' = 'default', 'show_initial_rows' = 'false')"); + OlapTableStream stream = (OlapTableStream) db.getTableOrMetaException("s_history"); + + @SuppressWarnings("unchecked") + Map historicalPartitionOffset = (Map) Deencapsulation.getField(stream, + "historicalPartitionOffset"); + @SuppressWarnings("unchecked") + Map historicalPartitionTSO = (Map) Deencapsulation.getField(stream, + "historicalPartitionTSO"); + + Map prev = new HashMap<>(); + Map next = new HashMap<>(); + Map expectedPromotedTso = new HashMap<>(); + long seed = 0; + for (Partition partition : baseTable.getPartitions()) { + long pid = partition.getId(); + long histVer = 7000L + seed; + long histTso = 8000L + seed; + long ignoredNextTso = 9999L + seed; + historicalPartitionOffset.put(pid, histVer); + historicalPartitionTSO.put(pid, histTso); + prev.put(pid, histVer); + next.put(pid, ignoredNextTso); + expectedPromotedTso.put(pid, histTso); + seed++; + } + OlapTableStreamUpdate update = new OlapTableStreamUpdate(prev, next); + + long ts = 67890L; + baseTable.writeLock(); + try { + stream.unprotectedUpdateStreamUpdate(update, ts); + } finally { + baseTable.writeUnlock(); + } + + @SuppressWarnings("unchecked") + Map consumptionTime = (Map) Deencapsulation.getField(stream, + "partitionConsumptionTime"); + for (Map.Entry entry : expectedPromotedTso.entrySet()) { + long pid = entry.getKey(); + Assertions.assertFalse(stream.hasHistoricalData(pid), + "historicalPartitionOffset must be cleared after commit"); + Assertions.assertFalse(historicalPartitionTSO.containsKey(pid), + "historicalPartitionTSO must be cleared after commit"); + Assertions.assertEquals(entry.getValue(), stream.getStreamUpdate(pid).first, + "partitionOffset must be promoted from historicalPartitionTSO, not update.next"); + Assertions.assertEquals(Long.valueOf(ts), consumptionTime.get(pid), + "partitionConsumptionTime must be recorded with the commit ts"); + } + } + + @Test + public void testInsertProducedStreamUpdateNextMatchesHistoryAndVisibleTime() throws Exception { + // (C) End-to-end FE-side contract: the OlapTableStreamUpdate produced by the insert + // path's planner must carry, for each selected partition, + // next == historicalPartitionOffset[pid] when present, + // else partition.getVisibleVersionTime(). + Database db = (Database) Env.getCurrentInternalCatalog().getDbOrMetaException("test_stream"); + OlapTable baseTable = (OlapTable) db.getTableOrMetaException("tbl_stream_base"); + // s1 was created with show_initial_rows=true. At creation time partitions had + // visibleVersion=1 (init), so historicalPartitionOffset is empty. Manually seed + // it for one partition to also cover the history branch. + OlapTableStream stream = (OlapTableStream) db.getTableOrMetaException("s1"); + @SuppressWarnings("unchecked") + Map historicalPartitionOffset = (Map) Deencapsulation.getField(stream, + "historicalPartitionOffset"); + @SuppressWarnings("unchecked") + Map historicalPartitionTSO = (Map) Deencapsulation.getField(stream, + "historicalPartitionTSO"); + + // Bump partition versions so partition.getVisibleVersionTime() differs from any + // initial seed and is a meaningful upper bound for non-history partitions. + for (Partition partition : baseTable.getPartitions()) { + long newVer = 5000L + partition.getId() % 1000; + partition.setVisibleVersionAndTime(newVer, newVer); + partition.setNextVersion(newVer + 1); + for (MaterializedIndex index : partition.getMaterializedIndices(IndexExtState.VISIBLE)) { + for (Tablet tablet : index.getTablets()) { + for (Replica replica : tablet.getReplicas()) { + replica.updateVersion(newVer); + } + } + } + } + + // Pick the first partition to fall on the history path. + Partition historyPartition = baseTable.getPartitions().iterator().next(); + long historyPid = historyPartition.getId(); + long historyOffset = 4242L; + long historyTso = 6789L; + historicalPartitionOffset.put(historyPid, historyOffset); + historicalPartitionTSO.put(historyPid, historyTso); + + String sql = "insert into test_stream.tbl_target select * from test_stream.s1"; + LogicalPlan logicalPlan = parser.parseSingle(sql); + Assertions.assertTrue(logicalPlan instanceof InsertIntoTableCommand); + + connectContext.setStartTime(); + UUID uuid = UUID.randomUUID(); + connectContext.setQueryId(new TUniqueId(uuid.getMostSignificantBits(), uuid.getLeastSignificantBits())); + + StmtExecutor executor = new StmtExecutor(connectContext, sql); + InsertIntoTableCommand command = (InsertIntoTableCommand) logicalPlan; + AbstractInsertExecutor insertExecutor = command.initPlan(connectContext, executor, true); + + List streamUpdateInfos = insertExecutor.getStreamUpdateInfos(); + Assertions.assertEquals(1, streamUpdateInfos.size()); + OlapTableStreamUpdate update = (OlapTableStreamUpdate) streamUpdateInfos.get(0).getUpdate(); + Map producedNext = update.getNext(); + Assertions.assertFalse(producedNext.isEmpty()); + + for (Map.Entry entry : producedNext.entrySet()) { + long pid = entry.getKey(); + long actualNext = entry.getValue(); + if (pid == historyPid) { + // History path: next equals historicalPartitionOffset[pid]. + Assertions.assertEquals(historyOffset, actualNext, + "next of history partition must equal historicalPartitionOffset"); + } else { + // Incremental path: next equals partition.visibleVersionTime. + long expected = baseTable.getPartition(pid).getVisibleVersionTime(); + Assertions.assertEquals(expected, actualNext, + "next of incremental partition must equal partition.visibleVersionTime"); + } + } + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/transaction/DatabaseTransactionMgrTest.java b/fe/fe-core/src/test/java/org/apache/doris/transaction/DatabaseTransactionMgrTest.java index c1703fc4136d02..126f3492f801c8 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/transaction/DatabaseTransactionMgrTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/transaction/DatabaseTransactionMgrTest.java @@ -17,6 +17,7 @@ package org.apache.doris.transaction; +import org.apache.doris.catalog.BinlogConfig; import org.apache.doris.catalog.CatalogTestUtil; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.FakeEditLog; @@ -611,17 +612,26 @@ private void addSubTransaction() throws UserException { LabelToTxnId.put(CatalogTestUtil.testTxnLabel8, transactionState8.getTransactionId()); } + /** + * Sets table binlog format so tests can control table-level TSO without the legacy enable_tso property. + */ + private void setTableBinlogFormat(OlapTable table, BinlogConfig.BinlogFormat binlogFormat) { + BinlogConfig binlogConfig = new BinlogConfig(table.getBinlogConfig()); + binlogConfig.setBinlogFormat(binlogFormat); + table.setBinlogConfig(binlogConfig); + } + @Test public void testCommitTransactionSetsCommitTSOWhenEnableTso() throws Exception { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = true; + Config.enable_feature_binlog = true; FakeEnv.setEnv(masterEnv); OlapTable table = (OlapTable) masterEnv.getInternalCatalog() .getDbOrMetaException(CatalogTestUtil.testDbId1) .getTableOrMetaException(CatalogTestUtil.testTableId1); - table.setEnableTso(true); + setTableBinlogFormat(table, BinlogConfig.BinlogFormat.ROW); long expectedCommitTSO = 12345L; TSOService tsoService = Mockito.mock(TSOService.class); @@ -645,21 +655,21 @@ public void testCommitTransactionSetsCommitTSOWhenEnableTso() throws Exception { Assert.assertNotNull(tableCommitInfo); Assert.assertEquals(expectedCommitTSO, tableCommitInfo.getCommitTSO()); } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testCommitTransactionCommitTSORemainsMinusOneWhenTableDisableTso() throws Exception { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = true; + Config.enable_feature_binlog = true; FakeEnv.setEnv(masterEnv); OlapTable table = (OlapTable) masterEnv.getInternalCatalog() .getDbOrMetaException(CatalogTestUtil.testDbId1) .getTableOrMetaException(CatalogTestUtil.testTableId1); - table.setEnableTso(false); + setTableBinlogFormat(table, BinlogConfig.BinlogFormat.STATEMENT_AND_SNAPSHOT); TSOService tsoService = Mockito.mock(TSOService.class); Mockito.when(tsoService.getTSO()).thenReturn(12345L); @@ -682,21 +692,21 @@ public void testCommitTransactionCommitTSORemainsMinusOneWhenTableDisableTso() t Assert.assertNotNull(tableCommitInfo); Assert.assertEquals(-1L, tableCommitInfo.getCommitTSO()); } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testCommitTransactionFailsWhenGetTSOInvalid() throws Exception { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = true; + Config.enable_feature_binlog = true; FakeEnv.setEnv(masterEnv); OlapTable table = (OlapTable) masterEnv.getInternalCatalog() .getDbOrMetaException(CatalogTestUtil.testDbId1) .getTableOrMetaException(CatalogTestUtil.testTableId1); - table.setEnableTso(true); + setTableBinlogFormat(table, BinlogConfig.BinlogFormat.ROW); TSOService tsoService = Mockito.mock(TSOService.class); Mockito.when(tsoService.getTSO()).thenReturn(-1L); @@ -717,7 +727,7 @@ public void testCommitTransactionFailsWhenGetTSOInvalid() throws Exception { Assert.assertTrue(e.getMessage().contains("failed to get TSO")); } } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/transaction/TableStreamOffsetTransactionTest.java b/fe/fe-core/src/test/java/org/apache/doris/transaction/TableStreamOffsetTransactionTest.java index a75b780786528a..b4962f1af05042 100755 --- a/fe/fe-core/src/test/java/org/apache/doris/transaction/TableStreamOffsetTransactionTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/transaction/TableStreamOffsetTransactionTest.java @@ -54,12 +54,13 @@ public void runBeforeAll() throws Exception { + " k1 int,\n" + " k2 int\n" + ")\n" - + "duplicate key(k1)\n" + + "unique key(k1)\n" + "partition by range(k1)\n" + "(partition p1 values less than (\"100\"),\n" + " partition p2 values less than (\"200\"))\n" + "distributed by hash(k1) buckets 1\n" - + "properties(\"replication_num\"=\"1\")"; + + "properties(\"replication_num\"=\"1\",\"binlog.enable\"=\"true\"," + + "\"binlog.format\"=\"ROW\",\"binlog.need_historical_value\"=\"true\")"; createTable(createBaseTable); String createTargetTable = "create table test_stream.tbl_target (\n" @@ -85,12 +86,16 @@ public void testHistoricalConsumeOffsetCheckAndUpdate() throws Exception { List partitionIds = new ArrayList<>(baseTable.getPartitionIds()); Map historicalPartitionOffset = new HashMap<>(); + Map historicalPartitionTSO = new HashMap<>(); Map partitionOffset = new HashMap<>(); for (Long partitionId : partitionIds) { historicalPartitionOffset.put(partitionId, 100L); + historicalPartitionTSO.put(partitionId, 1000L + partitionId); partitionOffset.put(partitionId, 0L); } + Map expectedPartitionOffset = new HashMap<>(historicalPartitionTSO); Deencapsulation.setField(stream, "historicalPartitionOffset", historicalPartitionOffset); + Deencapsulation.setField(stream, "historicalPartitionTSO", historicalPartitionTSO); Deencapsulation.setField(stream, "partitionOffset", partitionOffset); OlapTableStreamUpdate update = new OlapTableStreamUpdate(new HashMap<>(), @@ -115,12 +120,14 @@ public void testHistoricalConsumeOffsetCheckAndUpdate() throws Exception { Deencapsulation.invoke(dbTxnMgr, "updateStreamOffset", transactionState, commitTime); Map updatedHistoricalPartitionOffset = Deencapsulation.getField(stream, "historicalPartitionOffset"); + Map updatedHistoricalPartitionTSO = Deencapsulation.getField(stream, "historicalPartitionTSO"); Map updatedPartitionOffset = Deencapsulation.getField(stream, "partitionOffset"); Map partitionConsumptionTime = Deencapsulation.getField(stream, "partitionConsumptionTime"); for (Long pid : partitionIds) { Assertions.assertFalse(updatedHistoricalPartitionOffset.containsKey(pid)); - Assertions.assertEquals(update.getNext().get(pid), updatedPartitionOffset.get(pid)); + Assertions.assertFalse(updatedHistoricalPartitionTSO.containsKey(pid)); + Assertions.assertEquals(expectedPartitionOffset.get(pid), updatedPartitionOffset.get(pid)); Assertions.assertEquals(commitTime, partitionConsumptionTime.get(pid)); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/tso/TSOServiceTest.java b/fe/fe-core/src/test/java/org/apache/doris/tso/TSOServiceTest.java index a4a0e233598a61..59740952e07298 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/tso/TSOServiceTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/tso/TSOServiceTest.java @@ -53,7 +53,7 @@ public class TSOServiceTest { private int originalMaxGetTSORetryCount; private int originalMaxUpdateRetryCount; private int originalUpdateIntervalMs; - private boolean originalEnableTsoPersistJournal; + private boolean originalEnableFeatureBinlog; private long originalClockBackwardThresholdMs; @Before @@ -63,13 +63,13 @@ public void setUp() { originalMaxGetTSORetryCount = Config.tso_max_get_retry_count; originalMaxUpdateRetryCount = Config.tso_max_update_retry_count; originalUpdateIntervalMs = Config.tso_service_update_interval_ms; - originalEnableTsoPersistJournal = Config.enable_tso_persist_journal; + originalEnableFeatureBinlog = Config.enable_feature_binlog; originalClockBackwardThresholdMs = Config.tso_clock_backward_startup_threshold_ms; Config.tso_max_get_retry_count = 1; Config.tso_max_update_retry_count = 1; Config.tso_service_update_interval_ms = 1; - Config.enable_tso_persist_journal = true; + Config.enable_feature_binlog = true; Config.tso_clock_backward_startup_threshold_ms = 30L * 60 * 1000; env = Mockito.mock(Env.class); @@ -84,7 +84,7 @@ public void tearDown() { Config.tso_max_get_retry_count = originalMaxGetTSORetryCount; Config.tso_max_update_retry_count = originalMaxUpdateRetryCount; Config.tso_service_update_interval_ms = originalUpdateIntervalMs; - Config.enable_tso_persist_journal = originalEnableTsoPersistJournal; + Config.enable_feature_binlog = originalEnableFeatureBinlog; Config.tso_clock_backward_startup_threshold_ms = originalClockBackwardThresholdMs; } @@ -104,9 +104,9 @@ public void testGetCurrentTSO() { @Test public void testGetTSOThrowsWhenEnvNotReady() { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = true; + Config.enable_feature_binlog = true; setInitializedFlag(tsoService, true); Mockito.when(env.isReady()).thenReturn(false); try { @@ -116,15 +116,15 @@ public void testGetTSOThrowsWhenEnvNotReady() { Assert.assertTrue(e.getMessage().contains("Failed to get TSO")); } } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testGetTSOThrowsWhenNotCalibrated() throws Exception { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = true; + Config.enable_feature_binlog = true; Mockito.when(env.isReady()).thenReturn(true); Mockito.when(env.isMaster()).thenReturn(true); try { @@ -134,15 +134,15 @@ public void testGetTSOThrowsWhenNotCalibrated() throws Exception { Assert.assertTrue(e.getMessage().contains("not calibrated")); } } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testGetTSOThrowsOnLogicalOverflow() throws Exception { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = true; + Config.enable_feature_binlog = true; setInitializedFlag(tsoService, true); Mockito.when(env.isReady()).thenReturn(true); Mockito.when(env.isMaster()).thenReturn(true); @@ -157,15 +157,15 @@ public void testGetTSOThrowsOnLogicalOverflow() throws Exception { Assert.assertEquals(TSOTimestamp.MAX_LOGICAL_COUNTER, getGlobalLogicalCounter(tsoService)); } } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testGetTSOAcceptsLogicalCounterUpperBound() throws Exception { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = true; + Config.enable_feature_binlog = true; setInitializedFlag(tsoService, true); Mockito.when(env.isReady()).thenReturn(true); Mockito.when(env.isMaster()).thenReturn(true); @@ -173,16 +173,16 @@ public void testGetTSOAcceptsLogicalCounterUpperBound() throws Exception { long tso = tsoService.getTSO(); Assert.assertEquals(TSOTimestamp.composeTimestamp(100L, TSOTimestamp.MAX_LOGICAL_COUNTER), tso); } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testRunAfterCatalogReadySetsIntervalTo50WhenDisabled() { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { setInitializedFlag(tsoService, true); - Config.enable_tso_feature = false; + Config.enable_feature_binlog = false; tsoService.runAfterCatalogReady(); Assert.assertEquals(1L, tsoService.getInterval()); try { @@ -192,31 +192,30 @@ public void testRunAfterCatalogReadySetsIntervalTo50WhenDisabled() { Assert.assertTrue(e.getMessage().contains("feature is disabled")); } } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testRunAfterCatalogReadyDoesNotResetFatalClockBackwardFlagWhenDisabled() { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = false; + Config.enable_feature_binlog = false; setFatalClockBackwardReportedFlag(tsoService, true); tsoService.runAfterCatalogReady(); Assert.assertTrue(getFatalClockBackwardReportedFlag(tsoService)); } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testRunAfterCatalogReadyUsesAtLeastOneRetryWhenConfigNonPositive() { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = true; - Config.enable_tso_persist_journal = true; + Config.enable_feature_binlog = true; Config.tso_max_update_retry_count = 0; Mockito.when(env.isReady()).thenReturn(true); Mockito.when(env.isMaster()).thenReturn(true); @@ -224,17 +223,17 @@ public void testRunAfterCatalogReadyUsesAtLeastOneRetryWhenConfigNonPositive() { tsoService.runAfterCatalogReady(); Assert.assertTrue(tsoService.getTSO() > 0); } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testRunAfterCatalogReadyUpdateFailureDoesNotTouchMetricWhenNotInit() throws Exception { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; boolean originalMetricInit = MetricRepo.isInit; LongCounterMetric originalUpdateFailedMetric = MetricRepo.COUNTER_TSO_CLOCK_UPDATE_FAILED; try { - Config.enable_tso_feature = true; + Config.enable_feature_binlog = true; setInitializedFlag(tsoService, true); setGlobalTimestamp(tsoService, 100L, 1L); MetricRepo.isInit = false; @@ -243,7 +242,7 @@ public void testRunAfterCatalogReadyUpdateFailureDoesNotTouchMetricWhenNotInit() Mockito.when(env.isMaster()).thenThrow(new RuntimeException("injected update failure")); tsoService.runAfterCatalogReady(); } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; MetricRepo.isInit = originalMetricInit; MetricRepo.COUNTER_TSO_CLOCK_UPDATE_FAILED = originalUpdateFailedMetric; } @@ -257,12 +256,10 @@ public void testReplayWindowEndTSOUpdatesServiceState() { } @Test - public void testSaveTSOPersistsWindowEndWhenFeatureDisabled() throws IOException { - boolean originalEnableTsoFeature = Config.enable_tso_feature; - boolean originalEnableTsoCheckpointModule = Config.enable_tso_checkpoint_module; + public void testSaveTSOPersistsWindowEndWhenBinlogEnabled() throws IOException { + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = false; - Config.enable_tso_checkpoint_module = true; + Config.enable_feature_binlog = true; long windowEnd = 12345L; tsoService.replayWindowEndTSO(new TSOTimestamp(windowEnd, 0L)); @@ -274,16 +271,15 @@ public void testSaveTSOPersistsWindowEndWhenFeatureDisabled() throws IOException Assert.assertEquals(windowEnd, checksum); Assert.assertEquals(windowEnd, recoveredService.getWindowEndTSO()); } finally { - Config.enable_tso_feature = originalEnableTsoFeature; - Config.enable_tso_checkpoint_module = originalEnableTsoCheckpointModule; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testSaveTSOSkipsWhenWindowEndIsZero() throws IOException { - boolean originalEnableTsoCheckpointModule = Config.enable_tso_checkpoint_module; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_checkpoint_module = true; + Config.enable_feature_binlog = true; ByteArrayOutputStream out = new ByteArrayOutputStream(); long checksum; @@ -294,15 +290,15 @@ public void testSaveTSOSkipsWhenWindowEndIsZero() throws IOException { Assert.assertEquals(7L, checksum); Assert.assertEquals(0, out.size()); } finally { - Config.enable_tso_checkpoint_module = originalEnableTsoCheckpointModule; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test - public void testWriteTimestampToBdbJeSkipsWhenEnvNotReady() throws Exception { - boolean originalEnableTsoPersistJournal = Config.enable_tso_persist_journal; + public void testWriteTimestampToBdbJeSkipsWhenBinlogDisabled() throws Exception { + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_persist_journal = false; + Config.enable_feature_binlog = false; EditLog editLog = Mockito.mock(EditLog.class); Mockito.when(env.isReady()).thenReturn(false); Mockito.when(env.getEditLog()).thenReturn(editLog); @@ -310,7 +306,7 @@ public void testWriteTimestampToBdbJeSkipsWhenEnvNotReady() throws Exception { invokeWriteTimestampToBdbJe(tsoService, 123L); Mockito.verifyNoInteractions(editLog); } finally { - Config.enable_tso_persist_journal = originalEnableTsoPersistJournal; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @@ -323,14 +319,12 @@ public void testWriteTimestampToBdbJeWritesWhenEnabledAndJournalReady() throws E Mockito.when(env.getEditLog()).thenReturn(editLog); Mockito.when(editLog.getJournal()).thenReturn(journal); - Config.enable_tso_persist_journal = true; invokeWriteTimestampToBdbJe(tsoService, 123L); Mockito.verify(editLog).logTSOTimestampWindowEnd(Mockito.any(TSOTimestamp.class)); } @Test public void testWriteTimestampToBdbJeThrowsWhenEnabledAndEnvNotReady() throws Exception { - Config.enable_tso_persist_journal = true; Mockito.when(env.isReady()).thenReturn(false); try { invokeWriteTimestampToBdbJe(tsoService, 123L); @@ -342,10 +336,9 @@ public void testWriteTimestampToBdbJeThrowsWhenEnabledAndEnvNotReady() throws Ex @Test public void testCalibrateTimestampThrowsWhenPersistWriteFailsAndKeepNotInitialized() throws Exception { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { - Config.enable_tso_feature = true; - Config.enable_tso_persist_journal = true; + Config.enable_feature_binlog = true; Mockito.when(env.isReady()).thenReturn(true); Mockito.when(env.isMaster()).thenReturn(true); Mockito.when(env.getEditLog()).thenReturn(null); @@ -366,13 +359,12 @@ public void testCalibrateTimestampThrowsWhenPersistWriteFailsAndKeepNotInitializ Assert.assertTrue(e.getMessage().contains("not calibrated")); } } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } @Test public void testCalibrateTimestampThrowsWhenClockBackwardExceedsThreshold() throws Exception { - Config.enable_tso_persist_journal = true; Mockito.when(env.isReady()).thenReturn(true); Mockito.when(env.isMaster()).thenReturn(true); long now = System.currentTimeMillis() + Config.tso_time_offset_debug_mode; @@ -388,7 +380,6 @@ public void testCalibrateTimestampThrowsWhenClockBackwardExceedsThreshold() thro @Test public void testCalibrateTimestampResetsFatalClockBackwardReportedOnSuccess() throws Exception { - Config.enable_tso_persist_journal = true; setFatalClockBackwardReportedFlag(tsoService, true); Mockito.when(env.isReady()).thenReturn(true); Mockito.when(env.isMaster()).thenReturn(true); @@ -400,16 +391,11 @@ public void testCalibrateTimestampResetsFatalClockBackwardReportedOnSuccess() th } @Test - public void testCalibrateTimestampThrowsWhenPersistJournalDisabled() throws Exception { - Config.enable_tso_persist_journal = false; - Mockito.when(env.isReady()).thenReturn(true); - Mockito.when(env.isMaster()).thenReturn(true); - try { - invokeCalibrateTimestamp(tsoService); - Assert.fail(); - } catch (RuntimeException e) { - Assert.assertTrue(e.getMessage().contains("enable_tso_persist_journal=true")); - } + public void testRunAfterCatalogReadySkipsWhenBinlogDisabled() throws Exception { + Config.enable_feature_binlog = false; + setInitializedFlag(tsoService, true); + tsoService.runAfterCatalogReady(); + Assert.assertEquals(0L, tsoService.getCurrentTSO()); } @Test @@ -427,23 +413,23 @@ public void testUpdateTimestampReturnsEarlyWhenNotCalibrated() throws Exception @Test public void testGenerateTSOReturnsZeroWhenDisabledOrNotInitialized() throws Exception { - boolean originalEnableTsoFeature = Config.enable_tso_feature; + boolean originalEnableFeatureBinlog = Config.enable_feature_binlog; try { setGlobalTimestamp(tsoService, 100L, 1L); - Config.enable_tso_feature = true; + Config.enable_feature_binlog = true; setInitializedFlag(tsoService, false); Pair pairWhenNotInitialized = invokeGenerateTSO(tsoService); Assert.assertEquals(0L, (long) pairWhenNotInitialized.first); Assert.assertEquals(0L, (long) pairWhenNotInitialized.second); - Config.enable_tso_feature = false; + Config.enable_feature_binlog = false; setInitializedFlag(tsoService, true); Pair pairWhenDisabled = invokeGenerateTSO(tsoService); Assert.assertEquals(0L, (long) pairWhenDisabled.first); Assert.assertEquals(0L, (long) pairWhenDisabled.second); } finally { - Config.enable_tso_feature = originalEnableTsoFeature; + Config.enable_feature_binlog = originalEnableFeatureBinlog; } } diff --git a/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index efd539d5e7e360..8960ed36a9a750 100644 --- a/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -81,6 +81,7 @@ ANALYZER: 'ANALYZER'; AND: 'AND'; ANTI: 'ANTI'; APPEND: 'APPEND'; +APPEND_ONLY: 'APPEND_ONLY'; ARRAY: 'ARRAY'; AS: 'AS'; ASC: 'ASC'; diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 757751d3a46724..0565b2ba09735c 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -80,6 +80,20 @@ struct TKeyRange { // - T: all other operational parameters that are the same across // all plan fragments +enum TBinlogScanType { + NONE = 0, + APPEND_ONLY = 1, + MIN_DELTA = 2, + DETAIL = 3, + UNKNOWN = 4 +} + +enum TBinlogReadSource { + NONE = 0, + STREAM = 1, + CHANGES = 2 +} + struct TPaloScanRange { 1: required list hosts 2: required string schema_hash @@ -90,6 +104,10 @@ struct TPaloScanRange { 7: optional list partition_column_ranges 8: optional string index_name 9: optional string table_name + 10: optional i64 start_tso + 11: optional i64 end_tso + 13: optional TBinlogScanType binlog_scan_type + 14: optional TBinlogReadSource binlog_read_source } enum TFileFormatType { diff --git a/regression-test/data/query_p0/schema_table/test_stream_info_schema.out b/regression-test/data/query_p0/schema_table/test_stream_info_schema.out index 4c6753c7215e42..9997faefb76244 100644 --- a/regression-test/data/query_p0/schema_table/test_stream_info_schema.out +++ b/regression-test/data/query_p0/schema_table/test_stream_info_schema.out @@ -1,6 +1,4 @@ -- This file is automatically generated. You should know what you did if you want to edit this -- !sql -- -test_stream_info_db s1 OLAP_TABLE_STREAM MIN_DELTA test stream 1 tbl1 test_stream_info_db internal OLAP true false N/A -test_stream_info_db s2 OLAP_TABLE_STREAM DEFAULT test stream 2 tbl1 test_stream_info_db internal OLAP true false N/A +test_stream_info_db s2 OLAP_TABLE_STREAM DEFAULT test stream 2 tbl2 test_stream_info_db internal OLAP true false N/A test_stream_info_db s3 OLAP_TABLE_STREAM APPEND_ONLY test stream 3 tbl1 test_stream_info_db internal OLAP true false N/A - diff --git a/regression-test/data/query_p0/system/test_table_properties.out b/regression-test/data/query_p0/system/test_table_properties.out index 7561214f2e6cd1..4d496b2514a892 100644 --- a/regression-test/data/query_p0/system/test_table_properties.out +++ b/regression-test/data/query_p0/system/test_table_properties.out @@ -18,7 +18,6 @@ internal test_table_properties_db duplicate_table deprecated_variant_enable_flat internal test_table_properties_db duplicate_table disable_auto_compaction false internal test_table_properties_db duplicate_table enable_mow_light_delete false internal test_table_properties_db duplicate_table enable_single_replica_compaction false -internal test_table_properties_db duplicate_table enable_tso false internal test_table_properties_db duplicate_table enable_unique_key_merge_on_write false internal test_table_properties_db duplicate_table file_cache_ttl_seconds 0 internal test_table_properties_db duplicate_table group_commit_data_bytes 134217728 @@ -58,7 +57,6 @@ internal test_table_properties_db listtable deprecated_variant_enable_flatten_ne internal test_table_properties_db listtable disable_auto_compaction false internal test_table_properties_db listtable enable_mow_light_delete false internal test_table_properties_db listtable enable_single_replica_compaction false -internal test_table_properties_db listtable enable_tso false internal test_table_properties_db listtable enable_unique_key_merge_on_write false internal test_table_properties_db listtable file_cache_ttl_seconds 0 internal test_table_properties_db listtable group_commit_data_bytes 134217728 @@ -98,7 +96,6 @@ internal test_table_properties_db unique_table deprecated_variant_enable_flatten internal test_table_properties_db unique_table disable_auto_compaction false internal test_table_properties_db unique_table enable_mow_light_delete false internal test_table_properties_db unique_table enable_single_replica_compaction false -internal test_table_properties_db unique_table enable_tso false internal test_table_properties_db unique_table enable_unique_key_merge_on_write true internal test_table_properties_db unique_table file_cache_ttl_seconds 0 internal test_table_properties_db unique_table group_commit_data_bytes 134217728 @@ -140,7 +137,6 @@ internal test_table_properties_db duplicate_table deprecated_variant_enable_flat internal test_table_properties_db duplicate_table disable_auto_compaction false internal test_table_properties_db duplicate_table enable_mow_light_delete false internal test_table_properties_db duplicate_table enable_single_replica_compaction false -internal test_table_properties_db duplicate_table enable_tso false internal test_table_properties_db duplicate_table enable_unique_key_merge_on_write false internal test_table_properties_db duplicate_table file_cache_ttl_seconds 0 internal test_table_properties_db duplicate_table group_commit_data_bytes 134217728 @@ -180,7 +176,6 @@ internal test_table_properties_db unique_table deprecated_variant_enable_flatten internal test_table_properties_db unique_table disable_auto_compaction false internal test_table_properties_db unique_table enable_mow_light_delete false internal test_table_properties_db unique_table enable_single_replica_compaction false -internal test_table_properties_db unique_table enable_tso false internal test_table_properties_db unique_table enable_unique_key_merge_on_write true internal test_table_properties_db unique_table file_cache_ttl_seconds 0 internal test_table_properties_db unique_table group_commit_data_bytes 134217728 @@ -224,7 +219,6 @@ internal test_table_properties_db duplicate_table deprecated_variant_enable_flat internal test_table_properties_db duplicate_table disable_auto_compaction false internal test_table_properties_db duplicate_table enable_mow_light_delete false internal test_table_properties_db duplicate_table enable_single_replica_compaction false -internal test_table_properties_db duplicate_table enable_tso false internal test_table_properties_db duplicate_table enable_unique_key_merge_on_write false internal test_table_properties_db duplicate_table file_cache_ttl_seconds 0 internal test_table_properties_db duplicate_table group_commit_data_bytes 134217728 diff --git a/regression-test/data/row_binlog_p0/test_row_binlog_basic.out b/regression-test/data/row_binlog_p0/test_row_binlog_basic.out index c6ba50174aa10e..e0c2eba98239cd 100644 --- a/regression-test/data/row_binlog_p0/test_row_binlog_basic.out +++ b/regression-test/data/row_binlog_p0/test_row_binlog_basic.out @@ -11,22 +11,6 @@ 0 1 1 1 11 11 0 3 3 3 30 30 --- !mow_raw -- -2 2 2 2000 201 - --- !mow_binlog -- -0 1 1 1 10 10 -0 2 2 2 20 20 -1 2 2 2 20 200 -1 2 2 2 2000 200 -1 2 2 2 2000 201 -2 1 1 1 \N \N - --- !mow_before_raw -- -2 2 2 2000 201 - --- !mow_before_binlog -- -0 1 1 1 10 10 \N \N 0 2 2 2 20 20 \N \N 1 2 2 2 20 200 20 20 1 2 2 2 2000 200 20 200 @@ -51,3 +35,4 @@ 1 1 1 1 400 360 400 350 2 1 1 1 400 350 400 350 + diff --git a/regression-test/pipeline/nonConcurrent/conf/fe.conf b/regression-test/pipeline/nonConcurrent/conf/fe.conf index e21077357420f3..e89a00d209ccf7 100644 --- a/regression-test/pipeline/nonConcurrent/conf/fe.conf +++ b/regression-test/pipeline/nonConcurrent/conf/fe.conf @@ -47,8 +47,6 @@ fuzzy_test_type=p0 use_fuzzy_session_variable=true enable_feature_binlog=true -experimental_enable_tso_feature=true -enable_tso_persist_journal=true enable_debug_points=true diff --git a/regression-test/suites/query_p0/schema_table/test_stream_info_schema.groovy b/regression-test/suites/query_p0/schema_table/test_stream_info_schema.groovy index b00bd920d76a9c..58c9b937e08aba 100644 --- a/regression-test/suites/query_p0/schema_table/test_stream_info_schema.groovy +++ b/regression-test/suites/query_p0/schema_table/test_stream_info_schema.groovy @@ -28,18 +28,39 @@ suite("test_stream_info_schema") { DUPLICATE KEY(`sid`) DISTRIBUTED BY HASH(`sid`) BUCKETS 1 PROPERTIES ( - "replication_allocation" = "tag.location.default: 1" + "replication_allocation" = "tag.location.default: 1", + "binlog.enable" = "true", + "binlog.format" = "ROW" ); """ sql """ - CREATE STREAM `s1` ON TABLE tbl1 - COMMENT 'test stream 1' - PROPERTIES('type' = 'min_delta'); + CREATE TABLE `tbl2` ( + `sid` int NULL, + `sname` varchar(32) NULL + ) ENGINE=OLAP + UNIQUE KEY(`sid`) + DISTRIBUTED BY HASH(`sid`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ); """ + test { + sql """ + CREATE STREAM `s1` ON TABLE tbl1 + COMMENT 'test stream 1' + PROPERTIES('type' = 'min_delta'); + """ + exception "MIN_DELTA table stream requires base table to be UNIQUE KEY with enable_unique_key_merge_on_write=true" + } + sql """ - CREATE STREAM `s2` ON TABLE tbl1 + CREATE STREAM `s2` ON TABLE tbl2 COMMENT 'test stream 2' PROPERTIES('type' = 'default'); """ @@ -52,4 +73,4 @@ suite("test_stream_info_schema") { qt_sql "select DB_NAME,STREAM_NAME,STREAM_TYPE,CONSUME_TYPE,STREAM_COMMENT,BASE_TABLE_NAME,BASE_TABLE_DB,BASE_TABLE_CTL,BASE_TABLE_TYPE,ENABLED,IS_STALE,STALE_REASON from information_schema.table_streams where DB_NAME = 'test_stream_info_db' order by STREAM_NAME;" sql "DROP DATABASE IF EXISTS test_stream_info_db" -} \ No newline at end of file +} diff --git a/regression-test/suites/row_binlog_p0/test_row_binlog_basic.groovy b/regression-test/suites/row_binlog_p0/test_row_binlog_basic.groovy index 4f67af693678da..d904c67e0baf62 100644 --- a/regression-test/suites/row_binlog_p0/test_row_binlog_basic.groovy +++ b/regression-test/suites/row_binlog_p0/test_row_binlog_basic.groovy @@ -20,10 +20,6 @@ suite("test_row_binlog_basic", "nonConcurrent") { return } - sql "DROP TABLE IF EXISTS test_dup_with_binlog FORCE" - sql "DROP TABLE IF EXISTS test_mow_with_binlog FORCE" - sql "DROP TABLE IF EXISTS test_mow_with_before_binlog FORCE" - sql "DROP TABLE IF EXISTS test_mow_seq_with_binlog FORCE" sql """ CREATE TABLE test_dup_with_binlog ( @@ -38,8 +34,7 @@ suite("test_row_binlog_basic", "nonConcurrent") { PROPERTIES ( "replication_num" = "1", "binlog.enable" = "true", - "binlog.format" = "ROW", - "enable_tso" = "true" + "binlog.format" = "ROW" ) """ @@ -58,8 +53,7 @@ suite("test_row_binlog_basic", "nonConcurrent") { "enable_unique_key_merge_on_write" = "true", "light_schema_change" = "true", "binlog.enable" = "true", - "binlog.format" = "ROW", - "enable_tso" = "true" + "binlog.format" = "ROW" ) """ @@ -79,8 +73,7 @@ suite("test_row_binlog_basic", "nonConcurrent") { "light_schema_change" = "true", "binlog.enable" = "true", "binlog.format" = "ROW", - "binlog.need_historical_value" = "true", - "enable_tso" = "true" + "binlog.need_historical_value" = "true" ) """ @@ -100,8 +93,7 @@ suite("test_row_binlog_basic", "nonConcurrent") { "light_schema_change" = "true", "binlog.enable" = "true", "binlog.format" = "ROW", - "binlog.need_historical_value" = "true", - "enable_tso" = "true" + "binlog.need_historical_value" = "true" ) """ @@ -118,7 +110,6 @@ suite("test_row_binlog_basic", "nonConcurrent") { INSERT INTO test_dup_with_binlog VALUES (1, 1, 1, 11, '11'), (3, 3, 3, 30, '30') - """ order_qt_dup_raw """ SELECT k1, k2, k3, v1, v2 @@ -139,50 +130,6 @@ suite("test_row_binlog_basic", "nonConcurrent") { sql """ INSERT INTO test_mow_with_binlog VALUES (1, 1, 1, 10, '10'), - (2, 2, 2, 20, '20') - """ - sql "SET enable_unique_key_partial_update = true" - sql "INSERT INTO test_mow_with_binlog(k1, k2, k3, v2) VALUES (2, 2, 2, '200')" - sql "INSERT INTO test_mow_with_binlog(k1, k2, k3, v1) VALUES (2, 2, 2, 2000)" - sql "INSERT INTO test_mow_with_binlog(k1, k2, k3, v2) VALUES (2, 2, 2, '201')" - sql "SET enable_unique_key_partial_update = false" - sql "DELETE FROM test_mow_with_binlog WHERE k1 = 1 AND k2 = 1 AND k3 = 1" - - order_qt_mow_raw """ - SELECT k1, k2, k3, v1, v2 - FROM test_mow_with_binlog - """ - - qt_mow_binlog """ - SELECT __DORIS_BINLOG_OP__ AS op, - k1, - k2, - k3, - v1, - v2 - FROM binlog("table" = "test_mow_with_binlog") - ORDER BY __DORIS_BINLOG_LSN__ - """ - - sql """ - INSERT INTO test_mow_with_before_binlog VALUES - (1, 1, 1, 10, '10'), - (2, 2, 2, 20, '20') - """ - sql "SET enable_unique_key_partial_update = true" - sql "INSERT INTO test_mow_with_before_binlog(k1, k2, k3, v2) VALUES (2, 2, 2, '200')" - sql "INSERT INTO test_mow_with_before_binlog(k1, k2, k3, v1) VALUES (2, 2, 2, 2000)" - sql "INSERT INTO test_mow_with_before_binlog(k1, k2, k3, v2) VALUES (2, 2, 2, '201')" - sql "SET enable_unique_key_partial_update = false" - sql "DELETE FROM test_mow_with_before_binlog WHERE k1 = 1 AND k2 = 1 AND k3 = 1" - - order_qt_mow_before_raw """ - SELECT k1, k2, k3, v1, v2 - FROM test_mow_with_before_binlog - """ - - qt_mow_before_binlog """ - SELECT __DORIS_BINLOG_OP__ AS op, k1, k2, k3, diff --git a/regression-test/suites/row_binlog_p0/test_row_binlog_multi_segment.groovy b/regression-test/suites/row_binlog_p0/test_row_binlog_multi_segment.groovy index 541a867e9cf249..e213874eb40b4c 100644 --- a/regression-test/suites/row_binlog_p0/test_row_binlog_multi_segment.groovy +++ b/regression-test/suites/row_binlog_p0/test_row_binlog_multi_segment.groovy @@ -39,8 +39,7 @@ suite("test_row_binlog_multi_segment", "nonConcurrent") { "disable_auto_compaction" = "true", "binlog.enable" = "true", "binlog.format" = "ROW", - "binlog.need_historical_value" = "true", - "enable_tso" = "true" + "binlog.need_historical_value" = "true" ) """ diff --git a/regression-test/suites/row_binlog_p0/test_row_binlog_publish_conflict.groovy b/regression-test/suites/row_binlog_p0/test_row_binlog_publish_conflict.groovy index c1cc92989ba28c..04112956dd4136 100644 --- a/regression-test/suites/row_binlog_p0/test_row_binlog_publish_conflict.groovy +++ b/regression-test/suites/row_binlog_p0/test_row_binlog_publish_conflict.groovy @@ -41,8 +41,7 @@ suite("test_row_binlog_publish_conflict", "nonConcurrent") { "disable_auto_compaction" = "true", "binlog.enable" = "true", "binlog.format" = "ROW", - "binlog.need_historical_value" = "true", - "enable_tso" = "true" + "binlog.need_historical_value" = "true" ) """ diff --git a/regression-test/suites/row_binlog_p0/test_row_binlog_schema_change.groovy b/regression-test/suites/row_binlog_p0/test_row_binlog_schema_change.groovy index c737be8fe12d02..731ced47b6112f 100644 --- a/regression-test/suites/row_binlog_p0/test_row_binlog_schema_change.groovy +++ b/regression-test/suites/row_binlog_p0/test_row_binlog_schema_change.groovy @@ -39,8 +39,7 @@ suite("test_row_binlog_schema_change", "nonConcurrent") { "disable_auto_compaction" = "true", "binlog.enable" = "true", "binlog.format" = "ROW", - "binlog.need_historical_value" = "true", - "enable_tso" = "true" + "binlog.need_historical_value" = "true" ) """ diff --git a/regression-test/suites/table_stream_p0/test_min_delta_stream.groovy b/regression-test/suites/table_stream_p0/test_min_delta_stream.groovy new file mode 100644 index 00000000000000..351349acf237be --- /dev/null +++ b/regression-test/suites/table_stream_p0/test_min_delta_stream.groovy @@ -0,0 +1,826 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_min_delta_stream", "p0,nonConcurrent") { + sql "DROP DATABASE IF EXISTS test_min_delta_stream_db" + sql "CREATE DATABASE test_min_delta_stream_db" + sql "USE test_min_delta_stream_db" + sql "set enable_nereids_planner=true" + sql "set enable_fallback_to_original_planner=false" + + def ukBase = "md_uk_base" + def ukStream = "md_uk_stream" + def ukSkipBase = "md_uk_skip_base" + def ukSkipStream = "md_uk_skip_stream" + def ukMultiBase = "md_uk_multi_base" + def ukMultiStream = "md_uk_multi_stream" + def ukDeleteBase = "md_uk_delete_base" + def ukDeleteStream = "md_uk_delete_stream" + def ukCrossRowsetBase = "md_uk_cross_rowset_base" + def ukCrossRowsetStream = "md_uk_cross_rowset_stream" + def ukCrossBlockBase = "md_uk_cross_block_base" + def ukCrossBlockStream = "md_uk_cross_block_stream" + def ukPendingBase = "md_uk_pending_base" + def ukPendingStream = "md_uk_pending_stream" + def ukDeleteBeforeBase = "md_uk_delete_before_base" + def ukDeleteBeforeStream = "md_uk_delete_before_stream" + def ukShowInitialBase = "md_uk_show_initial_base" + def ukShowInitialStream = "md_uk_show_initial_stream" + def ukShowInitialTarget = "md_uk_show_initial_target" + def paperBase = "md_paper_base" + def paperStream = "md_paper_stream" + def incrBase = "md_incr_base" + def incrDupBase = "md_incr_dup_base" + def incrMowNoHistoryBase = "md_incr_mow_no_history_base" + + try { + sql "DROP STREAM IF EXISTS ${ukStream}" + sql "DROP TABLE IF EXISTS ${ukBase}" + sql "DROP STREAM IF EXISTS ${ukSkipStream}" + sql "DROP TABLE IF EXISTS ${ukSkipBase}" + sql "DROP STREAM IF EXISTS ${ukMultiStream}" + sql "DROP TABLE IF EXISTS ${ukMultiBase}" + sql "DROP STREAM IF EXISTS ${ukDeleteStream}" + sql "DROP TABLE IF EXISTS ${ukDeleteBase}" + sql "DROP STREAM IF EXISTS ${ukCrossRowsetStream}" + sql "DROP TABLE IF EXISTS ${ukCrossRowsetBase}" + sql "DROP STREAM IF EXISTS ${ukCrossBlockStream}" + sql "DROP TABLE IF EXISTS ${ukCrossBlockBase}" + sql "DROP STREAM IF EXISTS ${ukPendingStream}" + sql "DROP TABLE IF EXISTS ${ukPendingBase}" + sql "DROP STREAM IF EXISTS ${ukDeleteBeforeStream}" + sql "DROP TABLE IF EXISTS ${ukDeleteBeforeBase}" + sql "DROP STREAM IF EXISTS ${ukShowInitialStream}" + sql "DROP TABLE IF EXISTS ${ukShowInitialBase}" + sql "DROP TABLE IF EXISTS ${ukShowInitialTarget}" + sql "DROP STREAM IF EXISTS ${paperStream}" + sql "DROP TABLE IF EXISTS ${paperBase}" + sql "DROP TABLE IF EXISTS ${incrBase}" + sql "DROP TABLE IF EXISTS ${incrDupBase}" + sql "DROP TABLE IF EXISTS ${incrMowNoHistoryBase}" + + // 1) UNIQUE KEY + MIN_DELTA: verify UPDATE_BEFORE/UPDATE_AFTER are emitted as a pair. + sql """ + CREATE TABLE ${ukBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + sql "INSERT INTO ${ukBase} VALUES (1, 10)" + sql """ + CREATE STREAM ${ukStream} + ON TABLE ${ukBase} + PROPERTIES ( + "type" = "min_delta", + "show_initial_rows" = "false" + ) + """ + sql "INSERT INTO ${ukBase} VALUES (1, 11)" + sql "sync" + + def ukRows = sql """ + SELECT id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + FROM ${ukStream} + ORDER BY id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + """ + assertEquals(2, ukRows.size()) + assertEquals("1", ukRows[0][0].toString()) + assertEquals("10", ukRows[0][1].toString()) + assertEquals("UPDATE_BEFORE", ukRows[0][2].toString()) + assertEquals("1", ukRows[1][0].toString()) + assertEquals("11", ukRows[1][1].toString()) + assertEquals("UPDATE_AFTER", ukRows[1][2].toString()) + + // 1.1) Stream hidden columns: + // - __DORIS_STREAM_CHANGE_TYPE_COL__/__DORIS_STREAM_SEQUENCE_COL__ must be queryable; + // - SELECT * should not expose hidden columns or __BEFORE__ columns by default. + def ukMetaRows = sql """ + SELECT id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__, __DORIS_STREAM_SEQUENCE_COL__ + FROM ${ukStream} + ORDER BY id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + """ + assertEquals(2, ukMetaRows.size()) + assertEquals(false, ukMetaRows[0][3] == null) + assertEquals(false, ukMetaRows[1][3] == null) + + def ukAllRows = sql """ + SELECT * + FROM ${ukStream} + ORDER BY id, v1 + """ + assertEquals(2, ukAllRows.size()) + // Base table has 2 visible columns; hidden columns and __BEFORE__v1__ must not be included by default. + assertEquals(2, ukAllRows[0].size()) + assertEquals(2, ukAllRows[1].size()) + + sql "SET show_hidden_columns=true" + def ukDescRows = sql "DESC ${ukStream}" + def hasBeforeCol = ukDescRows.any { it[0].toString().startsWith("__BEFORE__") } + def hasChangeTypeCol = ukDescRows.any { it[0].toString().equalsIgnoreCase("__DORIS_STREAM_CHANGE_TYPE_COL__") } + def hasSequenceCol = ukDescRows.any { it[0].toString().equalsIgnoreCase("__DORIS_STREAM_SEQUENCE_COL__") } + assertEquals(false, hasBeforeCol) + assertEquals(true, hasChangeTypeCol) + assertEquals(true, hasSequenceCol) + sql "SET show_hidden_columns=false" + + // 2) UNIQUE KEY + MIN_DELTA: verify INSERT then DELETE collapses to SKIP (empty result). + sql """ + CREATE TABLE ${ukSkipBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + sql """ + CREATE STREAM ${ukSkipStream} + ON TABLE ${ukSkipBase} + PROPERTIES ( + "type" = "min_delta", + "show_initial_rows" = "false" + ) + """ + sql "INSERT INTO ${ukSkipBase} VALUES (2, 20)" + sql "DELETE FROM ${ukSkipBase} WHERE id = 2" + sql "sync" + + def ukSkipRows = sql "SELECT id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ FROM ${ukSkipStream}" + assertEquals(0, ukSkipRows.size()) + + // 3) UNIQUE KEY + MIN_DELTA: verify multiple UPDATEs on the same key keep only first/last as BEFORE/AFTER. + sql """ + CREATE TABLE ${ukMultiBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + sql "INSERT INTO ${ukMultiBase} VALUES (3, 30)" + sql """ + CREATE STREAM ${ukMultiStream} + ON TABLE ${ukMultiBase} + PROPERTIES ( + "type" = "min_delta", + "show_initial_rows" = "false" + ) + """ + sql "INSERT INTO ${ukMultiBase} VALUES (3, 31)" + sql "INSERT INTO ${ukMultiBase} VALUES (3, 32)" + sql "sync" + + def ukMultiRows = sql """ + SELECT id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + FROM ${ukMultiStream} + ORDER BY id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + """ + assertEquals(2, ukMultiRows.size()) + assertEquals("3", ukMultiRows[0][0].toString()) + assertEquals("30", ukMultiRows[0][1].toString()) + assertEquals("UPDATE_BEFORE", ukMultiRows[0][2].toString()) + assertEquals("3", ukMultiRows[1][0].toString()) + assertEquals("32", ukMultiRows[1][1].toString()) + assertEquals("UPDATE_AFTER", ukMultiRows[1][2].toString()) + + // 4) UNIQUE KEY + MIN_DELTA: verify a pure DELETE emits a single DELETE row. + sql """ + CREATE TABLE ${ukDeleteBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + sql "INSERT INTO ${ukDeleteBase} VALUES (4, 40)" + sql """ + CREATE STREAM ${ukDeleteStream} + ON TABLE ${ukDeleteBase} + PROPERTIES ( + "type" = "min_delta", + "show_initial_rows" = "false" + ) + """ + sql "DELETE FROM ${ukDeleteBase} WHERE id = 4" + sql "sync" + + def ukDeleteRows = sql """ + SELECT id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + FROM ${ukDeleteStream} + ORDER BY id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + """ + assertEquals(1, ukDeleteRows.size()) + assertEquals("4", ukDeleteRows[0][0].toString()) + assertEquals("40", ukDeleteRows[0][1].toString()) + assertEquals("DELETE", ukDeleteRows[0][2].toString()) + + // 5) Same key across rowsets: + // rowset-1 writes key=1(update) and key=2(insert), rowset-2 updates key=1 again. + // Expect key=1 to fold into one UPDATE_BEFORE/UPDATE_AFTER pair; key=2 remains INSERT. + sql """ + CREATE TABLE ${ukCrossRowsetBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + // Seed old value for key=1 before stream starts. + sql "INSERT INTO ${ukCrossRowsetBase} VALUES (1, 10)" + sql """ + CREATE STREAM ${ukCrossRowsetStream} + ON TABLE ${ukCrossRowsetBase} + PROPERTIES ( + "type" = "min_delta", + "show_initial_rows" = "false" + ) + """ + // rowset-1 + sql "INSERT INTO ${ukCrossRowsetBase} VALUES (1, 11), (2, 30)" + // rowset-2 + sql "INSERT INTO ${ukCrossRowsetBase} VALUES (1, 12)" + sql "sync" + + def ukCrossRowsetRows = sql """ + SELECT id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + FROM ${ukCrossRowsetStream} + ORDER BY id, + CASE __DORIS_STREAM_CHANGE_TYPE_COL__ + WHEN 'UPDATE_BEFORE' THEN 0 + WHEN 'UPDATE_AFTER' THEN 1 + WHEN 'DELETE' THEN 2 + WHEN 'APPEND' THEN 3 + ELSE 9 + END, v1 + """ + assertEquals(3, ukCrossRowsetRows.size()) + assertEquals("1", ukCrossRowsetRows[0][0].toString()) + assertEquals("10", ukCrossRowsetRows[0][1].toString()) + assertEquals("UPDATE_BEFORE", ukCrossRowsetRows[0][2].toString()) + assertEquals("1", ukCrossRowsetRows[1][0].toString()) + assertEquals("12", ukCrossRowsetRows[1][1].toString()) + assertEquals("UPDATE_AFTER", ukCrossRowsetRows[1][2].toString()) + assertEquals("2", ukCrossRowsetRows[2][0].toString()) + assertEquals("30", ukCrossRowsetRows[2][1].toString()) + assertEquals("APPEND", ukCrossRowsetRows[2][2].toString()) + + def ukCrossRowsetSequenceRows = sql """ + SELECT id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__, __DORIS_STREAM_SEQUENCE_COL__ + FROM ${ukCrossRowsetStream} + ORDER BY id, + CASE __DORIS_STREAM_CHANGE_TYPE_COL__ + WHEN 'UPDATE_BEFORE' THEN 0 + WHEN 'UPDATE_AFTER' THEN 1 + WHEN 'DELETE' THEN 2 + WHEN 'APPEND' THEN 3 + ELSE 9 + END, v1 + """ + assertTrue(ukCrossRowsetSequenceRows.every { it[3] != null }) + + // 6) Same key across blocks: set batch_size=1 to force binlog rows of the same key into multiple blocks. + // Expect only "first BEFORE image + last AFTER image" to be emitted. + sql """ + CREATE TABLE ${ukCrossBlockBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + sql "INSERT INTO ${ukCrossBlockBase} VALUES (7, 70)" + sql """ + CREATE STREAM ${ukCrossBlockStream} + ON TABLE ${ukCrossBlockBase} + PROPERTIES ( + "type" = "min_delta", + "show_initial_rows" = "false" + ) + """ + sql "INSERT INTO ${ukCrossBlockBase} VALUES (7, 71)" + sql "INSERT INTO ${ukCrossBlockBase} VALUES (7, 72)" + sql "INSERT INTO ${ukCrossBlockBase} VALUES (7, 73)" + sql "sync" + + def ukCrossBlockRows = sql """ + SELECT /*+ SET_VAR(batch_size=1) */ id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + FROM ${ukCrossBlockStream} + ORDER BY id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + """ + assertEquals(2, ukCrossBlockRows.size()) + assertEquals("7", ukCrossBlockRows[0][0].toString()) + assertEquals("70", ukCrossBlockRows[0][1].toString()) + assertEquals("UPDATE_BEFORE", ukCrossBlockRows[0][2].toString()) + assertEquals("7", ukCrossBlockRows[1][0].toString()) + assertEquals("73", ukCrossBlockRows[1][1].toString()) + assertEquals("UPDATE_AFTER", ukCrossBlockRows[1][2].toString()) + + // 7) Emission across batch_size: UPDATE_BEFORE/UPDATE_AFTER may be split across two returns, + // with the second row buffered in pending. Use batch_size=1 to verify no loss or reordering. + sql """ + CREATE TABLE ${ukPendingBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + sql "INSERT INTO ${ukPendingBase} VALUES (8, 80), (9, 90)" + sql """ + CREATE STREAM ${ukPendingStream} + ON TABLE ${ukPendingBase} + PROPERTIES ( + "type" = "min_delta", + "show_initial_rows" = "false" + ) + """ + sql "INSERT INTO ${ukPendingBase} VALUES (8, 81)" + sql "INSERT INTO ${ukPendingBase} VALUES (9, 91)" + sql "sync" + + def ukPendingRows = sql """ + SELECT /*+ SET_VAR(batch_size=1) */ id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + FROM ${ukPendingStream} + ORDER BY id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + """ + assertEquals(4, ukPendingRows.size()) + assertEquals("8", ukPendingRows[0][0].toString()) + assertEquals("80", ukPendingRows[0][1].toString()) + assertEquals("UPDATE_BEFORE", ukPendingRows[0][2].toString()) + assertEquals("8", ukPendingRows[1][0].toString()) + assertEquals("81", ukPendingRows[1][1].toString()) + assertEquals("UPDATE_AFTER", ukPendingRows[1][2].toString()) + assertEquals("9", ukPendingRows[2][0].toString()) + assertEquals("90", ukPendingRows[2][1].toString()) + assertEquals("UPDATE_BEFORE", ukPendingRows[2][2].toString()) + assertEquals("9", ukPendingRows[3][0].toString()) + assertEquals("91", ukPendingRows[3][1].toString()) + assertEquals("UPDATE_AFTER", ukPendingRows[3][2].toString()) + + // 8) UPDATE then DELETE: min_delta result is DELETE, and value columns should use __BEFORE__ (pre-delete snapshot). + sql """ + CREATE TABLE ${ukDeleteBeforeBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + sql "INSERT INTO ${ukDeleteBeforeBase} VALUES (10, 100)" + sql """ + CREATE STREAM ${ukDeleteBeforeStream} + ON TABLE ${ukDeleteBeforeBase} + PROPERTIES ( + "type" = "min_delta", + "show_initial_rows" = "false" + ) + """ + sql "INSERT INTO ${ukDeleteBeforeBase} VALUES (10, 101)" + sql "DELETE FROM ${ukDeleteBeforeBase} WHERE id = 10" + sql "sync" + + def ukDeleteBeforeRows = sql """ + SELECT id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + FROM ${ukDeleteBeforeStream} + ORDER BY id, v1, __DORIS_STREAM_CHANGE_TYPE_COL__ + """ + assertEquals(1, ukDeleteBeforeRows.size()) + assertEquals("10", ukDeleteBeforeRows[0][0].toString()) + assertEquals("101", ukDeleteBeforeRows[0][1].toString()) + assertEquals("DELETE", ukDeleteBeforeRows[0][2].toString()) + + // 9) show_initial_rows=true: + // direct stream query should first return historical rows only; + // after consuming history, the stream should switch to incremental mode. + sql """ + CREATE TABLE ${ukShowInitialBase} ( + id BIGINT, + v1 INT, + v2 VARCHAR(16) + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + sql """ + CREATE TABLE ${ukShowInitialTarget} ( + id BIGINT, + v1 INT, + v2 VARCHAR(16) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + sql "INSERT INTO ${ukShowInitialBase} VALUES (1, 11, 'a2'), (2, 11, 'a2'), (3, 11, 'a2')" + sql """ + CREATE STREAM ${ukShowInitialStream} + ON TABLE ${ukShowInitialBase} + PROPERTIES ( + "type" = "min_delta", + "show_initial_rows" = "true" + ) + """ + sql "INSERT INTO ${ukShowInitialBase} VALUES (4, 11, 'a2')" + sql "DELETE FROM ${ukShowInitialBase} WHERE id = 2" + sql "sync" + + def ukShowInitialRows = sql """ + SELECT id, __DORIS_STREAM_CHANGE_TYPE_COL__, __DORIS_STREAM_SEQUENCE_COL__ + FROM ${ukShowInitialStream} + ORDER BY id, __DORIS_STREAM_CHANGE_TYPE_COL__ + """ + assertEquals(3, ukShowInitialRows.size()) + assertEquals("1", ukShowInitialRows[0][0].toString()) + assertEquals("APPEND", ukShowInitialRows[0][1].toString()) + assertEquals("2", ukShowInitialRows[1][0].toString()) + assertEquals("APPEND", ukShowInitialRows[1][1].toString()) + assertEquals("3", ukShowInitialRows[2][0].toString()) + assertEquals("APPEND", ukShowInitialRows[2][1].toString()) + assertTrue(ukShowInitialRows.every { it[2] != null }) + assertEquals(["-1"] as Set, ukShowInitialRows.collect { it[2].toString() }.toSet()) + + sql "INSERT INTO ${ukShowInitialTarget} SELECT * FROM ${ukShowInitialStream}" + + def ukShowInitialIncrementalRows = sql """ + SELECT id, __DORIS_STREAM_CHANGE_TYPE_COL__, __DORIS_STREAM_SEQUENCE_COL__ + FROM ${ukShowInitialStream} + ORDER BY id, __DORIS_STREAM_CHANGE_TYPE_COL__ + """ + assertEquals(2, ukShowInitialIncrementalRows.size()) + assertEquals("2", ukShowInitialIncrementalRows[0][0].toString()) + assertEquals("DELETE", ukShowInitialIncrementalRows[0][1].toString()) + assertEquals("4", ukShowInitialIncrementalRows[1][0].toString()) + assertEquals("APPEND", ukShowInitialIncrementalRows[1][1].toString()) + assertTrue(ukShowInitialIncrementalRows.every { it[2] != null }) + assertFalse(ukShowInitialIncrementalRows.collect { it[2].toString() }.toSet().contains("-1")) + + def ukShowInitialTargetRows = sql """ + SELECT id, v1, v2 + FROM ${ukShowInitialTarget} + ORDER BY id + """ + assertEquals(3, ukShowInitialTargetRows.size()) + assertEquals("1", ukShowInitialTargetRows[0][0].toString()) + assertEquals("2", ukShowInitialTargetRows[1][0].toString()) + assertEquals("3", ukShowInitialTargetRows[2][0].toString()) + + // 10) Reproduce the paper's minimum-delta scenario (mixed INSERT/UPDATE/DELETE folding semantics): + // - Walter: keep INSERT; + // - Jeff -> Jeffrey: emit UPDATE_BEFORE/UPDATE_AFTER as a pair; + // - Maud -> Maude (INSERT then UPDATE): fold into a single INSERT (latest value); + // - Donny: keep DELETE; + // - Uli (INSERT then DELETE): net change is empty and should be SKIP. + sql """ + CREATE TABLE ${paperBase} ( + id BIGINT, + name VARCHAR(32) + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + sql "INSERT INTO ${paperBase} VALUES (1, 'Jeff'), (2, 'Donny')" + sql """ + CREATE STREAM ${paperStream} + ON TABLE ${paperBase} + PROPERTIES ( + "type" = "min_delta", + "show_initial_rows" = "false" + ) + """ + + sql "INSERT INTO ${paperBase} VALUES (3, 'Walter')" + sql "INSERT INTO ${paperBase} VALUES (1, 'Jeffrey')" + sql "INSERT INTO ${paperBase} VALUES (4, 'Maud')" + sql "INSERT INTO ${paperBase} VALUES (4, 'Maude')" + sql "DELETE FROM ${paperBase} WHERE id = 2" + sql "INSERT INTO ${paperBase} VALUES (5, 'Uli')" + sql "DELETE FROM ${paperBase} WHERE id = 5" + sql "sync" + + def paperRows = sql """ + SELECT id, name, __DORIS_STREAM_CHANGE_TYPE_COL__ + FROM ${paperStream} + ORDER BY id, + CASE __DORIS_STREAM_CHANGE_TYPE_COL__ + WHEN 'UPDATE_BEFORE' THEN 0 + WHEN 'UPDATE_AFTER' THEN 1 + WHEN 'DELETE' THEN 2 + WHEN 'APPEND' THEN 3 + ELSE 9 + END + """ + assertEquals(5, paperRows.size()) + assertEquals("1", paperRows[0][0].toString()) + assertEquals("Jeff", paperRows[0][1].toString()) + assertEquals("UPDATE_BEFORE", paperRows[0][2].toString()) + assertEquals("1", paperRows[1][0].toString()) + assertEquals("Jeffrey", paperRows[1][1].toString()) + assertEquals("UPDATE_AFTER", paperRows[1][2].toString()) + assertEquals("2", paperRows[2][0].toString()) + assertEquals("Donny", paperRows[2][1].toString()) + assertEquals("DELETE", paperRows[2][2].toString()) + assertEquals("3", paperRows[3][0].toString()) + assertEquals("Walter", paperRows[3][1].toString()) + assertEquals("APPEND", paperRows[3][2].toString()) + assertEquals("4", paperRows[4][0].toString()) + assertEquals("Maude", paperRows[4][1].toString()) + assertEquals("APPEND", paperRows[4][2].toString()) + + // 11) Base table @incr(timestamp-based) queries should support MIN_DELTA / APPEND_ONLY / DETAIL, + // and DETAIL without startTimestamp should behave the same as default @incr(). + sql """ + CREATE TABLE ${incrBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "true" + ) + """ + sql "INSERT INTO ${incrBase} VALUES (1, 10), (3, 30)" + sql "sync" + + sleep(1200) + def incrTimeFormat = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss") + def startTimestamp = incrTimeFormat.format(new Date()) + sleep(1200) + + sql "INSERT INTO ${incrBase} VALUES (2, 20)" + sql "INSERT INTO ${incrBase} VALUES (1, 11)" + sql "INSERT INTO ${incrBase} VALUES (1, 12)" + sql "DELETE FROM ${incrBase} WHERE id = 3" + sql "sync" + + sleep(1200) + def endTimestamp = incrTimeFormat.format(new Date()) + sleep(1200) + + def minDeltaRows = sql """ + SELECT id, v1, __DORIS_BINLOG_OP__ + FROM ${incrBase}@incr('startTimestamp' = '${startTimestamp}', + "endTimestamp" = "${endTimestamp}", + "incrementType" = "MIN_DELTA") + ORDER BY id, __DORIS_BINLOG_OP__, v1 + """ + assertEquals(4, minDeltaRows.size()) + assertEquals("1", minDeltaRows[0][0].toString()) + assertEquals("10", minDeltaRows[0][1].toString()) + assertEquals("2", minDeltaRows[0][2].toString()) + assertEquals("1", minDeltaRows[1][0].toString()) + assertEquals("12", minDeltaRows[1][1].toString()) + assertEquals("3", minDeltaRows[1][2].toString()) + assertEquals("2", minDeltaRows[2][0].toString()) + assertEquals("20", minDeltaRows[2][1].toString()) + assertEquals("0", minDeltaRows[2][2].toString()) + assertEquals("3", minDeltaRows[3][0].toString()) + assertEquals("30", minDeltaRows[3][1].toString()) + assertEquals("1", minDeltaRows[3][2].toString()) + + def appendOnlyRows = sql """ + SELECT id, v1, __DORIS_BINLOG_OP__ + FROM ${incrBase}@incr('startTimestamp' = '${startTimestamp}', + "endTimestamp" = "${endTimestamp}", + "incrementType" = "APPEND_ONLY") + ORDER BY id, __DORIS_BINLOG_OP__, v1 + """ + assertEquals(1, appendOnlyRows.size()) + assertEquals("2", appendOnlyRows[0][0].toString()) + assertEquals("20", appendOnlyRows[0][1].toString()) + assertEquals("0", appendOnlyRows[0][2].toString()) + + def detailWithRangeRows = sql """ + SELECT id, v1, __DORIS_BINLOG_OP__ + FROM ${incrBase}@incr('startTimestamp' = '${startTimestamp}', + "endTimestamp" = "${endTimestamp}", + "incrementType" = "DETAIL") + ORDER BY __DORIS_BINLOG_LSN__ + """ + assertEquals(6, detailWithRangeRows.size()) + assertEquals("2", detailWithRangeRows[0][0].toString()) + assertEquals("20", detailWithRangeRows[0][1].toString()) + assertEquals("0", detailWithRangeRows[0][2].toString()) + assertEquals("1", detailWithRangeRows[1][0].toString()) + assertEquals("10", detailWithRangeRows[1][1].toString()) + assertEquals("2", detailWithRangeRows[1][2].toString()) + assertEquals("1", detailWithRangeRows[2][0].toString()) + assertEquals("11", detailWithRangeRows[2][1].toString()) + assertEquals("3", detailWithRangeRows[2][2].toString()) + assertEquals("1", detailWithRangeRows[3][0].toString()) + assertEquals("11", detailWithRangeRows[3][1].toString()) + assertEquals("2", detailWithRangeRows[3][2].toString()) + assertEquals("1", detailWithRangeRows[4][0].toString()) + assertEquals("12", detailWithRangeRows[4][1].toString()) + assertEquals("3", detailWithRangeRows[4][2].toString()) + assertEquals("3", detailWithRangeRows[5][0].toString()) + assertEquals("30", detailWithRangeRows[5][1].toString()) + assertEquals("1", detailWithRangeRows[5][2].toString()) + + def detailWithStartRows = sql """ + SELECT id, v1, __DORIS_BINLOG_OP__ + FROM ${incrBase}@incr('startTimestamp' = '${startTimestamp}', + "incrementType" = "DETAIL") + ORDER BY __DORIS_BINLOG_LSN__ + """ + assertEquals(detailWithRangeRows, detailWithStartRows) + + def detailDefaultRows = sql """ + SELECT id, v1, __DORIS_BINLOG_OP__ + FROM ${incrBase}@incr("incrementType" = "DETAIL") + ORDER BY __DORIS_BINLOG_LSN__ + """ + assertEquals(8, detailDefaultRows.size()) + assertEquals("1", detailDefaultRows[0][0].toString()) + assertEquals("10", detailDefaultRows[0][1].toString()) + assertEquals("0", detailDefaultRows[0][2].toString()) + assertEquals("3", detailDefaultRows[1][0].toString()) + assertEquals("30", detailDefaultRows[1][1].toString()) + assertEquals("0", detailDefaultRows[1][2].toString()) + + def emptyIncrRows = sql """ + SELECT id, v1, __DORIS_BINLOG_OP__ + FROM ${incrBase}@incr() + ORDER BY __DORIS_BINLOG_LSN__ + """ + assertEquals(detailDefaultRows, emptyIncrRows) + + // 12) DETAIL incr should support duplicate table and MOW table without historical value. + sql """ + CREATE TABLE ${incrDupBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "binlog.enable" = "true", + "binlog.format" = "ROW" + ) + """ + sql "INSERT INTO ${incrDupBase} VALUES (1, 10), (2, 20)" + sql "sync" + sleep(1200) + def dupStartTimestamp = incrTimeFormat.format(new Date()) + sleep(1200) + sql "INSERT INTO ${incrDupBase} VALUES (3, 30)" + sql "sync" + + def dupDetailRows = sql """ + SELECT id, v1, __DORIS_BINLOG_OP__ + FROM ${incrDupBase}@incr('startTimestamp' = '${dupStartTimestamp}', + "incrementType" = "DETAIL") + ORDER BY __DORIS_BINLOG_LSN__ + """ + assertEquals(1, dupDetailRows.size()) + assertEquals("3", dupDetailRows[0][0].toString()) + assertEquals("30", dupDetailRows[0][1].toString()) + assertEquals("0", dupDetailRows[0][2].toString()) + + test { + sql """ + SELECT id, v1, __DORIS_BINLOG_OP__ + FROM ${incrDupBase}@incr('startTimestamp' = '${dupStartTimestamp}', + "incrementType" = "MIN_DELTA") + """ + exception "MIN_DELTA INCR query requires base table to be UNIQUE KEY with enable_unique_key_merge_on_write=true" + } + + sql """ + CREATE TABLE ${incrMowNoHistoryBase} ( + id BIGINT, + v1 INT + ) ENGINE=OLAP + UNIQUE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "binlog.enable" = "true", + "binlog.format" = "ROW", + "binlog.need_historical_value" = "false" + ) + """ + sql "INSERT INTO ${incrMowNoHistoryBase} VALUES (1, 10)" + sql "sync" + sleep(1200) + def mowNoHistoryStartTimestamp = incrTimeFormat.format(new Date()) + sleep(1200) + sql "INSERT INTO ${incrMowNoHistoryBase} VALUES (1, 11)" + sql "sync" + + def mowNoHistoryDetailRows = sql """ + SELECT id, v1, __DORIS_BINLOG_OP__ + FROM ${incrMowNoHistoryBase}@incr('startTimestamp' = '${mowNoHistoryStartTimestamp}', + "incrementType" = "DETAIL") + ORDER BY __DORIS_BINLOG_LSN__ + """ + assertEquals(1, mowNoHistoryDetailRows.size()) + assertEquals("1", mowNoHistoryDetailRows[0][0].toString()) + assertEquals("11", mowNoHistoryDetailRows[0][1].toString()) + assertEquals("0", mowNoHistoryDetailRows[0][2].toString()) + + test { + sql """ + SELECT id, v1, __DORIS_BINLOG_OP__ + FROM ${incrMowNoHistoryBase}@incr('startTimestamp' = '${mowNoHistoryStartTimestamp}', + "incrementType" = "MIN_DELTA") + """ + exception "MIN_DELTA INCR query requires base table to enable binlog.need_historical_value=true" + } + } finally { + sql "DROP DATABASE IF EXISTS test_min_delta_stream_db" + } +} diff --git a/regression-test/suites/table_stream_p0/test_olap_table_stream_history_consumption.groovy b/regression-test/suites/table_stream_p0/test_olap_table_stream_history_consumption.groovy index c2d0a647936629..2653178150c273 100644 --- a/regression-test/suites/table_stream_p0/test_olap_table_stream_history_consumption.groovy +++ b/regression-test/suites/table_stream_p0/test_olap_table_stream_history_consumption.groovy @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -suite("test_olap_table_stream_history_consumption") { +suite("test_olap_table_stream_history_consumption", "nonConcurrent") { sql "DROP DATABASE IF EXISTS test_olap_table_stream_history_consumption_db" sql "CREATE DATABASE test_olap_table_stream_history_consumption_db" sql "USE test_olap_table_stream_history_consumption_db" @@ -29,7 +29,9 @@ suite("test_olap_table_stream_history_consumption") { DUPLICATE KEY(`sid`) DISTRIBUTED BY HASH(`sid`) BUCKETS 1 PROPERTIES ( - "replication_allocation" = "tag.location.default: 1" + "replication_allocation" = "tag.location.default: 1", + "binlog.enable" = "true", + "binlog.format" = "ROW" ); """ sql """ @@ -60,7 +62,9 @@ suite("test_olap_table_stream_history_consumption") { ) DISTRIBUTED BY HASH(`sname`) BUCKETS 1 PROPERTIES ( - "replication_allocation" = "tag.location.default: 1" + "replication_allocation" = "tag.location.default: 1", + "binlog.enable" = "true", + "binlog.format" = "ROW" ); """ @@ -93,7 +97,9 @@ suite("test_olap_table_stream_history_consumption") { ) DISTRIBUTED BY HASH(`sname`) BUCKETS 1 PROPERTIES ( - "replication_allocation" = "tag.location.default: 1" + "replication_allocation" = "tag.location.default: 1", + "binlog.enable" = "true", + "binlog.format" = "ROW" ); """ @@ -143,6 +149,47 @@ suite("test_olap_table_stream_history_consumption") { qt_sql "select * from s2" qt_sql "select * from s3" qt_sql "select * from target order by sid" - qt_sql "select DB_NAME,STREAM_NAME,UNIT,CONSUMPTION_STATUS,LAG from information_schema.table_stream_consumption where DB_NAME = 'test_olap_table_stream_history_consumption_db' order by STREAM_NAME, UNIT;" + def consumptionRows = sql """ + select DB_NAME, STREAM_NAME, UNIT, CONSUMPTION_STATUS, LAG + from information_schema.table_stream_consumption + where DB_NAME = 'test_olap_table_stream_history_consumption_db' + order by STREAM_NAME, UNIT + """ + assertEquals(6, consumptionRows.size()) + + assertEquals("test_olap_table_stream_history_consumption_db", consumptionRows[0][0].toString()) + assertEquals("s1", consumptionRows[0][1].toString()) + assertEquals("tbl1", consumptionRows[0][2].toString()) + assertTrue(consumptionRows[0][3] != null) + assertTrue(consumptionRows[0][3].toString() != "N/A") + assertEquals("0", consumptionRows[0][4].toString()) + + assertEquals("s2", consumptionRows[1][1].toString()) + assertEquals("p1", consumptionRows[1][2].toString()) + assertEquals("N/A", consumptionRows[1][3].toString()) + assertEquals("1", consumptionRows[1][4].toString()) + + assertEquals("s2", consumptionRows[2][1].toString()) + assertEquals("p2", consumptionRows[2][2].toString()) + assertTrue(consumptionRows[2][3] != null) + assertTrue(consumptionRows[2][3].toString() != "N/A") + assertEquals("0", consumptionRows[2][4].toString()) + + assertEquals("s3", consumptionRows[3][1].toString()) + assertEquals("p1", consumptionRows[3][2].toString()) + assertEquals("N/A", consumptionRows[3][3].toString()) + assertEquals("1", consumptionRows[3][4].toString()) + + assertEquals("s3", consumptionRows[4][1].toString()) + assertEquals("p2", consumptionRows[4][2].toString()) + assertEquals("N/A", consumptionRows[4][3].toString()) + assertEquals("1", consumptionRows[4][4].toString()) + + assertEquals("s3", consumptionRows[5][1].toString()) + assertEquals("p3", consumptionRows[5][2].toString()) + assertTrue(consumptionRows[5][3] != null) + assertTrue(consumptionRows[5][3].toString() != "N/A") + assertEquals("0", consumptionRows[5][4].toString()) + sql "DROP DATABASE IF EXISTS test_olap_table_stream_history_consumption_db" -} \ No newline at end of file +} diff --git a/regression-test/suites/table_stream_p0/test_olap_table_stream_history_query.groovy b/regression-test/suites/table_stream_p0/test_olap_table_stream_history_query.groovy index 46dfc5f691f77e..79fd31348fceda 100644 --- a/regression-test/suites/table_stream_p0/test_olap_table_stream_history_query.groovy +++ b/regression-test/suites/table_stream_p0/test_olap_table_stream_history_query.groovy @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -suite("test_olap_table_stream_history_query") { +suite("test_olap_table_stream_history_query", "nonConcurrent") { sql "DROP DATABASE IF EXISTS test_olap_table_stream_history_query_db" sql "CREATE DATABASE test_olap_table_stream_history_query_db" sql "USE test_olap_table_stream_history_query_db" @@ -29,7 +29,9 @@ suite("test_olap_table_stream_history_query") { UNIQUE KEY(`sid`) DISTRIBUTED BY HASH(`sid`) BUCKETS 1 PROPERTIES ( - "replication_allocation" = "tag.location.default: 1" + "replication_allocation" = "tag.location.default: 1", + "binlog.enable" = "true", + "binlog.format" = "ROW" ); """ sql """ @@ -54,7 +56,9 @@ suite("test_olap_table_stream_history_query") { DUPLICATE KEY(`sid`) DISTRIBUTED BY HASH(`sid`) BUCKETS 1 PROPERTIES ( - "replication_allocation" = "tag.location.default: 1" + "replication_allocation" = "tag.location.default: 1", + "binlog.enable" = "true", + "binlog.format" = "ROW" ); """ @@ -129,4 +133,4 @@ suite("test_olap_table_stream_history_query") { qt_sql "select __DORIS_STREAM_CHANGE_TYPE_COL__, count(*) from s2 group by 1" sql "DROP DATABASE IF EXISTS test_olap_table_stream_history_query_db" -} \ No newline at end of file +} diff --git a/regression-test/suites/tso_p0/test_tso_rowset_commit_tso.groovy b/regression-test/suites/tso_p0/test_tso_rowset_commit_tso.groovy index 1ff98b996be2aa..bf06b8b09ab7f9 100644 --- a/regression-test/suites/tso_p0/test_tso_rowset_commit_tso.groovy +++ b/regression-test/suites/tso_p0/test_tso_rowset_commit_tso.groovy @@ -33,7 +33,7 @@ suite("test_tso_rowset_commit_tso", "nonConcurrent") { id INT ) DISTRIBUTED BY HASH(id) BUCKETS 1 - PROPERTIES ("replication_num" = "1", "enable_tso" = "true", "disable_auto_compaction" = "true") + PROPERTIES ("replication_num" = "1", "binlog.format" = "ROW", "disable_auto_compaction" = "true") """ sql """INSERT INTO ${tableName} VALUES (1), (2), (3)"""