diff --git a/be/src/olap/partial_update_info.cpp b/be/src/olap/partial_update_info.cpp index 518b29dcaa4867..928ae80b38fc8f 100644 --- a/be/src/olap/partial_update_info.cpp +++ b/be/src/olap/partial_update_info.cpp @@ -433,9 +433,12 @@ Status FixedReadPlan::fill_missing_columns( DCHECK(column.type() == FieldType::OLAP_FIELD_TYPE_BIGINT); auto* auto_inc_column = assert_cast(missing_col.get()); - auto_inc_column->insert_from( - *block->get_by_name(BeConsts::PARTIAL_UPDATE_AUTO_INC_COL).column.get(), - idx); + int pos = block->get_position_by_name(BeConsts::PARTIAL_UPDATE_AUTO_INC_COL); + if (pos == -1) { + return Status::InternalError("auto increment column not found in block {}", + block->dump_structure()); + } + auto_inc_column->insert_from(*block->get_by_position(pos).column.get(), idx); } else { // If the control flow reaches this branch, the column neither has default value // nor is nullable. It means that the row's delete sign is marked, and the value diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index 6a513d9517beeb..7da1967916a5a6 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -241,24 +241,6 @@ const ColumnWithTypeAndName& Block::safe_get_by_position(size_t position) const return data[position]; } -ColumnWithTypeAndName& Block::get_by_name(const std::string& name) { - int pos = get_position_by_name(name); - if (pos == -1) { - throw Exception(ErrorCode::INTERNAL_ERROR, "No such name in Block, name={}, block_names={}", - name, dump_names()); - } - return data[pos]; -} - -const ColumnWithTypeAndName& Block::get_by_name(const std::string& name) const { - int pos = get_position_by_name(name); - if (pos == -1) { - throw Exception(ErrorCode::INTERNAL_ERROR, "No such name in Block, name={}, block_names={}", - name, dump_names()); - } - return data[pos]; -} - int Block::get_position_by_name(const std::string& name) const { for (int i = 0; i < data.size(); i++) { if (data[i].name == name) { diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index d2128b964e4703..b99f2ec09ada41 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -116,9 +116,13 @@ class Block { std::swap(data, new_data); } - // Use this method only when you are certain index_by_name will not be used - // This is a temporary compromise; index_by_name may be removed in the future - void simple_insert(const ColumnWithTypeAndName& elem) { data.emplace_back(elem); } + std::unordered_map get_name_to_pos_map() const { + std::unordered_map name_to_index_map; + for (uint32_t i = 0; i < data.size(); ++i) { + name_to_index_map[data[i].name] = i; + } + return name_to_index_map; + } /// References are invalidated after calling functions above. ColumnWithTypeAndName& get_by_position(size_t position) { @@ -144,11 +148,6 @@ class Block { ColumnWithTypeAndName& safe_get_by_position(size_t position); const ColumnWithTypeAndName& safe_get_by_position(size_t position) const; - // Get column by name. Throws an exception if there is no column with that name. - // ATTN: this method is O(N). better maintain name -> position map in caller if you need to call it frequently. - ColumnWithTypeAndName& get_by_name(const std::string& name); - const ColumnWithTypeAndName& get_by_name(const std::string& name) const; - Container::iterator begin() { return data.begin(); } Container::iterator end() { return data.end(); } Container::const_iterator begin() const { return data.begin(); } diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 59d3ece9356066..93c62deaa1d78a 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -1283,11 +1283,10 @@ Status OrcReader::_fill_partition_columns( const std::unordered_map>& partition_columns) { DataTypeSerDe::FormatOptions _text_formatOptions; + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); for (const auto& kv : partition_columns) { - auto doris_column = block->get_by_name(kv.first).column; - // block is a Block*, and get_by_name returns a ColumnPtr, - // which is a const pointer. Therefore, using const_cast is permissible. - auto* col_ptr = const_cast(doris_column.get()); + auto col_ptr = block->get_by_position(name_to_pos_map[kv.first]).column->assume_mutable(); const auto& [value, slot_desc] = kv.second; auto text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); @@ -1312,10 +1311,18 @@ Status OrcReader::_fill_partition_columns( Status OrcReader::_fill_missing_columns( Block* block, uint64_t rows, const std::unordered_map& missing_columns) { + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); + std::set positions_to_erase; for (const auto& kv : missing_columns) { + if (!name_to_pos_map.contains(kv.first)) { + return Status::InternalError("Failed to find missing column: {}, block: {}", kv.first, + block->dump_structure()); + } if (kv.second == nullptr) { // no default column, fill with null - auto mutable_column = block->get_by_name(kv.first).column->assume_mutable(); + auto mutable_column = + block->get_by_position(name_to_pos_map[kv.first]).column->assume_mutable(); auto* nullable_column = static_cast(mutable_column.get()); nullable_column->insert_many_defaults(rows); } else { @@ -1335,19 +1342,16 @@ Status OrcReader::_fill_missing_columns( mutable_column->resize(rows); // result_column_ptr maybe a ColumnConst, convert it to a normal column result_column_ptr = result_column_ptr->convert_to_full_column_if_const(); - auto origin_column_type = block->get_by_name(kv.first).type; + auto origin_column_type = block->get_by_position(name_to_pos_map[kv.first]).type; bool is_nullable = origin_column_type->is_nullable(); - int pos = block->get_position_by_name(kv.first); - if (pos == -1) { - return Status::InternalError("Failed to find column: {}, block: {}", kv.first, - block->dump_structure()); - } block->replace_by_position( - pos, is_nullable ? make_nullable(result_column_ptr) : result_column_ptr); - block->erase(result_column_id); + name_to_pos_map[kv.first], + is_nullable ? make_nullable(result_column_ptr) : result_column_ptr); + positions_to_erase.insert(result_column_id); } } } + block->erase(positions_to_erase); return Status::OK(); } @@ -1988,8 +1992,10 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo std::vector batch_vec; _fill_batch_vec(batch_vec, _batch.get(), 0); + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); for (auto& col_name : _lazy_read_ctx.lazy_read_columns) { - auto& column_with_type_and_name = block->get_by_name(col_name); + auto& column_with_type_and_name = block->get_by_position(name_to_pos_map[col_name]); auto& column_ptr = column_with_type_and_name.column; auto& column_type = column_with_type_and_name.type; auto file_column_name = _table_info_node_ptr->children_file_column_name(col_name); @@ -2055,15 +2061,17 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo } } + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); if (!_dict_cols_has_converted && !_dict_filter_cols.empty()) { for (auto& dict_filter_cols : _dict_filter_cols) { MutableColumnPtr dict_col_ptr = ColumnInt32::create(); - int pos = block->get_position_by_name(dict_filter_cols.first); - if (pos == -1) { + if (!name_to_pos_map.contains(dict_filter_cols.first)) { return Status::InternalError( "Failed to find dict filter column '{}' in block {}", dict_filter_cols.first, block->dump_structure()); } + auto pos = name_to_pos_map[dict_filter_cols.first]; auto& column_with_type_and_name = block->get_by_position(pos); auto& column_type = column_with_type_and_name.type; if (column_type->is_nullable()) { @@ -2085,7 +2093,7 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo _fill_batch_vec(batch_vec, _batch.get(), 0); for (auto& col_name : _lazy_read_ctx.all_read_columns) { - auto& column_with_type_and_name = block->get_by_name(col_name); + auto& column_with_type_and_name = block->get_by_position(name_to_pos_map[col_name]); auto& column_ptr = column_with_type_and_name.column; auto& column_type = column_with_type_and_name.type; auto file_column_name = _table_info_node_ptr->children_file_column_name(col_name); @@ -2196,19 +2204,27 @@ void OrcReader::_build_delete_row_filter(const Block* block, size_t rows) { if (_delete_rows != nullptr) { _delete_rows_filter_ptr = std::make_unique(rows, 1); auto* __restrict _pos_delete_filter_data = _delete_rows_filter_ptr->data(); + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); const auto& original_transaction_column = assert_cast(*remove_nullable( - block->get_by_name(TransactionalHive::ORIGINAL_TRANSACTION_LOWER_CASE).column)); - const auto& bucket_id_column = assert_cast( - *remove_nullable(block->get_by_name(TransactionalHive::BUCKET_LOWER_CASE).column)); - const auto& row_id_column = assert_cast( - *remove_nullable(block->get_by_name(TransactionalHive::ROW_ID_LOWER_CASE).column)); + block->get_by_position( + name_to_pos_map[TransactionalHive::ORIGINAL_TRANSACTION_LOWER_CASE]) + .column)); + const auto& bucket_id_column = assert_cast(*remove_nullable( + block->get_by_position(name_to_pos_map[TransactionalHive::BUCKET_LOWER_CASE]) + .column)); + const auto& row_id_column = assert_cast(*remove_nullable( + block->get_by_position(name_to_pos_map[TransactionalHive::ROW_ID_LOWER_CASE]) + .column)); for (int i = 0; i < rows; ++i) { auto original_transaction = original_transaction_column.get_int(i); auto bucket_id = bucket_id_column.get_int(i); auto row_id = row_id_column.get_int(i); - TransactionalHiveReader::AcidRowID transactional_row_id = {original_transaction, - bucket_id, row_id}; + TransactionalHiveReader::AcidRowID transactional_row_id = { + .original_transaction = original_transaction, + .bucket = bucket_id, + .row_id = row_id}; if (_delete_rows->contains(transactional_row_id)) { _pos_delete_filter_data[i] = 0; } @@ -2222,13 +2238,15 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s size_t origin_column_num = block->columns(); if (!_dict_cols_has_converted && !_dict_filter_cols.empty()) { + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); for (auto& dict_filter_cols : _dict_filter_cols) { - MutableColumnPtr dict_col_ptr = ColumnInt32::create(); - int pos = block->get_position_by_name(dict_filter_cols.first); - if (pos == -1) { - return Status::InternalError("Wrong read column '{}' in orc file, block: {}", + if (!name_to_pos_map.contains(dict_filter_cols.first)) { + return Status::InternalError("Failed to find dict filter column '{}' in block {}", dict_filter_cols.first, block->dump_structure()); } + MutableColumnPtr dict_col_ptr = ColumnInt32::create(); + auto pos = name_to_pos_map[dict_filter_cols.first]; auto& column_with_type_and_name = block->get_by_position(pos); auto& column_type = column_with_type_and_name.type; if (column_type->is_nullable()) { @@ -2254,8 +2272,10 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s TransactionalHive::READ_ROW_COLUMN_NAMES_LOWER_CASE.begin(), TransactionalHive::READ_ROW_COLUMN_NAMES_LOWER_CASE.end()); } + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); for (auto& table_col_name : table_col_names) { - auto& column_with_type_and_name = block->get_by_name(table_col_name); + auto& column_with_type_and_name = block->get_by_position(name_to_pos_map[table_col_name]); auto& column_ptr = column_with_type_and_name.column; auto& column_type = column_with_type_and_name.type; auto file_column_name = _table_info_node_ptr->children_file_column_name(table_col_name); @@ -2307,13 +2327,13 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s if (can_filter_all) { for (auto& col : table_col_names) { // clean block to read predicate columns and acid columns - block->get_by_name(col).column->assume_mutable()->clear(); + block->get_by_position(name_to_pos_map[col]).column->assume_mutable()->clear(); } for (auto& col : _lazy_read_ctx.predicate_partition_columns) { - block->get_by_name(col.first).column->assume_mutable()->clear(); + block->get_by_position(name_to_pos_map[col.first]).column->assume_mutable()->clear(); } for (auto& col : _lazy_read_ctx.predicate_missing_columns) { - block->get_by_name(col.first).column->assume_mutable()->clear(); + block->get_by_position(name_to_pos_map[col.first]).column->assume_mutable()->clear(); } Block::erase_useless_column(block, origin_column_num); RETURN_IF_ERROR(_convert_dict_cols_to_string_cols(block, nullptr)); @@ -2627,12 +2647,14 @@ Status OrcReader::_convert_dict_cols_to_string_cols( return Status::OK(); } if (!_dict_filter_cols.empty()) { + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); for (auto& dict_filter_cols : _dict_filter_cols) { - int pos = block->get_position_by_name(dict_filter_cols.first); - if (pos == -1) { - return Status::InternalError("Wrong read column '{}' in orc file, block: {}", + if (!name_to_pos_map.contains(dict_filter_cols.first)) { + return Status::InternalError("Failed to find dict filter column '{}' in block {}", dict_filter_cols.first, block->dump_structure()); } + auto pos = name_to_pos_map[dict_filter_cols.first]; ColumnWithTypeAndName& column_with_type_and_name = block->get_by_position(pos); const ColumnPtr& column = column_with_type_and_name.column; diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index f9de3648d07901..82e829f6199339 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include "common/config.h" @@ -392,30 +393,32 @@ Status RowGroupReader::_read_column_data(Block* block, FilterMap& filter_map) { size_t batch_read_rows = 0; bool has_eof = false; + // todo: maybe do not need to build name to index map every time + auto name_to_idx = block->get_name_to_pos_map(); for (auto& read_col_name : table_columns) { - auto& column_with_type_and_name = block->get_by_name(read_col_name); + auto& column_with_type_and_name = block->safe_get_by_position(name_to_idx[read_col_name]); auto& column_ptr = column_with_type_and_name.column; auto& column_type = column_with_type_and_name.type; bool is_dict_filter = false; for (auto& _dict_filter_col : _dict_filter_cols) { if (_dict_filter_col.first == read_col_name) { MutableColumnPtr dict_column = ColumnInt32::create(); - int pos = block->get_position_by_name(read_col_name); - if (pos == -1) { + if (!name_to_idx.contains(read_col_name)) { return Status::InternalError( "Wrong read column '{}' in parquet file, block: {}", read_col_name, block->dump_structure()); } if (column_type->is_nullable()) { - block->get_by_position(pos).type = + block->get_by_position(name_to_idx[read_col_name]).type = std::make_shared(std::make_shared()); block->replace_by_position( - pos, + name_to_idx[read_col_name], ColumnNullable::create(std::move(dict_column), ColumnUInt8::create(dict_column->size(), 0))); } else { - block->get_by_position(pos).type = std::make_shared(); - block->replace_by_position(pos, std::move(dict_column)); + block->get_by_position(name_to_idx[read_col_name]).type = + std::make_shared(); + block->replace_by_position(name_to_idx[read_col_name], std::move(dict_column)); } is_dict_filter = true; break; @@ -516,20 +519,25 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t batch_size, size_t* re } const uint8_t* __restrict filter_map_data = result_filter.data(); - filter_map_ptr.reset(new FilterMap()); + filter_map_ptr = std::make_unique(); RETURN_IF_ERROR(filter_map_ptr->init(filter_map_data, pre_read_rows, can_filter_all)); if (filter_map_ptr->filter_all()) { { SCOPED_RAW_TIMER(&_predicate_filter_time); - for (auto& col : _lazy_read_ctx.predicate_columns.first) { + auto name_to_idx = block->get_name_to_pos_map(); + for (const auto& col : _lazy_read_ctx.predicate_columns.first) { // clean block to read predicate columns - block->get_by_name(col).column->assume_mutable()->clear(); + block->get_by_position(name_to_idx[col]).column->assume_mutable()->clear(); } - for (auto& col : _lazy_read_ctx.predicate_partition_columns) { - block->get_by_name(col.first).column->assume_mutable()->clear(); + for (const auto& col : _lazy_read_ctx.predicate_partition_columns) { + block->get_by_position(name_to_idx[col.first]) + .column->assume_mutable() + ->clear(); } - for (auto& col : _lazy_read_ctx.predicate_missing_columns) { - block->get_by_name(col.first).column->assume_mutable()->clear(); + for (const auto& col : _lazy_read_ctx.predicate_missing_columns) { + block->get_by_position(name_to_idx[col.first]) + .column->assume_mutable() + ->clear(); } if (_row_id_column_iterator_pair.first != nullptr) { block->get_by_position(_row_id_column_iterator_pair.second) @@ -665,11 +673,12 @@ Status RowGroupReader::_fill_partition_columns( const std::unordered_map>& partition_columns) { DataTypeSerDe::FormatOptions _text_formatOptions; - for (auto& kv : partition_columns) { - auto doris_column = block->get_by_name(kv.first).column; + auto name_to_idx = block->get_name_to_pos_map(); + for (const auto& kv : partition_columns) { + auto doris_column = block->get_by_position(name_to_idx[kv.first]).column; // obtained from block*, it is a mutable object. - IColumn* col_ptr = const_cast(doris_column.get()); - auto& [value, slot_desc] = kv.second; + auto* col_ptr = const_cast(doris_column.get()); + const auto& [value, slot_desc] = kv.second; auto _text_serde = slot_desc->get_data_type_ptr()->get_serde(); Slice slice(value.data(), value.size()); uint64_t num_deserialized = 0; @@ -694,15 +703,23 @@ Status RowGroupReader::_fill_partition_columns( Status RowGroupReader::_fill_missing_columns( Block* block, size_t rows, const std::unordered_map& missing_columns) { - for (auto& kv : missing_columns) { + // todo: maybe do not need to build name to index map every time + auto name_to_idx = block->get_name_to_pos_map(); + std::set positions_to_erase; + for (const auto& kv : missing_columns) { + if (!name_to_idx.contains(kv.first)) { + return Status::InternalError("Missing column: {} not found in block {}", kv.first, + block->dump_structure()); + } if (kv.second == nullptr) { // no default column, fill with null - auto mutable_column = block->get_by_name(kv.first).column->assume_mutable(); + auto mutable_column = + block->get_by_position(name_to_idx[kv.first]).column->assume_mutable(); auto* nullable_column = assert_cast(mutable_column.get()); nullable_column->insert_many_defaults(rows); } else { // fill with default value - auto& ctx = kv.second; + const auto& ctx = kv.second; auto origin_column_num = block->columns(); int result_column_id = -1; // PT1 => dest primitive type @@ -717,20 +734,16 @@ Status RowGroupReader::_fill_missing_columns( mutable_column->resize(rows); // result_column_ptr maybe a ColumnConst, convert it to a normal column result_column_ptr = result_column_ptr->convert_to_full_column_if_const(); - auto origin_column_type = block->get_by_name(kv.first).type; + auto origin_column_type = block->get_by_position(name_to_idx[kv.first]).type; bool is_nullable = origin_column_type->is_nullable(); - int pos = block->get_position_by_name(kv.first); - if (pos == -1) { - return Status::InternalError( - "Wrong missing column '{}' in parquet file, block: {}", kv.first, - block->dump_structure()); - } block->replace_by_position( - pos, is_nullable ? make_nullable(result_column_ptr) : result_column_ptr); - block->erase(result_column_id); + name_to_idx[kv.first], + is_nullable ? make_nullable(result_column_ptr) : result_column_ptr); + positions_to_erase.insert(result_column_id); } } } + block->erase(positions_to_erase); return Status::OK(); } @@ -1082,18 +1095,20 @@ Status RowGroupReader::_rewrite_dict_conjuncts(std::vector& dict_codes, } void RowGroupReader::_convert_dict_cols_to_string_cols(Block* block) { + // todo: maybe do not need to build name to index map every time + auto name_to_idx = block->get_name_to_pos_map(); for (auto& dict_filter_cols : _dict_filter_cols) { - int pos = block->get_position_by_name(dict_filter_cols.first); - if (pos == -1) { + if (!name_to_idx.contains(dict_filter_cols.first)) { throw Exception(ErrorCode::INTERNAL_ERROR, "Wrong read column '{}' in parquet file, block: {}", dict_filter_cols.first, block->dump_structure()); } - ColumnWithTypeAndName& column_with_type_and_name = block->get_by_position(pos); + ColumnWithTypeAndName& column_with_type_and_name = + block->get_by_position(name_to_idx[dict_filter_cols.first]); const ColumnPtr& column = column_with_type_and_name.column; - if (auto* nullable_column = check_and_get_column(*column)) { + if (const auto* nullable_column = check_and_get_column(*column)) { const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr(); - const ColumnInt32* dict_column = assert_cast(nested_column.get()); + const auto* dict_column = assert_cast(nested_column.get()); DCHECK(dict_column); MutableColumnPtr string_column = @@ -1103,16 +1118,18 @@ void RowGroupReader::_convert_dict_cols_to_string_cols(Block* block) { column_with_type_and_name.type = std::make_shared(std::make_shared()); block->replace_by_position( - pos, ColumnNullable::create(std::move(string_column), - nullable_column->get_null_map_column_ptr())); + name_to_idx[dict_filter_cols.first], + ColumnNullable::create(std::move(string_column), + nullable_column->get_null_map_column_ptr())); } else { - const ColumnInt32* dict_column = assert_cast(column.get()); + const auto* dict_column = assert_cast(column.get()); MutableColumnPtr string_column = _column_readers[dict_filter_cols.first]->convert_dict_column_to_string_column( dict_column); column_with_type_and_name.type = std::make_shared(); - block->replace_by_position(pos, std::move(string_column)); + block->replace_by_position(name_to_idx[dict_filter_cols.first], + std::move(string_column)); } } } diff --git a/be/src/vec/exec/format/table/equality_delete.cpp b/be/src/vec/exec/format/table/equality_delete.cpp index b2bc1408fc53df..48914a021441d8 100644 --- a/be/src/vec/exec/format/table/equality_delete.cpp +++ b/be/src/vec/exec/format/table/equality_delete.cpp @@ -45,7 +45,12 @@ Status SimpleEqualityDelete::_build_set() { Status SimpleEqualityDelete::filter_data_block(Block* data_block) { SCOPED_TIMER(equality_delete_time); - auto column_and_type = data_block->get_by_name(_delete_column_name); + int pos = data_block->get_position_by_name(_delete_column_name); + if (pos == -1) { + return Status::InternalError("Column '{}' not found in data block: {}", _delete_column_name, + data_block->dump_structure()); + } + auto column_and_type = data_block->get_by_position(pos); if (column_and_type.type->get_primitive_type() != _delete_column_type) { return Status::InternalError( "Not support type change in column '{}', src type: {}, target type: {}", @@ -104,20 +109,22 @@ Status MultiEqualityDelete::_build_set() { Status MultiEqualityDelete::filter_data_block(Block* data_block) { SCOPED_TIMER(equality_delete_time); size_t column_index = 0; - for (std::string column_name : _delete_block->get_names()) { - auto column_and_type = data_block->get_by_name(column_name); - if (!_delete_block->get_by_name(column_name).type->equals(*column_and_type.type)) { - return Status::InternalError( - "Not support type change in column '{}', src type: {}, target type: {}", - column_name, _delete_block->get_by_name(column_name).type->get_name(), - column_and_type.type->get_name()); - } - int pos = data_block->get_position_by_name(column_name); - if (pos == -1) { + + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = data_block->get_name_to_pos_map(); + for (auto delete_col : _delete_block->get_columns_with_type_and_name()) { + const std::string& column_name = delete_col.name; + auto column_and_type = data_block->safe_get_by_position(name_to_pos_map[column_name]); + if (name_to_pos_map.contains(column_name) == false) { return Status::InternalError("Column '{}' not found in data block: {}", column_name, data_block->dump_structure()); } - _data_column_index[column_index++] = pos; + if (!delete_col.type->equals(*column_and_type.type)) { + return Status::InternalError( + "Not support type change in column '{}', src type: {}, target type: {}", + column_name, delete_col.type->get_name(), column_and_type.type->get_name()); + } + _data_column_index[column_index++] = name_to_pos_map[column_name]; } size_t rows = data_block->rows(); _data_hashes.clear(); diff --git a/be/src/vec/exec/format/table/iceberg_reader.cpp b/be/src/vec/exec/format/table/iceberg_reader.cpp index f26cb2d7c441f6..a5b5de40c37a51 100644 --- a/be/src/vec/exec/format/table/iceberg_reader.cpp +++ b/be/src/vec/exec/format/table/iceberg_reader.cpp @@ -223,25 +223,32 @@ void IcebergTableReader::_generate_equality_delete_block( } Status IcebergTableReader::_expand_block_if_need(Block* block) { + std::set names; + auto block_names = block->get_names(); + names.insert(block_names.begin(), block_names.end()); for (auto& col : _expand_columns) { col.column->assume_mutable()->clear(); - if (block->get_position_by_name(col.name) != -1) { + if (names.contains(col.name)) { return Status::InternalError("Wrong expand column '{}'", col.name); } + names.insert(col.name); block->insert(col); } return Status::OK(); } Status IcebergTableReader::_shrink_block_if_need(Block* block) { + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); + std::set positions_to_erase; for (const std::string& expand_col : _expand_col_names) { - int pos = block->get_position_by_name(expand_col); - if (pos == -1) { + if (!name_to_pos_map.contains(expand_col)) { return Status::InternalError("Wrong erase column '{}', block: {}", expand_col, block->dump_names()); } - block->erase(pos); + positions_to_erase.emplace(name_to_pos_map[expand_col]); } + block->erase(positions_to_erase); return Status::OK(); } @@ -388,9 +395,11 @@ void IcebergTableReader::_sort_delete_rows(std::vector*>& d void IcebergTableReader::_gen_position_delete_file_range(Block& block, DeleteFile* position_delete, size_t read_rows, bool file_path_column_dictionary_coded) { - ColumnPtr path_column = block.get_by_name(ICEBERG_FILE_PATH).column; + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block.get_name_to_pos_map(); + ColumnPtr path_column = block.get_by_position(name_to_pos_map[ICEBERG_FILE_PATH]).column; DCHECK_EQ(path_column->size(), read_rows); - ColumnPtr pos_column = block.get_by_name(ICEBERG_ROW_POS).column; + ColumnPtr pos_column = block.get_by_position(name_to_pos_map[ICEBERG_ROW_POS]).column; using ColumnType = typename PrimitiveTypeTraits::ColumnType; const int64_t* src_data = assert_cast(*pos_column).get_data().data(); IcebergTableReader::PositionDeleteRange range; diff --git a/be/src/vec/exec/format/table/remote_doris_reader.cpp b/be/src/vec/exec/format/table/remote_doris_reader.cpp index fa0f8566c0f49f..ca56de0d48f6e5 100644 --- a/be/src/vec/exec/format/table/remote_doris_reader.cpp +++ b/be/src/vec/exec/format/table/remote_doris_reader.cpp @@ -76,14 +76,20 @@ Status RemoteDorisReader::get_next_block(Block* block, size_t* read_rows, bool* auto batch = chunk.data; auto num_rows = batch->num_rows(); auto num_columns = batch->num_columns(); + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); for (int c = 0; c < num_columns; ++c) { arrow::Array* column = batch->column(c).get(); std::string column_name = batch->schema()->field(c)->name(); + if (!name_to_pos_map.contains(column_name)) { + return Status::InternalError("column {} not found in block {}", column_name, + block->dump_structure()); + } try { const vectorized::ColumnWithTypeAndName& column_with_name = - block->get_by_name(column_name); + block->get_by_position(name_to_pos_map[column_name]); RETURN_IF_ERROR(column_with_name.type->get_serde()->read_column_from_arrow( column_with_name.column->assume_mutable_ref(), column, 0, num_rows, _ctzz)); } catch (Exception& e) { diff --git a/be/src/vec/exec/jni_connector.cpp b/be/src/vec/exec/jni_connector.cpp index 87a6ec8c224156..b03081dfb10b67 100644 --- a/be/src/vec/exec/jni_connector.cpp +++ b/be/src/vec/exec/jni_connector.cpp @@ -324,8 +324,10 @@ Status JniConnector::_fill_block(Block* block, size_t num_rows) { SCOPED_RAW_TIMER(&_fill_block_watcher); JNIEnv* env = nullptr; RETURN_IF_ERROR(JniUtil::GetJNIEnv(&env)); + // todo: maybe do not need to build name to index map every time + auto name_to_pos_map = block->get_name_to_pos_map(); for (int i = 0; i < _column_names.size(); ++i) { - auto& column_with_type_and_name = block->get_by_name(_column_names[i]); + auto& column_with_type_and_name = block->get_by_position(name_to_pos_map[_column_names[i]]); auto& column_ptr = column_with_type_and_name.column; auto& column_type = column_with_type_and_name.type; RETURN_IF_ERROR(_fill_column(_table_meta, column_ptr, column_type, num_rows)); diff --git a/be/src/vec/exec/scan/file_scanner.cpp b/be/src/vec/exec/scan/file_scanner.cpp index db06796d78f670..2aec305170376b 100644 --- a/be/src/vec/exec/scan/file_scanner.cpp +++ b/be/src/vec/exec/scan/file_scanner.cpp @@ -544,6 +544,9 @@ Status FileScanner::_check_output_block_types() { Status FileScanner::_init_src_block(Block* block) { if (!_is_load) { _src_block_ptr = block; + + // todo: maybe do not need to build name to index map every time + _src_block_name_to_idx = block->get_name_to_pos_map(); return Status::OK(); } RETURN_IF_ERROR(_check_output_block_types()); @@ -610,7 +613,7 @@ Status FileScanner::_cast_to_input_block(Block* block) { // skip variant type continue; } - auto& arg = _src_block_ptr->get_by_name(slot_desc->col_name()); + auto& arg = _src_block_ptr->get_by_position(_src_block_name_to_idx[slot_desc->col_name()]); auto return_type = slot_desc->get_data_type_ptr(); // remove nullable here, let the get_function decide whether nullable auto data_type = get_data_type_with_default_argument(remove_nullable(return_type)); @@ -638,7 +641,8 @@ Status FileScanner::_fill_columns_from_path(size_t rows) { } DataTypeSerDe::FormatOptions _text_formatOptions; for (auto& kv : _partition_col_descs) { - auto doris_column = _src_block_ptr->get_by_name(kv.first).column; + auto doris_column = + _src_block_ptr->get_by_position(_src_block_name_to_idx[kv.first]).column; // _src_block_ptr points to a mutable block created by this class itself, so const_cast can be used here. IColumn* col_ptr = const_cast(doris_column.get()); auto& [value, slot_desc] = kv.second; @@ -671,7 +675,8 @@ Status FileScanner::_fill_missing_columns(size_t rows) { for (auto& kv : _missing_col_descs) { if (kv.second == nullptr) { // no default column, fill with null - auto mutable_column = _src_block_ptr->get_by_name(kv.first).column->assume_mutable(); + auto mutable_column = _src_block_ptr->get_by_position(_src_block_name_to_idx[kv.first]) + .column->assume_mutable(); auto* nullable_column = static_cast(mutable_column.get()); nullable_column->insert_many_defaults(rows); } else { @@ -691,15 +696,16 @@ Status FileScanner::_fill_missing_columns(size_t rows) { mutable_column->resize(rows); // result_column_ptr maybe a ColumnConst, convert it to a normal column result_column_ptr = result_column_ptr->convert_to_full_column_if_const(); - auto origin_column_type = _src_block_ptr->get_by_name(kv.first).type; + auto origin_column_type = + _src_block_ptr->get_by_position(_src_block_name_to_idx[kv.first]).type; bool is_nullable = origin_column_type->is_nullable(); - int pos = _src_block_ptr->get_position_by_name(kv.first); - if (pos == -1) { + if (!_src_block_name_to_idx.contains(kv.first)) { return Status::InternalError("Column {} not found in src block {}", kv.first, _src_block_ptr->dump_structure()); } _src_block_ptr->replace_by_position( - pos, is_nullable ? make_nullable(result_column_ptr) : result_column_ptr); + _src_block_name_to_idx[kv.first], + is_nullable ? make_nullable(result_column_ptr) : result_column_ptr); _src_block_ptr->erase(result_column_id); } } diff --git a/be/src/vec/functions/function.cpp b/be/src/vec/functions/function.cpp index dea9eab86b278c..af927fac913ac4 100644 --- a/be/src/vec/functions/function.cpp +++ b/be/src/vec/functions/function.cpp @@ -150,15 +150,15 @@ Status PreparedFunctionImpl::default_implementation_for_constant_arguments( // If we unpack it, there will be unnecessary cost of virtual judge. if (args_expect_const.end() != std::find(args_expect_const.begin(), args_expect_const.end(), arg_num)) { - temporary_block.simple_insert({column.column, column.type, column.name}); + temporary_block.insert({column.column, column.type, column.name}); } else { - temporary_block.simple_insert( + temporary_block.insert( {assert_cast(column.column.get())->get_data_column_ptr(), column.type, column.name}); } } - temporary_block.simple_insert(block.get_by_position(result)); + temporary_block.insert(block.get_by_position(result)); ColumnNumbers temporary_argument_numbers(arguments_size); for (int i = 0; i < arguments_size; ++i) { @@ -209,9 +209,9 @@ Status PreparedFunctionImpl::default_implementation_for_nulls( for (int i = 0; i < args.size(); ++i) { uint32_t arg = args[i]; new_args.push_back(i); - new_block.simple_insert(block.get_by_position(arg).unnest_nullable(need_to_default)); + new_block.insert(block.get_by_position(arg).unnest_nullable(need_to_default)); } - new_block.simple_insert(block.get_by_position(result)); + new_block.insert(block.get_by_position(result)); int new_result = new_block.columns() - 1; RETURN_IF_ERROR(default_execute(context, new_block, new_args, new_result, block.rows())); diff --git a/be/test/vec/core/block_test.cpp b/be/test/vec/core/block_test.cpp index ba5e99c9d2bd88..9b316baca605c3 100644 --- a/be/test/vec/core/block_test.cpp +++ b/be/test/vec/core/block_test.cpp @@ -1162,16 +1162,13 @@ TEST(BlockTest, insert_erase) { block.insert(0, column_with_name); - EXPECT_NO_THROW(auto item = block.get_by_name("column")); EXPECT_EQ(block.get_position_by_name("column"), 0); { // test const block const auto const_block = block; - EXPECT_ANY_THROW(const_block.get_by_name("column2")); EXPECT_EQ(const_block.get_position_by_name("column2"), -1); - EXPECT_NO_THROW(auto item = const_block.get_by_name("column")); EXPECT_EQ(const_block.get_position_by_name("column"), 0); } diff --git a/be/test/vec/exec/format/parquet/parquet_read_lines.cpp b/be/test/vec/exec/format/parquet/parquet_read_lines.cpp index 19416fe84e82e9..08864a86fef400 100644 --- a/be/test/vec/exec/format/parquet/parquet_read_lines.cpp +++ b/be/test/vec/exec/format/parquet/parquet_read_lines.cpp @@ -172,8 +172,8 @@ static void read_parquet_lines(std::vector numeric_types, bool eof = false; size_t read_row = 0; static_cast(p_reader->get_next_block(block.get(), &read_row, &eof)); - auto row_id_string_column = - static_cast(*block->get_by_name("row_id").column.get()); + auto row_id_string_column = static_cast( + *block->get_by_position(block->get_position_by_name("row_id")).column.get()); auto read_lines_tmp = read_lines; for (auto i = 0; i < row_id_string_column.size(); i++) { GlobalRowLoacationV2 info = diff --git a/be/test/vec/exec/orc/orc_read_lines.cpp b/be/test/vec/exec/orc/orc_read_lines.cpp index f81fdc076bc1e7..c73d6604b06073 100644 --- a/be/test/vec/exec/orc/orc_read_lines.cpp +++ b/be/test/vec/exec/orc/orc_read_lines.cpp @@ -157,8 +157,8 @@ static void read_orc_line(int64_t line, std::string block_dump) { bool eof = false; size_t read_row = 0; static_cast(reader->get_next_block(block.get(), &read_row, &eof)); - auto row_id_string_column = - static_cast(*block->get_by_name("row_id").column.get()); + auto row_id_string_column = static_cast( + *block->get_by_position(block->get_position_by_name("row_id")).column.get()); for (auto i = 0; i < row_id_string_column.size(); i++) { GlobalRowLoacationV2 info = *((GlobalRowLoacationV2*)row_id_string_column.get_data_at(i).data);