Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions be/src/olap/rowset/segment_v2/index_file_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class IndexFileReader {
Result<InvertedIndexDirectoryMap> get_all_directories();
// open file v2, init _stream
int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 : _stream->length(); }
const std::string& get_index_path_prefix() const { return _index_path_prefix; }
friend IndexFileWriter;

protected:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

#include <algorithm>
#include <memory>
#include <string>

#include "olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/query.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h"
Expand All @@ -34,14 +36,14 @@ using AllScorerPtr = std::shared_ptr<AllScorer>;
using AllWeightPtr = std::shared_ptr<AllWeight>;
using AllQueryPtr = std::shared_ptr<AllQuery>;

/// Scorer that matches all documents [0, max_doc).
/// Mirrors Lucene's MatchAllDocsQuery scorer with ConstantScoreWeight:
/// returns a constant score of 1.0 when scoring is enabled, 0.0 otherwise.
class AllScorer : public Scorer {
public:
explicit AllScorer(uint32_t max_doc) : _max_doc(max_doc) {
if (_max_doc == 0) {
_doc = TERMINATED;
} else {
_doc = 0;
}
AllScorer(uint32_t max_doc, bool enable_scoring)
: _max_doc(max_doc), _score(enable_scoring ? 1.0F : 0.0F) {
_doc = (_max_doc == 0) ? TERMINATED : 0;
}

~AllScorer() override = default;
Expand Down Expand Up @@ -72,41 +74,60 @@ class AllScorer : public Scorer {
return _doc;
}

float score() override { return 1.0F; }
float score() override { return _score; }

uint32_t size_hint() const override { return _max_doc; }

private:
uint32_t _max_doc = 0;
uint32_t _doc = TERMINATED;
float _score;
};

/// Weight for AllQuery. Analogous to Lucene's ConstantScoreWeight used by MatchAllDocsQuery.
class AllWeight : public Weight {
public:
explicit AllWeight(uint32_t max_doc) : _max_doc(max_doc) {}
explicit AllWeight(bool enable_scoring) : _enable_scoring(enable_scoring) {}

AllWeight(std::wstring field, bool nullable, bool enable_scoring)
: _field(std::move(field)), _nullable(nullable), _enable_scoring(enable_scoring) {}

~AllWeight() override = default;

ScorerPtr scorer(const QueryExecutionContext& context) override {
return std::make_shared<AllScorer>(_max_doc);
auto inner = std::make_shared<AllScorer>(context.segment_num_rows, _enable_scoring);
if (_nullable && context.null_resolver != nullptr) {
std::string logical = logical_field_or_fallback(context, "", _field);
return make_nullable_scorer(std::move(inner), logical, context.null_resolver);
}
return inner;
}

private:
uint32_t _max_doc = 0;
std::wstring _field;
bool _nullable = false;
bool _enable_scoring = false;
};

/// Query that matches all documents, analogous to Lucene's MatchAllDocsQuery.
/// Uses constant scoring (score = 1.0) like Lucene's ConstantScoreWeight.
class AllQuery : public Query {
public:
explicit AllQuery(uint32_t max_doc) : _max_doc(max_doc) {}
AllQuery() = default;
AllQuery(std::wstring field, bool nullable) : _field(std::move(field)), _nullable(nullable) {}

~AllQuery() override = default;

WeightPtr weight(bool /*enable_scoring*/) override {
return std::make_shared<AllWeight>(_max_doc);
WeightPtr weight(bool enable_scoring) override {
if (!_field.empty()) {
return std::make_shared<AllWeight>(_field, _nullable, enable_scoring);
}
return std::make_shared<AllWeight>(enable_scoring);
}

private:
uint32_t _max_doc = 0;
std::wstring _field;
bool _nullable = false;
};

} // namespace doris::segment_v2::inverted_index::query_v2
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ std::optional<CombinationMethod> OccurBooleanWeight<ScoreCombinerPtrT>::build_sh
} else if (adjusted_minimum == 1) {
return Required {scorer_union(std::move(should_scorers), combiner)};
} else if (adjusted_minimum == num_of_should_scorers) {
// All SHOULD clauses must match - move them to must_scorers (append, not swap)
for (auto& scorer : should_scorers) {
must_scorers.push_back(std::move(scorer));
}
Expand All @@ -137,7 +138,7 @@ ScorerPtr OccurBooleanWeight<ScoreCombinerPtrT>::effective_must_scorer(
std::vector<ScorerPtr> must_scorers, size_t must_num_all_scorers) {
if (must_scorers.empty()) {
if (must_num_all_scorers > 0) {
return std::make_shared<AllScorer>(_max_doc);
return std::make_shared<AllScorer>(_max_doc, _enable_scoring);
}
return nullptr;
}
Expand All @@ -152,10 +153,10 @@ SpecializedScorer OccurBooleanWeight<ScoreCombinerPtrT>::effective_should_scorer
if (_enable_scoring) {
std::vector<ScorerPtr> scorers;
scorers.push_back(into_box_scorer(std::move(should_scorer), combiner));
scorers.push_back(std::make_shared<AllScorer>(_max_doc));
scorers.push_back(std::make_shared<AllScorer>(_max_doc, _enable_scoring));
return make_buffered_union(std::move(scorers), combiner);
} else {
return std::make_shared<AllScorer>(_max_doc);
return std::make_shared<AllScorer>(_max_doc, _enable_scoring);
}
}
return should_scorer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ class RegexpWeight : public Weight {
std::string _pattern;
bool _enable_scoring = false;
bool _nullable = true;
int32_t _max_expansions = 50;
// Set to 0 to disable limit (ES has no default limit for prefix queries)
// The limit prevents collecting too many terms, but can cause incorrect results
int32_t _max_expansions = 0;
};

} // namespace doris::segment_v2::inverted_index::query_v2
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ Result<InvertedIndexReaderPtr> InvertedIndexIterator::select_for_text(
}
}

// EQUAL/WILDCARD/REGEXP queries prefer STRING_TYPE
// EQUAL queries prefer STRING_TYPE for exact match
if (is_equal_query(query_type)) {
for (const auto* entry : match.candidates) {
if (entry->type == InvertedIndexReaderType::STRING_TYPE) {
Expand Down
4 changes: 4 additions & 0 deletions be/src/olap/rowset/segment_v2/inverted_index_query_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ enum class InvertedIndexQueryType {
WILDCARD_QUERY = 12,
RANGE_QUERY = 13,
LIST_QUERY = 14,
SEARCH_DSL_QUERY = 15,
};

inline bool is_equal_query(InvertedIndexQueryType query_type) {
Expand Down Expand Up @@ -154,6 +155,9 @@ inline std::string query_type_to_string(InvertedIndexQueryType query_type) {
case InvertedIndexQueryType::LIST_QUERY: {
return "LIST";
}
case InvertedIndexQueryType::SEARCH_DSL_QUERY: {
return "SEARCH_DSL";
}
default:
return "";
}
Expand Down
62 changes: 46 additions & 16 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,12 @@ class InvertedIndexResultBitmap {

// Copy constructor
InvertedIndexResultBitmap(const InvertedIndexResultBitmap& other)
: _data_bitmap(std::make_shared<roaring::Roaring>(*other._data_bitmap)),
_null_bitmap(std::make_shared<roaring::Roaring>(*other._null_bitmap)) {}
: _data_bitmap(other._data_bitmap
? std::make_shared<roaring::Roaring>(*other._data_bitmap)
: nullptr),
_null_bitmap(other._null_bitmap
? std::make_shared<roaring::Roaring>(*other._null_bitmap)
: nullptr) {}

// Move constructor
InvertedIndexResultBitmap(InvertedIndexResultBitmap&& other) noexcept
Expand All @@ -105,8 +109,12 @@ class InvertedIndexResultBitmap {
// Copy assignment operator
InvertedIndexResultBitmap& operator=(const InvertedIndexResultBitmap& other) {
if (this != &other) { // Prevent self-assignment
_data_bitmap = std::make_shared<roaring::Roaring>(*other._data_bitmap);
_null_bitmap = std::make_shared<roaring::Roaring>(*other._null_bitmap);
_data_bitmap = other._data_bitmap
? std::make_shared<roaring::Roaring>(*other._data_bitmap)
: nullptr;
_null_bitmap = other._null_bitmap
? std::make_shared<roaring::Roaring>(*other._null_bitmap)
: nullptr;
}
return *this;
}
Expand All @@ -122,49 +130,65 @@ class InvertedIndexResultBitmap {

// Operator &=
InvertedIndexResultBitmap& operator&=(const InvertedIndexResultBitmap& other) {
if (_data_bitmap && _null_bitmap && other._data_bitmap && other._null_bitmap) {
auto new_null_bitmap = (*_data_bitmap & *other._null_bitmap) |
(*_null_bitmap & *other._data_bitmap) |
(*_null_bitmap & *other._null_bitmap);
if (_data_bitmap && other._data_bitmap) {
const auto& my_null = _null_bitmap ? *_null_bitmap : _empty_bitmap();
const auto& ot_null = other._null_bitmap ? *other._null_bitmap : _empty_bitmap();
auto new_null_bitmap = (*_data_bitmap & ot_null) | (my_null & *other._data_bitmap) |
(my_null & ot_null);
*_data_bitmap &= *other._data_bitmap;
if (!_null_bitmap) {
_null_bitmap = std::make_shared<roaring::Roaring>();
}
*_null_bitmap = std::move(new_null_bitmap);
}
return *this;
}

// Operator |=
InvertedIndexResultBitmap& operator|=(const InvertedIndexResultBitmap& other) {
if (_data_bitmap && _null_bitmap && other._data_bitmap && other._null_bitmap) {
if (_data_bitmap && other._data_bitmap) {
const auto& my_null = _null_bitmap ? *_null_bitmap : _empty_bitmap();
const auto& ot_null = other._null_bitmap ? *other._null_bitmap : _empty_bitmap();
// SQL three-valued logic for OR:
// - TRUE OR anything = TRUE (not NULL)
// - FALSE OR NULL = NULL
// - NULL OR NULL = NULL
// Result is NULL when the row is NULL on either side while the other side
// is not TRUE. Rows that become TRUE must be removed from the NULL bitmap.
*_data_bitmap |= *other._data_bitmap;
auto new_null_bitmap =
(*_null_bitmap - *other._data_bitmap) | (*other._null_bitmap - *_data_bitmap);
auto new_null_bitmap = (my_null - *other._data_bitmap) | (ot_null - *_data_bitmap);
new_null_bitmap -= *_data_bitmap;
if (!_null_bitmap) {
_null_bitmap = std::make_shared<roaring::Roaring>();
}
*_null_bitmap = std::move(new_null_bitmap);
}
return *this;
}

// NOT operation
const InvertedIndexResultBitmap& op_not(const roaring::Roaring* universe) const {
if (_data_bitmap && _null_bitmap) {
*_data_bitmap = *universe - *_data_bitmap - *_null_bitmap;
if (_data_bitmap) {
if (_null_bitmap) {
*_data_bitmap = *universe - *_data_bitmap - *_null_bitmap;
} else {
*_data_bitmap = *universe - *_data_bitmap;
}
// The _null_bitmap remains unchanged.
}
return *this;
}

// Operator -=
InvertedIndexResultBitmap& operator-=(const InvertedIndexResultBitmap& other) {
if (_data_bitmap && _null_bitmap && other._data_bitmap && other._null_bitmap) {
if (_data_bitmap && other._data_bitmap) {
*_data_bitmap -= *other._data_bitmap;
*_data_bitmap -= *other._null_bitmap;
*_null_bitmap -= *other._null_bitmap;
if (other._null_bitmap) {
*_data_bitmap -= *other._null_bitmap;
}
if (_null_bitmap && other._null_bitmap) {
*_null_bitmap -= *other._null_bitmap;
}
}
return *this;
}
Expand All @@ -181,6 +205,12 @@ class InvertedIndexResultBitmap {

// Check if both bitmaps are empty
bool is_empty() const { return (_data_bitmap == nullptr && _null_bitmap == nullptr); }

private:
static const roaring::Roaring& _empty_bitmap() {
static const roaring::Roaring empty;
return empty;
}
};

class InvertedIndexReader : public IndexReader {
Expand Down
14 changes: 11 additions & 3 deletions be/src/olap/tablet_schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "olap/inverted_index_parser.h"
#include "olap/olap_common.h"
#include "olap/olap_define.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
#include "olap/tablet_column_object_pool.h"
#include "olap/types.h"
#include "olap/utils.h"
Expand Down Expand Up @@ -943,9 +944,16 @@ void TabletIndex::to_schema_pb(TabletIndexPB* index) const {

DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", { return; })

// lowercase by default
if (!_properties.empty()) {
if (!_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
// Only add lower_case=true default for built-in analyzers/parsers, NOT for custom analyzers
// Custom analyzer: lower_case is determined by analyzer's internal token filter
if (!_properties.empty() && !_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
bool has_parser = _properties.contains(INVERTED_INDEX_PARSER_KEY) ||
_properties.contains(INVERTED_INDEX_PARSER_KEY_ALIAS);
std::string analyzer_name = get_analyzer_name_from_properties(_properties);
bool is_builtin = analyzer_name.empty() ||
segment_v2::inverted_index::InvertedIndexAnalyzer::is_builtin_analyzer(
analyzer_name);
if (has_parser || is_builtin) {
(*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] =
INVERTED_INDEX_PARSER_TRUE;
}
Expand Down
2 changes: 1 addition & 1 deletion be/src/runtime/exec_env_init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ Status ExecEnv::init_mem_env() {
_inverted_index_query_cache = InvertedIndexQueryCache::create_global_cache(
inverted_index_query_cache_limit, config::inverted_index_query_cache_shards);
LOG(INFO) << "Inverted index query match cache memory limit: "
<< PrettyPrinter::print(inverted_index_cache_limit, TUnit::BYTES)
<< PrettyPrinter::print(inverted_index_query_cache_limit, TUnit::BYTES)
<< ", origin config value: " << config::inverted_index_query_cache_limit;

// Initialize encoding info resolver
Expand Down
14 changes: 13 additions & 1 deletion be/src/vec/exprs/vsearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "common/status.h"
#include "glog/logging.h"
#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "runtime/runtime_state.h"
#include "vec/columns/column_const.h"
#include "vec/exprs/vexpr_context.h"
#include "vec/exprs/vliteral.h"
Expand Down Expand Up @@ -120,6 +121,16 @@ VSearchExpr::VSearchExpr(const TExprNode& node) : VExpr(node) {
}
}

Status VSearchExpr::prepare(RuntimeState* state, const RowDescriptor& row_desc,
VExprContext* context) {
RETURN_IF_ERROR(VExpr::prepare(state, row_desc, context));
const auto& query_options = state->query_options();
if (query_options.__isset.enable_inverted_index_query_cache) {
_enable_cache = query_options.enable_inverted_index_query_cache;
}
return Status::OK();
}

const std::string& VSearchExpr::expr_name() const {
static const std::string name = "VSearchExpr";
return name;
Expand Down Expand Up @@ -164,7 +175,8 @@ Status VSearchExpr::evaluate_inverted_index(VExprContext* context, uint32_t segm
auto function = std::make_shared<FunctionSearch>();
auto result_bitmap = InvertedIndexResultBitmap();
auto status = function->evaluate_inverted_index_with_search_param(
_search_param, bundle.field_types, bundle.iterators, segment_num_rows, result_bitmap);
_search_param, bundle.field_types, bundle.iterators, segment_num_rows, result_bitmap,
_enable_cache);

if (!status.ok()) {
LOG(WARNING) << "VSearchExpr: Function evaluation failed: " << status.to_string();
Expand Down
5 changes: 5 additions & 0 deletions be/src/vec/exprs/vsearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,15 @@ class VSearchExpr : public VExpr {
bool can_push_down_to_index() const override { return true; }

const TSearchParam& get_search_param() const { return _search_param; }
bool enable_cache() const { return _enable_cache; }

Status prepare(RuntimeState* state, const RowDescriptor& row_desc,
VExprContext* context) override;

private:
TSearchParam _search_param;
std::string _original_dsl;
bool _enable_cache = true;
};

} // namespace doris::vectorized
Loading
Loading