diff --git a/be/src/olap/rowset/segment_v2/index_file_reader.h b/be/src/olap/rowset/segment_v2/index_file_reader.h index 42022224485a6e..2b5f2e183a1c3a 100644 --- a/be/src/olap/rowset/segment_v2/index_file_reader.h +++ b/be/src/olap/rowset/segment_v2/index_file_reader.h @@ -71,6 +71,7 @@ class IndexFileReader { Result get_all_directories(); // open file v2, init _stream int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 : _stream->length(); } + const std::string& get_index_path_prefix() const { return _index_path_prefix; } friend IndexFileWriter; protected: diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h index cd73860d46fd17..aa17338e2b13f6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h @@ -19,7 +19,9 @@ #include #include +#include +#include "olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h" @@ -34,14 +36,14 @@ using AllScorerPtr = std::shared_ptr; using AllWeightPtr = std::shared_ptr; using AllQueryPtr = std::shared_ptr; +/// Scorer that matches all documents [0, max_doc). +/// Mirrors Lucene's MatchAllDocsQuery scorer with ConstantScoreWeight: +/// returns a constant score of 1.0 when scoring is enabled, 0.0 otherwise. class AllScorer : public Scorer { public: - explicit AllScorer(uint32_t max_doc) : _max_doc(max_doc) { - if (_max_doc == 0) { - _doc = TERMINATED; - } else { - _doc = 0; - } + AllScorer(uint32_t max_doc, bool enable_scoring) + : _max_doc(max_doc), _score(enable_scoring ? 1.0F : 0.0F) { + _doc = (_max_doc == 0) ? TERMINATED : 0; } ~AllScorer() override = default; @@ -72,41 +74,60 @@ class AllScorer : public Scorer { return _doc; } - float score() override { return 1.0F; } + float score() override { return _score; } uint32_t size_hint() const override { return _max_doc; } private: uint32_t _max_doc = 0; uint32_t _doc = TERMINATED; + float _score; }; +/// Weight for AllQuery. Analogous to Lucene's ConstantScoreWeight used by MatchAllDocsQuery. class AllWeight : public Weight { public: - explicit AllWeight(uint32_t max_doc) : _max_doc(max_doc) {} + explicit AllWeight(bool enable_scoring) : _enable_scoring(enable_scoring) {} + + AllWeight(std::wstring field, bool nullable, bool enable_scoring) + : _field(std::move(field)), _nullable(nullable), _enable_scoring(enable_scoring) {} ~AllWeight() override = default; ScorerPtr scorer(const QueryExecutionContext& context) override { - return std::make_shared(_max_doc); + auto inner = std::make_shared(context.segment_num_rows, _enable_scoring); + if (_nullable && context.null_resolver != nullptr) { + std::string logical = logical_field_or_fallback(context, "", _field); + return make_nullable_scorer(std::move(inner), logical, context.null_resolver); + } + return inner; } private: - uint32_t _max_doc = 0; + std::wstring _field; + bool _nullable = false; + bool _enable_scoring = false; }; +/// Query that matches all documents, analogous to Lucene's MatchAllDocsQuery. +/// Uses constant scoring (score = 1.0) like Lucene's ConstantScoreWeight. class AllQuery : public Query { public: - explicit AllQuery(uint32_t max_doc) : _max_doc(max_doc) {} + AllQuery() = default; + AllQuery(std::wstring field, bool nullable) : _field(std::move(field)), _nullable(nullable) {} ~AllQuery() override = default; - WeightPtr weight(bool /*enable_scoring*/) override { - return std::make_shared(_max_doc); + WeightPtr weight(bool enable_scoring) override { + if (!_field.empty()) { + return std::make_shared(_field, _nullable, enable_scoring); + } + return std::make_shared(enable_scoring); } private: - uint32_t _max_doc = 0; + std::wstring _field; + bool _nullable = false; }; } // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/occur_boolean_weight.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/occur_boolean_weight.cpp index e92a32fbe94f33..9b828708798d45 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/occur_boolean_weight.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/occur_boolean_weight.cpp @@ -112,6 +112,7 @@ std::optional OccurBooleanWeight::build_sh } else if (adjusted_minimum == 1) { return Required {scorer_union(std::move(should_scorers), combiner)}; } else if (adjusted_minimum == num_of_should_scorers) { + // All SHOULD clauses must match - move them to must_scorers (append, not swap) for (auto& scorer : should_scorers) { must_scorers.push_back(std::move(scorer)); } @@ -137,7 +138,7 @@ ScorerPtr OccurBooleanWeight::effective_must_scorer( std::vector must_scorers, size_t must_num_all_scorers) { if (must_scorers.empty()) { if (must_num_all_scorers > 0) { - return std::make_shared(_max_doc); + return std::make_shared(_max_doc, _enable_scoring); } return nullptr; } @@ -152,10 +153,10 @@ SpecializedScorer OccurBooleanWeight::effective_should_scorer if (_enable_scoring) { std::vector scorers; scorers.push_back(into_box_scorer(std::move(should_scorer), combiner)); - scorers.push_back(std::make_shared(_max_doc)); + scorers.push_back(std::make_shared(_max_doc, _enable_scoring)); return make_buffered_union(std::move(scorers), combiner); } else { - return std::make_shared(_max_doc); + return std::make_shared(_max_doc, _enable_scoring); } } return should_scorer; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h index b58d124ed112ae..f9959ff0d8ce3c 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_weight.h @@ -48,7 +48,9 @@ class RegexpWeight : public Weight { std::string _pattern; bool _enable_scoring = false; bool _nullable = true; - int32_t _max_expansions = 50; + // Set to 0 to disable limit (ES has no default limit for prefix queries) + // The limit prevents collecting too many terms, but can cause incorrect results + int32_t _max_expansions = 0; }; } // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp b/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp index 4df1560183a639..fa0a7488015ec1 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp @@ -174,7 +174,7 @@ Result InvertedIndexIterator::select_for_text( } } - // EQUAL/WILDCARD/REGEXP queries prefer STRING_TYPE + // EQUAL queries prefer STRING_TYPE for exact match if (is_equal_query(query_type)) { for (const auto* entry : match.candidates) { if (entry->type == InvertedIndexReaderType::STRING_TYPE) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h index c6b250bb5c2563..8fa1a0f9059656 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h @@ -82,6 +82,7 @@ enum class InvertedIndexQueryType { WILDCARD_QUERY = 12, RANGE_QUERY = 13, LIST_QUERY = 14, + SEARCH_DSL_QUERY = 15, }; inline bool is_equal_query(InvertedIndexQueryType query_type) { @@ -154,6 +155,9 @@ inline std::string query_type_to_string(InvertedIndexQueryType query_type) { case InvertedIndexQueryType::LIST_QUERY: { return "LIST"; } + case InvertedIndexQueryType::SEARCH_DSL_QUERY: { + return "SEARCH_DSL"; + } default: return ""; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 618cc71a94e1dd..14bd5189a19c01 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -94,8 +94,12 @@ class InvertedIndexResultBitmap { // Copy constructor InvertedIndexResultBitmap(const InvertedIndexResultBitmap& other) - : _data_bitmap(std::make_shared(*other._data_bitmap)), - _null_bitmap(std::make_shared(*other._null_bitmap)) {} + : _data_bitmap(other._data_bitmap + ? std::make_shared(*other._data_bitmap) + : nullptr), + _null_bitmap(other._null_bitmap + ? std::make_shared(*other._null_bitmap) + : nullptr) {} // Move constructor InvertedIndexResultBitmap(InvertedIndexResultBitmap&& other) noexcept @@ -105,8 +109,12 @@ class InvertedIndexResultBitmap { // Copy assignment operator InvertedIndexResultBitmap& operator=(const InvertedIndexResultBitmap& other) { if (this != &other) { // Prevent self-assignment - _data_bitmap = std::make_shared(*other._data_bitmap); - _null_bitmap = std::make_shared(*other._null_bitmap); + _data_bitmap = other._data_bitmap + ? std::make_shared(*other._data_bitmap) + : nullptr; + _null_bitmap = other._null_bitmap + ? std::make_shared(*other._null_bitmap) + : nullptr; } return *this; } @@ -122,11 +130,15 @@ class InvertedIndexResultBitmap { // Operator &= InvertedIndexResultBitmap& operator&=(const InvertedIndexResultBitmap& other) { - if (_data_bitmap && _null_bitmap && other._data_bitmap && other._null_bitmap) { - auto new_null_bitmap = (*_data_bitmap & *other._null_bitmap) | - (*_null_bitmap & *other._data_bitmap) | - (*_null_bitmap & *other._null_bitmap); + if (_data_bitmap && other._data_bitmap) { + const auto& my_null = _null_bitmap ? *_null_bitmap : _empty_bitmap(); + const auto& ot_null = other._null_bitmap ? *other._null_bitmap : _empty_bitmap(); + auto new_null_bitmap = (*_data_bitmap & ot_null) | (my_null & *other._data_bitmap) | + (my_null & ot_null); *_data_bitmap &= *other._data_bitmap; + if (!_null_bitmap) { + _null_bitmap = std::make_shared(); + } *_null_bitmap = std::move(new_null_bitmap); } return *this; @@ -134,7 +146,9 @@ class InvertedIndexResultBitmap { // Operator |= InvertedIndexResultBitmap& operator|=(const InvertedIndexResultBitmap& other) { - if (_data_bitmap && _null_bitmap && other._data_bitmap && other._null_bitmap) { + if (_data_bitmap && other._data_bitmap) { + const auto& my_null = _null_bitmap ? *_null_bitmap : _empty_bitmap(); + const auto& ot_null = other._null_bitmap ? *other._null_bitmap : _empty_bitmap(); // SQL three-valued logic for OR: // - TRUE OR anything = TRUE (not NULL) // - FALSE OR NULL = NULL @@ -142,9 +156,11 @@ class InvertedIndexResultBitmap { // Result is NULL when the row is NULL on either side while the other side // is not TRUE. Rows that become TRUE must be removed from the NULL bitmap. *_data_bitmap |= *other._data_bitmap; - auto new_null_bitmap = - (*_null_bitmap - *other._data_bitmap) | (*other._null_bitmap - *_data_bitmap); + auto new_null_bitmap = (my_null - *other._data_bitmap) | (ot_null - *_data_bitmap); new_null_bitmap -= *_data_bitmap; + if (!_null_bitmap) { + _null_bitmap = std::make_shared(); + } *_null_bitmap = std::move(new_null_bitmap); } return *this; @@ -152,8 +168,12 @@ class InvertedIndexResultBitmap { // NOT operation const InvertedIndexResultBitmap& op_not(const roaring::Roaring* universe) const { - if (_data_bitmap && _null_bitmap) { - *_data_bitmap = *universe - *_data_bitmap - *_null_bitmap; + if (_data_bitmap) { + if (_null_bitmap) { + *_data_bitmap = *universe - *_data_bitmap - *_null_bitmap; + } else { + *_data_bitmap = *universe - *_data_bitmap; + } // The _null_bitmap remains unchanged. } return *this; @@ -161,10 +181,14 @@ class InvertedIndexResultBitmap { // Operator -= InvertedIndexResultBitmap& operator-=(const InvertedIndexResultBitmap& other) { - if (_data_bitmap && _null_bitmap && other._data_bitmap && other._null_bitmap) { + if (_data_bitmap && other._data_bitmap) { *_data_bitmap -= *other._data_bitmap; - *_data_bitmap -= *other._null_bitmap; - *_null_bitmap -= *other._null_bitmap; + if (other._null_bitmap) { + *_data_bitmap -= *other._null_bitmap; + } + if (_null_bitmap && other._null_bitmap) { + *_null_bitmap -= *other._null_bitmap; + } } return *this; } @@ -181,6 +205,12 @@ class InvertedIndexResultBitmap { // Check if both bitmaps are empty bool is_empty() const { return (_data_bitmap == nullptr && _null_bitmap == nullptr); } + +private: + static const roaring::Roaring& _empty_bitmap() { + static const roaring::Roaring empty; + return empty; + } }; class InvertedIndexReader : public IndexReader { diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index e56c73e34a56cb..9f42cb254eacab 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -39,6 +39,7 @@ #include "olap/inverted_index_parser.h" #include "olap/olap_common.h" #include "olap/olap_define.h" +#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h" #include "olap/tablet_column_object_pool.h" #include "olap/types.h" #include "olap/utils.h" @@ -943,9 +944,16 @@ void TabletIndex::to_schema_pb(TabletIndexPB* index) const { DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", { return; }) - // lowercase by default - if (!_properties.empty()) { - if (!_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) { + // Only add lower_case=true default for built-in analyzers/parsers, NOT for custom analyzers + // Custom analyzer: lower_case is determined by analyzer's internal token filter + if (!_properties.empty() && !_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) { + bool has_parser = _properties.contains(INVERTED_INDEX_PARSER_KEY) || + _properties.contains(INVERTED_INDEX_PARSER_KEY_ALIAS); + std::string analyzer_name = get_analyzer_name_from_properties(_properties); + bool is_builtin = analyzer_name.empty() || + segment_v2::inverted_index::InvertedIndexAnalyzer::is_builtin_analyzer( + analyzer_name); + if (has_parser || is_builtin) { (*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] = INVERTED_INDEX_PARSER_TRUE; } diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 14f8aac3153d1d..9a2fd612e866b8 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -655,7 +655,7 @@ Status ExecEnv::init_mem_env() { _inverted_index_query_cache = InvertedIndexQueryCache::create_global_cache( inverted_index_query_cache_limit, config::inverted_index_query_cache_shards); LOG(INFO) << "Inverted index query match cache memory limit: " - << PrettyPrinter::print(inverted_index_cache_limit, TUnit::BYTES) + << PrettyPrinter::print(inverted_index_query_cache_limit, TUnit::BYTES) << ", origin config value: " << config::inverted_index_query_cache_limit; // Initialize encoding info resolver diff --git a/be/src/vec/exprs/vsearch.cpp b/be/src/vec/exprs/vsearch.cpp index f2d9894f2a563e..5043990172e9cb 100644 --- a/be/src/vec/exprs/vsearch.cpp +++ b/be/src/vec/exprs/vsearch.cpp @@ -24,6 +24,7 @@ #include "common/status.h" #include "glog/logging.h" #include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "runtime/runtime_state.h" #include "vec/columns/column_const.h" #include "vec/exprs/vexpr_context.h" #include "vec/exprs/vliteral.h" @@ -120,6 +121,16 @@ VSearchExpr::VSearchExpr(const TExprNode& node) : VExpr(node) { } } +Status VSearchExpr::prepare(RuntimeState* state, const RowDescriptor& row_desc, + VExprContext* context) { + RETURN_IF_ERROR(VExpr::prepare(state, row_desc, context)); + const auto& query_options = state->query_options(); + if (query_options.__isset.enable_inverted_index_query_cache) { + _enable_cache = query_options.enable_inverted_index_query_cache; + } + return Status::OK(); +} + const std::string& VSearchExpr::expr_name() const { static const std::string name = "VSearchExpr"; return name; @@ -164,7 +175,8 @@ Status VSearchExpr::evaluate_inverted_index(VExprContext* context, uint32_t segm auto function = std::make_shared(); auto result_bitmap = InvertedIndexResultBitmap(); auto status = function->evaluate_inverted_index_with_search_param( - _search_param, bundle.field_types, bundle.iterators, segment_num_rows, result_bitmap); + _search_param, bundle.field_types, bundle.iterators, segment_num_rows, result_bitmap, + _enable_cache); if (!status.ok()) { LOG(WARNING) << "VSearchExpr: Function evaluation failed: " << status.to_string(); diff --git a/be/src/vec/exprs/vsearch.h b/be/src/vec/exprs/vsearch.h index 9f818e50ee472f..a7cfaa84ef074e 100644 --- a/be/src/vec/exprs/vsearch.h +++ b/be/src/vec/exprs/vsearch.h @@ -41,10 +41,15 @@ class VSearchExpr : public VExpr { bool can_push_down_to_index() const override { return true; } const TSearchParam& get_search_param() const { return _search_param; } + bool enable_cache() const { return _enable_cache; } + + Status prepare(RuntimeState* state, const RowDescriptor& row_desc, + VExprContext* context) override; private: TSearchParam _search_param; std::string _original_dsl; + bool _enable_cache = true; }; } // namespace doris::vectorized diff --git a/be/src/vec/functions/function_search.cpp b/be/src/vec/functions/function_search.cpp index 6f737146915dbd..79c697ef558871 100644 --- a/be/src/vec/functions/function_search.cpp +++ b/be/src/vec/functions/function_search.cpp @@ -37,6 +37,7 @@ #include "olap/rowset/segment_v2/index_query_context.h" #include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h" #include "olap/rowset/segment_v2/inverted_index/query/query_helper.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/all_query/all_query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/bit_set_query/bit_set_query.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/boolean_query_builder.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/operator.h" @@ -48,6 +49,9 @@ #include "olap/rowset/segment_v2/inverted_index/util/string_helper.h" #include "olap/rowset/segment_v2/inverted_index_iterator.h" #include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "olap/rowset/segment_v2/inverted_index_searcher.h" +#include "util/string_util.h" +#include "util/thrift_util.h" #include "vec/columns/column_const.h" #include "vec/core/columns_with_type_and_name.h" #include "vec/data_types/data_type_string.h" @@ -55,6 +59,48 @@ namespace doris::vectorized { +// Build canonical DSL signature for cache key. +// Serializes the entire TSearchParam via Thrift binary protocol so that +// every field (DSL, AST root, field bindings, default_operator, +// minimum_should_match, etc.) is included automatically. +static std::string build_dsl_signature(const TSearchParam& param) { + ThriftSerializer ser(false, 1024); + TSearchParam copy = param; + std::string sig; + auto st = ser.serialize(©, &sig); + if (UNLIKELY(!st.ok())) { + LOG(WARNING) << "build_dsl_signature: Thrift serialization failed: " << st.to_string() + << ", caching disabled for this query"; + return ""; + } + return sig; +} + +// Extract segment path prefix from the first available inverted index iterator. +// All fields in the same segment share the same path prefix. +static std::string extract_segment_prefix( + const std::unordered_map& iterators) { + for (const auto& [field_name, iter] : iterators) { + auto* inv_iter = dynamic_cast(iter); + if (!inv_iter) continue; + // Try fulltext reader first, then string type + for (auto type : + {InvertedIndexReaderType::FULLTEXT, InvertedIndexReaderType::STRING_TYPE}) { + IndexReaderType reader_type = type; + auto reader = inv_iter->get_reader(reader_type); + if (!reader) continue; + auto inv_reader = std::dynamic_pointer_cast(reader); + if (!inv_reader) continue; + auto file_reader = inv_reader->get_index_file_reader(); + if (!file_reader) continue; + return file_reader->get_index_path_prefix(); + } + } + VLOG_DEBUG << "extract_segment_prefix: no suitable inverted index reader found across " + << iterators.size() << " iterators, caching disabled for this query"; + return ""; +} + Status FieldReaderResolver::resolve(const std::string& field_name, InvertedIndexQueryType query_type, FieldReaderBinding* binding) { @@ -106,10 +152,29 @@ Status FieldReaderResolver::resolve(const std::string& field_name, "iterator for field '{}' is not InvertedIndexIterator", field_name); } + // For variant subcolumns, FE resolves the field pattern to a specific index and sends + // its index_properties via TSearchFieldBinding. When FE picks an analyzer-based index, + // upgrade certain query types to MATCH_ANY_QUERY so select_best_reader picks the FULLTEXT + // reader instead of STRING_TYPE. Without this upgrade: + // - TERM (EQUAL_QUERY) clauses would open the wrong (untokenized) index directory + // - WILDCARD clauses would enumerate terms from the wrong index, returning empty results + InvertedIndexQueryType effective_query_type = query_type; + auto fb_it = _field_binding_map.find(field_name); + if (is_variant_sub && fb_it != _field_binding_map.end() && + fb_it->second->__isset.index_properties && !fb_it->second->index_properties.empty()) { + if (inverted_index::InvertedIndexAnalyzer::should_analyzer( + fb_it->second->index_properties) && + (effective_query_type == InvertedIndexQueryType::EQUAL_QUERY || + effective_query_type == InvertedIndexQueryType::WILDCARD_QUERY)) { + effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; + } + } + Result reader_result; const auto& column_type = data_it->second.second; if (column_type) { - reader_result = inverted_iterator->select_best_reader(column_type, query_type, ""); + reader_result = + inverted_iterator->select_best_reader(column_type, effective_query_type, ""); } else { reader_result = inverted_iterator->select_best_reader(""); } @@ -130,43 +195,75 @@ Status FieldReaderResolver::resolve(const std::string& field_name, "index file reader is null for field '{}'", field_name); } - RETURN_IF_ERROR( - index_file_reader->init(config::inverted_index_read_buffer_size, _context->io_ctx)); + // Use InvertedIndexSearcherCache to avoid re-opening index files repeatedly + auto index_file_key = + index_file_reader->get_index_file_cache_key(&inverted_reader->get_index_meta()); + InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); + InvertedIndexCacheHandle searcher_cache_handle; + bool cache_hit = InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, + &searcher_cache_handle); + + std::shared_ptr reader_holder; + if (cache_hit) { + auto searcher_variant = searcher_cache_handle.get_index_searcher(); + auto* searcher_ptr = std::get_if(&searcher_variant); + if (searcher_ptr != nullptr && *searcher_ptr != nullptr) { + reader_holder = std::shared_ptr( + (*searcher_ptr)->getReader(), + [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ }); + } + } - auto directory = DORIS_TRY( - index_file_reader->open(&inverted_reader->get_index_meta(), _context->io_ctx)); + if (!reader_holder) { + // Cache miss: open directory, build IndexSearcher, insert into cache + RETURN_IF_ERROR( + index_file_reader->init(config::inverted_index_read_buffer_size, _context->io_ctx)); + auto directory = DORIS_TRY( + index_file_reader->open(&inverted_reader->get_index_meta(), _context->io_ctx)); + + auto index_searcher_builder = DORIS_TRY( + IndexSearcherBuilder::create_index_searcher_builder(inverted_reader->type())); + auto searcher_result = + DORIS_TRY(index_searcher_builder->get_index_searcher(directory.get())); + auto reader_size = index_searcher_builder->get_reader_size(); + + auto* cache_value = new InvertedIndexSearcherCache::CacheValue(std::move(searcher_result), + reader_size, UnixMillis()); + InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value, + &searcher_cache_handle); + + auto new_variant = searcher_cache_handle.get_index_searcher(); + auto* new_ptr = std::get_if(&new_variant); + if (new_ptr != nullptr && *new_ptr != nullptr) { + reader_holder = std::shared_ptr( + (*new_ptr)->getReader(), + [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ }); + } - lucene::index::IndexReader* raw_reader = nullptr; - try { - raw_reader = lucene::index::IndexReader::open( - directory.get(), config::inverted_index_read_buffer_size, false); - } catch (const CLuceneError& e) { - return Status::Error( - "failed to open IndexReader for field '{}': {}", field_name, e.what()); + if (!reader_holder) { + return Status::Error( + "failed to build IndexSearcher for field '{}'", field_name); + } } - if (raw_reader == nullptr) { - return Status::Error( - "IndexReader is null for field '{}'", field_name); - } - - auto reader_holder = std::shared_ptr( - raw_reader, [](lucene::index::IndexReader* reader) { - if (reader != nullptr) { - reader->close(); - _CLDELETE(reader); - } - }); + _searcher_cache_handles.push_back(std::move(searcher_cache_handle)); FieldReaderBinding resolved; resolved.logical_field_name = field_name; resolved.stored_field_name = stored_field_name; resolved.stored_field_wstr = StringHelper::to_wstring(resolved.stored_field_name); resolved.column_type = column_type; - resolved.query_type = query_type; + resolved.query_type = effective_query_type; resolved.inverted_reader = inverted_reader; resolved.lucene_reader = reader_holder; - resolved.index_properties = inverted_reader->get_index_properties(); + // Prefer FE-provided index_properties (needed for variant subcolumn field_pattern matching) + // Reuse fb_it from earlier lookup above. + if (fb_it != _field_binding_map.end() && fb_it->second->__isset.index_properties && + !fb_it->second->index_properties.empty()) { + resolved.index_properties = fb_it->second->index_properties; + } else { + resolved.index_properties = inverted_reader->get_index_properties(); + } resolved.binding_key = binding_key; resolved.analyzer_key = normalize_analyzer_key(build_analyzer_key_from_properties(resolved.index_properties)); @@ -200,7 +297,7 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( const std::unordered_map& data_type_with_names, std::unordered_map iterators, uint32_t num_rows, - InvertedIndexResultBitmap& bitmap_result) const { + InvertedIndexResultBitmap& bitmap_result, bool enable_cache) const { if (iterators.empty() || data_type_with_names.empty()) { LOG(INFO) << "No indexed columns or iterators available, returning empty result, dsl:" << search_param.original_dsl; @@ -209,6 +306,45 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( return Status::OK(); } + // DSL result cache: reuse InvertedIndexQueryCache with SEARCH_DSL_QUERY type + auto* dsl_cache = enable_cache ? InvertedIndexQueryCache::instance() : nullptr; + std::string seg_prefix; + std::string dsl_sig; + InvertedIndexQueryCache::CacheKey dsl_cache_key; + bool cache_usable = false; + if (dsl_cache) { + seg_prefix = extract_segment_prefix(iterators); + dsl_sig = build_dsl_signature(search_param); + if (!seg_prefix.empty() && !dsl_sig.empty()) { + dsl_cache_key = InvertedIndexQueryCache::CacheKey { + seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, + dsl_sig}; + cache_usable = true; + InvertedIndexQueryCacheHandle dsl_cache_handle; + if (dsl_cache->lookup(dsl_cache_key, &dsl_cache_handle)) { + auto cached_bitmap = dsl_cache_handle.get_bitmap(); + if (cached_bitmap) { + // Also retrieve cached null bitmap for three-valued SQL logic + // (needed by compound operators NOT, OR, AND in VCompoundPred) + auto null_cache_key = InvertedIndexQueryCache::CacheKey { + seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, + dsl_sig + "__null"}; + InvertedIndexQueryCacheHandle null_cache_handle; + std::shared_ptr null_bitmap; + if (dsl_cache->lookup(null_cache_key, &null_cache_handle)) { + null_bitmap = null_cache_handle.get_bitmap(); + } + if (!null_bitmap) { + null_bitmap = std::make_shared(); + } + bitmap_result = + InvertedIndexResultBitmap(cached_bitmap, std::move(null_bitmap)); + return Status::OK(); + } + } + } + } + auto context = std::make_shared(); context->collection_statistics = std::make_shared(); context->collection_similarity = std::make_shared(); @@ -217,10 +353,22 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( FieldReaderResolver resolver(data_type_with_names, iterators, context, search_param.field_bindings); + // Extract default_operator from TSearchParam (default: "or") + std::string default_operator = "or"; + if (search_param.__isset.default_operator && !search_param.default_operator.empty()) { + default_operator = search_param.default_operator; + } + // Extract minimum_should_match from TSearchParam (-1 means not set) + int32_t minimum_should_match = -1; + if (search_param.__isset.minimum_should_match) { + minimum_should_match = search_param.minimum_should_match; + } + query_v2::QueryPtr root_query; std::string root_binding_key; RETURN_IF_ERROR(build_query_recursive(search_param.root, context, resolver, &root_query, - &root_binding_key)); + &root_binding_key, default_operator, + minimum_should_match)); if (root_query == nullptr) { LOG(INFO) << "search: Query tree resolved to empty query, dsl:" << search_param.original_dsl; @@ -314,6 +462,21 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( VLOG_TRACE << "search: After mask - result_bitmap=" << bitmap_result.get_data_bitmap()->cardinality(); + // Insert post-mask_out_null result into DSL cache for future reuse + // Cache both data bitmap and null bitmap so compound operators (NOT, OR, AND) + // can apply correct three-valued SQL logic on cache hit + if (dsl_cache && cache_usable) { + InvertedIndexQueryCacheHandle insert_handle; + dsl_cache->insert(dsl_cache_key, bitmap_result.get_data_bitmap(), &insert_handle); + if (bitmap_result.get_null_bitmap()) { + auto null_cache_key = InvertedIndexQueryCache::CacheKey { + seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, + dsl_sig + "__null"}; + InvertedIndexQueryCacheHandle null_insert_handle; + dsl_cache->insert(null_cache_key, bitmap_result.get_null_bitmap(), &null_insert_handle); + } + } + return Status::OK(); } @@ -429,7 +592,9 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, const std::shared_ptr& context, FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, - std::string* binding_key) const { + std::string* binding_key, + const std::string& default_operator, + int32_t minimum_should_match) const { DCHECK(out != nullptr); *out = nullptr; if (binding_key) { @@ -438,6 +603,12 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, const std::string& clause_type = clause.clause_type; + // Handle MATCH_ALL_DOCS - matches all documents in the segment + if (clause_type == "MATCH_ALL_DOCS") { + *out = std::make_shared(); + return Status::OK(); + } + // Handle OCCUR_BOOLEAN - Lucene-style boolean query with MUST/SHOULD/MUST_NOT if (clause_type == "OCCUR_BOOLEAN") { auto builder = segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder(); @@ -452,7 +623,8 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, query_v2::QueryPtr child_query; std::string child_binding_key; RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, - &child_binding_key)); + &child_binding_key, default_operator, + minimum_should_match)); // Determine occur type from child clause query_v2::Occur occur = query_v2::Occur::MUST; // default @@ -483,7 +655,8 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, query_v2::QueryPtr child_query; std::string child_binding_key; RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, - &child_binding_key)); + &child_binding_key, default_operator, + minimum_should_match)); // Add all children including empty BitSetQuery // BooleanQuery will handle the logic: // - AND with empty bitmap → result is empty @@ -497,14 +670,17 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, return Status::OK(); } - return build_leaf_query(clause, context, resolver, out, binding_key); + return build_leaf_query(clause, context, resolver, out, binding_key, default_operator, + minimum_should_match); } Status FunctionSearch::build_leaf_query(const TSearchClause& clause, const std::shared_ptr& context, FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, - std::string* binding_key) const { + std::string* binding_key, + const std::string& default_operator, + int32_t minimum_should_match) const { DCHECK(out != nullptr); *out = nullptr; if (binding_key) { @@ -576,7 +752,27 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, return Status::OK(); } - auto builder = create_operator_boolean_query_builder(query_v2::OperatorType::OP_OR); + // When minimum_should_match is specified, use OccurBooleanQuery + // ES behavior: msm only applies to SHOULD clauses + if (minimum_should_match > 0) { + auto builder = + segment_v2::inverted_index::query_v2::create_occur_boolean_query_builder(); + builder->set_minimum_number_should_match(minimum_should_match); + query_v2::Occur occur = (default_operator == "and") ? query_v2::Occur::MUST + : query_v2::Occur::SHOULD; + for (const auto& term_info : term_infos) { + std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); + builder->add(make_term_query(term_wstr), occur); + } + *out = builder->build(); + return Status::OK(); + } + + // Use default_operator to determine how to combine tokenized terms + query_v2::OperatorType op_type = (default_operator == "and") + ? query_v2::OperatorType::OP_AND + : query_v2::OperatorType::OP_OR; + auto builder = create_operator_boolean_query_builder(op_type); for (const auto& term_info : term_infos) { std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); builder->add(make_term_query(term_wstr), binding.binding_key); @@ -716,20 +912,50 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, return Status::OK(); } if (clause_type == "PREFIX") { - *out = std::make_shared(context, field_wstr, value); + // Apply lowercase only if: + // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing) + // 2. lower_case is explicitly set to "true" + bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer( + binding.index_properties); + std::string lowercase_setting = + get_parser_lowercase_from_properties(binding.index_properties); + bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE); + std::string pattern = should_lowercase ? to_lower(value) : value; + *out = std::make_shared(context, field_wstr, pattern); VLOG_DEBUG << "search: PREFIX clause processed, field=" << field_name << ", pattern='" - << value << "'"; + << pattern << "' (original='" << value << "', has_parser=" << has_parser + << ", lower_case=" << lowercase_setting << ")"; return Status::OK(); } if (clause_type == "WILDCARD") { - *out = std::make_shared(context, field_wstr, value); + // Standalone wildcard "*" matches all non-null values for this field + // Consistent with ES query_string behavior where field:* becomes FieldExistsQuery + if (value == "*") { + *out = std::make_shared(field_wstr, true); + VLOG_DEBUG << "search: WILDCARD '*' converted to AllQuery(nullable=true), field=" + << field_name; + return Status::OK(); + } + // Apply lowercase only if: + // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing) + // 2. lower_case is explicitly set to "true" + bool has_parser = inverted_index::InvertedIndexAnalyzer::should_analyzer( + binding.index_properties); + std::string lowercase_setting = + get_parser_lowercase_from_properties(binding.index_properties); + bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE); + std::string pattern = should_lowercase ? to_lower(value) : value; + *out = std::make_shared(context, field_wstr, pattern); VLOG_DEBUG << "search: WILDCARD clause processed, field=" << field_name << ", pattern='" - << value << "'"; + << pattern << "' (original='" << value << "', has_parser=" << has_parser + << ", lower_case=" << lowercase_setting << ")"; return Status::OK(); } if (clause_type == "REGEXP") { + // ES-compatible: regex patterns are NOT lowercased (case-sensitive matching) + // This matches ES query_string behavior where regex patterns bypass analysis *out = std::make_shared(context, field_wstr, value); VLOG_DEBUG << "search: REGEXP clause processed, field=" << field_name << ", pattern='" << value << "'"; @@ -749,37 +975,6 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, return Status::OK(); } -Status FunctionSearch::collect_all_field_nulls( - const TSearchClause& clause, - const std::unordered_map& iterators, - std::shared_ptr& null_bitmap) const { - // Recursively collect NULL bitmaps from all fields referenced in the query - if (clause.__isset.field_name) { - const std::string& field_name = clause.field_name; - auto it = iterators.find(field_name); - if (it != iterators.end() && it->second) { - auto has_null_result = it->second->has_null(); - if (has_null_result.has_value() && has_null_result.value()) { - segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle; - RETURN_IF_ERROR(it->second->read_null_bitmap(&null_bitmap_cache_handle)); - auto field_null_bitmap = null_bitmap_cache_handle.get_bitmap(); - if (field_null_bitmap) { - *null_bitmap |= *field_null_bitmap; - } - } - } - } - - // Recurse into child clauses - if (clause.__isset.children) { - for (const auto& child_clause : clause.children) { - RETURN_IF_ERROR(collect_all_field_nulls(child_clause, iterators, null_bitmap)); - } - } - - return Status::OK(); -} - void register_function_search(SimpleFunctionFactory& factory) { factory.register_function(); } diff --git a/be/src/vec/functions/function_search.h b/be/src/vec/functions/function_search.h index 944f07dd1b65a4..d86f23605b24a6 100644 --- a/be/src/vec/functions/function_search.h +++ b/be/src/vec/functions/function_search.h @@ -28,6 +28,7 @@ #include "gen_cpp/Exprs_types.h" #include "olap/rowset/segment_v2/index_query_context.h" #include "olap/rowset/segment_v2/inverted_index/query_v2/boolean_query/operator_boolean_query.h" +#include "olap/rowset/segment_v2/inverted_index_cache.h" #include "vec/core/block.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" @@ -64,11 +65,12 @@ class FieldReaderResolver { _iterators(iterators), _context(std::move(context)), _field_bindings(field_bindings) { - // Build a lookup map for quick variant subcolumn checks + // Build lookup maps for quick access for (const auto& binding : _field_bindings) { if (binding.__isset.is_variant_subcolumn && binding.is_variant_subcolumn) { _variant_subcolumn_fields.insert(binding.field_name); } + _field_binding_map[binding.field_name] = &binding; } } @@ -114,11 +116,15 @@ class FieldReaderResolver { const std::unordered_map& _iterators; std::shared_ptr _context; std::vector _field_bindings; + std::unordered_map _field_binding_map; std::unordered_set _variant_subcolumn_fields; std::unordered_map _cache; std::vector> _readers; std::unordered_map> _binding_readers; std::unordered_map> _field_readers; + // Keep searcher cache handles alive for the resolver's lifetime. + // This pins cached IndexSearcher entries so extracted IndexReaders remain valid. + std::vector _searcher_cache_handles; }; class FunctionSearch : public IFunction { @@ -161,7 +167,7 @@ class FunctionSearch : public IFunction { const std::unordered_map& data_type_with_names, std::unordered_map iterators, uint32_t num_rows, - InvertedIndexResultBitmap& bitmap_result) const; + InvertedIndexResultBitmap& bitmap_result, bool enable_cache = true) const; // Public methods for testing enum class ClauseTypeCategory { @@ -182,17 +188,15 @@ class FunctionSearch : public IFunction { Status build_query_recursive(const TSearchClause& clause, const std::shared_ptr& context, FieldReaderResolver& resolver, - inverted_index::query_v2::QueryPtr* out, - std::string* binding_key) const; + inverted_index::query_v2::QueryPtr* out, std::string* binding_key, + const std::string& default_operator, + int32_t minimum_should_match) const; Status build_leaf_query(const TSearchClause& clause, const std::shared_ptr& context, FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, - std::string* binding_key) const; - - Status collect_all_field_nulls(const TSearchClause& clause, - const std::unordered_map& iterators, - std::shared_ptr& null_bitmap) const; + std::string* binding_key, const std::string& default_operator, + int32_t minimum_should_match) const; }; } // namespace doris::vectorized diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp index 4fc01f43e1d9b4..0088b88dd4d97d 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_real_index_test.cpp @@ -136,8 +136,7 @@ TEST_F(OccurBooleanQueryRealIndexTest, NotPhraseQuery) { auto phrase_query = std::make_shared(context, field, term_infos); - uint32_t max_doc = reader_holder->maxDoc(); - auto all_query = std::make_shared(max_doc); + auto all_query = std::make_shared(); std::vector> clauses; clauses.emplace_back(Occur::SHOULD, all_query); @@ -255,8 +254,7 @@ TEST_F(OccurBooleanQueryRealIndexTest, NotPhraseQueryNonExistent) { auto phrase_query = std::make_shared(context, field, term_infos); - uint32_t max_doc = reader_holder->maxDoc(); - auto all_query = std::make_shared(max_doc); + auto all_query = std::make_shared(); std::vector> clauses; clauses.emplace_back(Occur::SHOULD, all_query); @@ -309,8 +307,7 @@ TEST_F(OccurBooleanQueryRealIndexTest, NotPhraseQueryExcludesPartial) { auto phrase_query = std::make_shared(context, field, term_infos); - uint32_t max_doc = reader_holder->maxDoc(); - auto all_query = std::make_shared(max_doc); + auto all_query = std::make_shared(); std::vector> clauses; clauses.emplace_back(Occur::SHOULD, all_query); diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_test.cpp index 244ddfb8dcc0c0..7d885ecce19117 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/occur_boolean_query_test.cpp @@ -874,7 +874,7 @@ TEST_F(OccurBooleanQueryTest, AllQueryWithMustClause) { std::vector> clauses; clauses.emplace_back(Occur::MUST, std::make_shared(must_docs)); - clauses.emplace_back(Occur::MUST, std::make_shared(100)); + clauses.emplace_back(Occur::MUST, std::make_shared()); OccurBooleanQuery query(std::move(clauses)); auto weight = query.weight(false); @@ -891,7 +891,7 @@ TEST_F(OccurBooleanQueryTest, AllQueryWithShouldClause) { std::vector> clauses; clauses.emplace_back(Occur::SHOULD, std::make_shared(should_docs)); - clauses.emplace_back(Occur::SHOULD, std::make_shared(50)); + clauses.emplace_back(Occur::SHOULD, std::make_shared()); OccurBooleanQuery query(std::move(clauses)); auto weight = query.weight(false); @@ -909,7 +909,7 @@ TEST_F(OccurBooleanQueryTest, AllQueryWithMustNotClause) { auto must_not_docs = std::vector {10, 20, 30, 40, 50}; std::vector> clauses; - clauses.emplace_back(Occur::MUST, std::make_shared(100)); + clauses.emplace_back(Occur::MUST, std::make_shared()); clauses.emplace_back(Occur::MUST_NOT, std::make_shared(must_not_docs)); OccurBooleanQuery query(std::move(clauses)); @@ -930,8 +930,8 @@ TEST_F(OccurBooleanQueryTest, MultipleAllQueriesWithMust) { std::vector> clauses; clauses.emplace_back(Occur::MUST, std::make_shared(must_docs)); - clauses.emplace_back(Occur::MUST, std::make_shared(100)); - clauses.emplace_back(Occur::MUST, std::make_shared(100)); + clauses.emplace_back(Occur::MUST, std::make_shared()); + clauses.emplace_back(Occur::MUST, std::make_shared()); OccurBooleanQuery query(std::move(clauses)); auto weight = query.weight(false); @@ -945,7 +945,7 @@ TEST_F(OccurBooleanQueryTest, AllQueryOnlyMust) { _ctx.segment_num_rows = 50; std::vector> clauses; - clauses.emplace_back(Occur::MUST, std::make_shared(50)); + clauses.emplace_back(Occur::MUST, std::make_shared()); OccurBooleanQuery query(std::move(clauses)); auto weight = query.weight(false); @@ -967,7 +967,7 @@ TEST_F(OccurBooleanQueryTest, AllQueryWithMustAndShouldMinMatch) { std::vector> clauses; clauses.emplace_back(Occur::MUST, std::make_shared(must_docs)); - clauses.emplace_back(Occur::MUST, std::make_shared(100)); + clauses.emplace_back(Occur::MUST, std::make_shared()); clauses.emplace_back(Occur::SHOULD, std::make_shared(should1_docs)); clauses.emplace_back(Occur::SHOULD, std::make_shared(should2_docs)); @@ -1014,7 +1014,7 @@ TEST_F(OccurBooleanQueryTest, ShouldOnlyWithAllQueryMinShouldMatch) { std::vector> clauses; clauses.emplace_back(Occur::SHOULD, std::make_shared(should_docs)); - clauses.emplace_back(Occur::SHOULD, std::make_shared(50)); + clauses.emplace_back(Occur::SHOULD, std::make_shared()); OccurBooleanQuery query(std::move(clauses), 2); auto weight = query.weight(false); @@ -1031,7 +1031,7 @@ TEST_F(OccurBooleanQueryTest, ShouldOnlyAllQueryScoring) { std::vector> clauses; clauses.emplace_back(Occur::SHOULD, std::make_shared(std::vector {1, 2}, 2.0F)); - clauses.emplace_back(Occur::SHOULD, std::make_shared(10)); + clauses.emplace_back(Occur::SHOULD, std::make_shared()); OccurBooleanQuery query(std::move(clauses)); auto weight = query.weight(true); diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query_v2/regexp_wildcard_lowercase_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/regexp_wildcard_lowercase_test.cpp new file mode 100644 index 00000000000000..f25ed8db8f04de --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/query_v2/regexp_wildcard_lowercase_test.cpp @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include + +#include "io/fs/local_file_system.h" +#include "olap/rowset/segment_v2/index_query_context.h" +#include "olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/regexp_query/regexp_query.h" +#include "olap/rowset/segment_v2/inverted_index/query_v2/wildcard_query/wildcard_query.h" +#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h" + +CL_NS_USE(search) +CL_NS_USE(store) +CL_NS_USE(index) + +namespace doris::segment_v2 { + +using namespace inverted_index; +using namespace inverted_index::query_v2; + +// Test that REGEXP queries match directly against the term dictionary (no lowercasing), +// while WILDCARD queries are expected to receive already-lowercased patterns from function_search.cpp. +// +// This test creates an index with lowercased terms (simulating parser=english, lower_case=true) +// and verifies: +// 1. REGEXP with uppercase pattern does NOT match lowercased terms (ES-compatible) +// 2. REGEXP with lowercase pattern DOES match lowercased terms +// 3. WILDCARD with lowercase pattern DOES match lowercased terms +class RegexpWildcardLowercaseTest : public testing::Test { +public: + const std::string kTestDir = "./ut_dir/regexp_wildcard_lowercase_test"; + + void SetUp() override { + auto st = io::global_local_filesystem()->delete_directory(kTestDir); + ASSERT_TRUE(st.ok()) << st; + st = io::global_local_filesystem()->create_directory(kTestDir); + ASSERT_TRUE(st.ok()) << st; + // Create index with lowercased terms (simulating lower_case=true analyzer) + create_test_index("title", kTestDir); + } + + void TearDown() override { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok()); + } + +private: + void create_test_index(const std::string& field_name, const std::string& dir) { + // Simulate data that was indexed with lower_case=true: + // Original data: "ABC DEF", "abc def", "Apple Banana", "cherry date" + // After english analyzer with lower_case=true, terms are all lowercase + std::vector test_data = {"abc def", "abc def", "apple banana", "cherry date"}; + + // Use standard tokenizer (which lowercases by default) + CustomAnalyzerConfig::Builder builder; + builder.with_tokenizer_config("standard", {}); + auto custom_analyzer_config = builder.build(); + auto custom_analyzer = CustomAnalyzer::build_custom_analyzer(custom_analyzer_config); + + auto* indexwriter = + _CLNEW lucene::index::IndexWriter(dir.c_str(), custom_analyzer.get(), true); + indexwriter->setMaxBufferedDocs(100); + indexwriter->setRAMBufferSizeMB(-1); + indexwriter->setMaxFieldLength(0x7FFFFFFFL); + indexwriter->setMergeFactor(1000000000); + indexwriter->setUseCompoundFile(false); + + auto char_string_reader = std::make_shared>(); + + auto* doc = _CLNEW lucene::document::Document(); + int32_t field_config = lucene::document::Field::STORE_NO; + field_config |= lucene::document::Field::INDEX_NONORMS; + field_config |= lucene::document::Field::INDEX_TOKENIZED; + auto field_name_w = std::wstring(field_name.begin(), field_name.end()); + auto* field = _CLNEW lucene::document::Field(field_name_w.c_str(), field_config); + field->setOmitTermFreqAndPositions(false); + doc->add(*field); + + for (const auto& data : test_data) { + char_string_reader->init(data.data(), data.size(), false); + auto* stream = custom_analyzer->reusableTokenStream(field->name(), char_string_reader); + field->setValue(stream); + indexwriter->addDocument(doc); + } + + indexwriter->close(); + _CLLDELETE(indexwriter); + _CLLDELETE(doc); + } +}; + +static std::shared_ptr make_shared_reader( + lucene::index::IndexReader* raw_reader) { + return {raw_reader, [](lucene::index::IndexReader* reader) { + if (reader != nullptr) { + reader->close(); + _CLDELETE(reader); + } + }}; +} + +static std::vector execute_query(const std::string& test_dir, const std::wstring& field, + const std::shared_ptr& query) { + auto* dir = FSDirectory::getDirectory(test_dir.c_str()); + auto reader_holder = make_shared_reader(lucene::index::IndexReader::open(dir, true)); + + auto weight = query->weight(false); + + QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = reader_holder->maxDoc(); + exec_ctx.readers = {reader_holder}; + exec_ctx.field_reader_bindings.emplace(field, reader_holder); + + auto scorer = weight->scorer(exec_ctx); + std::vector matched_docs; + if (scorer) { + uint32_t doc = scorer->doc(); + while (doc != TERMINATED) { + matched_docs.push_back(doc); + doc = scorer->advance(); + } + } + + _CLDECDELETE(dir); + return matched_docs; +} + +// REGEXP with uppercase pattern should NOT match lowercased index terms. +// This is consistent with ES query_string regex behavior. +TEST_F(RegexpWildcardLowercaseTest, RegexpUppercasePatternNoMatch) { + auto context = std::make_shared(); + std::wstring field = StringHelper::to_wstring("title"); + + // Pattern "AB.*" should NOT match "abc" (uppercase vs lowercase) + auto query = std::make_shared(context, field, "AB.*"); + auto matched = execute_query(kTestDir, field, query); + + EXPECT_EQ(matched.size(), 0) + << "Uppercase regex 'AB.*' should not match lowercased terms 'abc'"; +} + +// REGEXP with lowercase pattern SHOULD match lowercased index terms. +TEST_F(RegexpWildcardLowercaseTest, RegexpLowercasePatternMatches) { + auto context = std::make_shared(); + std::wstring field = StringHelper::to_wstring("title"); + + // Pattern "ab.*" should match "abc" (both lowercase) + auto query = std::make_shared(context, field, "ab.*"); + auto matched = execute_query(kTestDir, field, query); + + // Docs 0 and 1 contain "abc", docs 2 and 3 don't + EXPECT_EQ(matched.size(), 2) << "Lowercase regex 'ab.*' should match lowercased terms 'abc'"; +} + +// WILDCARD with lowercase pattern SHOULD match. +// In function_search.cpp, WILDCARD patterns are lowercased before being passed here. +TEST_F(RegexpWildcardLowercaseTest, WildcardLowercasePatternMatches) { + auto context = std::make_shared(); + std::wstring field = StringHelper::to_wstring("title"); + + // Pattern "ab*" (already lowercased by function_search.cpp) should match "abc" + auto query = std::make_shared(context, field, "ab*"); + auto matched = execute_query(kTestDir, field, query); + + EXPECT_EQ(matched.size(), 2) << "Lowercase wildcard 'ab*' should match lowercased terms 'abc'"; +} + +// WILDCARD with uppercase pattern should NOT match lowercased index terms +// (but in practice, function_search.cpp lowercases before passing to WildcardQuery). +TEST_F(RegexpWildcardLowercaseTest, WildcardUppercasePatternNoMatch) { + auto context = std::make_shared(); + std::wstring field = StringHelper::to_wstring("title"); + + // Pattern "AB*" should NOT match "abc" at the WildcardQuery level + auto query = std::make_shared(context, field, "AB*"); + auto matched = execute_query(kTestDir, field, query); + + EXPECT_EQ(matched.size(), 0) << "Uppercase wildcard 'AB*' should not match lowercased terms"; +} + +// REGEXP with a more complex pattern +TEST_F(RegexpWildcardLowercaseTest, RegexpComplexPatternMatches) { + auto context = std::make_shared(); + std::wstring field = StringHelper::to_wstring("title"); + + // Pattern "ch.*y" should match "cherry" (lowercased) + auto query = std::make_shared(context, field, "ch.*y"); + auto matched = execute_query(kTestDir, field, query); + + EXPECT_EQ(matched.size(), 1) << "Regex 'ch.*y' should match 'cherry' in doc 3"; + if (!matched.empty()) { + EXPECT_EQ(matched[0], 3); + } +} + +// WILDCARD matching all terms with '*' +TEST_F(RegexpWildcardLowercaseTest, WildcardStarMatchesAll) { + auto context = std::make_shared(); + std::wstring field = StringHelper::to_wstring("title"); + + // Pattern "a*" should match "abc" and "apple" + auto query = std::make_shared(context, field, "a*"); + auto matched = execute_query(kTestDir, field, query); + + // Docs 0,1 have "abc", doc 2 has "apple", doc 3 has no "a*" terms + EXPECT_EQ(matched.size(), 3) << "Wildcard 'a*' should match docs with 'abc' and 'apple'"; +} + +} // namespace doris::segment_v2 diff --git a/be/test/olap/rowset/segment_v2/search_function_query_cache_test.cpp b/be/test/olap/rowset/segment_v2/search_function_query_cache_test.cpp new file mode 100644 index 00000000000000..fc257f02fb1002 --- /dev/null +++ b/be/test/olap/rowset/segment_v2/search_function_query_cache_test.cpp @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include "olap/rowset/segment_v2/inverted_index_cache.h" +#include "olap/rowset/segment_v2/inverted_index_query_type.h" + +namespace doris::segment_v2 { + +class SearchDslQueryCacheTest : public testing::Test { +public: + static const int kCacheSize = 4096; + + void SetUp() override { _cache = new InvertedIndexQueryCache(kCacheSize, 1); } + + void TearDown() override { delete _cache; } + + InvertedIndexQueryCache::CacheKey make_key(const std::string& seg_prefix, + const std::string& dsl_sig) { + return InvertedIndexQueryCache::CacheKey { + seg_prefix, "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, dsl_sig}; + } + +protected: + InvertedIndexQueryCache* _cache = nullptr; +}; + +TEST_F(SearchDslQueryCacheTest, insert_and_lookup) { + auto bm = std::make_shared(); + bm->add(1); + bm->add(3); + bm->add(5); + + auto key = make_key("segment_0001", "thrift_sig_abc"); + + InvertedIndexQueryCacheHandle handle; + _cache->insert(key, bm, &handle); + + // Lookup should succeed + InvertedIndexQueryCacheHandle lookup_handle; + EXPECT_TRUE(_cache->lookup(key, &lookup_handle)); + + auto cached_bm = lookup_handle.get_bitmap(); + ASSERT_NE(cached_bm, nullptr); + EXPECT_TRUE(cached_bm->contains(1)); + EXPECT_TRUE(cached_bm->contains(3)); + EXPECT_TRUE(cached_bm->contains(5)); + EXPECT_FALSE(cached_bm->contains(2)); + EXPECT_EQ(cached_bm->cardinality(), 3); +} + +TEST_F(SearchDslQueryCacheTest, lookup_miss) { + auto key = make_key("segment_0001", "thrift_sig_abc"); + + InvertedIndexQueryCacheHandle handle; + EXPECT_FALSE(_cache->lookup(key, &handle)); +} + +TEST_F(SearchDslQueryCacheTest, different_keys_independent) { + auto bm1 = std::make_shared(); + bm1->add(10); + auto bm2 = std::make_shared(); + bm2->add(20); + + auto key1 = make_key("seg_a", "dsl_1"); + auto key2 = make_key("seg_a", "dsl_2"); + + { + InvertedIndexQueryCacheHandle h; + _cache->insert(key1, bm1, &h); + } + { + InvertedIndexQueryCacheHandle h; + _cache->insert(key2, bm2, &h); + } + + // Lookup key1 + { + InvertedIndexQueryCacheHandle h; + EXPECT_TRUE(_cache->lookup(key1, &h)); + auto cached = h.get_bitmap(); + ASSERT_NE(cached, nullptr); + EXPECT_TRUE(cached->contains(10)); + EXPECT_FALSE(cached->contains(20)); + } + + // Lookup key2 + { + InvertedIndexQueryCacheHandle h; + EXPECT_TRUE(_cache->lookup(key2, &h)); + auto cached = h.get_bitmap(); + ASSERT_NE(cached, nullptr); + EXPECT_TRUE(cached->contains(20)); + EXPECT_FALSE(cached->contains(10)); + } +} + +TEST_F(SearchDslQueryCacheTest, different_segments_independent) { + auto bm1 = std::make_shared(); + bm1->add(1); + auto bm2 = std::make_shared(); + bm2->add(2); + + auto key1 = make_key("seg_a", "same_dsl"); + auto key2 = make_key("seg_b", "same_dsl"); + + { + InvertedIndexQueryCacheHandle h; + _cache->insert(key1, bm1, &h); + } + { + InvertedIndexQueryCacheHandle h; + _cache->insert(key2, bm2, &h); + } + + { + InvertedIndexQueryCacheHandle h; + EXPECT_TRUE(_cache->lookup(key1, &h)); + EXPECT_TRUE(h.get_bitmap()->contains(1)); + EXPECT_FALSE(h.get_bitmap()->contains(2)); + } + { + InvertedIndexQueryCacheHandle h; + EXPECT_TRUE(_cache->lookup(key2, &h)); + EXPECT_TRUE(h.get_bitmap()->contains(2)); + EXPECT_FALSE(h.get_bitmap()->contains(1)); + } +} + +TEST_F(SearchDslQueryCacheTest, no_collision_with_regular_query_cache) { + // SEARCH_DSL_QUERY key should not collide with a regular EQUAL_QUERY key + // even with same index_path and value + auto bm_dsl = std::make_shared(); + bm_dsl->add(100); + auto bm_eq = std::make_shared(); + bm_eq->add(200); + + InvertedIndexQueryCache::CacheKey dsl_key { + "seg_a", "__search_dsl__", InvertedIndexQueryType::SEARCH_DSL_QUERY, "some_value"}; + InvertedIndexQueryCache::CacheKey eq_key {"seg_a", "__search_dsl__", + InvertedIndexQueryType::EQUAL_QUERY, "some_value"}; + + { + InvertedIndexQueryCacheHandle h; + _cache->insert(dsl_key, bm_dsl, &h); + } + { + InvertedIndexQueryCacheHandle h; + _cache->insert(eq_key, bm_eq, &h); + } + + { + InvertedIndexQueryCacheHandle h; + EXPECT_TRUE(_cache->lookup(dsl_key, &h)); + EXPECT_TRUE(h.get_bitmap()->contains(100)); + } + { + InvertedIndexQueryCacheHandle h; + EXPECT_TRUE(_cache->lookup(eq_key, &h)); + EXPECT_TRUE(h.get_bitmap()->contains(200)); + } +} + +TEST_F(SearchDslQueryCacheTest, overwrite_same_key) { + auto bm1 = std::make_shared(); + bm1->add(1); + auto bm2 = std::make_shared(); + bm2->add(99); + + auto key = make_key("seg", "dsl"); + + { + InvertedIndexQueryCacheHandle h; + _cache->insert(key, bm1, &h); + } + { + InvertedIndexQueryCacheHandle h; + _cache->insert(key, bm2, &h); + } + + InvertedIndexQueryCacheHandle h; + EXPECT_TRUE(_cache->lookup(key, &h)); + auto cached = h.get_bitmap(); + ASSERT_NE(cached, nullptr); + EXPECT_TRUE(cached->contains(99)); +} + +} // namespace doris::segment_v2 diff --git a/be/test/vec/function/function_search_test.cpp b/be/test/vec/function/function_search_test.cpp index 64b64b0d667b19..4e5b7cb2e84e4c 100644 --- a/be/test/vec/function/function_search_test.cpp +++ b/be/test/vec/function/function_search_test.cpp @@ -59,40 +59,6 @@ class DummyIndexIterator : public segment_v2::IndexIterator { Result has_null() override { return false; } }; -class TrackingIndexIterator : public segment_v2::IndexIterator { -public: - explicit TrackingIndexIterator(bool has_null) : _has_null(has_null) {} - - segment_v2::IndexReaderPtr get_reader( - segment_v2::IndexReaderType /*reader_type*/) const override { - return nullptr; - } - - Status read_from_index(const segment_v2::IndexParam& /*param*/) override { - return Status::OK(); - } - - Status read_null_bitmap(segment_v2::InvertedIndexQueryCacheHandle* /*cache_handle*/) override { - ++_read_null_bitmap_calls; - return Status::OK(); - } - - Result has_null() override { - ++_has_null_checks; - return _has_null; - } - - int read_null_bitmap_calls() const { return _read_null_bitmap_calls; } - int has_null_checks() const { return _has_null_checks; } - - void set_has_null(bool value) { _has_null = value; } - -private: - bool _has_null = false; - int _read_null_bitmap_calls = 0; - int _has_null_checks = 0; -}; - TEST_F(FunctionSearchTest, TestGetName) { EXPECT_EQ("search", function_search->get_name()); } @@ -1561,40 +1527,6 @@ TEST_F(FunctionSearchTest, TestEvaluateInvertedIndexWithSearchParamComplexQuery) } TEST_F(FunctionSearchTest, TestOrCrossFieldMatchesMatchAnyRows) { - TSearchClause left_clause; - left_clause.clause_type = "TERM"; - left_clause.field_name = "title"; - left_clause.value = "foo"; - left_clause.__isset.field_name = true; - left_clause.__isset.value = true; - - TSearchClause right_clause; - right_clause.clause_type = "TERM"; - right_clause.field_name = "content"; - right_clause.value = "bar"; - right_clause.__isset.field_name = true; - right_clause.__isset.value = true; - - TSearchClause root_clause; - root_clause.clause_type = "OR"; - root_clause.children = {left_clause, right_clause}; - root_clause.__isset.children = true; - - auto left_iterator = std::make_unique(true); - auto right_iterator = std::make_unique(true); - - std::unordered_map iterators_map = { - {"title", left_iterator.get()}, {"content", right_iterator.get()}}; - - auto null_bitmap = std::make_shared(); - auto status = function_search->collect_all_field_nulls(root_clause, iterators_map, null_bitmap); - EXPECT_TRUE(status.ok()); - EXPECT_GE(left_iterator->has_null_checks(), 1); - EXPECT_GE(right_iterator->has_null_checks(), 1); - EXPECT_GE(left_iterator->read_null_bitmap_calls(), 1); - EXPECT_GE(right_iterator->read_null_bitmap_calls(), 1); - EXPECT_TRUE(null_bitmap->isEmpty()); - auto data_bitmap = std::make_shared(); data_bitmap->add(1); data_bitmap->add(3); @@ -1622,38 +1554,6 @@ TEST_F(FunctionSearchTest, TestOrCrossFieldMatchesMatchAnyRows) { } TEST_F(FunctionSearchTest, TestOrWithNotSameFieldMatchesMatchAllRows) { - TSearchClause include_clause; - include_clause.clause_type = "TERM"; - include_clause.field_name = "title"; - include_clause.value = "foo"; - include_clause.__isset.field_name = true; - include_clause.__isset.value = true; - - TSearchClause exclude_child; - exclude_child.clause_type = "TERM"; - exclude_child.field_name = "title"; - exclude_child.value = "bar"; - exclude_child.__isset.field_name = true; - exclude_child.__isset.value = true; - - TSearchClause exclude_clause; - exclude_clause.clause_type = "NOT"; - exclude_clause.children = {exclude_child}; - - TSearchClause root_clause; - root_clause.clause_type = "OR"; - root_clause.children = {include_clause, exclude_clause}; - root_clause.__isset.children = true; - - auto iterator = std::make_unique(true); - std::unordered_map iterators_map = {{"title", iterator.get()}}; - - auto null_bitmap = std::make_shared(); - auto status = function_search->collect_all_field_nulls(root_clause, iterators_map, null_bitmap); - EXPECT_TRUE(status.ok()); - EXPECT_GE(iterator->has_null_checks(), 1); - EXPECT_GE(iterator->read_null_bitmap_calls(), 1); - auto data_bitmap = std::make_shared(); data_bitmap->add(1); data_bitmap->add(2); @@ -1716,8 +1616,8 @@ TEST_F(FunctionSearchTest, TestBuildLeafQueryPhrase) { inverted_index::query_v2::QueryPtr out; std::string out_binding_key; - Status st = - function_search->build_leaf_query(clause, context, resolver, &out, &out_binding_key); + Status st = function_search->build_leaf_query(clause, context, resolver, &out, &out_binding_key, + "OR", 0); EXPECT_TRUE(st.ok()); auto phrase_query = std::dynamic_pointer_cast(out); @@ -2201,4 +2101,19 @@ TEST_F(FunctionSearchTest, TestEvaluateInvertedIndexWithOccurBoolean) { EXPECT_TRUE(status.is()); } +TEST_F(FunctionSearchTest, TestSearcherCacheHandlesLifetime) { + // Verify FieldReaderResolver keeps _searcher_cache_handles alive + std::unordered_map data_types; + std::unordered_map iterators; + auto context = std::make_shared(); + + FieldReaderResolver resolver(data_types, iterators, context); + + // The resolver should have an empty cache handles vector initially + // (We can't directly access _searcher_cache_handles, but we can verify + // that binding_cache is empty) + EXPECT_TRUE(resolver.binding_cache().empty()); + EXPECT_TRUE(resolver.readers().empty()); +} + } // namespace doris::vectorized diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4 index 4dab0af2ed2ce9..7767e66ac7dec7 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchLexer.g4 @@ -41,9 +41,9 @@ fragment QUOTED_CHAR // ============== Default lexer rules ============== -AND : 'AND' | 'and' ; -OR : 'OR' | 'or' ; -NOT : 'NOT' | 'not' | '!' ; +AND : 'AND' ; +OR : 'OR' ; +NOT : 'NOT' | '!' ; LPAREN : '(' ; RPAREN : ')' ; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchParser.g4 index cc5f6082cd6a7e..3ff445ea1d6e8b 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/search/SearchParser.g4 @@ -25,9 +25,14 @@ orClause : andClause (OR andClause)* ; // AND is optional - space-separated terms use default_operator andClause : notClause (AND? notClause)* ; notClause : NOT atomClause | atomClause ; -// Note: fieldQuery is listed before bareQuery so ANTLR prioritizes field:value over bare value. +// Note: fieldGroupQuery is listed before fieldQuery so ANTLR prioritizes field:(group) over field:value. +// fieldQuery is listed before bareQuery so ANTLR prioritizes field:value over bare value. // This ensures "field:term" is parsed as fieldQuery, not bareQuery with "field" as term. -atomClause : LPAREN clause RPAREN | fieldQuery | bareQuery ; +atomClause : LPAREN clause RPAREN | fieldGroupQuery | fieldQuery | bareQuery ; + +// Support for field:(grouped query) syntax, e.g., title:(rock OR jazz) +// All terms inside the parentheses inherit the field prefix. +fieldGroupQuery : fieldPath COLON LPAREN clause RPAREN ; // Support for variant subcolumn paths (e.g., field.subcolumn, field.sub1.sub2) fieldQuery : fieldPath COLON searchValue ; diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java index 082a83f13545da..7bf711428c34c3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/SearchPredicate.java @@ -17,6 +17,7 @@ package org.apache.doris.analysis; +import org.apache.doris.catalog.Index; import org.apache.doris.catalog.Type; import org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser; import org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser.QsPlan; @@ -33,7 +34,9 @@ import org.apache.logging.log4j.Logger; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.stream.IntStream; /** @@ -45,11 +48,18 @@ public class SearchPredicate extends Predicate { private final String dslString; private final QsPlan qsPlan; + private final List fieldIndexes; public SearchPredicate(String dslString, QsPlan qsPlan, List children) { + this(dslString, qsPlan, children, Collections.emptyList()); + } + + public SearchPredicate(String dslString, QsPlan qsPlan, List children, + List fieldIndexes) { super(); this.dslString = dslString; this.qsPlan = qsPlan; + this.fieldIndexes = fieldIndexes != null ? fieldIndexes : Collections.emptyList(); this.type = Type.BOOLEAN; // Add children (SlotReferences) @@ -62,6 +72,7 @@ protected SearchPredicate(SearchPredicate other) { super(other); this.dslString = other.dslString; this.qsPlan = other.qsPlan; + this.fieldIndexes = other.fieldIndexes; } @Override @@ -182,10 +193,30 @@ private TSearchParam buildThriftParam() { thriftBinding.setSlotIndex(i); // fallback to position } + // Set index properties from FE Index lookup (needed for variant subcolumn analyzer) + if (i < fieldIndexes.size() && fieldIndexes.get(i) != null) { + Map properties = fieldIndexes.get(i).getProperties(); + if (properties != null && !properties.isEmpty()) { + thriftBinding.setIndexProperties(properties); + LOG.debug("buildThriftParam: field='{}' index_properties={}", + fieldPath, properties); + } + } + bindings.add(thriftBinding); } param.setFieldBindings(bindings); + // Set default_operator for BE to use when tokenizing TERM queries + if (qsPlan.getDefaultOperator() != null) { + param.setDefaultOperator(qsPlan.getDefaultOperator()); + } + + // Set minimum_should_match for BE to use when tokenizing TERM queries in Lucene mode + if (qsPlan.getMinimumShouldMatch() != null) { + param.setMinimumShouldMatch(qsPlan.getMinimumShouldMatch()); + } + return param; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java index 80accc33c33105..e835392187ab04 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java @@ -642,19 +642,31 @@ public Expr visitDictGetMany(DictGetMany dictGetMany, PlanTranslatorContext cont public Expr visitSearchExpression(SearchExpression searchExpression, PlanTranslatorContext context) { List slotChildren = new ArrayList<>(); + List fieldIndexes = new ArrayList<>(); // Convert slot reference children from Nereids to Analysis for (Expression slotExpr : searchExpression.getSlotChildren()) { Expr translatedSlot = slotExpr.accept(this, context); slotChildren.add(translatedSlot); + + // Look up the inverted index for each field (needed for variant subcolumn analyzer) + Index invertedIndex = null; + if (slotExpr instanceof SlotReference) { + SlotReference slot = (SlotReference) slotExpr; + OlapTable olapTbl = getOlapTableDirectly(slot); + if (olapTbl != null) { + Column column = slot.getOriginalColumn().orElse(null); + if (column != null) { + invertedIndex = olapTbl.getInvertedIndex(column, slot.getSubPath()); + } + } + } + fieldIndexes.add(invertedIndex); } // Create SearchPredicate with proper slot children for BE "action on slot" detection - SearchPredicate searchPredicate = - new SearchPredicate( - searchExpression.getDslString(), - searchExpression.getQsPlan(), - slotChildren); + SearchPredicate searchPredicate = new SearchPredicate(searchExpression.getDslString(), + searchExpression.getQsPlan(), slotChildren, fieldIndexes); searchPredicate.setNullableFromNereids(searchExpression.nullable()); return searchPredicate; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java index 40d10d30986345..bf0bc8f6168947 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParser.java @@ -22,6 +22,7 @@ import org.apache.doris.nereids.search.SearchParserBaseVisitor; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonSetter; import com.fasterxml.jackson.core.JsonProcessingException; @@ -61,6 +62,7 @@ public class SearchDslParser { private static final Logger LOG = LogManager.getLogger(SearchDslParser.class); private static final ObjectMapper JSON_MAPPER = new ObjectMapper(); + private static final int MAX_FIELD_GROUP_DEPTH = 32; /** * Exception for search DSL syntax errors. @@ -110,23 +112,30 @@ public static QsPlan parseDsl(String dsl, @Nullable String optionsJson) { String defaultField = searchOptions.getDefaultField(); String defaultOperator = searchOptions.getDefaultOperator(); + QsPlan plan; // Use Lucene mode parser if specified if (searchOptions.isLuceneMode()) { // Multi-field + Lucene mode: first expand DSL, then parse with Lucene semantics if (searchOptions.isMultiFieldMode()) { - return parseDslMultiFieldLuceneMode(dsl, searchOptions.getFields(), + plan = parseDslMultiFieldLuceneMode(dsl, searchOptions.getFields(), defaultOperator, searchOptions); - } - return parseDslLuceneMode(dsl, defaultField, defaultOperator, searchOptions); - } - - // Multi-field mode parsing (standard mode) - if (searchOptions.isMultiFieldMode()) { - return parseDslMultiFieldMode(dsl, searchOptions.getFields(), defaultOperator, searchOptions); - } - - // Standard mode parsing - return parseDslStandardMode(dsl, defaultField, defaultOperator); + } else { + plan = parseDslLuceneMode(dsl, defaultField, defaultOperator, searchOptions); + } + } else if (searchOptions.isMultiFieldMode()) { + // Multi-field mode parsing (standard mode) + plan = parseDslMultiFieldMode(dsl, searchOptions.getFields(), defaultOperator, searchOptions); + } else { + // Standard mode parsing + plan = parseDslStandardMode(dsl, defaultField, defaultOperator); + } + + // Wrap plan with options for BE serialization + // NOTE: Must use normalizeDefaultOperator() here because BE compares + // default_operator case-sensitively against lowercase "and"/"or" + return new QsPlan(plan.getRoot(), plan.getFieldBindings(), + normalizeDefaultOperator(searchOptions.getDefaultOperator()), + searchOptions.getMinimumShouldMatch()); } /** @@ -298,6 +307,43 @@ private static void collectFieldNamesRecursive(QsNode node, Set fieldNam } } + /** + * Recursively mark leaf nodes with the given field name and set explicitField=true. + * Used for field-grouped queries like title:(rock OR jazz) to ensure inner leaf nodes + * are bound to the group's field and are not re-expanded by MultiFieldExpander. + * + * Skips nodes already marked as explicitField to preserve inner explicit bindings, + * e.g., title:(content:foo OR bar) keeps content:foo intact and only sets title on bar. + * + * @param depth current recursion depth to prevent StackOverflow from malicious input + */ + private static void markExplicitFieldRecursive(QsNode node, String field) { + markExplicitFieldRecursive(node, field, 0); + } + + private static void markExplicitFieldRecursive(QsNode node, String field, int depth) { + if (node == null) { + return; + } + if (depth > MAX_FIELD_GROUP_DEPTH) { + throw new SearchDslSyntaxException( + "Field group query nesting too deep (max " + MAX_FIELD_GROUP_DEPTH + ")"); + } + // Skip nodes already explicitly bound to a field (e.g., inner field:term inside a group) + if (node.isExplicitField()) { + return; + } + if (node.getChildren() != null && !node.getChildren().isEmpty()) { + for (QsNode child : node.getChildren()) { + markExplicitFieldRecursive(child, field, depth + 1); + } + } else { + // Leaf node - set field and mark as explicit + node.setField(field); + node.setExplicitField(true); + } + } + /** * Common ANTLR parsing helper with visitor pattern. * Reduces code duplication across parsing methods. @@ -445,6 +491,11 @@ public void syntaxError(org.antlr.v4.runtime.Recognizer recognizer, // Extract field bindings from expanded AST Set fieldNames = collectFieldNames(expandedRoot); + // If no fields were collected (e.g., MATCH_ALL_DOCS query that matches all docs + // regardless of field), use the original fields list to ensure proper push-down + if (fieldNames.isEmpty()) { + fieldNames = new LinkedHashSet<>(fields); + } List bindings = new ArrayList<>(); int slotIndex = 0; for (String fieldName : fieldNames) { @@ -480,6 +531,12 @@ private static QsPlan parseDslMultiFieldLuceneMode(String dsl, List fiel } validateFieldsList(fields); + // For multi-field mode (fields.size() > 1), ignore minimum_should_match. + // The expanded DSL creates complex nested boolean structures where msm + // semantics become ambiguous. This is a deliberate design decision. + final SearchOptions effectiveOptions = fields.size() > 1 + ? options.withMinimumShouldMatch(null) : options; + String trimmedDsl = dsl.trim(); try { @@ -507,32 +564,33 @@ public void syntaxError(org.antlr.v4.runtime.Recognizer recognizer, // Build AST using Lucene-mode visitor with first field as placeholder for bare queries // Use constructor with override to avoid mutating shared options object (thread-safety) - QsLuceneModeAstBuilder visitor = new QsLuceneModeAstBuilder(options, fields.get(0)); + QsLuceneModeAstBuilder visitor = new QsLuceneModeAstBuilder(effectiveOptions, fields.get(0)); QsNode root = visitor.visit(tree); - // Apply multi-field expansion based on type - // Pass luceneMode=true since this is Lucene mode parsing - QsNode expandedRoot; - if (options.isCrossFieldsMode()) { - // cross_fields: each term expands to OCCUR_BOOLEAN(field1:term, field2:term) - expandedRoot = MultiFieldExpander.expandCrossFields(root, fields, true); - } else if (options.isBestFieldsMode()) { - // best_fields: entire query copied per field, joined with OCCUR_BOOLEAN - expandedRoot = MultiFieldExpander.expandBestFields(root, fields, true); - } else { - throw new IllegalStateException( - "Invalid type value: '" + options.getType() + "'. Expected 'best_fields' or 'cross_fields'"); - } + // In ES query_string, both best_fields and cross_fields use per-clause expansion + // (each clause is independently expanded across fields). The difference is only + // in scoring (dis_max vs blended analysis), which doesn't apply to Doris since + // search() is a boolean filter. So we always use expandCrossFields here. + // Type validation already happened in SearchOptions.setType(). + QsNode expandedRoot = MultiFieldExpander.expandCrossFields(root, fields, true); // Extract field bindings from expanded AST Set fieldNames = collectFieldNames(expandedRoot); + // If no fields were collected (e.g., MATCH_ALL_DOCS query that matches all docs + // regardless of field), use the original fields list to ensure proper push-down + if (fieldNames.isEmpty()) { + fieldNames = new LinkedHashSet<>(fields); + } List bindings = new ArrayList<>(); int slotIndex = 0; for (String fieldName : fieldNames) { bindings.add(new QsFieldBinding(fieldName, slotIndex++)); } - return new QsPlan(expandedRoot, bindings); + // Include default_operator and minimum_should_match for BE + return new QsPlan(expandedRoot, bindings, + normalizeDefaultOperator(effectiveOptions.getDefaultOperator()), + effectiveOptions.getMinimumShouldMatch()); } catch (SearchDslSyntaxException e) { LOG.error("Failed to parse search DSL in multi-field Lucene mode: '{}'", dsl, e); @@ -560,7 +618,8 @@ public enum QsClauseType { AND, // clause1 AND clause2 (standard boolean algebra) OR, // clause1 OR clause2 (standard boolean algebra) NOT, // NOT clause (standard boolean algebra) - OCCUR_BOOLEAN // Lucene-style boolean query with MUST/SHOULD/MUST_NOT + OCCUR_BOOLEAN, // Lucene-style boolean query with MUST/SHOULD/MUST_NOT + MATCH_ALL_DOCS // Matches all documents (used for pure NOT query rewriting) } /** @@ -732,6 +791,13 @@ public QsNode visitAtomClause(SearchParser.AtomClauseContext ctx) { } return result; } + if (ctx.fieldGroupQuery() != null) { + QsNode result = visit(ctx.fieldGroupQuery()); + if (result == null) { + throw new SearchDslSyntaxException("Invalid field group query"); + } + return result; + } if (ctx.fieldQuery() != null) { QsNode result = visit(ctx.fieldQuery()); if (result == null) { @@ -751,18 +817,21 @@ public QsNode visitAtomClause(SearchParser.AtomClauseContext ctx) { @Override public QsNode visitBareQuery(SearchParser.BareQueryContext ctx) { - // Bare query - uses default field - if (defaultField == null || defaultField.isEmpty()) { + // Use currentFieldName if inside a field group context (set by visitFieldGroupQuery), + // otherwise fall back to the configured defaultField. + String effectiveField = (currentFieldName != null && !currentFieldName.isEmpty()) + ? currentFieldName : defaultField; + if (effectiveField == null || effectiveField.isEmpty()) { throw new SearchDslSyntaxException( "No field specified and no default_field configured. " + "Either use field:value syntax or set default_field in options."); } - fieldNames.add(defaultField); + fieldNames.add(effectiveField); - // Set current field context to default field before visiting search value + // Set current field context before visiting search value String previousFieldName = currentFieldName; - currentFieldName = defaultField; + currentFieldName = effectiveField; try { if (ctx.searchValue() == null) { @@ -816,6 +885,8 @@ public QsNode visitFieldQuery(SearchParser.FieldQueryContext ctx) { if (result == null) { throw new RuntimeException("Invalid search value"); } + // Mark as explicit field - user wrote "field:term" syntax + result.setExplicitField(true); return result; } finally { // Restore previous context @@ -823,6 +894,50 @@ public QsNode visitFieldQuery(SearchParser.FieldQueryContext ctx) { } } + @Override + public QsNode visitFieldGroupQuery(SearchParser.FieldGroupQueryContext ctx) { + if (ctx.fieldPath() == null) { + throw new SearchDslSyntaxException("Invalid field group query: missing field path"); + } + + // Build complete field path from segments (support field.subcolumn syntax) + StringBuilder fullPath = new StringBuilder(); + List segments = ctx.fieldPath().fieldSegment(); + for (int i = 0; i < segments.size(); i++) { + if (i > 0) { + fullPath.append('.'); + } + String segment = segments.get(i).getText(); + if (segment.startsWith("\"") && segment.endsWith("\"")) { + segment = segment.substring(1, segment.length() - 1); + } + fullPath.append(segment); + } + + String fieldPath = fullPath.toString(); + fieldNames.add(fieldPath); + + // Set field group context so bare terms inside use this field + String previousFieldName = currentFieldName; + currentFieldName = fieldPath; + + try { + if (ctx.clause() == null) { + throw new SearchDslSyntaxException("Invalid field group query: missing inner clause"); + } + QsNode result = visit(ctx.clause()); + if (result == null) { + throw new SearchDslSyntaxException("Invalid field group query: inner clause returned null"); + } + // Mark all leaf nodes as explicitly bound to this field. + // This prevents MultiFieldExpander from re-expanding them across other fields. + markExplicitFieldRecursive(result, fieldPath); + return result; + } finally { + currentFieldName = previousFieldName; + } + } + @Override public QsNode visitSearchValue(SearchParser.SearchValueContext ctx) { String fieldName = getCurrentFieldName(); @@ -875,6 +990,10 @@ private QsNode createTermNode(String fieldName, String value) { } private QsNode createPrefixNode(String fieldName, String value) { + // Standalone * → MATCH_ALL_DOCS (matches ES behavior: field:* becomes ExistsQuery) + if ("*".equals(value)) { + return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + } return new QsNode(QsClauseType.PREFIX, fieldName, unescapeTermValue(value)); } @@ -996,11 +1115,28 @@ public static class QsPlan { @JsonProperty("fieldBindings") private final List fieldBindings; + @JsonProperty("defaultOperator") + private final String defaultOperator; + + @JsonProperty("minimumShouldMatch") + private final Integer minimumShouldMatch; + @JsonCreator public QsPlan(@JsonProperty("root") QsNode root, @JsonProperty("fieldBindings") List fieldBindings) { + this(root, fieldBindings, null, null); + } + + public QsPlan(QsNode root, List fieldBindings, String defaultOperator) { + this(root, fieldBindings, defaultOperator, null); + } + + public QsPlan(QsNode root, List fieldBindings, String defaultOperator, + Integer minimumShouldMatch) { this.root = Objects.requireNonNull(root, "root cannot be null"); this.fieldBindings = fieldBindings != null ? new ArrayList<>(fieldBindings) : new ArrayList<>(); + this.defaultOperator = defaultOperator; + this.minimumShouldMatch = minimumShouldMatch; } public QsNode getRoot() { @@ -1011,6 +1147,14 @@ public List getFieldBindings() { return Collections.unmodifiableList(fieldBindings); } + public String getDefaultOperator() { + return defaultOperator; + } + + public Integer getMinimumShouldMatch() { + return minimumShouldMatch; + } + /** * Parse QsPlan from JSON string */ @@ -1036,7 +1180,7 @@ public String toJson() { @Override public int hashCode() { - return Objects.hash(root, fieldBindings); + return Objects.hash(root, fieldBindings, defaultOperator, minimumShouldMatch); } @Override @@ -1049,7 +1193,9 @@ public boolean equals(Object o) { } QsPlan qsPlan = (QsPlan) o; return Objects.equals(root, qsPlan.getRoot()) - && Objects.equals(fieldBindings, qsPlan.getFieldBindings()); + && Objects.equals(fieldBindings, qsPlan.getFieldBindings()) + && Objects.equals(defaultOperator, qsPlan.getDefaultOperator()) + && Objects.equals(minimumShouldMatch, qsPlan.getMinimumShouldMatch()); } } @@ -1081,6 +1227,15 @@ public static class QsNode { @JsonProperty("minimumShouldMatch") private final Integer minimumShouldMatch; + /** + * Whether the field was explicitly specified in the DSL syntax (e.g., title:music) + * vs assigned from default field for bare queries (e.g., music). + * Used internally by MultiFieldExpander to avoid expanding explicit field prefixes. + * Not serialized to JSON since it's only needed during FE-side AST expansion. + */ + @JsonIgnore + private boolean explicitField; + /** * Constructor for JSON deserialization * @@ -1185,6 +1340,23 @@ public Integer getMinimumShouldMatch() { return minimumShouldMatch; } + /** + * Returns whether the field was explicitly specified in the DSL syntax. + */ + public boolean isExplicitField() { + return explicitField; + } + + /** + * Sets whether the field was explicitly specified in the DSL syntax. + * @param explicitField true if field was explicitly specified (e.g., title:music) + * @return this node for method chaining + */ + public QsNode setExplicitField(boolean explicitField) { + this.explicitField = explicitField; + return this; + } + /** * Sets the occur type for this node. * @param occur the occur type (MUST, SHOULD, MUST_NOT) @@ -1319,51 +1491,23 @@ public static QsNode expandCrossFields(QsNode root, List fields, boolean * @return Expanded AST */ public static QsNode expandBestFields(QsNode root, List fields) { - return expandBestFields(root, fields, false); - } - - /** - * Expand AST using best_fields strategy with optional Lucene mode. - * @param root The AST root node - * @param fields List of fields to expand across - * @param luceneMode If true, use Lucene-style OCCUR_BOOLEAN; if false, use standard OR - */ - public static QsNode expandBestFields(QsNode root, List fields, boolean luceneMode) { if (fields == null || fields.isEmpty()) { return root; } if (fields.size() == 1) { - // Single field - just set the field on all leaf nodes return setFieldOnLeaves(root, fields.get(0), fields); } - // Use the explicit luceneMode parameter only - don't infer from node properties - boolean isLuceneMode = luceneMode; - - // Create a copy of the entire AST for each field + // Non-lucene mode (used by parseDslMultiFieldMode for multi_match semantics): + // Copy entire AST per field, join with OR. + // Example: "hello AND world" with fields=[title,content] becomes + // (title:hello AND title:world) OR (content:hello AND content:world) List fieldTrees = new ArrayList<>(); for (String field : fields) { QsNode copy = deepCopyWithField(root, field, fields); - // In Lucene mode, set SHOULD on each field tree - if (isLuceneMode) { - copy.setOccur(QsOccur.SHOULD); - } fieldTrees.add(copy); } - - // In Lucene mode, create OCCUR_BOOLEAN instead of OR - if (isLuceneMode) { - // Preserve minimum_should_match from root if it has one - Integer minShouldMatch = root.getMinimumShouldMatch(); - if (minShouldMatch == null) { - // Default: at least 1 field should match - minShouldMatch = 1; - } - return new QsNode(QsClauseType.OCCUR_BOOLEAN, fieldTrees, minShouldMatch); - } else { - // Standard mode: join with OR - return new QsNode(QsClauseType.OR, fieldTrees); - } + return new QsNode(QsClauseType.OR, fieldTrees); } /** @@ -1371,13 +1515,15 @@ public static QsNode expandBestFields(QsNode root, List fields, boolean * Always returns a new copy or new node structure, never the original node. */ private static QsNode expandNodeCrossFields(QsNode node, List fields, boolean luceneMode) { + // MATCH_ALL_DOCS matches all documents regardless of field - don't expand + if (node.getType() == QsClauseType.MATCH_ALL_DOCS) { + return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + } + // Check if this is a leaf node (no children) if (isLeafNode(node)) { - // Check if the node has an explicit field that's NOT in the fields list - // If so, don't expand but still return a copy - String nodeField = node.getField(); - if (nodeField != null && !nodeField.isEmpty() && !fields.contains(nodeField)) { - // Explicit field not in expansion list - return a copy preserving all fields + // If the user explicitly wrote "field:term" syntax, respect it - don't expand + if (node.isExplicitField()) { return new QsNode( node.getType(), node.getField(), @@ -1450,17 +1596,13 @@ private static boolean isLeafNode(QsNode node) { * Always returns a new copy, never the original node. */ private static QsNode deepCopyWithField(QsNode node, String field, List fields) { + // MATCH_ALL_DOCS matches all documents regardless of field - don't set field + if (node.getType() == QsClauseType.MATCH_ALL_DOCS) { + return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + } if (isLeafNode(node)) { - // Check if the node has an explicit field that's NOT in the fields list - String nodeField = node.getField(); - String targetField; - if (nodeField != null && !nodeField.isEmpty() && !fields.contains(nodeField)) { - // Explicit field not in expansion list - preserve original field - targetField = nodeField; - } else { - // Use new field - targetField = field; - } + // If the user explicitly wrote "field:term" syntax, preserve original field + String targetField = node.isExplicitField() ? node.getField() : field; // Create a complete copy of the leaf node QsNode copy = new QsNode( @@ -1471,6 +1613,7 @@ private static QsNode deepCopyWithField(QsNode node, String field, List node.getOccur(), node.getMinimumShouldMatch() ); + copy.setExplicitField(node.isExplicitField()); return copy; } @@ -1500,16 +1643,13 @@ private static QsNode deepCopyWithField(QsNode node, String field, List * Always returns a new copy, never the original node. */ private static QsNode setFieldOnLeaves(QsNode node, String field, List fields) { + // MATCH_ALL_DOCS matches all documents regardless of field - don't set field + if (node.getType() == QsClauseType.MATCH_ALL_DOCS) { + return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + } if (isLeafNode(node)) { - // Check if the node has an explicit field that's NOT in the fields list - String nodeField = node.getField(); - String targetField; - if (nodeField != null && !nodeField.isEmpty() && !fields.contains(nodeField)) { - // Explicit field not in expansion list - preserve original field - targetField = nodeField; - } else { - targetField = field; - } + // If the user explicitly wrote "field:term" syntax, preserve original field + String targetField = node.isExplicitField() ? node.getField() : field; // Create complete copy return new QsNode( @@ -1676,6 +1816,21 @@ public boolean isCrossFieldsMode() { return "cross_fields".equals(type); } + /** + * Create a copy of this SearchOptions with a different minimum_should_match value. + * Used for ES compatibility in multi-field mode where msm is ignored. + */ + public SearchOptions withMinimumShouldMatch(Integer newMsm) { + SearchOptions copy = new SearchOptions(); + copy.defaultField = this.defaultField; + copy.defaultOperator = this.defaultOperator; + copy.mode = this.mode; + copy.minimumShouldMatch = newMsm; + copy.fields = this.fields != null ? new ArrayList<>(this.fields) : null; + copy.type = this.type; + return copy; + } + /** * Validate the options after deserialization. * Checks for: @@ -1803,7 +1958,10 @@ public void syntaxError(org.antlr.v4.runtime.Recognizer recognizer, bindings.add(new QsFieldBinding(fieldName, slotIndex++)); } - return new QsPlan(root, bindings); + // Include default_operator and minimum_should_match for BE + return new QsPlan(root, bindings, + normalizeDefaultOperator(defaultOperator), + options.getMinimumShouldMatch()); } catch (SearchDslSyntaxException e) { // Syntax error in DSL - user input issue @@ -1841,6 +1999,7 @@ private static class QsLuceneModeAstBuilder extends SearchParserBaseVisitor) null); + matchAllNode.setOccur(QsOccur.SHOULD); + List children = new ArrayList<>(); + children.add(matchAllNode); children.add(singleTerm.node); - return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 0); + return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 1); } // Single non-negated term - return directly without wrapper return singleTerm.node; @@ -1918,37 +2083,32 @@ private QsNode processLuceneBooleanChain(SearchParser.OrClauseContext ctx) { applyLuceneBooleanLogic(terms); // Determine minimum_should_match - Integer minShouldMatch = options.getMinimumShouldMatch(); + // Only use explicit option at top level; nested clauses use default logic + Integer minShouldMatch = (nestingLevel == 0) ? options.getMinimumShouldMatch() : null; if (minShouldMatch == null) { // Default: 0 if there are MUST clauses, 1 if only SHOULD + // This matches Lucene BooleanQuery default behavior boolean hasMust = terms.stream().anyMatch(t -> t.occur == QsOccur.MUST); boolean hasMustNot = terms.stream().anyMatch(t -> t.occur == QsOccur.MUST_NOT); minShouldMatch = (hasMust || hasMustNot) ? 0 : 1; } - // Filter out SHOULD clauses if minimum_should_match=0 and there are MUST clauses final int finalMinShouldMatch = minShouldMatch; - if (minShouldMatch == 0) { - boolean hasMust = terms.stream().anyMatch(t -> t.occur == QsOccur.MUST); - if (hasMust) { - terms = terms.stream() - .filter(t -> t.occur != QsOccur.SHOULD) - .collect(Collectors.toList()); - } - } - - if (terms.isEmpty()) { - throw new RuntimeException("All terms filtered out in Lucene boolean logic"); - } if (terms.size() == 1) { TermWithOccur remainingTerm = terms.get(0); if (remainingTerm.occur == QsOccur.MUST_NOT) { - // Single MUST_NOT term - must wrap in OCCUR_BOOLEAN for BE to handle + // Single MUST_NOT term - rewrite to: SHOULD(MATCH_ALL_DOCS) + MUST_NOT(term) + // This ensures proper Lucene semantics: match all docs EXCEPT those matching the term remainingTerm.node.setOccur(QsOccur.MUST_NOT); + + QsNode matchAllNode = new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + matchAllNode.setOccur(QsOccur.SHOULD); + List children = new ArrayList<>(); + children.add(matchAllNode); children.add(remainingTerm.node); - return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 0); + return new QsNode(QsClauseType.OCCUR_BOOLEAN, children, 1); } return remainingTerm.node; } @@ -2036,8 +2196,17 @@ private void collectTermsFromNotClause(SearchParser.NotClauseContext ctx, List - * Rules (processed left-to-right): - * 1. First term: MUST (due to default_operator=AND) - * 2. AND introduces: marks preceding and current as MUST - * 3. OR introduces: marks preceding and current as SHOULD - * 4. NOT modifier: marks current as MUST_NOT - * 5. AND after MUST_NOT: the MUST_NOT term is not affected, current becomes MUST + * Faithfully replicates Lucene QueryParserBase.addClause() semantics: + * - Processes terms left-to-right with NO operator precedence (AND/OR are equal) + * - Each conjunction affects at most the immediately preceding term + *

+ * With OR_OPERATOR (default_operator=OR): + * - First term / no conjunction: SHOULD + * - AND: preceding becomes MUST, current MUST + * - OR: current SHOULD (preceding unchanged) + *

+ * With AND_OPERATOR (default_operator=AND): + * - First term / no conjunction: MUST + * - AND: preceding becomes MUST, current MUST + * - OR: preceding becomes SHOULD, current SHOULD */ private void applyLuceneBooleanLogic(List terms) { + boolean useAnd = "AND".equalsIgnoreCase(options.getDefaultOperator()); + for (int i = 0; i < terms.size(); i++) { TermWithOccur current = terms.get(i); @@ -2073,36 +2251,44 @@ private void applyLuceneBooleanLogic(List terms) { // NOT modifier - mark as MUST_NOT current.occur = QsOccur.MUST_NOT; - // OR + NOT: preceding becomes SHOULD (if not already MUST_NOT) - if (current.introducedByOr && i > 0) { + if (current.introducedByAnd && i > 0) { + // AND + NOT: AND still makes preceding MUST + TermWithOccur prev = terms.get(i - 1); + if (prev.occur != QsOccur.MUST_NOT) { + prev.occur = QsOccur.MUST; + } + } else if (current.introducedByOr && i > 0 && useAnd) { + // OR + NOT with AND_OPERATOR: preceding becomes SHOULD TermWithOccur prev = terms.get(i - 1); if (prev.occur != QsOccur.MUST_NOT) { prev.occur = QsOccur.SHOULD; } } + // OR + NOT with OR_OPERATOR: no change to preceding } else if (current.introducedByAnd) { - // AND introduces: both preceding and current are MUST + // AND: preceding becomes MUST, current MUST current.occur = QsOccur.MUST; if (i > 0) { TermWithOccur prev = terms.get(i - 1); - // Don't change MUST_NOT to MUST if (prev.occur != QsOccur.MUST_NOT) { prev.occur = QsOccur.MUST; } } } else if (current.introducedByOr) { - // OR introduces: both preceding and current are SHOULD + // OR: current is SHOULD current.occur = QsOccur.SHOULD; - if (i > 0) { + // Only change preceding to SHOULD if default_operator=AND + // (Lucene: OR_OPERATOR + CONJ_OR does NOT modify preceding) + if (useAnd && i > 0) { TermWithOccur prev = terms.get(i - 1); - // Don't change MUST_NOT to SHOULD if (prev.occur != QsOccur.MUST_NOT) { prev.occur = QsOccur.SHOULD; } } } else { - // First term: MUST (default_operator=AND) - current.occur = QsOccur.MUST; + // First term or implicit conjunction (no explicit AND/OR) + // Lucene: SHOULD for OR_OPERATOR, MUST for AND_OPERATOR + current.occur = useAnd ? QsOccur.MUST : QsOccur.SHOULD; } } } @@ -2164,6 +2350,9 @@ public QsNode visitAtomClause(SearchParser.AtomClauseContext ctx) { if (ctx.clause() != null) { return visit(ctx.clause()); } + if (ctx.fieldGroupQuery() != null) { + return visit(ctx.fieldGroupQuery()); + } if (ctx.fieldQuery() != null) { return visit(ctx.fieldQuery()); } @@ -2175,19 +2364,22 @@ public QsNode visitAtomClause(SearchParser.AtomClauseContext ctx) { @Override public QsNode visitBareQuery(SearchParser.BareQueryContext ctx) { - // Bare query - uses effective default field (considering override) + // Use currentFieldName if inside a field group context (set by visitFieldGroupQuery), + // otherwise fall back to the effective default field. String defaultField = getEffectiveDefaultField(); - if (defaultField == null || defaultField.isEmpty()) { + String effectiveField = (currentFieldName != null && !currentFieldName.isEmpty()) + ? currentFieldName : defaultField; + if (effectiveField == null || effectiveField.isEmpty()) { throw new SearchDslSyntaxException( "No field specified and no default_field configured. " + "Either use field:value syntax or set default_field in options."); } - fieldNames.add(defaultField); + fieldNames.add(effectiveField); - // Set current field context to default field before visiting search value + // Set current field context before visiting search value String previousFieldName = currentFieldName; - currentFieldName = defaultField; + currentFieldName = effectiveField; try { if (ctx.searchValue() == null) { @@ -2228,12 +2420,61 @@ public QsNode visitFieldQuery(SearchParser.FieldQueryContext ctx) { currentFieldName = fieldPath; try { - return visit(ctx.searchValue()); + QsNode result = visit(ctx.searchValue()); + // Mark as explicit field - user wrote "field:term" syntax + result.setExplicitField(true); + return result; } finally { currentFieldName = previousFieldName; } } + @Override + public QsNode visitFieldGroupQuery(SearchParser.FieldGroupQueryContext ctx) { + if (ctx.fieldPath() == null) { + throw new SearchDslSyntaxException("Invalid field group query: missing field path"); + } + + // Build complete field path from segments (support field.subcolumn syntax) + StringBuilder fullPath = new StringBuilder(); + List segments = ctx.fieldPath().fieldSegment(); + for (int i = 0; i < segments.size(); i++) { + if (i > 0) { + fullPath.append('.'); + } + String segment = segments.get(i).getText(); + if (segment.startsWith("\"") && segment.endsWith("\"")) { + segment = segment.substring(1, segment.length() - 1); + } + fullPath.append(segment); + } + + String fieldPath = fullPath.toString(); + fieldNames.add(fieldPath); + + // Set field group context so bare terms inside use this field + String previousFieldName = currentFieldName; + currentFieldName = fieldPath; + nestingLevel++; + + try { + if (ctx.clause() == null) { + throw new SearchDslSyntaxException("Invalid field group query: missing inner clause"); + } + QsNode result = visit(ctx.clause()); + if (result == null) { + throw new SearchDslSyntaxException("Invalid field group query: inner clause returned null"); + } + // Mark all leaf nodes as explicitly bound to this field. + // This prevents MultiFieldExpander from re-expanding them across other fields. + markExplicitFieldRecursive(result, fieldPath); + return result; + } finally { + nestingLevel--; + currentFieldName = previousFieldName; + } + } + @Override public QsNode visitSearchValue(SearchParser.SearchValueContext ctx) { String fieldName = currentFieldName; @@ -2252,7 +2493,12 @@ public QsNode visitSearchValue(SearchParser.SearchValueContext ctx) { return new QsNode(QsClauseType.TERM, fieldName, unescapeTermValue(ctx.TERM().getText())); } if (ctx.PREFIX() != null) { - return new QsNode(QsClauseType.PREFIX, fieldName, unescapeTermValue(ctx.PREFIX().getText())); + String prefixText = ctx.PREFIX().getText(); + // Standalone * → MATCH_ALL_DOCS (matches ES behavior: field:* becomes ExistsQuery) + if ("*".equals(prefixText)) { + return new QsNode(QsClauseType.MATCH_ALL_DOCS, (List) null); + } + return new QsNode(QsClauseType.PREFIX, fieldName, unescapeTermValue(prefixText)); } if (ctx.WILDCARD() != null) { return new QsNode(QsClauseType.WILDCARD, fieldName, unescapeTermValue(ctx.WILDCARD().getText())); diff --git a/fe/fe-core/src/test/java/org/apache/doris/analysis/SearchPredicateTest.java b/fe/fe-core/src/test/java/org/apache/doris/analysis/SearchPredicateTest.java index c5d0e7d934d73a..c8a686188d298b 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/analysis/SearchPredicateTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/analysis/SearchPredicateTest.java @@ -17,10 +17,12 @@ package org.apache.doris.analysis; +import org.apache.doris.catalog.Index; import org.apache.doris.catalog.Type; import org.apache.doris.nereids.trees.expressions.functions.scalar.SearchDslParser; import org.apache.doris.thrift.TExprNode; import org.apache.doris.thrift.TExprNodeType; +import org.apache.doris.thrift.TSearchFieldBinding; import org.apache.doris.thrift.TSearchParam; import org.junit.jupiter.api.Assertions; @@ -28,7 +30,9 @@ import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** @@ -257,4 +261,136 @@ public void testEmptyChildren() { Assertions.assertNotNull(thriftNode.search_param); Assertions.assertEquals(dsl, thriftNode.search_param.original_dsl); } + + @Test + public void testFieldIndexesPassedToThrift() { + // Simulate a variant subcolumn search where FE passes index properties + String dsl = "data.string_8:admin"; + + SearchDslParser.QsNode root = new SearchDslParser.QsNode( + SearchDslParser.QsClauseType.TERM, "data.string_8", "admin"); + List bindings = Arrays.asList( + new SearchDslParser.QsFieldBinding("data.string_8", 0)); + SearchDslParser.QsPlan plan = new SearchDslParser.QsPlan(root, bindings); + + SlotRef dataSlot = createTestSlotRef("data"); + List children = Arrays.asList(dataSlot); + + // Create an Index with analyzer properties (simulates field_pattern matched index) + Map indexProps = new HashMap<>(); + indexProps.put("parser", "unicode"); + indexProps.put("lower_case", "true"); + Index invertedIndex = new Index(1L, "idx_text", Arrays.asList("data"), + IndexDef.IndexType.INVERTED, indexProps, ""); + + List fieldIndexes = Arrays.asList(invertedIndex); + + SearchPredicate predicate = new SearchPredicate(dsl, plan, children, fieldIndexes); + + TExprNode thriftNode = new TExprNode(); + predicate.toThrift(thriftNode); + + TSearchParam param = thriftNode.search_param; + Assertions.assertNotNull(param); + Assertions.assertEquals(1, param.field_bindings.size()); + + TSearchFieldBinding binding = param.field_bindings.get(0); + Assertions.assertEquals("data.string_8", binding.field_name); + Assertions.assertTrue(binding.is_variant_subcolumn); + Assertions.assertEquals("data", binding.parent_field_name); + Assertions.assertEquals("string_8", binding.subcolumn_path); + + // Verify index_properties are set + Assertions.assertNotNull(binding.index_properties); + Assertions.assertEquals("unicode", binding.index_properties.get("parser")); + Assertions.assertEquals("true", binding.index_properties.get("lower_case")); + } + + @Test + public void testFieldIndexesNullDoesNotSetProperties() { + String dsl = "title:hello"; + SearchDslParser.QsPlan plan = createTestPlan(); + SlotRef titleSlot = createTestSlotRef("title"); + List children = Arrays.asList(titleSlot); + + // Pass null Index in the fieldIndexes list + List fieldIndexes = Arrays.asList((Index) null); + + SearchPredicate predicate = new SearchPredicate(dsl, plan, children, fieldIndexes); + + TExprNode thriftNode = new TExprNode(); + predicate.toThrift(thriftNode); + + TSearchParam param = thriftNode.search_param; + TSearchFieldBinding binding = param.field_bindings.get(0); + + // index_properties should not be set when Index is null + Assertions.assertFalse(binding.isSetIndexProperties()); + } + + @Test + public void testFieldIndexesEmptyListBackwardCompatible() { + // Verify that using the old constructor (without fieldIndexes) still works + String dsl = "title:hello"; + SearchDslParser.QsPlan plan = createTestPlan(); + SlotRef titleSlot = createTestSlotRef("title"); + List children = Arrays.asList(titleSlot); + + // Constructor without fieldIndexes + SearchPredicate predicate = new SearchPredicate(dsl, plan, children); + + TExprNode thriftNode = new TExprNode(); + predicate.toThrift(thriftNode); + + TSearchParam param = thriftNode.search_param; + TSearchFieldBinding binding = param.field_bindings.get(0); + + // index_properties should not be set + Assertions.assertFalse(binding.isSetIndexProperties()); + } + + @Test + public void testMultipleFieldsWithMixedIndexes() { + String dsl = "title:hello AND data.string_8:admin"; + + SearchDslParser.QsNode leftChild = new SearchDslParser.QsNode( + SearchDslParser.QsClauseType.TERM, "title", "hello"); + SearchDslParser.QsNode rightChild = new SearchDslParser.QsNode( + SearchDslParser.QsClauseType.TERM, "data.string_8", "admin"); + SearchDslParser.QsNode root = new SearchDslParser.QsNode( + SearchDslParser.QsClauseType.AND, Arrays.asList(leftChild, rightChild)); + + List fieldBindings = Arrays.asList( + new SearchDslParser.QsFieldBinding("title", 0), + new SearchDslParser.QsFieldBinding("data.string_8", 1)); + SearchDslParser.QsPlan plan = new SearchDslParser.QsPlan(root, fieldBindings); + + List children = Arrays.asList( + createTestSlotRef("title"), + createTestSlotRef("data")); + + // First field has no index, second has index with analyzer + Map indexProps = new HashMap<>(); + indexProps.put("parser", "unicode"); + indexProps.put("lower_case", "true"); + Index variantIndex = new Index(1L, "idx_text", Arrays.asList("data"), + IndexDef.IndexType.INVERTED, indexProps, ""); + + List fieldIndexes = Arrays.asList(null, variantIndex); + + SearchPredicate predicate = new SearchPredicate(dsl, plan, children, fieldIndexes); + + TExprNode thriftNode = new TExprNode(); + predicate.toThrift(thriftNode); + + TSearchParam param = thriftNode.search_param; + Assertions.assertEquals(2, param.field_bindings.size()); + + // First field: no index_properties + Assertions.assertFalse(param.field_bindings.get(0).isSetIndexProperties()); + + // Second field: has index_properties + Assertions.assertTrue(param.field_bindings.get(1).isSetIndexProperties()); + Assertions.assertEquals("unicode", param.field_bindings.get(1).index_properties.get("parser")); + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java index d33a9987771aeb..01bbcf8d925f23 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SearchDslParserTest.java @@ -664,29 +664,39 @@ public void testLuceneModeSimpleOrQuery() { @Test public void testLuceneModeAndOrMixed() { // Test: "a AND b OR c" in Lucene mode with minimum_should_match=0 - // Expected: +a (SHOULD terms discarded because MUST exists) + // Lucene addClause semantics (left-to-right, no precedence, default_operator=OR): + // a(CONJ_NONE)→SHOULD, b(CONJ_AND)→prev MUST, b MUST, c(CONJ_OR)→SHOULD (prev unchanged) + // Result: [MUST(a), MUST(b), SHOULD(c)] with msm=0 + // ES: +a +b c (SHOULD(c) kept, not filtered — msm=0 means optional, not removed) String dsl = "field:a AND field:b OR field:c"; String options = "{\"mode\":\"lucene\",\"minimum_should_match\":0}"; QsPlan plan = SearchDslParser.parseDsl(dsl, options); Assertions.assertNotNull(plan); - // With minimum_should_match=0 and MUST clauses present, SHOULD is discarded - // Only "a" remains with MUST - Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType()); - Assertions.assertEquals("field", plan.getRoot().getField()); - Assertions.assertEquals("a", plan.getRoot().getValue()); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + Assertions.assertEquals(3, plan.getRoot().getChildren().size()); + + QsNode nodeA = plan.getRoot().getChildren().get(0); + Assertions.assertEquals("a", nodeA.getValue()); + Assertions.assertEquals(SearchDslParser.QsOccur.MUST, nodeA.getOccur()); + + QsNode nodeB = plan.getRoot().getChildren().get(1); + Assertions.assertEquals("b", nodeB.getValue()); + Assertions.assertEquals(SearchDslParser.QsOccur.MUST, nodeB.getOccur()); + + QsNode nodeC = plan.getRoot().getChildren().get(2); + Assertions.assertEquals("c", nodeC.getValue()); + Assertions.assertEquals(SearchDslParser.QsOccur.SHOULD, nodeC.getOccur()); } @Test public void testLuceneModeAndOrNotMixed() { // Test: "a AND b OR NOT c AND d" in Lucene mode - // Expected processing: - // - a: MUST (first term, default_operator=AND) - // - b: MUST (AND introduces) - // - c: MUST_NOT (OR + NOT, but OR makes preceding SHOULD, NOT makes current MUST_NOT) - // - d: MUST (AND introduces) - // With minimum_should_match=0: b becomes SHOULD and is discarded - // Result: +a -c +d + // Lucene addClause semantics (left-to-right, no precedence): + // a(CONJ_NONE)→SHOULD, b(CONJ_AND)→prev MUST, b MUST, + // NOT c(CONJ_OR, MOD_NOT)→MUST_NOT (prev unchanged with OR_OPERATOR), + // d(CONJ_AND)→prev(c) skip (MUST_NOT), d MUST + // Result: [MUST(a), MUST(b), MUST_NOT(c), MUST(d)] = +a +b -c +d String dsl = "field:a AND field:b OR NOT field:c AND field:d"; String options = "{\"mode\":\"lucene\",\"minimum_should_match\":0}"; QsPlan plan = SearchDslParser.parseDsl(dsl, options); @@ -694,19 +704,22 @@ public void testLuceneModeAndOrNotMixed() { Assertions.assertNotNull(plan); Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); - // Should have 3 children: a(MUST), c(MUST_NOT), d(MUST) - // b is filtered out because it becomes SHOULD - Assertions.assertEquals(3, plan.getRoot().getChildren().size()); + // Should have 4 children: a(MUST), b(MUST), c(MUST_NOT), d(MUST) + Assertions.assertEquals(4, plan.getRoot().getChildren().size()); QsNode nodeA = plan.getRoot().getChildren().get(0); Assertions.assertEquals("a", nodeA.getValue()); Assertions.assertEquals(SearchDslParser.QsOccur.MUST, nodeA.getOccur()); - QsNode nodeC = plan.getRoot().getChildren().get(1); + QsNode nodeB = plan.getRoot().getChildren().get(1); + Assertions.assertEquals("b", nodeB.getValue()); + Assertions.assertEquals(SearchDslParser.QsOccur.MUST, nodeB.getOccur()); + + QsNode nodeC = plan.getRoot().getChildren().get(2); Assertions.assertEquals("c", nodeC.getValue()); Assertions.assertEquals(SearchDslParser.QsOccur.MUST_NOT, nodeC.getOccur()); - QsNode nodeD = plan.getRoot().getChildren().get(2); + QsNode nodeD = plan.getRoot().getChildren().get(3); Assertions.assertEquals("d", nodeD.getValue()); Assertions.assertEquals(SearchDslParser.QsOccur.MUST, nodeD.getOccur()); } @@ -714,33 +727,58 @@ public void testLuceneModeAndOrNotMixed() { @Test public void testLuceneModeWithDefaultField() { // Test: Lucene mode with default field expansion + // Lucene addClause semantics with default_operator=AND (AND_OPERATOR): + // aterm(CONJ_NONE)→MUST, bterm(CONJ_AND)→prev MUST, bterm MUST, + // cterm(CONJ_OR)→SHOULD + prev(bterm) becomes SHOULD (AND_OPERATOR + CONJ_OR) + // Result: [MUST(aterm), SHOULD(bterm), SHOULD(cterm)] with msm=0 + // ES: +aterm bterm cterm String dsl = "aterm AND bterm OR cterm"; - // Now default_field and default_operator are inside the options JSON String options = "{\"default_field\":\"firstname\",\"default_operator\":\"and\"," + "\"mode\":\"lucene\",\"minimum_should_match\":0}"; QsPlan plan = SearchDslParser.parseDsl(dsl, options); Assertions.assertNotNull(plan); - // With minimum_should_match=0, only aterm (MUST) remains - Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType()); - Assertions.assertEquals("firstname", plan.getRoot().getField()); - Assertions.assertEquals("aterm", plan.getRoot().getValue()); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + Assertions.assertEquals(3, plan.getRoot().getChildren().size()); + + QsNode nodeA = plan.getRoot().getChildren().get(0); + Assertions.assertEquals("firstname", nodeA.getField()); + Assertions.assertEquals("aterm", nodeA.getValue()); + Assertions.assertEquals(SearchDslParser.QsOccur.MUST, nodeA.getOccur()); + + QsNode nodeB = plan.getRoot().getChildren().get(1); + Assertions.assertEquals("bterm", nodeB.getValue()); + Assertions.assertEquals(SearchDslParser.QsOccur.SHOULD, nodeB.getOccur()); + + QsNode nodeC = plan.getRoot().getChildren().get(2); + Assertions.assertEquals("cterm", nodeC.getValue()); + Assertions.assertEquals(SearchDslParser.QsOccur.SHOULD, nodeC.getOccur()); } @Test public void testLuceneModeNotOperator() { // Test: "NOT a" in Lucene mode - // In Lucene mode, single NOT produces OCCUR_BOOLEAN with a MUST_NOT child - // (wrapped for BE to handle the negation properly) + // Pure NOT queries are rewritten to: SHOULD(MATCH_ALL_DOCS) + MUST_NOT(term) + // with minimum_should_match=1, following ES/Lucene semantics where pure NOT + // should return all documents EXCEPT those matching the NOT clause String dsl = "NOT field:a"; String options = "{\"mode\":\"lucene\"}"; QsPlan plan = SearchDslParser.parseDsl(dsl, options); Assertions.assertNotNull(plan); Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); - Assertions.assertEquals(1, plan.getRoot().getChildren().size()); - Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getChildren().get(0).getType()); - Assertions.assertEquals(QsOccur.MUST_NOT, plan.getRoot().getChildren().get(0).getOccur()); + Assertions.assertEquals(2, plan.getRoot().getChildren().size()); + Assertions.assertEquals(Integer.valueOf(1), plan.getRoot().getMinimumShouldMatch()); + + // First child: MATCH_ALL_DOCS with SHOULD + QsNode matchAllNode = plan.getRoot().getChildren().get(0); + Assertions.assertEquals(QsClauseType.MATCH_ALL_DOCS, matchAllNode.getType()); + Assertions.assertEquals(QsOccur.SHOULD, matchAllNode.getOccur()); + + // Second child: TERM with MUST_NOT + QsNode termNode = plan.getRoot().getChildren().get(1); + Assertions.assertEquals(QsClauseType.TERM, termNode.getType()); + Assertions.assertEquals(QsOccur.MUST_NOT, termNode.getOccur()); } @Test @@ -817,6 +855,40 @@ public void testEscapedSpaceInTerm() { Assertions.assertEquals("First Value", plan.getRoot().getValue()); } + @Test + public void testEscapedSpaceInBareQueryLuceneMode() { + // Test: "Josh\ Brolin" (bare query, no field prefix) in lucene mode + // Should be treated as a single term "Josh Brolin", not split into two terms + String dsl = "Josh\\ Brolin"; + String optionsJson = "{\"default_field\":\"title\",\"default_operator\":\"AND\"," + + "\"mode\":\"lucene\",\"minimum_should_match\":0}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, optionsJson); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.TERM, plan.getRoot().getType()); + Assertions.assertEquals("title", plan.getRoot().getField()); + Assertions.assertEquals("Josh Brolin", plan.getRoot().getValue()); + // defaultOperator must be lowercase for BE case-sensitive comparison + Assertions.assertEquals("and", plan.getDefaultOperator()); + } + + @Test + public void testDefaultOperatorNormalization() { + // Verify defaultOperator is always normalized to lowercase in the plan, + // regardless of the case used in the options JSON. + // BE compares case-sensitively: (default_operator == "and") + String dsl = "foo bar"; + String optionsJson = "{\"default_field\":\"title\",\"default_operator\":\"AND\"," + + "\"mode\":\"lucene\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, optionsJson); + Assertions.assertEquals("and", plan.getDefaultOperator()); + + optionsJson = "{\"default_field\":\"title\",\"default_operator\":\"OR\"," + + "\"mode\":\"lucene\"}"; + plan = SearchDslParser.parseDsl(dsl, optionsJson); + Assertions.assertEquals("or", plan.getDefaultOperator()); + } + @Test public void testEscapedParentheses() { // Test: \( and \) should be treated as literal characters, not grouping @@ -872,16 +944,13 @@ public void testUppercaseAndOperator() { @Test public void testLowercaseAndOperator() { - // Test: Currently lowercase 'and' is also treated as operator - // According to PDF requirement, only uppercase should be operators - // This test documents current behavior - may need to change + // Lowercase 'and' is NOT an operator in ANTLR grammar (case-sensitive). + // With bareQuery rule, it's parsed as a bare term without field. + // Without default_field, bare term throws exception. String dsl = "field:a and field:b"; - QsPlan plan = SearchDslParser.parseDsl(dsl); - - Assertions.assertNotNull(plan); - // Current behavior: lowercase 'and' IS an operator - Assertions.assertEquals(QsClauseType.AND, plan.getRoot().getType()); - // TODO: If PDF requires only uppercase, this should fail and return OR or different structure + Assertions.assertThrows(RuntimeException.class, () -> { + SearchDslParser.parseDsl(dsl); + }); } @Test @@ -897,15 +966,13 @@ public void testUppercaseOrOperator() { @Test public void testLowercaseOrOperator() { - // Test: Currently lowercase 'or' is also treated as operator - // According to PDF requirement, only uppercase should be operators + // Lowercase 'or' is NOT an operator in ANTLR grammar (case-sensitive). + // With bareQuery rule, it's parsed as a bare term without field. + // Without default_field, bare term throws exception. String dsl = "field:a or field:b"; - QsPlan plan = SearchDslParser.parseDsl(dsl); - - Assertions.assertNotNull(plan); - // Current behavior: lowercase 'or' IS an operator - Assertions.assertEquals(QsClauseType.OR, plan.getRoot().getType()); - // TODO: If PDF requires only uppercase, this should fail + Assertions.assertThrows(RuntimeException.class, () -> { + SearchDslParser.parseDsl(dsl); + }); } @Test @@ -920,15 +987,13 @@ public void testUppercaseNotOperator() { @Test public void testLowercaseNotOperator() { - // Test: Currently lowercase 'not' is also treated as operator - // According to PDF requirement, only uppercase should be operators + // Lowercase 'not' is NOT an operator in ANTLR grammar (case-sensitive). + // With bareQuery rule, it's parsed as a bare term without field. + // Without default_field, bare term throws exception. String dsl = "not field:spam"; - QsPlan plan = SearchDslParser.parseDsl(dsl); - - Assertions.assertNotNull(plan); - // Current behavior: lowercase 'not' IS an operator - Assertions.assertEquals(QsClauseType.NOT, plan.getRoot().getType()); - // TODO: If PDF requires only uppercase, this should fail + Assertions.assertThrows(RuntimeException.class, () -> { + SearchDslParser.parseDsl(dsl); + }); } @Test @@ -1047,6 +1112,112 @@ public void testMultiFieldMixedWithExplicitField() { .anyMatch(b -> "category".equals(b.getFieldName()))); } + @Test + public void testMultiFieldExplicitFieldInFieldsList() { + // Bug fix: explicit field prefix should NOT be expanded even when the field IS in the fields list + // ES query_string always respects explicit "field:term" syntax regardless of the fields parameter. + // "title:music AND content:history" with fields=["title","content"] + // → title:music AND content:history (NOT expanded to multi-field OR) + String dsl = "title:music AND content:history"; + String options = "{\"fields\":[\"title\",\"content\"],\"type\":\"cross_fields\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.AND, plan.getRoot().getType()); + Assertions.assertEquals(2, plan.getRoot().getChildren().size()); + + // First child: title:music - NOT expanded + QsNode first = plan.getRoot().getChildren().get(0); + Assertions.assertEquals(QsClauseType.TERM, first.getType()); + Assertions.assertEquals("title", first.getField()); + Assertions.assertEquals("music", first.getValue()); + + // Second child: content:history - NOT expanded + QsNode second = plan.getRoot().getChildren().get(1); + Assertions.assertEquals(QsClauseType.TERM, second.getType()); + Assertions.assertEquals("content", second.getField()); + Assertions.assertEquals("history", second.getValue()); + } + + @Test + public void testMultiFieldExplicitFieldInFieldsListBestFields() { + // Same test as above but with best_fields type + String dsl = "title:music AND content:history"; + String options = "{\"fields\":[\"title\",\"content\"],\"type\":\"best_fields\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + // best_fields wraps in OR for multi-field, but explicit fields should be preserved in each copy + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OR, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + // Each OR branch should have AND(title:music, content:history) - both explicit fields preserved + for (QsNode branch : root.getChildren()) { + Assertions.assertEquals(QsClauseType.AND, branch.getType()); + Assertions.assertEquals(2, branch.getChildren().size()); + + QsNode titleNode = branch.getChildren().get(0); + Assertions.assertEquals("title", titleNode.getField()); + Assertions.assertEquals("music", titleNode.getValue()); + + QsNode contentNode = branch.getChildren().get(1); + Assertions.assertEquals("content", contentNode.getField()); + Assertions.assertEquals("history", contentNode.getValue()); + } + } + + @Test + public void testMultiFieldMixedExplicitAndBareQuery() { + // "title:football AND american" with fields=["title","content"] + // → title:football AND (title:american OR content:american) + // title:football should NOT be expanded; "american" (bare) should be expanded + String dsl = "title:football AND american"; + String options = "{\"fields\":[\"title\",\"content\"],\"type\":\"cross_fields\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.AND, plan.getRoot().getType()); + Assertions.assertEquals(2, plan.getRoot().getChildren().size()); + + // First child: title:football - NOT expanded (explicit field) + QsNode first = plan.getRoot().getChildren().get(0); + Assertions.assertEquals(QsClauseType.TERM, first.getType()); + Assertions.assertEquals("title", first.getField()); + Assertions.assertEquals("football", first.getValue()); + + // Second child: (title:american OR content:american) - expanded (bare term) + QsNode second = plan.getRoot().getChildren().get(1); + Assertions.assertEquals(QsClauseType.OR, second.getType()); + Assertions.assertEquals(2, second.getChildren().size()); + } + + @Test + public void testMultiFieldLuceneModeExplicitFieldInFieldsList() { + // Lucene mode: "title:music AND content:history" with fields=["title","content"] + // Explicit fields should be preserved, not expanded + String dsl = "title:music AND content:history"; + String options = "{\"fields\":[\"title\",\"content\"],\"default_operator\":\"and\"," + + "\"mode\":\"lucene\",\"type\":\"cross_fields\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + // Both children should be leaf TERM nodes (not expanded to OCCUR_BOOLEAN wrappers) + QsNode first = root.getChildren().get(0); + Assertions.assertEquals(QsClauseType.TERM, first.getType()); + Assertions.assertEquals("title", first.getField()); + Assertions.assertEquals("music", first.getValue()); + + QsNode second = root.getChildren().get(1); + Assertions.assertEquals(QsClauseType.TERM, second.getType()); + Assertions.assertEquals("content", second.getField()); + Assertions.assertEquals("history", second.getValue()); + } + @Test public void testMultiFieldWithWildcard() { // Test: "hello*" + fields=["title","content"] @@ -1182,16 +1353,22 @@ public void testMultiFieldLuceneModeSimpleOr() { @Test public void testMultiFieldLuceneModeAndOrMixed() { // Test: "a AND b OR c" + fields=["title","content"] + lucene mode + minimum_should_match=0 + cross_fields - // With Lucene semantics and minimum_should_match=0: SHOULD groups are discarded - // Only "a" (MUST) remains - wrapped in OCCUR_BOOLEAN + // With no default_operator (default is OR_OPERATOR in Lucene): + // a=MUST (promoted by AND), b=MUST (from AND), c=SHOULD (from OR) + // With OR_OPERATOR, OR does NOT change preceding term's occur + // msm is ignored for multi-field mode, node-level msm defaults to 0 (since MUST exists) String dsl = "a AND b OR c"; String options = "{\"fields\":[\"title\",\"content\"],\"mode\":\"lucene\",\"minimum_should_match\":0,\"type\":\"cross_fields\"}"; QsPlan plan = SearchDslParser.parseDsl(dsl, options); Assertions.assertNotNull(plan); - // With minimum_should_match=0, only (title:a OR content:a) remains - // In Lucene mode, this is wrapped as OCCUR_BOOLEAN + // Root is OCCUR_BOOLEAN with 3 children: MUST(a), MUST(b), SHOULD(c) Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + Assertions.assertEquals(3, plan.getRoot().getChildren().size()); + // a and b are MUST, c is SHOULD + Assertions.assertEquals(QsOccur.MUST, plan.getRoot().getChildren().get(0).getOccur()); + Assertions.assertEquals(QsOccur.MUST, plan.getRoot().getChildren().get(1).getOccur()); + Assertions.assertEquals(QsOccur.SHOULD, plan.getRoot().getChildren().get(2).getOccur()); } @Test @@ -1243,16 +1420,18 @@ public void testMultiFieldLuceneModeComplexQuery() { @Test public void testMultiFieldLuceneModeMinimumShouldMatchOne() { - // Test: "a AND b OR c" with minimum_should_match=1 keeps all clauses + cross_fields + // Test: "a AND b OR c" with minimum_should_match=1 + cross_fields + multi-field + // For multi-field mode (fields.size() > 1), minimum_should_match is nullified. + // Lucene addClause with default_operator=OR: [MUST(a), MUST(b), SHOULD(c)] msm=0 + // No SHOULD filtering — all 3 terms kept, each expanded to 2 fields via cross_fields String dsl = "a AND b OR c"; String options = "{\"fields\":[\"title\",\"content\"],\"mode\":\"lucene\",\"minimum_should_match\":1,\"type\":\"cross_fields\"}"; QsPlan plan = SearchDslParser.parseDsl(dsl, options); Assertions.assertNotNull(plan); Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); - // All 3 groups should be present + // 3 terms (a, b, c), each expanded to cross_fields OCCUR_BOOLEAN Assertions.assertEquals(3, plan.getRoot().getChildren().size()); - Assertions.assertEquals(Integer.valueOf(1), plan.getRoot().getMinimumShouldMatch()); } // ============ Tests for type parameter (best_fields vs cross_fields) ============ @@ -1313,13 +1492,107 @@ public void testMultiFieldCrossFields() { @Test public void testMultiFieldBestFieldsLuceneMode() { - // Test: best_fields with Lucene mode + // Test: best_fields with Lucene mode uses per-clause expansion (matching ES query_string) + // "hello world" with AND → each term independently expanded across fields: + // MUST(SHOULD(title:hello, content:hello)) AND MUST(SHOULD(title:world, content:world)) String dsl = "hello world"; String options = "{\"fields\":[\"title\",\"content\"],\"default_operator\":\"and\",\"mode\":\"lucene\",\"type\":\"best_fields\"}"; QsPlan plan = SearchDslParser.parseDsl(dsl, options); Assertions.assertNotNull(plan); Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + // Per-clause expansion: 2 children (one per term), each expanded across fields + Assertions.assertEquals(2, plan.getRoot().getChildren().size()); + for (QsNode child : plan.getRoot().getChildren()) { + // Each child is an OCCUR_BOOLEAN wrapping the per-field expansion + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, child.getType()); + Assertions.assertEquals(2, child.getChildren().size()); // one per field + } + } + + @Test + public void testMultiFieldBestFieldsLuceneModePerClauseExpansion() { + // Test: best_fields with phrase + regex uses per-clause expansion (not per-field) + // ES query_string expands each clause independently across fields: + // ("Costner" AND /Li../) → MUST(title:"Costner" | content:"Costner") AND MUST(title:/Li../ | content:/Li../) + // NOT: (title:"Costner" AND title:/Li../) OR (content:"Costner" AND content:/Li../) + String dsl = "\"Costner\" /Li../"; + String options = "{\"fields\":[\"title\",\"content\"],\"default_operator\":\"and\",\"mode\":\"lucene\",\"type\":\"best_fields\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, root.getType()); + // 2 children: one for phrase "Costner", one for regex /Li../ + Assertions.assertEquals(2, root.getChildren().size()); + + // First child: phrase "Costner" expanded across fields + QsNode phraseGroup = root.getChildren().get(0); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, phraseGroup.getType()); + Assertions.assertEquals(2, phraseGroup.getChildren().size()); + Assertions.assertEquals(QsClauseType.PHRASE, phraseGroup.getChildren().get(0).getType()); + Assertions.assertEquals(QsClauseType.PHRASE, phraseGroup.getChildren().get(1).getType()); + + // Second child: regex /Li../ expanded across fields + QsNode regexpGroup = root.getChildren().get(1); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, regexpGroup.getType()); + Assertions.assertEquals(2, regexpGroup.getChildren().size()); + Assertions.assertEquals(QsClauseType.REGEXP, regexpGroup.getChildren().get(0).getType()); + Assertions.assertEquals(QsClauseType.REGEXP, regexpGroup.getChildren().get(1).getType()); + } + + @Test + public void testMultiFieldExplicitFieldNotExpanded() { + // Bug #1: explicit field prefix (field:term) should NOT be expanded across fields, + // even when the field is in the fields list. Matches ES query_string behavior. + // "title:music AND content:history" → +title:music +content:history (no expansion) + String dsl = "title:music AND content:history"; + String options = "{\"fields\":[\"title\",\"content\"],\"default_operator\":\"and\",\"mode\":\"lucene\",\"type\":\"best_fields\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + // First child: title:music - should be a TERM pinned to "title", NOT expanded + QsNode musicNode = root.getChildren().get(0); + Assertions.assertEquals(QsClauseType.TERM, musicNode.getType()); + Assertions.assertEquals("title", musicNode.getField()); + Assertions.assertEquals("music", musicNode.getValue()); + + // Second child: content:history - should be a TERM pinned to "content", NOT expanded + QsNode historyNode = root.getChildren().get(1); + Assertions.assertEquals(QsClauseType.TERM, historyNode.getType()); + Assertions.assertEquals("content", historyNode.getField()); + Assertions.assertEquals("history", historyNode.getValue()); + } + + @Test + public void testMultiFieldMixedExplicitAndBareTerms() { + // "title:football AND american" → +title:football +(title:american | content:american) + // Explicit field pinned, bare term expanded + String dsl = "title:football AND american"; + String options = "{\"fields\":[\"title\",\"content\"],\"default_operator\":\"and\",\"mode\":\"lucene\",\"type\":\"best_fields\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + // First child: title:football - pinned to "title" + QsNode footballNode = root.getChildren().get(0); + Assertions.assertEquals(QsClauseType.TERM, footballNode.getType()); + Assertions.assertEquals("title", footballNode.getField()); + Assertions.assertEquals("football", footballNode.getValue()); + + // Second child: american - expanded across [title, content] + QsNode americanGroup = root.getChildren().get(1); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, americanGroup.getType()); + Assertions.assertEquals(2, americanGroup.getChildren().size()); + Assertions.assertEquals("title", americanGroup.getChildren().get(0).getField()); + Assertions.assertEquals("content", americanGroup.getChildren().get(1).getField()); } @Test @@ -1601,4 +1874,495 @@ public void testParenthesizedBareQuery() { Assertions.assertEquals(QsClauseType.TERM, termNode.getType()); Assertions.assertEquals("title", termNode.getField()); } + + // ===================================================================== + // Hubspot-specific tests + // ===================================================================== + + @Test + public void testPhraseWithImplicitOrOperator() { + // Test: '"2003 NBA draft" Darrell' with default_operator=OR should produce same result as + // '"2003 NBA draft" OR Darrell' + String dsl1 = "\"2003 NBA draft\" Darrell"; + String dsl2 = "\"2003 NBA draft\" OR Darrell"; + String options = "{\"default_field\":\"title\",\"default_operator\":\"OR\"," + + "\"mode\":\"lucene\",\"minimum_should_match\":0}"; + + QsPlan plan1 = SearchDslParser.parseDsl(dsl1, options); + QsPlan plan2 = SearchDslParser.parseDsl(dsl2, options); + + Assertions.assertNotNull(plan1); + Assertions.assertNotNull(plan2); + + // Both should have the same structure - OCCUR_BOOLEAN with 2 SHOULD children + Assertions.assertEquals(plan2.getRoot().getType(), plan1.getRoot().getType()); + Assertions.assertEquals(plan2.getRoot().getChildren().size(), plan1.getRoot().getChildren().size()); + + // Verify the phrase is preserved as PHRASE type, not broken into terms + boolean hasPhrase1 = plan1.getRoot().getChildren().stream() + .anyMatch(n -> n.getType() == QsClauseType.PHRASE); + boolean hasPhrase2 = plan2.getRoot().getChildren().stream() + .anyMatch(n -> n.getType() == QsClauseType.PHRASE); + Assertions.assertTrue(hasPhrase1, "Plan 1 should contain a PHRASE node"); + Assertions.assertTrue(hasPhrase2, "Plan 2 should contain a PHRASE node"); + } + + @Test + public void testPhraseWithImplicitAndOperator() { + // Test: '"hello world" foo' with default_operator=AND + String dsl = "\"hello world\" foo"; + String options = "{\"default_field\":\"title\",\"default_operator\":\"AND\"}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + // Should create AND query: title:"hello world" AND title:foo + Assertions.assertEquals(QsClauseType.AND, plan.getRoot().getType()); + Assertions.assertEquals(2, plan.getRoot().getChildren().size()); + + // Verify the phrase is preserved + boolean hasPhrase = plan.getRoot().getChildren().stream() + .anyMatch(n -> n.getType() == QsClauseType.PHRASE); + Assertions.assertTrue(hasPhrase, "Should contain a PHRASE node"); + } + + @Test + public void testMultiplePhrases() { + // Test: '"hello world" "foo bar"' with default_operator=OR + String dsl = "\"hello world\" \"foo bar\""; + String options = "{\"default_field\":\"title\",\"default_operator\":\"OR\"}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OR, plan.getRoot().getType()); + Assertions.assertEquals(2, plan.getRoot().getChildren().size()); + + // Both children should be PHRASE type + for (QsNode child : plan.getRoot().getChildren()) { + Assertions.assertEquals(QsClauseType.PHRASE, child.getType()); + } + } + + // ============ Tests for Standalone Wildcard * ============ + + @Test + public void testStandaloneWildcardWithAnd() { + // Test: "Dollar AND *" should produce: MUST(title:Dollar) AND MUST(MATCH_ALL_DOCS) + // Standalone "*" becomes MATCH_ALL_DOCS (matches ES behavior: field:* → ExistsQuery) + String dsl = "Dollar AND *"; + String options = "{\"default_field\":\"title\",\"default_operator\":\"OR\"," + + "\"mode\":\"lucene\",\"minimum_should_match\":0}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + Assertions.assertEquals(2, plan.getRoot().getChildren().size()); + + // Both children should have MUST occur (AND) + for (QsNode child : plan.getRoot().getChildren()) { + Assertions.assertEquals(QsOccur.MUST, child.getOccur()); + } + + // One should be TERM (Dollar), one should be MATCH_ALL_DOCS + boolean hasTerm = plan.getRoot().getChildren().stream() + .anyMatch(n -> n.getType() == QsClauseType.TERM && "Dollar".equals(n.getValue())); + boolean hasMatchAll = plan.getRoot().getChildren().stream() + .anyMatch(n -> n.getType() == QsClauseType.MATCH_ALL_DOCS); + + Assertions.assertTrue(hasTerm, "Should contain TERM node for 'Dollar'"); + Assertions.assertTrue(hasMatchAll, "Should contain MATCH_ALL_DOCS node for '*'"); + } + + @Test + public void testStandaloneWildcardAlone() { + // Test: "*" alone becomes MATCH_ALL_DOCS (matches ES behavior: field:* → ExistsQuery) + String dsl = "*"; + String options = "{\"default_field\":\"title\",\"default_operator\":\"OR\"}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.MATCH_ALL_DOCS, plan.getRoot().getType()); + } + + @Test + public void testStandaloneWildcardWithOr() { + // Test: "Dollar OR *" should produce: SHOULD(title:Dollar) OR SHOULD(MATCH_ALL_DOCS) + // Standalone "*" becomes MATCH_ALL_DOCS (matches ES behavior: field:* → ExistsQuery) + String dsl = "Dollar OR *"; + String options = "{\"default_field\":\"title\",\"default_operator\":\"OR\"," + + "\"mode\":\"lucene\",\"minimum_should_match\":0}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, plan.getRoot().getType()); + Assertions.assertEquals(2, plan.getRoot().getChildren().size()); + + // Both children should have SHOULD occur (OR) + for (QsNode child : plan.getRoot().getChildren()) { + Assertions.assertEquals(QsOccur.SHOULD, child.getOccur()); + } + + // One should be TERM (Dollar), one should be MATCH_ALL_DOCS + boolean hasTerm = plan.getRoot().getChildren().stream() + .anyMatch(n -> n.getType() == QsClauseType.TERM && "Dollar".equals(n.getValue())); + boolean hasMatchAll = plan.getRoot().getChildren().stream() + .anyMatch(n -> n.getType() == QsClauseType.MATCH_ALL_DOCS); + + Assertions.assertTrue(hasTerm, "Should contain TERM node for 'Dollar'"); + Assertions.assertTrue(hasMatchAll, "Should contain MATCH_ALL_DOCS node for '*'"); + } + + // ===== Field-Grouped Query Tests ===== + @Test + public void testFieldGroupQuerySimpleOr() { + // title:(rock OR jazz) → OR(TERM(title,rock), TERM(title,jazz)) + // ES semantics: field prefix applies to all terms inside parentheses + String dsl = "title:(rock OR jazz)"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OR, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + QsNode child0 = root.getChildren().get(0); + QsNode child1 = root.getChildren().get(1); + + Assertions.assertEquals(QsClauseType.TERM, child0.getType()); + Assertions.assertEquals("title", child0.getField()); + Assertions.assertEquals("rock", child0.getValue()); + Assertions.assertTrue(child0.isExplicitField(), "term should be marked explicit"); + + Assertions.assertEquals(QsClauseType.TERM, child1.getType()); + Assertions.assertEquals("title", child1.getField()); + Assertions.assertEquals("jazz", child1.getValue()); + Assertions.assertTrue(child1.isExplicitField(), "term should be marked explicit"); + + // Field bindings should include title + Assertions.assertEquals(1, plan.getFieldBindings().size()); + Assertions.assertEquals("title", plan.getFieldBindings().get(0).getFieldName()); + } + + @Test + public void testFieldGroupQueryWithAndOperator() { + // title:(rock jazz) with default_operator:AND → AND(TERM(title,rock), TERM(title,jazz)) + String dsl = "title:(rock jazz)"; + String options = "{\"default_operator\":\"and\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.AND, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + for (QsNode child : root.getChildren()) { + Assertions.assertEquals(QsClauseType.TERM, child.getType()); + Assertions.assertEquals("title", child.getField()); + Assertions.assertTrue(child.isExplicitField(), "child should be marked explicit"); + } + Assertions.assertEquals("rock", root.getChildren().get(0).getValue()); + Assertions.assertEquals("jazz", root.getChildren().get(1).getValue()); + } + + @Test + public void testFieldGroupQueryWithPhrase() { + // title:("rock and roll" OR jazz) → OR(PHRASE(title,"rock and roll"), TERM(title,jazz)) + String dsl = "title:(\"rock and roll\" OR jazz)"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OR, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + QsNode phrase = root.getChildren().get(0); + QsNode term = root.getChildren().get(1); + + Assertions.assertEquals(QsClauseType.PHRASE, phrase.getType()); + Assertions.assertEquals("title", phrase.getField()); + Assertions.assertEquals("rock and roll", phrase.getValue()); + Assertions.assertTrue(phrase.isExplicitField()); + + Assertions.assertEquals(QsClauseType.TERM, term.getType()); + Assertions.assertEquals("title", term.getField()); + Assertions.assertEquals("jazz", term.getValue()); + Assertions.assertTrue(term.isExplicitField()); + } + + @Test + public void testFieldGroupQueryWithWildcardAndRegexp() { + // title:(roc* OR /ja../) → OR(PREFIX(title,roc*), REGEXP(title,ja..)) + String dsl = "title:(roc* OR /ja../)"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OR, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + QsNode prefix = root.getChildren().get(0); + Assertions.assertEquals(QsClauseType.PREFIX, prefix.getType()); + Assertions.assertEquals("title", prefix.getField()); + Assertions.assertTrue(prefix.isExplicitField()); + + QsNode regexp = root.getChildren().get(1); + Assertions.assertEquals(QsClauseType.REGEXP, regexp.getType()); + Assertions.assertEquals("title", regexp.getField()); + Assertions.assertEquals("ja..", regexp.getValue()); + Assertions.assertTrue(regexp.isExplicitField()); + } + + @Test + public void testFieldGroupQueryCombinedWithBareQuery() { + // title:(rock OR jazz) AND music → combined query + // In standard mode with default_field=content: explicit title terms + expanded music + String dsl = "title:(rock OR jazz) AND music"; + String options = "{\"default_field\":\"content\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + // Root is AND + Assertions.assertEquals(QsClauseType.AND, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + // First child is the OR group from title:(rock OR jazz) + QsNode orGroup = root.getChildren().get(0); + Assertions.assertEquals(QsClauseType.OR, orGroup.getType()); + Assertions.assertEquals(2, orGroup.getChildren().size()); + for (QsNode child : orGroup.getChildren()) { + Assertions.assertEquals("title", child.getField()); + Assertions.assertTrue(child.isExplicitField()); + } + + // Second child is bare "music" → uses default_field "content" + QsNode musicNode = root.getChildren().get(1); + Assertions.assertEquals(QsClauseType.TERM, musicNode.getType()); + Assertions.assertEquals("content", musicNode.getField()); + Assertions.assertFalse(musicNode.isExplicitField()); + } + + @Test + public void testFieldGroupQueryMultiFieldExplicitNotExpanded() { + // title:(rock OR jazz) with fields=[title,content] in cross_fields mode + // Explicit title:(rock OR jazz) should NOT be expanded to content + String dsl = "title:(rock OR jazz)"; + String options = "{\"fields\":[\"title\",\"content\"],\"type\":\"cross_fields\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + // Result should preserve title field for rock and jazz (not expand to content) + // We verify that "content" is not a field used in the plan + boolean hasContentBinding = plan.getFieldBindings().stream() + .anyMatch(b -> "content".equals(b.getFieldName())); + Assertions.assertFalse(hasContentBinding, + "Explicit title:(rock OR jazz) should not expand to content field"); + } + + @Test + public void testFieldGroupQueryLuceneMode() { + // title:(rock OR jazz) in lucene mode → OR(SHOULD(title:rock), SHOULD(title:jazz)) + String dsl = "title:(rock OR jazz)"; + String options = "{\"mode\":\"lucene\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertNotNull(root); + + // In lucene mode, the inner clause should be an OCCUR_BOOLEAN with SHOULD children + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + for (QsNode child : root.getChildren()) { + Assertions.assertEquals("title", child.getField()); + Assertions.assertEquals(QsOccur.SHOULD, child.getOccur()); + } + } + + @Test + public void testFieldGroupQueryLuceneModeAndOperator() { + // title:(rock AND jazz) in lucene mode → OCCUR_BOOLEAN(MUST(title:rock), MUST(title:jazz)) + String dsl = "title:(rock AND jazz)"; + String options = "{\"mode\":\"lucene\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertNotNull(root); + + Assertions.assertEquals(QsClauseType.OCCUR_BOOLEAN, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + for (QsNode child : root.getChildren()) { + Assertions.assertEquals("title", child.getField()); + Assertions.assertEquals(QsOccur.MUST, child.getOccur()); + } + } + + @Test + public void testFieldGroupQueryLuceneModeMultiField() { + // title:(rock OR jazz) AND music with fields=[title,content], mode=lucene + // title terms are explicit, music expands to both fields + String dsl = "title:(rock OR jazz) AND music"; + String options = "{\"fields\":[\"title\",\"content\"],\"mode\":\"lucene\"}"; + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertNotNull(plan.getRoot()); + // Should parse without error and produce a plan + Assertions.assertFalse(plan.getFieldBindings().isEmpty()); + } + + @Test + public void testFieldGroupQuerySubcolumnPath() { + // attrs.color:(red OR blue) - field group with dot-notation path + String dsl = "attrs.color:(red OR blue)"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OR, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + for (QsNode child : root.getChildren()) { + Assertions.assertEquals("attrs.color", child.getField()); + Assertions.assertTrue(child.isExplicitField()); + } + } + + @Test + public void testFieldGroupQueryInnerExplicitFieldPreserved() { + // title:(content:foo OR bar) → content:foo stays pinned to "content", bar gets "title" + // Inner explicit field binding must NOT be overridden by outer group field + String dsl = "title:(content:foo OR bar)"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OR, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + // First child: content:foo - should keep "content" (inner explicit binding) + QsNode fooNode = root.getChildren().get(0); + Assertions.assertEquals(QsClauseType.TERM, fooNode.getType()); + Assertions.assertEquals("content", fooNode.getField()); + Assertions.assertEquals("foo", fooNode.getValue()); + Assertions.assertTrue(fooNode.isExplicitField()); + + // Second child: bar - should get "title" (outer group field) + QsNode barNode = root.getChildren().get(1); + Assertions.assertEquals(QsClauseType.TERM, barNode.getType()); + Assertions.assertEquals("title", barNode.getField()); + Assertions.assertEquals("bar", barNode.getValue()); + Assertions.assertTrue(barNode.isExplicitField()); + } + + @Test + public void testFieldGroupQueryNotOperatorInside() { + // title:(rock OR NOT jazz) → OR(title:rock, NOT(title:jazz)) + String dsl = "title:(rock OR NOT jazz)"; + QsPlan plan = SearchDslParser.parseDsl(dsl); + + Assertions.assertNotNull(plan); + QsNode root = plan.getRoot(); + Assertions.assertEquals(QsClauseType.OR, root.getType()); + Assertions.assertEquals(2, root.getChildren().size()); + + QsNode rockNode = root.getChildren().get(0); + Assertions.assertEquals("title", rockNode.getField()); + Assertions.assertEquals("rock", rockNode.getValue()); + Assertions.assertTrue(rockNode.isExplicitField()); + + QsNode notNode = root.getChildren().get(1); + Assertions.assertEquals(QsClauseType.NOT, notNode.getType()); + } + + // ============ Tests for MATCH_ALL_DOCS in multi-field mode ============ + @Test + public void testMultiFieldMatchAllDocsBestFieldsLuceneMode() { + // Test: "*" with best_fields + lucene mode should produce MATCH_ALL_DOCS + // with field bindings for all specified fields (needed for push-down) + String dsl = "*"; + String options = "{\"fields\":[\"title\",\"content\"],\"type\":\"best_fields\"," + + "\"default_operator\":\"AND\",\"mode\":\"lucene\",\"minimum_should_match\":0}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.MATCH_ALL_DOCS, plan.getRoot().getType()); + + // Must have field bindings for push-down to work + Assertions.assertNotNull(plan.getFieldBindings()); + Assertions.assertFalse(plan.getFieldBindings().isEmpty(), + "MATCH_ALL_DOCS in multi-field mode must have field bindings for push-down"); + Assertions.assertEquals(2, plan.getFieldBindings().size()); + + // Verify field names + java.util.List bindingNames = plan.getFieldBindings().stream() + .map(QsFieldBinding::getFieldName).collect(java.util.stream.Collectors.toList()); + Assertions.assertTrue(bindingNames.contains("title")); + Assertions.assertTrue(bindingNames.contains("content")); + } + + @Test + public void testMultiFieldMatchAllDocsCrossFieldsLuceneMode() { + // Test: "*" with cross_fields + lucene mode + String dsl = "*"; + String options = "{\"fields\":[\"title\",\"content\"],\"type\":\"cross_fields\"," + + "\"default_operator\":\"AND\",\"mode\":\"lucene\",\"minimum_should_match\":0}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.MATCH_ALL_DOCS, plan.getRoot().getType()); + + // Must have field bindings for push-down + Assertions.assertNotNull(plan.getFieldBindings()); + Assertions.assertFalse(plan.getFieldBindings().isEmpty(), + "MATCH_ALL_DOCS in multi-field mode must have field bindings for push-down"); + Assertions.assertEquals(2, plan.getFieldBindings().size()); + } + + @Test + public void testMultiFieldMatchAllDocsStandardMode() { + // Test: "*" with multi-field standard mode (no lucene) + String dsl = "*"; + String options = "{\"fields\":[\"title\",\"content\"],\"type\":\"best_fields\"," + + "\"default_operator\":\"AND\"}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + + // Must have field bindings for push-down + Assertions.assertNotNull(plan.getFieldBindings()); + Assertions.assertFalse(plan.getFieldBindings().isEmpty(), + "MATCH_ALL_DOCS in multi-field standard mode must have field bindings for push-down"); + Assertions.assertEquals(2, plan.getFieldBindings().size()); + } + + @Test + public void testSingleFieldMatchAllDocsLuceneMode() { + // Test: "*" with single default_field + lucene mode should have field binding + String dsl = "*"; + String options = "{\"default_field\":\"title\",\"default_operator\":\"AND\"," + + "\"mode\":\"lucene\",\"minimum_should_match\":0}"; + + QsPlan plan = SearchDslParser.parseDsl(dsl, options); + + Assertions.assertNotNull(plan); + Assertions.assertEquals(QsClauseType.MATCH_ALL_DOCS, plan.getRoot().getType()); + + // Must have field bindings for push-down + Assertions.assertNotNull(plan.getFieldBindings()); + Assertions.assertFalse(plan.getFieldBindings().isEmpty(), + "MATCH_ALL_DOCS with default_field must have field bindings for push-down"); + Assertions.assertEquals(1, plan.getFieldBindings().size()); + Assertions.assertEquals("title", plan.getFieldBindings().get(0).getFieldName()); + } } diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift index 6eaa5ff8e7947c..e74e1083243b02 100644 --- a/gensrc/thrift/Exprs.thrift +++ b/gensrc/thrift/Exprs.thrift @@ -261,12 +261,15 @@ struct TSearchFieldBinding { 3: optional string parent_field_name // Parent field name for variant subcolumns 4: optional string subcolumn_path // Subcolumn path for variant fields (e.g., "subcolumn" or "sub1.sub2") 5: optional bool is_variant_subcolumn // True if this is a variant subcolumn access + 6: optional map index_properties // Index properties (parser, lower_case, etc.) from FE Index lookup } struct TSearchParam { 1: required string original_dsl // Original DSL string for debugging 2: required TSearchClause root // Parsed AST root 3: required list field_bindings // Field to slot mappings + 4: optional string default_operator // "and" or "or" for TERM tokenization (default: "or") + 5: optional i32 minimum_should_match // Minimum number of SHOULD clauses that must match (for Lucene mode TERM tokenization) } // This is essentially a union over the subclasses of Expr. diff --git a/regression-test/data/search/test_search_dsl_operators.out b/regression-test/data/search/test_search_dsl_operators.out new file mode 100644 index 00000000000000..3d4e890e091022 --- /dev/null +++ b/regression-test/data/search/test_search_dsl_operators.out @@ -0,0 +1,45 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !dsl_or_chain -- +1 aterm bterm +2 bterm cterm +3 cterm dterm +4 dterm eterm aterm + +-- !dsl_and_chain -- +4 dterm eterm aterm + +-- !dsl_and_or_mixed -- +1 aterm bterm +4 dterm eterm aterm + +-- !dsl_and_not_or -- +4 dterm eterm aterm + +-- !dsl_implicit_and -- +3 cterm dterm + +-- !dsl_phrase_wrong_order -- + +-- !dsl_phrase_correct_order -- +4 dterm eterm aterm + +-- !dsl_escaped_space_and -- +4 dterm eterm aterm + +-- !dsl_phrase_and_term -- +4 dterm eterm aterm + +-- !dsl_phrase_wrong_and_term -- + +-- !dsl_phrase_or_term_1 -- +2 bterm cterm +3 cterm dterm + +-- !dsl_phrase_or_term_2 -- +2 bterm cterm +3 cterm dterm +4 dterm eterm aterm + +-- !dsl_and_or_min_should_1 -- +1 aterm bterm + diff --git a/regression-test/data/search/test_search_escape.out b/regression-test/data/search/test_search_escape.out index 09bd9f80b2b40d..fbe1e731f610ae 100644 --- a/regression-test/data/search/test_search_escape.out +++ b/regression-test/data/search/test_search_escape.out @@ -26,15 +26,6 @@ -- !uppercase_not -- 8 second fruit --- !lowercase_and -- -7 first fruit - --- !lowercase_or -- -1 first content -2 second content -7 first fruit -8 second fruit - -- !exclamation_not -- 8 second fruit diff --git a/regression-test/data/search/test_search_lucene_mode.out b/regression-test/data/search/test_search_lucene_mode.out index 68d8e6c1279012..5eb4346b50c3ba 100644 --- a/regression-test/data/search/test_search_lucene_mode.out +++ b/regression-test/data/search/test_search_lucene_mode.out @@ -34,6 +34,10 @@ 2 apple banana -- !lucene_not -- +4 banana cherry +5 cherry date +6 date elderberry +7 fig grape -- !lucene_and_not -- 3 apple diff --git a/regression-test/data/search/test_search_multi_field.out b/regression-test/data/search/test_search_multi_field.out index 4a4923a4c3b50e..59a901665cc4cd 100644 --- a/regression-test/data/search/test_search_multi_field.out +++ b/regression-test/data/search/test_search_multi_field.out @@ -75,13 +75,10 @@ -- !multi_field_lucene_and_or -- 1 machine learning basics -4 machine maintenance -8 cooking machine reviews 9 machine guide -- !multi_field_lucene_min_should_1 -- 1 machine learning basics -8 cooking machine reviews 9 machine guide -- !multi_field_lucene_and_not -- @@ -119,8 +116,21 @@ -- !multi_field_best_fields_lucene -- 1 machine learning basics +9 machine guide -- !multi_field_cross_fields_lucene -- 1 machine learning basics 9 machine guide +-- !multi_field_match_all_best_fields -- +9 + +-- !multi_field_match_all_cross_fields -- +9 + +-- !match_all_single_field -- +9 + +-- !multi_field_match_all_standard -- +9 + diff --git a/regression-test/data/search/test_search_regexp_lowercase.out b/regression-test/data/search/test_search_regexp_lowercase.out new file mode 100644 index 00000000000000..0ae25fc613f8d4 --- /dev/null +++ b/regression-test/data/search/test_search_regexp_lowercase.out @@ -0,0 +1,39 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !regexp_uppercase_no_match -- + +-- !match_regexp_uppercase_no_match -- + +-- !regexp_lowercase_match -- +1 ABC DEF +2 abc def + +-- !match_regexp_lowercase_match -- +1 ABC DEF +2 abc def + +-- !wildcard_uppercase_match -- +1 ABC DEF +2 abc def + +-- !wildcard_lowercase_match -- +1 ABC DEF +2 abc def + +-- !regexp_apple_lowercase -- +3 Apple Banana Cherry +4 apple banana cherry + +-- !regexp_apple_uppercase_no_match -- + +-- !consistency_regexp_cherry -- +3 +4 + +-- !consistency_match_regexp_cherry -- +3 +4 + +-- !consistency_regexp_cherry_upper -- + +-- !consistency_match_regexp_cherry_upper -- + diff --git a/regression-test/data/search/test_search_variant_dual_index_reader.out b/regression-test/data/search/test_search_variant_dual_index_reader.out new file mode 100644 index 00000000000000..d4f038ba490557 --- /dev/null +++ b/regression-test/data/search/test_search_variant_dual_index_reader.out @@ -0,0 +1,23 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !dual_index_basic -- +1 +3 + +-- !dual_index_and -- +3 + +-- !dual_index_other_field -- +4 + +-- !dual_index_field_syntax -- +2 +3 + +-- !dual_index_case_insensitive -- +1 +3 + +-- !dual_index_match_baseline -- +1 +3 + diff --git a/regression-test/data/search/test_search_variant_subcolumn_analyzer.out b/regression-test/data/search/test_search_variant_subcolumn_analyzer.out new file mode 100644 index 00000000000000..d1eff343b82fc7 --- /dev/null +++ b/regression-test/data/search/test_search_variant_subcolumn_analyzer.out @@ -0,0 +1,30 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !search_variant_analyzer_basic -- +1 +3 + +-- !match_variant_baseline -- +1 +3 + +-- !search_variant_analyzer_multi -- +3 + +-- !search_variant_analyzer_other_field -- +4 + +-- !search_variant_analyzer_field_syntax -- +2 +5 + +-- !search_variant_analyzer_lowercase -- +1 +3 + +-- !search_variant_analyzer_phrase -- +1 + +-- !search_variant_direct_index -- +1 +3 + diff --git a/regression-test/data/search/test_search_variant_wildcard.out b/regression-test/data/search/test_search_variant_wildcard.out new file mode 100644 index 00000000000000..d2f86c869ca558 --- /dev/null +++ b/regression-test/data/search/test_search_variant_wildcard.out @@ -0,0 +1,42 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !term_smith -- +73095521135 + +-- !term_smithson -- +73095446198 + +-- !term_johnson -- +73095754047 + +-- !wildcard_star_ith -- +73095521135 + +-- !wildcard_sm_star_th -- +73095521135 + +-- !wildcard_sm_q_th -- +73095521135 + +-- !wildcard_smith_star -- +73095446198 +73095521135 + +-- !wildcard_sm_star -- +73095446198 +73095521135 + +-- !wildcard_star_son -- +73095446198 +73095754047 + +-- !wildcard_firstname -- +73095521135 + +-- !wildcard_and_term -- +73095521135 + +-- !wildcard_star_all -- +73095446198 +73095521135 +73095754047 + diff --git a/regression-test/suites/search/test_search_boundary_cases.groovy b/regression-test/suites/search/test_search_boundary_cases.groovy index c5ed5ffd55fda6..5ab2a9386aa0d2 100644 --- a/regression-test/suites/search/test_search_boundary_cases.groovy +++ b/regression-test/suites/search/test_search_boundary_cases.groovy @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_boundary_cases") { +suite("test_search_boundary_cases", "p0") { def tableName = "search_boundary_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Create test table for boundary and edge cases @@ -86,31 +89,31 @@ suite("test_search_boundary_cases") { // Boundary Test 1: All NULL fields qt_boundary_1_all_null_or """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('field1:anything or field2:anything or field3:anything or field4:anything or field5:anything') + WHERE search('field1:anything OR field2:anything OR field3:anything OR field4:anything OR field5:anything') """ qt_boundary_1_all_null_and """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('field1:anything and field2:anything and field3:anything and field4:anything and field5:anything') + WHERE search('field1:anything AND field2:anything AND field3:anything AND field4:anything AND field5:anything') """ // Boundary Test 2: Single field NULL vs multiple fields NULL in OR qt_boundary_2_single_null_or """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id FROM ${tableName} - WHERE search('field1:nonexistent or field2:test') + WHERE search('field1:nonexistent OR field2:test') ORDER BY id """ qt_boundary_2_multiple_null_or """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id FROM ${tableName} - WHERE search('field1:nonexistent or field2:test or field3:nonexistent') + WHERE search('field1:nonexistent OR field2:test OR field3:nonexistent') ORDER BY id """ // Boundary Test 3: NOT with various NULL combinations qt_boundary_3_not_null_field """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('not field1:test') + WHERE search('NOT field1:test') """ qt_boundary_3_external_not_null """ @@ -138,59 +141,59 @@ suite("test_search_boundary_cases") { // Boundary Test 5: Complex nested boolean with NULLs qt_boundary_5_complex_nested """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('((field1:test or field2:test) and (field3:test or field4:test)) or field5:test') + WHERE search('((field1:test OR field2:test) AND (field3:test OR field4:test)) OR field5:test') """ qt_boundary_5_detailed_result """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, field1, field2, field3, field4, field5 FROM ${tableName} - WHERE search('((field1:test or field2:test) and (field3:test or field4:test)) or field5:test') + WHERE search('((field1:test OR field2:test) AND (field3:test OR field4:test)) OR field5:test') ORDER BY id """ // Boundary Test 6: Large OR query with many NULL fields qt_boundary_6_large_or """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('field1:"target" or field1:"keyword" or field1:"apple" or field1:"unique1" or - field2:"target" or field2:"keyword" or field2:"apple" or field2:"unique2" or - field3:"target" or field3:"keyword" or field3:"banana" or field3:"unique3" or - field4:"target" or field4:"keyword" or field4:"banana" or field4:"unique4" or - field5:"target" or field5:"keyword" or field5:"cherry" or field5:"unique5"') + WHERE search('field1:"target" OR field1:"keyword" OR field1:"apple" OR field1:"unique1" OR + field2:"target" OR field2:"keyword" OR field2:"apple" OR field2:"unique2" OR + field3:"target" OR field3:"keyword" OR field3:"banana" OR field3:"unique3" OR + field4:"target" OR field4:"keyword" OR field4:"banana" OR field4:"unique4" OR + field5:"target" OR field5:"keyword" OR field5:"cherry" OR field5:"unique5"') """ // Boundary Test 7: Special characters and NULL interaction qt_boundary_7_special_chars_or """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('field1:special123 or field2:nonexistent') + WHERE search('field1:special123 OR field2:nonexistent') """ qt_boundary_7_special_chars_and """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('field1:special123 and field2:chars456') + WHERE search('field1:special123 AND field2:chars456') """ // Boundary Test 8: Case sensitivity with NULL fields qt_boundary_8_case_variations """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id FROM ${tableName} - WHERE search('field1:Target or field2:TARGET or field3:target or field4:TaRgEt') + WHERE search('field1:Target OR field2:TARGET OR field3:target OR field4:TaRgEt') ORDER BY id """ // Boundary Test 9: Multiple NOT operations qt_boundary_9_multiple_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('not (field1:nonexistent or field2:nonexistent or field3:nonexistent)') + WHERE search('NOT (field1:nonexistent OR field2:nonexistent OR field3:nonexistent)') """ qt_boundary_9_external_multiple_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE not search('field1:nonexistent or field2:nonexistent or field3:nonexistent') + WHERE not search('field1:nonexistent OR field2:nonexistent OR field3:nonexistent') """ // Boundary Test 10: Performance with NULL-heavy dataset qt_boundary_10_performance """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('(field1:test or field1:target or field1:keyword) and - (field2:test or field2:target or field2:keyword) and - not (field3:nonexistent or field4:nonexistent or field5:nonexistent)') + WHERE search('(field1:test OR field1:target OR field1:keyword) AND + (field2:test OR field2:target OR field2:keyword) AND + NOT (field3:nonexistent OR field4:nonexistent OR field5:nonexistent)') """ } \ No newline at end of file diff --git a/regression-test/suites/search/test_search_cache.groovy b/regression-test/suites/search/test_search_cache.groovy new file mode 100644 index 00000000000000..39af2184bb7a6c --- /dev/null +++ b/regression-test/suites/search/test_search_cache.groovy @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_search_cache", "p0") { + def tableName = "search_cache_test" + + sql "DROP TABLE IF EXISTS ${tableName}" + sql """ + CREATE TABLE ${tableName} ( + id INT, + title VARCHAR(200), + content VARCHAR(500), + INDEX idx_title(title) USING INVERTED PROPERTIES("parser" = "english"), + INDEX idx_content(content) USING INVERTED PROPERTIES("parser" = "english") + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1") + """ + + sql """INSERT INTO ${tableName} VALUES + (1, 'apple banana cherry', 'red fruit sweet'), + (2, 'banana grape mango', 'yellow fruit tropical'), + (3, 'cherry plum peach', 'stone fruit summer'), + (4, 'apple grape kiwi', 'green fruit fresh'), + (5, 'mango pineapple coconut', 'tropical fruit exotic'), + (6, 'apple cherry plum', 'mixed fruit salad'), + (7, 'banana coconut papaya', 'smoothie blend tropical'), + (8, 'grape cherry apple', 'wine fruit tart') + """ + sql """sync""" + + // sync ensures data is flushed. Sleep is a best-effort wait for + // background index availability; may need to increase under load. + Thread.sleep(2000) + + // Test 1: Cache consistency - same query returns same results with cache enabled + def result1 = sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true,enable_inverted_index_query_cache=true) */ + id FROM ${tableName} + WHERE search('title:apple') + ORDER BY id + """ + + // Run same query again (should hit cache) + def result2 = sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true,enable_inverted_index_query_cache=true) */ + id FROM ${tableName} + WHERE search('title:apple') + ORDER BY id + """ + + // Results must be identical (cache hit returns same data) + assertEquals(result1, result2) + + // Test 2: Cache disabled returns same results as cache enabled + def result_no_cache = sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true,enable_inverted_index_query_cache=false) */ + id FROM ${tableName} + WHERE search('title:apple') + ORDER BY id + """ + assertEquals(result1, result_no_cache) + + // Test 3: Multi-field query cache consistency + def mf_result1 = sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true,enable_inverted_index_query_cache=true) */ + id FROM ${tableName} + WHERE search('title:cherry OR content:tropical') + ORDER BY id + """ + + def mf_result2 = sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true,enable_inverted_index_query_cache=true) */ + id FROM ${tableName} + WHERE search('title:cherry OR content:tropical') + ORDER BY id + """ + assertEquals(mf_result1, mf_result2) + + // Test 4: Different queries produce different cache entries + def diff_result = sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true,enable_inverted_index_query_cache=true) */ + id FROM ${tableName} + WHERE search('title:banana') + ORDER BY id + """ + // banana result should differ from apple result + assertNotEquals(result1, diff_result) + + // Test 5: AND query - cache vs no-cache consistency + def and_cached = sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true,enable_inverted_index_query_cache=true) */ + id, title FROM ${tableName} + WHERE search('title:apple AND title:cherry') + ORDER BY id + """ + + def and_uncached = sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true,enable_inverted_index_query_cache=false) */ + id, title FROM ${tableName} + WHERE search('title:apple AND title:cherry') + ORDER BY id + """ + assertEquals(and_cached, and_uncached) + + // Test 6: Complex boolean query - cache vs no-cache consistency + def complex_cached = sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true,enable_inverted_index_query_cache=true) */ + id, title FROM ${tableName} + WHERE search('(title:apple OR title:banana) AND content:fruit') + ORDER BY id + """ + + def complex_uncached = sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true,enable_inverted_index_query_cache=false) */ + id, title FROM ${tableName} + WHERE search('(title:apple OR title:banana) AND content:fruit') + ORDER BY id + """ + assertEquals(complex_cached, complex_uncached) + + sql "DROP TABLE IF EXISTS ${tableName}" +} diff --git a/regression-test/suites/search/test_search_default_field_operator.groovy b/regression-test/suites/search/test_search_default_field_operator.groovy index 230825862357ef..654a9ad2abf6ef 100644 --- a/regression-test/suites/search/test_search_default_field_operator.groovy +++ b/regression-test/suites/search/test_search_default_field_operator.groovy @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_default_field_operator") { +suite("test_search_default_field_operator", "p0") { def tableName = "search_enhanced_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Create table with inverted indexes diff --git a/regression-test/suites/search/test_search_dsl_operators.groovy b/regression-test/suites/search/test_search_dsl_operators.groovy new file mode 100644 index 00000000000000..08106c65243b46 --- /dev/null +++ b/regression-test/suites/search/test_search_dsl_operators.groovy @@ -0,0 +1,234 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Tests for search DSL operator scenarios + * + * This test suite validates Lucene mode parsing against the exact test cases + * documented in specification to ensure behavior matches Elasticsearch/Lucene semantics. + * + * Test Data Setup: + * | Email | Firstname | + * | test+query+1@gmail.com | "aterm bterm" | + * | test+query+2@gmail.com | "bterm cterm" | + * | test+query+3@gmail.com | "cterm dterm" | + * | test+query+4@gmail.com | "dterm eterm aterm" | + * + * Key Lucene Semantics: + * - Operators are processed left-to-right as modifiers + * - AND marks preceding and current terms as MUST (+) + * - OR marks preceding and current terms as SHOULD + * - NOT marks current term as MUST_NOT (-) + * - With minimum_should_match=0 and MUST clauses present, SHOULD clauses are discarded + */ +suite("test_search_dsl_operators", "p0") { + def tableName = "search_dsl_operators_test" + + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + + sql "DROP TABLE IF EXISTS ${tableName}" + + // Create table with inverted indexes + // Using parser=english to tokenize firstname field + sql """ + CREATE TABLE ${tableName} ( + id INT, + email VARCHAR(100), + firstname VARCHAR(200), + INDEX idx_firstname(firstname) USING INVERTED PROPERTIES("parser" = "english") + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_allocation" = "tag.location.default: 1") + """ + + // Insert test data + sql """INSERT INTO ${tableName} VALUES + (1, 'test+query+1@gmail.com', 'aterm bterm'), + (2, 'test+query+2@gmail.com', 'bterm cterm'), + (3, 'test+query+3@gmail.com', 'cterm dterm'), + (4, 'test+query+4@gmail.com', 'dterm eterm aterm') + """ + + // Wait for index building + Thread.sleep(3000) + + // ============ Test 1: aterm OR bterm OR cterm ============ + // All OR operators -> at least one must match (minimum_should_match=1) + // Expected: rows 1,2,3,4 (all match at least one term) + qt_dsl_or_chain """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('aterm OR bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 2: dterm AND eterm AND aterm ============ + // All AND operators -> all must match + // Expected: row 4 only (the only one with all three terms) + qt_dsl_and_chain """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('dterm AND eterm AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 3: aterm AND bterm OR cterm ============ + // Lucene left-to-right parsing with minimum_should_match=0: + // - aterm: MUST (first term, default_operator=AND) + // - bterm: MUST (AND introduces) + // - cterm: SHOULD (OR introduces), bterm becomes SHOULD too + // Final: +aterm bterm cterm + // With minimum_should_match=0 and MUST present, SHOULD discarded + // Result: effectively +aterm only + // Expected: rows 1, 4 (rows containing "aterm") + qt_dsl_and_or_mixed """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('aterm AND bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene","minimum_should_match":0}') + ORDER BY id + """ + + // ============ Test 4: aterm AND NOT bterm OR cterm ============ + // Lucene left-to-right parsing: + // - aterm: MUST + // - bterm: MUST_NOT (NOT modifier) + // - cterm: SHOULD (OR introduces) + // Final: +aterm -bterm cterm + // With minimum_should_match=0 and MUST present, SHOULD discarded + // Result: +aterm -bterm + // Expected: row 4 only (has "aterm" but NOT "bterm") + qt_dsl_and_not_or """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('aterm AND NOT bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene","minimum_should_match":0}') + ORDER BY id + """ + + // ============ Test 5: cterm dterm (implicit AND) ============ + // No explicit operators, default_operator=AND + // Same as: cterm AND dterm + // Expected: row 3 only (has both "cterm" AND "dterm") + qt_dsl_implicit_and """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('cterm dterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 6: "aterm eterm" (phrase query, wrong order) ============ + // Phrase query requires tokens in exact order + // Data has "dterm eterm aterm" - "aterm" comes AFTER "eterm", not before + // Expected: no match + qt_dsl_phrase_wrong_order """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('"aterm eterm"', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 7: "eterm aterm" (phrase query, correct order) ============ + // Phrase query requires tokens in exact order + // Data has "dterm eterm aterm" - "eterm aterm" appears in this order + // Expected: row 4 + qt_dsl_phrase_correct_order """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('"eterm aterm"', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 8: eterm\ dterm AND aterm (escaped space test) ============ + // Tests escape handling in Lucene mode + // In current implementation, the escaped space is processed such that + // the query effectively becomes a term query for individual tokens + // Row 4 contains all terms (dterm, eterm, aterm) + // Expected: row 4 + qt_dsl_escaped_space_and """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('eterm\\\\ dterm AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 9: "dterm eterm" AND aterm ============ + // Phrase query + AND + // Row 4 has "dterm eterm aterm" - phrase "dterm eterm" matches, and "aterm" is also present + // Expected: row 4 + qt_dsl_phrase_and_term """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('"dterm eterm" AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 10: "eterm dterm" AND aterm (phrase wrong order) ============ + // Phrase "eterm dterm" is wrong order (data has "dterm eterm") + // Expected: no match + qt_dsl_phrase_wrong_and_term """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('"eterm dterm" AND aterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 11: "eterm dterm" OR cterm ============ + // Phrase OR term + // Phrase "eterm dterm" won't match (wrong order) + // cterm matches rows 2, 3 + // Expected: rows 2, 3 + qt_dsl_phrase_or_term_1 """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('"eterm dterm" OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 12: "dterm eterm" OR cterm ============ + // Phrase OR term + // Phrase "dterm eterm" matches row 4 + // cterm matches rows 2, 3 + // Expected: rows 2, 3, 4 + qt_dsl_phrase_or_term_2 """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('"dterm eterm" OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Test 13: aterm AND bterm OR cterm with minimum_should_match=1 ============ + // Same as Test 3 but with minimum_should_match=1 + // Final state: +aterm bterm cterm (aterm is MUST, bterm and cterm are SHOULD) + // With minimum_should_match=1, at least 1 SHOULD must match + // Result: aterm AND (bterm OR cterm) + // Expected: rows 1, 2 (row 1 has aterm+bterm, row 2 doesn't have aterm) + // Wait - row 2 doesn't have aterm, so it shouldn't match + // Row 1: has aterm, has bterm -> matches + // Row 4: has aterm, doesn't have bterm or cterm -> doesn't match (no SHOULD satisfied) + // Actually row 4 has aterm but no bterm/cterm... + // Expected: row 1 only + qt_dsl_and_or_min_should_1 """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, firstname + FROM ${tableName} + WHERE search('aterm AND bterm OR cterm', '{"default_field":"firstname","default_operator":"and","mode":"lucene","minimum_should_match":1}') + ORDER BY id + """ + + // Cleanup + sql "DROP TABLE IF EXISTS ${tableName}" +} diff --git a/regression-test/suites/search/test_search_dsl_syntax.groovy b/regression-test/suites/search/test_search_dsl_syntax.groovy index e0ec901092640e..b52a018fa030eb 100644 --- a/regression-test/suites/search/test_search_dsl_syntax.groovy +++ b/regression-test/suites/search/test_search_dsl_syntax.groovy @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_dsl_syntax") { +suite("test_search_dsl_syntax", "p0") { def tableName = "search_dsl_test_table" sql "DROP TABLE IF EXISTS ${tableName}" diff --git a/regression-test/suites/search/test_search_escape.groovy b/regression-test/suites/search/test_search_escape.groovy index 629d3fcc1f5129..e7e09ca846c6f2 100644 --- a/regression-test/suites/search/test_search_escape.groovy +++ b/regression-test/suites/search/test_search_escape.groovy @@ -29,9 +29,12 @@ * - Groovy string: \\\\ -> SQL string: \\ -> DSL: \ (escape char) * - Groovy string: \\\\\\\\ -> SQL string: \\\\ -> DSL: \\ -> literal: \ */ -suite("test_search_escape") { +suite("test_search_escape", "p0") { def tableName = "search_escape_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Create table with inverted indexes @@ -143,21 +146,31 @@ suite("test_search_escape") { ORDER BY id """ - // ============ Test 9: Lowercase and operator ============ - qt_lowercase_and """ - SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content - FROM ${tableName} - WHERE search('content:first and content:fruit') - ORDER BY id - """ - - // ============ Test 10: Lowercase or operator ============ - qt_lowercase_or """ - SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content - FROM ${tableName} - WHERE search('content:first or content:second') - ORDER BY id - """ + // ============ Test 9: Lowercase 'and' should cause parse error ============ + // Per requirement: Only uppercase AND/OR/NOT are operators + // Lowercase 'and' is treated as a bare term (no field), causing error + test { + sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content + FROM ${tableName} + WHERE search('content:first and content:fruit') + ORDER BY id + """ + exception "No field specified and no default_field configured" + } + + // ============ Test 10: Lowercase 'or' should cause parse error ============ + // Per requirement: Only uppercase AND/OR/NOT are operators + // Lowercase 'or' is treated as a bare term (no field), causing error + test { + sql """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, content + FROM ${tableName} + WHERE search('content:first or content:second') + ORDER BY id + """ + exception "No field specified and no default_field configured" + } // ============ Test 11: Exclamation NOT operator ============ qt_exclamation_not """ diff --git a/regression-test/suites/search/test_search_exact_basic.groovy b/regression-test/suites/search/test_search_exact_basic.groovy index cf5701ac98fb2b..ffad823e45189d 100644 --- a/regression-test/suites/search/test_search_exact_basic.groovy +++ b/regression-test/suites/search/test_search_exact_basic.groovy @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_exact_basic") { +suite("test_search_exact_basic", "p0") { def tableName = "exact_basic_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Simple table with basic index diff --git a/regression-test/suites/search/test_search_exact_lowercase.groovy b/regression-test/suites/search/test_search_exact_lowercase.groovy index 9d1b3756cbca99..f4b82d411b6980 100644 --- a/regression-test/suites/search/test_search_exact_lowercase.groovy +++ b/regression-test/suites/search/test_search_exact_lowercase.groovy @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_exact_lowercase") { +suite("test_search_exact_lowercase", "p0") { def tableName = "exact_lowercase_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // EXACT on mixed indexes: prefers untokenized, but untokenized index doesn't support lowercase diff --git a/regression-test/suites/search/test_search_exact_match.groovy b/regression-test/suites/search/test_search_exact_match.groovy index 307963b19b1679..6f8439b0d2293a 100644 --- a/regression-test/suites/search/test_search_exact_match.groovy +++ b/regression-test/suites/search/test_search_exact_match.groovy @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_exact_match") { +suite("test_search_exact_match", "p0") { def tableName = "search_exact_test_table" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Create test table with different index configurations diff --git a/regression-test/suites/search/test_search_exact_multi_index.groovy b/regression-test/suites/search/test_search_exact_multi_index.groovy index ab361dc45c0d55..8c11a56097032b 100644 --- a/regression-test/suites/search/test_search_exact_multi_index.groovy +++ b/regression-test/suites/search/test_search_exact_multi_index.groovy @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_exact_multi_index") { +suite("test_search_exact_multi_index", "p0") { def tableName = "exact_multi_index_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Table with multiple indexes on the same column diff --git a/regression-test/suites/search/test_search_field_group_query.groovy b/regression-test/suites/search/test_search_field_group_query.groovy new file mode 100644 index 00000000000000..b352d6d1cc7db0 --- /dev/null +++ b/regression-test/suites/search/test_search_field_group_query.groovy @@ -0,0 +1,205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Tests for field-grouped query syntax in search() function. + * + * Supports ES query_string field-grouped syntax: field:(term1 OR term2) + * All terms inside the parentheses inherit the field prefix. + * + * Equivalent transformations: + * title:(rock OR jazz) → (title:rock OR title:jazz) + * title:(rock jazz) AND:and → (+title:rock +title:jazz) + * title:(rock OR jazz) AND music → (title:rock OR title:jazz) AND music + */ +suite("test_search_field_group_query") { + def tableName = "search_field_group_test" + + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + // When false, search() expressions are not pushed to the inverted index evaluation + // path, causing "SearchExpr should not be executed without inverted index" errors. + sql """ set enable_common_expr_pushdown = true """ + + sql "DROP TABLE IF EXISTS ${tableName}" + + sql """ + CREATE TABLE ${tableName} ( + id INT, + title VARCHAR(255), + content TEXT, + category VARCHAR(100), + INDEX idx_title (title) USING INVERTED PROPERTIES("parser" = "english"), + INDEX idx_content (content) USING INVERTED PROPERTIES("parser" = "english"), + INDEX idx_category (category) USING INVERTED + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_allocation" = "tag.location.default: 1") + """ + + sql """INSERT INTO ${tableName} VALUES + (1, 'rock music history', 'The history of rock and roll music', 'Music'), + (2, 'jazz music theory', 'Jazz harmony and improvisation theory', 'Music'), + (3, 'classical music guide', 'Guide to classical music composers', 'Music'), + (4, 'python programming', 'Python language tutorial for beginners', 'Tech'), + (5, 'rock climbing tips', 'Tips and techniques for rock climbing', 'Sports'), + (6, 'jazz and blues fusion', 'The fusion of jazz and blues in modern music', 'Music'), + (7, 'machine learning', 'Introduction to machine learning algorithms', 'Tech'), + (8, 'rock and jazz review', 'Review of rock and jazz music festivals', 'Music') + """ + + sql "sync" + + // === Basic field-grouped OR query === + + // title:(rock OR jazz) should match rows with "rock" or "jazz" in title + def res1 = sql """ + SELECT id FROM ${tableName} + WHERE search('title:(rock OR jazz)', '{"default_operator":"or"}') + ORDER BY id + """ + // rows 1 (rock music history), 2 (jazz music theory), 5 (rock climbing tips), + // 6 (jazz and blues fusion), 8 (rock and jazz review) + assertEquals([[1], [2], [5], [6], [8]], res1) + + // === Field-grouped query vs equivalent expanded query === + + // title:(rock OR jazz) should give same results as explicit (title:rock OR title:jazz) + def res2a = sql """ + SELECT id FROM ${tableName} + WHERE search('title:(rock OR jazz)', '{"default_operator":"or"}') + ORDER BY id + """ + def res2b = sql """ + SELECT id FROM ${tableName} + WHERE search('title:rock OR title:jazz', '{"default_operator":"or"}') + ORDER BY id + """ + assertEquals(res2a, res2b) + + // === Field-grouped AND (implicit) === + + // title:(rock jazz) with default_operator:AND → both "rock" AND "jazz" must be in title + def res3 = sql """ + SELECT id FROM ${tableName} + WHERE search('title:(rock jazz)', '{"default_operator":"and"}') + ORDER BY id + """ + // Only row 8 has both "rock" and "jazz" in title + assertEquals([[8]], res3) + + // === Field-grouped query combined with bare query === + + // title:(rock OR jazz) AND music - explicit title terms + bare "music" on default field + def res4 = sql """ + SELECT id FROM ${tableName} + WHERE search('title:(rock OR jazz) AND music', '{"default_field":"title","default_operator":"and"}') + ORDER BY id + """ + // Must have "rock" or "jazz" in title AND "music" in title + // Row 1: title="rock music history" → has "rock" and "music" ✓ + // Row 2: title="jazz music theory" → has "jazz" and "music" ✓ + // Row 8: title="rock and jazz review" → has "rock" and "jazz" but no "music" in title ✗ + // Row 5: title="rock climbing tips" → has "rock" but no "music" ✗ + assertEquals([[1], [2]], res4) + + // === Field-grouped query in multi-field mode === + + // title:(rock OR jazz) with fields=[title,content] + // Explicit title:(rock OR jazz) should NOT expand to content field + def res5a = sql """ + SELECT id FROM ${tableName} + WHERE search('title:(rock OR jazz)', '{"fields":["title","content"],"type":"cross_fields"}') + ORDER BY id + """ + // Only rows where title contains "rock" or "jazz" + // NOT rows where only content has those terms (row 3 has "rock and roll" in content) + def res5b = sql """ + SELECT id FROM ${tableName} + WHERE search('title:rock OR title:jazz', '{"fields":["title","content"],"type":"cross_fields"}') + ORDER BY id + """ + assertEquals(res5a, res5b) + + // Rows 1,2,5,6,8 have rock/jazz in title; content-only matches should not appear + assertEquals(true, res5a.size() >= 1) + // Row 4 has "python" in title, so it should NOT appear + assert !res5a.collect { it[0] }.contains(4) + // Row 7 has "machine learning", so it should NOT appear + assert !res5a.collect { it[0] }.contains(7) + + // === Phrase inside field group === + + // title:("rock and") - phrase query inside group + def res6 = sql """ + SELECT id FROM ${tableName} + WHERE search('title:("rock and")', '{"default_operator":"or"}') + ORDER BY id + """ + // Row 8: "rock and jazz review" → contains "rock and" ✓ + assert res6.collect { it[0] }.contains(8) + + // === Field-grouped query in Lucene mode === + + // title:(rock OR jazz) in lucene mode + def res7 = sql """ + SELECT id FROM ${tableName} + WHERE search('title:(rock OR jazz)', '{"mode":"lucene"}') + ORDER BY id + """ + // Should match rows with "rock" or "jazz" in title (SHOULD semantics) + assertEquals([[1], [2], [5], [6], [8]], res7) + + // title:(rock AND jazz) in lucene mode + def res8 = sql """ + SELECT id FROM ${tableName} + WHERE search('title:(rock AND jazz)', '{"mode":"lucene"}') + ORDER BY id + """ + // Must have both "rock" AND "jazz" in title + assertEquals([[8]], res8) + + // === Field-grouped combined with explicit field query === + + // title:(rock OR jazz) AND category:Music + def res9 = sql """ + SELECT id FROM ${tableName} + WHERE search('title:(rock OR jazz) AND category:Music', '{"default_operator":"and"}') + ORDER BY id + """ + // Must have (rock or jazz in title) AND category=Music + // Row 1: rock music history, Music ✓ + // Row 2: jazz music theory, Music ✓ + // Row 5: rock climbing tips, Sports ✗ + // Row 6: jazz and blues fusion, Music ✓ + // Row 8: rock and jazz review, Music ✓ + assertEquals([[1], [2], [6], [8]], res9) + + // === Verify it was previously broken (would have been a syntax error) === + // This verifies the fix: parsing title:(rock OR jazz) should not throw + try { + def resSyntax = sql """ + SELECT COUNT(*) FROM ${tableName} + WHERE search('title:(rock OR jazz)', '{"default_operator":"or"}') + """ + assert resSyntax[0][0] >= 0 : "Should parse and execute without error" + } catch (Exception e) { + throw new AssertionError("title:(rock OR jazz) syntax should be supported: " + e.message) + } + + sql "DROP TABLE IF EXISTS ${tableName}" +} diff --git a/regression-test/suites/search/test_search_function.groovy b/regression-test/suites/search/test_search_function.groovy index bf09ca237f644a..61ee8e4b026897 100644 --- a/regression-test/suites/search/test_search_function.groovy +++ b/regression-test/suites/search/test_search_function.groovy @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_function") { +suite("test_search_function", "p0") { def tableName = "search_test_table" def indexTableName = "search_test_index_table" diff --git a/regression-test/suites/search/test_search_function.groovy.backup b/regression-test/suites/search/test_search_function.groovy.backup new file mode 100644 index 00000000000000..47e5944e39351a --- /dev/null +++ b/regression-test/suites/search/test_search_function.groovy.backup @@ -0,0 +1,261 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_search_function") { + + def tableName = "search_test_table" + def indexTableName = "search_test_index_table" + + sql "DROP TABLE IF EXISTS ${tableName}" + sql "DROP TABLE IF EXISTS ${indexTableName}" + + // Create test table without inverted index + sql """ + CREATE TABLE ${tableName} ( + id INT, + title VARCHAR(255), + content TEXT, + category VARCHAR(100), + tags VARCHAR(200), + publish_date DATE, + view_count INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 3 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + // Create test table with inverted index + sql """ + CREATE TABLE ${indexTableName} ( + id INT, + title VARCHAR(255), + content TEXT, + category VARCHAR(100), + tags VARCHAR(200), + publish_date DATE, + view_count INT, + INDEX idx_title (title) USING INVERTED, + INDEX idx_content (content) USING INVERTED PROPERTIES("parser" = "english"), + INDEX idx_category (category) USING INVERTED, + INDEX idx_tags (tags) USING INVERTED PROPERTIES("parser" = "english") + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 3 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + // Insert test data + def testData = [ + [1, "Machine Learning Basics", "Introduction to machine learning algorithms and concepts", "Technology", "machine learning, AI, algorithms", "2023-01-15", 1500], + [2, "Deep Learning Tutorial", "Advanced deep learning techniques and neural networks", "Technology", "deep learning, neural networks, AI", "2023-02-20", 2300], + [3, "Python Programming Guide", "Complete guide to Python programming language", "Programming", "python, programming, tutorial", "2023-03-10", 1800], + [4, "Data Science Methods", "Statistical methods for data science and analytics", "Science", "data science, statistics, analytics", "2023-04-05", 1200], + [5, "Web Development Tips", "Modern web development best practices", "Technology", "web development, javascript, HTML", "2023-05-12", 950], + [6, "Algorithm Design", "Fundamental algorithms and data structures", "Computer Science", "algorithms, data structures, programming", "2023-06-18", 1650], + [7, "Natural Language Processing", "NLP techniques and applications", "Technology", "NLP, natural language, processing", "2023-07-22", 1100], + [8, "Cloud Computing Overview", "Introduction to cloud computing platforms", "Technology", "cloud computing, AWS, Azure", "2023-08-14", 1350], + [9, "Database Systems", "Relational and NoSQL database concepts", "Technology", "database, SQL, NoSQL", "2023-09-09", 1450], + [10, "Software Engineering", "Best practices in software development", "Programming", "software engineering, development, practices", "2023-10-01", 1750] + ] + + for (def row : testData) { + sql """INSERT INTO ${tableName} VALUES (${row[0]}, '${row[1]}', '${row[2]}', '${row[3]}', '${row[4]}', '${row[5]}', ${row[6]})""" + sql """INSERT INTO ${indexTableName} VALUES (${row[0]}, '${row[1]}', '${row[2]}', '${row[3]}', '${row[4]}', '${row[5]}', ${row[6]})""" + } + + // Wait for index building and data settling + Thread.sleep(10000) + + // Verify data insertion + qt_sql "SELECT COUNT(*) FROM ${tableName}" + qt_sql "SELECT COUNT(*) FROM ${indexTableName}" + + // Test 1: Basic term search + qt_sql "SELECT id, title FROM ${indexTableName} WHERE search('title:Machine')" + + // Test 2: Phrase search + qt_sql "SELECT id, title FROM ${indexTableName} WHERE search('title:\"Machine Learning\"')" + + // Test 3: Multiple field search with AND + qt_sql "SELECT id, title FROM ${indexTableName} WHERE search('title:Learning AND category:Technology') ORDER BY id" + + // Test 4: Multiple field search with OR + qt_sql "SELECT id, title FROM ${indexTableName} WHERE search('title:Python OR title:Algorithm') ORDER BY id" + + // Test 5: NOT search + qt_sql "SELECT COUNT(*) FROM ${indexTableName} WHERE search('category:Technology AND NOT title:Machine')" + + // Test 6: Complex nested search + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('(title:Learning OR content:algorithms) AND category:Technology') ORDER BY id" + result([ + [1, "Machine Learning Basics"], + [2, "Deep Learning Tutorial"] + ]) + } + + // Test 7: Wildcard search + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('title:Learn*') ORDER BY id" + result([ + [1, "Machine Learning Basics"], + [2, "Deep Learning Tutorial"] + ]) + } + + // Test 8: Prefix search + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('title:Data*')" + result([ + [4, "Data Science Methods"], + [6, "Algorithm Design"] + ]) + } + + // Test 9: Search in content field + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('content:neural')" + result([ + [2, "Deep Learning Tutorial"] + ]) + } + + // Test 10: Search in tags field + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('tags:programming') ORDER BY id" + result([ + [3, "Python Programming Guide"], + [6, "Algorithm Design"] + ]) + } + + // Test 11: Case insensitive search + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('title:MACHINE')" + result([ + [1, "Machine Learning Basics"] + ]) + } + + // Test 12: Search with spaces in field values + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('content:\"machine learning\"')" + result([ + [1, "Machine Learning Basics"] + ]) + } + + // Test 13: Empty search result + test { + sql "SELECT COUNT(*) FROM ${indexTableName} WHERE search('title:nonexistent')" + result([ + [0] + ]) + } + + // Test 14: Search combined with other WHERE conditions + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('category:Technology') AND view_count > 1400 ORDER BY id" + result([ + [1, "Machine Learning Basics"], + [2, "Deep Learning Tutorial"] + ]) + } + + // Test 15: Search with GROUP BY + test { + sql "SELECT category, COUNT(*) as cnt FROM ${indexTableName} WHERE search('title:Learning OR title:Programming') GROUP BY category ORDER BY category" + result([ + ["Programming", 1], + ["Technology", 2] + ]) + } + + // Test 16: Search with ORDER BY + test { + sql "SELECT id, title, view_count FROM ${indexTableName} WHERE search('tags:AI OR tags:programming') ORDER BY view_count DESC" + result([ + [2, "Deep Learning Tutorial", 2300], + [3, "Python Programming Guide", 1800], + [6, "Algorithm Design", 1650], + [1, "Machine Learning Basics", 1500] + ]) + } + + // Test 17: Search with LIMIT + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('category:Technology') ORDER BY id LIMIT 3" + result([ + [1, "Machine Learning Basics"], + [2, "Deep Learning Tutorial"], + [5, "Web Development Tips"] + ]) + } + + // Test 18: Search function in SELECT clause (should not be allowed - search is a predicate) + test { + try { + sql "SELECT id, search('title:Machine') FROM ${indexTableName}" + assertTrue(false, "Expected exception for search in SELECT clause") + } catch (Exception e) { + assertTrue(e.getMessage().contains("search") || e.getMessage().contains("not found")) + } + } + + // Test 19: Invalid DSL syntax + test { + try { + sql "SELECT id FROM ${indexTableName} WHERE search('title:')" + assertTrue(false, "Expected exception for invalid DSL") + } catch (Exception e) { + assertTrue(e.getMessage().contains("Invalid") || e.getMessage().contains("syntax")) + } + } + + // Test 20: ANY query test + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('tags:ANY(AI programming)') ORDER BY id" + check { result -> + assertTrue(result.size() > 0, "Should find records with AI or programming in tags") + } + } + + // Test 21: ALL query test + test { + sql "SELECT id, title FROM ${indexTableName} WHERE search('tags:ALL(machine learning)') ORDER BY id" + result([ + [1, "Machine Learning Basics"] + ]) + } + + // Test 22: Search on non-indexed table (should still work but may be slower) + test { + sql "SELECT id, title FROM ${tableName} WHERE search('title:Machine')" + result([ + [1, "Machine Learning Basics"] + ]) + } + + // Cleanup + sql "DROP TABLE IF EXISTS ${tableName}" + sql "DROP TABLE IF EXISTS ${indexTableName}" +} diff --git a/regression-test/suites/search/test_search_inverted_index.groovy b/regression-test/suites/search/test_search_inverted_index.groovy index 6314a291bb31d4..d4d83384d76e9f 100644 --- a/regression-test/suites/search/test_search_inverted_index.groovy +++ b/regression-test/suites/search/test_search_inverted_index.groovy @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_inverted_index") { +suite("test_search_inverted_index", "p0") { def tableWithIndex = "search_index_test_table" def tableWithoutIndex = "search_no_index_test_table" diff --git a/regression-test/suites/search/test_search_lucene_mode.groovy b/regression-test/suites/search/test_search_lucene_mode.groovy index 8e9d4edb7e37c6..4a2a5c7d94ccd7 100644 --- a/regression-test/suites/search/test_search_lucene_mode.groovy +++ b/regression-test/suites/search/test_search_lucene_mode.groovy @@ -30,9 +30,12 @@ * Enable Lucene mode with options parameter (JSON format): * search(dsl, '{"default_field":"title","default_operator":"and","mode":"lucene"}') */ -suite("test_search_lucene_mode") { +suite("test_search_lucene_mode", "p0") { def tableName = "search_lucene_mode_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Create table with inverted indexes @@ -137,12 +140,9 @@ suite("test_search_lucene_mode") { """ // ============ Test 7: Lucene mode NOT operator (pure negative query) ============ - // 'NOT a' in Lucene mode produces a pure MUST_NOT query. - // IMPORTANT: In Lucene/ES semantics, a pure negative query (only MUST_NOT, no MUST/SHOULD) - // returns EMPTY results because there's no positive clause to match against. - // This is correct Lucene behavior - to get "all except X", you need: - // match_all AND NOT X (i.e., a positive clause combined with negation) - // Expected: empty result (correct Lucene semantics) + // 'NOT a' in Lucene mode is rewritten to: SHOULD(MATCH_ALL_DOCS) + MUST_NOT(a) + // This matches all documents EXCEPT those containing the negated term. + // Expected: all docs without "apple" in title (4, 5, 6, 7) qt_lucene_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title FROM ${tableName} diff --git a/regression-test/suites/search/test_search_mow_support.groovy b/regression-test/suites/search/test_search_mow_support.groovy index ce759ad99c9ba3..279cc45b1dfa84 100644 --- a/regression-test/suites/search/test_search_mow_support.groovy +++ b/regression-test/suites/search/test_search_mow_support.groovy @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_mow_support") { +suite("test_search_mow_support", "p0") { def tableName = "search_mow_support_tbl" sql "DROP TABLE IF EXISTS ${tableName}" diff --git a/regression-test/suites/search/test_search_multi_field.groovy b/regression-test/suites/search/test_search_multi_field.groovy index f71db33f2b050f..44f9d9afad9e26 100644 --- a/regression-test/suites/search/test_search_multi_field.groovy +++ b/regression-test/suites/search/test_search_multi_field.groovy @@ -30,9 +30,12 @@ * * Multi-field search can also be combined with Lucene mode for MUST/SHOULD/MUST_NOT semantics. */ -suite("test_search_multi_field") { +suite("test_search_multi_field", "p0") { def tableName = "search_multi_field_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Create table with inverted indexes on multiple fields @@ -277,6 +280,8 @@ suite("test_search_multi_field") { """ // ============ Test 21: best_fields with Lucene mode ============ + // In lucene mode, best_fields uses per-clause expansion (matching ES query_string), + // so id=1 and id=9 both match (terms can be across different fields) qt_multi_field_best_fields_lucene """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title FROM ${tableName} @@ -292,6 +297,36 @@ suite("test_search_multi_field") { ORDER BY id """ + // ============ Test 23: MATCH_ALL_DOCS (*) with best_fields + lucene mode ============ + // Regression test for DORIS-24536: search('*', ...) with multi-field should not error + // "*" is a match-all query that should return all rows + qt_multi_field_match_all_best_fields """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) + FROM ${tableName} + WHERE search('*', '{"fields":["title","content"],"type":"best_fields","default_operator":"AND","mode":"lucene","minimum_should_match":0}') + """ + + // ============ Test 24: MATCH_ALL_DOCS (*) with cross_fields + lucene mode ============ + qt_multi_field_match_all_cross_fields """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) + FROM ${tableName} + WHERE search('*', '{"fields":["title","content"],"type":"cross_fields","default_operator":"AND","mode":"lucene","minimum_should_match":0}') + """ + + // ============ Test 25: MATCH_ALL_DOCS (*) with single default_field + lucene mode ============ + qt_match_all_single_field """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) + FROM ${tableName} + WHERE search('*', '{"default_field":"title","default_operator":"AND","mode":"lucene","minimum_should_match":0}') + """ + + // ============ Test 26: MATCH_ALL_DOCS (*) with best_fields standard mode (no lucene) ============ + qt_multi_field_match_all_standard """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) + FROM ${tableName} + WHERE search('*', '{"fields":["title","content"],"type":"best_fields","default_operator":"AND"}') + """ + // Cleanup sql "DROP TABLE IF EXISTS ${tableName}" } diff --git a/regression-test/suites/search/test_search_null_regression.groovy b/regression-test/suites/search/test_search_null_regression.groovy index 3fe85461455301..c2acfeb51b2b4c 100644 --- a/regression-test/suites/search/test_search_null_regression.groovy +++ b/regression-test/suites/search/test_search_null_regression.groovy @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_null_regression") { +suite("test_search_null_regression", "p0") { def tableName = "search_null_regression_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Create test table that reproduces the original bug scenarios @@ -69,7 +72,7 @@ suite("test_search_null_regression") { // vs title match "Ronald" or (content match_all "Selma Blair") qt_regression_1_search_or """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:Ronald or (content:ALL(Selma Blair))') + WHERE search('title:Ronald OR (content:ALL(Selma Blair))') """ qt_regression_1_match_or """ @@ -80,7 +83,7 @@ suite("test_search_null_regression") { // Detailed verification - get actual matching rows for OR query qt_regression_1_search_or_rows """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title, content FROM ${tableName} - WHERE search('title:Ronald or (content:ALL(Selma Blair))') + WHERE search('title:Ronald OR (content:ALL(Selma Blair))') ORDER BY id """ @@ -94,7 +97,7 @@ suite("test_search_null_regression") { // This reproduces: search('not content:"Round"') vs not search('content:"Round"') qt_regression_2_internal_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('not content:Round') + WHERE search('NOT content:Round') """ qt_regression_2_external_not """ @@ -105,7 +108,7 @@ suite("test_search_null_regression") { // Detailed verification for NOT query qt_regression_2_internal_not_rows """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title, content FROM ${tableName} - WHERE search('not content:Round') + WHERE search('NOT content:Round') ORDER BY id """ @@ -119,7 +122,7 @@ suite("test_search_null_regression") { // Verify that OR queries properly handle NULL values according to SQL semantics qt_regression_3_null_or """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id FROM ${tableName} - WHERE search('title:NonExistent or content:Ronald') + WHERE search('title:NonExistent OR content:Ronald') ORDER BY id """ @@ -133,14 +136,14 @@ suite("test_search_null_regression") { // Regression Test 4: NULL Handling in AND Queries qt_regression_4_null_and """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id FROM ${tableName} - WHERE search('title:Ronald and content:biography') + WHERE search('title:Ronald AND content:biography') ORDER BY id """ // Regression Test 5: Complex Boolean Operations qt_regression_5_complex_search """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('(title:Ronald or content:Selma) and not content:Round') + WHERE search('(title:Ronald OR content:Selma) AND NOT content:Round') """ qt_regression_5_complex_match """ @@ -151,7 +154,7 @@ suite("test_search_null_regression") { // Regression Test 6: Edge Case - All NULL Query qt_regression_6_all_null """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:NonExistent and content:NonExistent') + WHERE search('title:NonExistent AND content:NonExistent') """ // Regression Test 7: Case Sensitivity and Variations @@ -168,23 +171,23 @@ suite("test_search_null_regression") { // Regression Test 8: Multiple NOT operations qt_regression_8_multiple_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('not (title:nonexistent or content:nonexistent)') + WHERE search('NOT (title:nonexistent OR content:nonexistent)') """ qt_regression_8_external_multiple_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE not search('title:nonexistent or content:nonexistent') + WHERE not search('title:nonexistent OR content:nonexistent') """ // Regression Test 9: Empty string handling qt_regression_9_empty_string """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:"" or content:Round') + WHERE search('title:"" OR content:Round') """ // Regression Test 10: Performance test with complex query qt_regression_10_performance """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('(title:Ronald or title:Selma or content:Round) and not (title:NonExistent and content:NonExistent)') + WHERE search('(title:Ronald OR title:Selma OR content:Round) AND NOT (title:NonExistent AND content:NonExistent)') """ } \ No newline at end of file diff --git a/regression-test/suites/search/test_search_null_semantics.groovy b/regression-test/suites/search/test_search_null_semantics.groovy index c7d97c18bdc31c..1a16adb03aaa3f 100644 --- a/regression-test/suites/search/test_search_null_semantics.groovy +++ b/regression-test/suites/search/test_search_null_semantics.groovy @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_null_semantics") { +suite("test_search_null_semantics", "p0") { def tableName = "search_null_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Create test table with inverted index and NULL values @@ -61,7 +64,7 @@ suite("test_search_null_semantics") { // title match "Ronald" or (content match_all "Selma Blair") qt_test_case_1_search """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:Ronald or (content:ALL(Selma Blair))') + WHERE search('title:Ronald OR (content:ALL(Selma Blair))') """ qt_test_case_1_match """ @@ -73,7 +76,7 @@ suite("test_search_null_semantics") { // search('not content:"Round"') should match not search('content:"Round"') qt_test_case_2_internal_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('not content:Round') + WHERE search('NOT content:Round') """ qt_test_case_2_external_not """ @@ -92,7 +95,7 @@ suite("test_search_null_semantics") { // Verify that NULL OR TRUE = TRUE logic works qt_test_case_3_or_with_null """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title, content FROM ${tableName} - WHERE search('title:Ronald or content:biography') + WHERE search('title:Ronald OR content:biography') ORDER BY id """ @@ -100,14 +103,14 @@ suite("test_search_null_semantics") { // Verify that NULL AND TRUE = NULL logic works qt_test_case_4_and_with_null """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title, content FROM ${tableName} - WHERE search('title:Ronald and content:biography') + WHERE search('title:Ronald AND content:biography') ORDER BY id """ // Test Case 5: Complex OR query with multiple NULL scenarios qt_test_case_5_complex_or_search """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:Unknown or content:mascot or category:Test') + WHERE search('title:Unknown OR content:mascot OR category:Test') """ qt_test_case_5_complex_or_match """ @@ -118,7 +121,7 @@ suite("test_search_null_semantics") { // Test Case 6: NOT query with different field types qt_test_case_6_not_title_search """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('not title:Ronald') + WHERE search('NOT title:Ronald') """ qt_test_case_6_not_title_external """ @@ -129,13 +132,13 @@ suite("test_search_null_semantics") { // Test Case 7: Mixed boolean operations qt_test_case_7_mixed """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('(title:Ronald or content:Selma) and not category:Unknown') + WHERE search('(title:Ronald OR content:Selma) AND NOT category:Unknown') """ // Test Case 8: Edge case - all NULL fields qt_test_case_8_all_null """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:NonExistent or content:NonExistent or category:NonExistent') + WHERE search('title:NonExistent OR content:NonExistent OR category:NonExistent') """ // ------------------------------------------------------------------ diff --git a/regression-test/suites/search/test_search_regexp_lowercase.groovy b/regression-test/suites/search/test_search_regexp_lowercase.groovy new file mode 100644 index 00000000000000..81b93d0de7d162 --- /dev/null +++ b/regression-test/suites/search/test_search_regexp_lowercase.groovy @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// DORIS-24464: search() REGEXP with lower_case=true should be consistent with match_regexp +// Regex patterns are NOT lowercased (matching ES query_string behavior). +// Wildcard patterns ARE lowercased (matching ES query_string normalizer behavior). + +suite("test_search_regexp_lowercase", "p0") { + def tableName = "search_regexp_lowercase_test" + + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + + sql "DROP TABLE IF EXISTS ${tableName}" + + sql """ + CREATE TABLE ${tableName} ( + a INT, + title VARCHAR(512) NOT NULL, + INDEX idx_title (title) USING INVERTED PROPERTIES("lower_case" = "true", "parser" = "english", "support_phrase" = "true") + ) ENGINE=OLAP + DUPLICATE KEY(a) + DISTRIBUTED BY HASH(a) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + sql "INSERT INTO ${tableName} VALUES(1, 'ABC DEF')" + sql "INSERT INTO ${tableName} VALUES(2, 'abc def')" + sql "INSERT INTO ${tableName} VALUES(3, 'Apple Banana Cherry')" + sql "INSERT INTO ${tableName} VALUES(4, 'apple banana cherry')" + + // Wait for data to be ready + Thread.sleep(5000) + + // ========================================================================= + // Test 1: REGEXP with uppercase pattern should NOT match lowercased terms + // (ES-compatible behavior: regex patterns are not analyzed/lowercased) + // ========================================================================= + + // search() REGEXP with uppercase pattern - should return 0 rows + // because indexed terms are lowercased (abc, def) but pattern AB.* is case-sensitive + qt_regexp_uppercase_no_match """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM ${tableName} + WHERE search('/AB.*/', '{"default_field":"title","default_operator":"AND","mode":"lucene", "minimum_should_match": 0}') + ORDER BY a + """ + + // match_regexp with uppercase pattern - should also return 0 rows + qt_match_regexp_uppercase_no_match """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM ${tableName} + WHERE title match_regexp 'AB.*' + ORDER BY a + """ + + // ========================================================================= + // Test 2: REGEXP with lowercase pattern SHOULD match lowercased terms + // ========================================================================= + + // search() REGEXP with lowercase pattern - should match both rows with "abc" + qt_regexp_lowercase_match """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM ${tableName} + WHERE search('/ab.*/', '{"default_field":"title","default_operator":"AND","mode":"lucene", "minimum_should_match": 0}') + ORDER BY a + """ + + // match_regexp with lowercase pattern - should also match + qt_match_regexp_lowercase_match """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM ${tableName} + WHERE title match_regexp 'ab.*' + ORDER BY a + """ + + // ========================================================================= + // Test 3: WILDCARD with uppercase pattern should match (wildcards ARE lowercased) + // ========================================================================= + + // search() WILDCARD with uppercase - should match because wildcard patterns are lowercased + qt_wildcard_uppercase_match """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM ${tableName} + WHERE search('AB*', '{"default_field":"title","default_operator":"AND","mode":"lucene", "minimum_should_match": 0}') + ORDER BY a + """ + + // search() WILDCARD with lowercase - should also match + qt_wildcard_lowercase_match """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM ${tableName} + WHERE search('ab*', '{"default_field":"title","default_operator":"AND","mode":"lucene", "minimum_should_match": 0}') + ORDER BY a + """ + + // ========================================================================= + // Test 4: More complex REGEXP patterns + // ========================================================================= + + // Lowercase regex that matches "apple" - should match rows 3 and 4 + qt_regexp_apple_lowercase """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM ${tableName} + WHERE search('/app.*/', '{"default_field":"title","default_operator":"AND","mode":"lucene", "minimum_should_match": 0}') + ORDER BY a + """ + + // Uppercase regex "App.*" should NOT match (terms are lowercased as "apple") + qt_regexp_apple_uppercase_no_match """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ * FROM ${tableName} + WHERE search('/App.*/', '{"default_field":"title","default_operator":"AND","mode":"lucene", "minimum_should_match": 0}') + ORDER BY a + """ + + // ========================================================================= + // Test 5: REGEXP consistency with match_regexp for various patterns + // ========================================================================= + + // Both should return same results for lowercase pattern + qt_consistency_regexp_cherry """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ a FROM ${tableName} + WHERE search('/cher.*/', '{"default_field":"title","default_operator":"AND","mode":"lucene", "minimum_should_match": 0}') + ORDER BY a + """ + + qt_consistency_match_regexp_cherry """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ a FROM ${tableName} + WHERE title match_regexp 'cher.*' + ORDER BY a + """ + + // Both should return 0 rows for uppercase pattern + qt_consistency_regexp_cherry_upper """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ a FROM ${tableName} + WHERE search('/CHER.*/', '{"default_field":"title","default_operator":"AND","mode":"lucene", "minimum_should_match": 0}') + ORDER BY a + """ + + qt_consistency_match_regexp_cherry_upper """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ a FROM ${tableName} + WHERE title match_regexp 'CHER.*' + ORDER BY a + """ + + sql "DROP TABLE IF EXISTS ${tableName}" +} diff --git a/regression-test/suites/search/test_search_usage_restrictions.groovy b/regression-test/suites/search/test_search_usage_restrictions.groovy index ea31a4eb998ebc..842fa5454adb49 100644 --- a/regression-test/suites/search/test_search_usage_restrictions.groovy +++ b/regression-test/suites/search/test_search_usage_restrictions.groovy @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_usage_restrictions") { +suite("test_search_usage_restrictions", "p0") { def tableName = "search_usage_test_table" def tableName2 = "search_usage_test_table2" diff --git a/regression-test/suites/search/test_search_variant_dual_index_reader.groovy b/regression-test/suites/search/test_search_variant_dual_index_reader.groovy new file mode 100644 index 00000000000000..2463d126ba6ebc --- /dev/null +++ b/regression-test/suites/search/test_search_variant_dual_index_reader.groovy @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Regression test for variant subcolumn with dual inverted indexes on the same field pattern. + * + * Bug scenario: When a variant column has two indexes on the same field_pattern (e.g. "string_*"): + * - idx_no_analyzer: no parser -> STRING_TYPE reader (untokenized) + * - idx_with_analyzer: parser=xxx -> FULLTEXT reader (tokenized) + * + * FE correctly resolves the field to the analyzer-based index and sends its index_properties + * via TSearchFieldBinding. However, BE's FieldReaderResolver::resolve() called + * select_best_reader(column_type, EQUAL_QUERY, "") which preferred STRING_TYPE for EQUAL_QUERY. + * This opened the untokenized index directory, so tokenized search terms never matched. + * + * Fix: For variant subcolumns, when FE provides index_properties indicating an analyzer, + * upgrade EQUAL_QUERY to MATCH_ANY_QUERY before reader selection so the FULLTEXT reader is chosen. + * + * Before fix: search() returns empty (wrong reader selected) + * After fix: search() returns matching rows (correct FULLTEXT reader selected) + */ +suite("test_search_variant_dual_index_reader", "p0") { + def tableName = "test_variant_dual_index_reader" + + sql """ set enable_match_without_inverted_index = false """ + sql """ set enable_common_expr_pushdown = true """ + sql """ set default_variant_enable_typed_paths_to_sparse = false """ + // Pin doc_mode to false to prevent CI flakiness from fuzzy testing. + sql """ set default_variant_enable_doc_mode = false """ + + sql "DROP TABLE IF EXISTS ${tableName}" + + // Create table with variant column and TWO indexes on the same field_pattern: + // one without analyzer (STRING_TYPE) and one with analyzer (FULLTEXT). + // This is the exact scenario that triggers the bug. + sql """ + CREATE TABLE ${tableName} ( + `id` INT NOT NULL, + `props` variant< + MATCH_NAME_GLOB 'string_*' : string, + properties("variant_max_subcolumns_count" = "100") + > NULL, + INDEX idx_no_analyzer (props) USING INVERTED PROPERTIES( + "field_pattern" = "string_*" + ), + INDEX idx_with_analyzer (props) USING INVERTED PROPERTIES( + "parser" = "unicode", + "field_pattern" = "string_*", + "lower_case" = "true" + ) + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + + sql """INSERT INTO ${tableName} VALUES + (1, '{"string_8": "admin user"}'), + (2, '{"string_8": "readonly access"}'), + (3, '{"string_8": "admin access granted"}'), + (4, '{"string_1": "hello world"}'), + (5, '{"string_8": "guest only"}') + """ + + sql "sync" + Thread.sleep(5000) + + // Test 1: Basic tokenized search on variant subcolumn with dual indexes. + // "admin" should match rows 1 and 3 via the FULLTEXT reader (tokenized). + // Before fix: returns empty because EQUAL_QUERY selects STRING_TYPE reader. + // After fix: returns rows 1, 3 because MATCH_ANY_QUERY selects FULLTEXT reader. + qt_dual_index_basic """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('admin', '{"default_field":"props.string_8","mode":"lucene"}') + ORDER BY id + """ + + // Test 2: Multi-term AND search. Both "admin" and "access" must match. + // Before fix: empty. After fix: row 3. + qt_dual_index_and """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('admin access', '{"default_field":"props.string_8","mode":"lucene","default_operator":"AND"}') + ORDER BY id + """ + + // Test 3: Search on a different subcolumn matching the same field_pattern. + // Ensures the fix works across different subcolumns under the same pattern. + qt_dual_index_other_field """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('hello', '{"default_field":"props.string_1","mode":"lucene"}') + ORDER BY id + """ + + // Test 4: Field-qualified syntax with dual indexes. + qt_dual_index_field_syntax """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('props.string_8:access', '{"mode":"lucene"}') + ORDER BY id + """ + + // Test 5: Case-insensitive search (lowercase index). + // "ADMIN" should match "admin user" and "admin access granted". + qt_dual_index_case_insensitive """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('ADMIN', '{"default_field":"props.string_8","mode":"lucene"}') + ORDER BY id + """ + + // Test 6: Verify MATCH_ANY also works as baseline (uses MATCH query type directly, + // so it always picks FULLTEXT reader — this should work both before and after the fix). + qt_dual_index_match_baseline """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE props['string_8'] MATCH_ANY 'admin' + ORDER BY id + """ + + sql "DROP TABLE IF EXISTS ${tableName}" +} diff --git a/regression-test/suites/search/test_search_variant_subcolumn_analyzer.groovy b/regression-test/suites/search/test_search_variant_subcolumn_analyzer.groovy new file mode 100644 index 00000000000000..69bc0f29157337 --- /dev/null +++ b/regression-test/suites/search/test_search_variant_subcolumn_analyzer.groovy @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Test search() function with variant subcolumn and field_pattern index. + * + * This test verifies that the analyzer (parser) from field_pattern matched indexes + * is correctly applied when using search() on variant subcolumns. + * + * Bug: When using search() on variant subcolumns with field_pattern indexes, + * the analyzer was not applied because FE did not pass index properties to BE. + * This caused exact-match-only behavior instead of tokenized matching. + * + * Fix: FE now looks up the Index for each field in SearchExpression and passes + * the index_properties via TSearchFieldBinding to BE. + */ +suite("test_search_variant_subcolumn_analyzer", "p0") { + def tableName = "test_variant_subcolumn_analyzer" + + sql """ set enable_match_without_inverted_index = false """ + sql """ set enable_common_expr_pushdown = true """ + sql """ set default_variant_enable_typed_paths_to_sparse = false """ + // Pin doc_mode to false to prevent CI flakiness from fuzzy testing. + // When default_variant_enable_doc_mode=true (randomly set by fuzzy testing), + // variant subcolumns are stored in document mode, causing inverted index + // iterators to be unavailable in BE for search() evaluation. + sql """ set default_variant_enable_doc_mode = false """ + + sql "DROP TABLE IF EXISTS ${tableName}" + + // Create table with variant column using predefined field pattern and field_pattern index + sql """ + CREATE TABLE ${tableName} ( + `id` INT NOT NULL, + `data` variant< + MATCH_NAME_GLOB 'string_*' : string, + properties("variant_max_subcolumns_count" = "100") + > NULL, + INDEX idx_text (data) USING INVERTED PROPERTIES( + "parser" = "unicode", + "field_pattern" = "string_*", + "lower_case" = "true" + ) + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + + // Insert test data + sql """INSERT INTO ${tableName} VALUES + (1, '{"string_8": "admin only"}'), + (2, '{"string_8": "user access"}'), + (3, '{"string_8": "admin access granted"}'), + (4, '{"string_1": "hello world"}'), + (5, '{"string_8": "readonly user"}'), + (6, '{"number_1": 42}') + """ + + // Wait for data to be flushed and indexes built + sql "sync" + Thread.sleep(5000) + + // Test 1: search() with default_field on variant subcolumn matching field_pattern + // "admin" should match "admin only" and "admin access granted" because the unicode + // parser tokenizes them into ["admin", "only"] and ["admin", "access", "granted"] + qt_search_variant_analyzer_basic """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('admin', '{"default_field":"data.string_8","mode":"lucene"}') + ORDER BY id + """ + + // Test 2: Verify MATCH also works (as a baseline) + qt_match_variant_baseline """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE data['string_8'] MATCH_ANY 'admin' + ORDER BY id + """ + + // Test 3: Multi-term search should also work with tokenization + qt_search_variant_analyzer_multi """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('admin access', '{"default_field":"data.string_8","mode":"lucene","default_operator":"AND"}') + ORDER BY id + """ + + // Test 4: Search on a different subcolumn matching the same field_pattern + qt_search_variant_analyzer_other_field """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('hello', '{"default_field":"data.string_1","mode":"lucene"}') + ORDER BY id + """ + + // Test 5: Search with field-qualified syntax on variant subcolumn + qt_search_variant_analyzer_field_syntax """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('data.string_8:user', '{"mode":"lucene"}') + ORDER BY id + """ + + // Test 6: Verify lowercase is applied (search for "ADMIN" should match "admin only") + qt_search_variant_analyzer_lowercase """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('ADMIN', '{"default_field":"data.string_8","mode":"lucene"}') + ORDER BY id + """ + + // Test 7: Phrase search on variant subcolumn with analyzer + qt_search_variant_analyzer_phrase """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('"admin only"', '{"default_field":"data.string_8","mode":"lucene"}') + ORDER BY id + """ + + // Clean up + sql "DROP TABLE IF EXISTS ${tableName}" + + // Test Case 2: Variant with direct named field and field_pattern index for comparison + def tableName2 = "test_variant_direct_index" + + sql "DROP TABLE IF EXISTS ${tableName2}" + + sql """ + CREATE TABLE ${tableName2} ( + `id` INT NOT NULL, + `data` variant< + 'name' : string, + properties("variant_max_subcolumns_count" = "10") + > NULL, + INDEX idx_text (data) USING INVERTED PROPERTIES( + "parser" = "unicode", + "field_pattern" = "name", + "lower_case" = "true" + ) + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + + sql """INSERT INTO ${tableName2} VALUES + (1, '{"name": "admin only"}'), + (2, '{"name": "user access"}'), + (3, '{"name": "admin access granted"}') + """ + + sql "sync" + Thread.sleep(5000) + + // Test 8: search() on variant subcolumn with named field_pattern (direct match) + qt_search_variant_direct_index """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName2} + WHERE search('admin', '{"default_field":"data.name","mode":"lucene"}') + ORDER BY id + """ + + sql "DROP TABLE IF EXISTS ${tableName2}" + + logger.info("All variant subcolumn analyzer tests completed!") +} diff --git a/regression-test/suites/search/test_search_variant_wildcard.groovy b/regression-test/suites/search/test_search_variant_wildcard.groovy new file mode 100644 index 00000000000000..37fb79d35dac4b --- /dev/null +++ b/regression-test/suites/search/test_search_variant_wildcard.groovy @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/** + * Test wildcard search on variant subcolumns in Lucene mode. + * + * Bug: Wildcard queries (*, ?) on variant subcolumns return empty results + * even when the data exists and regular TERM search works correctly. + * + * Root cause: In FieldReaderResolver::resolve(), only EQUAL_QUERY is upgraded + * to MATCH_ANY_QUERY for variant subcolumns with analyzers. WILDCARD_QUERY is + * not upgraded, so it may select the wrong index reader (STRING_TYPE instead of + * FULLTEXT), causing term enumeration to fail. + * + * Scenario: Contacts with firstname/lastname stored in variant subcolumns. + * - TERM search for 'smith' correctly returns John Smith + * - WILDCARD searches '*ith', 'sm*th', 'sm?th' should also match but returned empty + */ +suite("test_search_variant_wildcard", "p0") { + def tableName = "test_search_variant_wildcard" + + sql """ set enable_match_without_inverted_index = false """ + sql """ set enable_common_expr_pushdown = true """ + sql """ set default_variant_enable_typed_paths_to_sparse = false """ + sql """ set default_variant_enable_doc_mode = false """ + + sql "DROP TABLE IF EXISTS ${tableName}" + + // Create table with variant column using field_pattern index + sql """ + CREATE TABLE ${tableName} ( + `id` BIGINT NOT NULL, + `props` variant< + MATCH_NAME_GLOB 'string_*' : string, + properties("variant_max_subcolumns_count" = "100") + > NULL, + INDEX idx_props (props) USING INVERTED PROPERTIES( + "parser" = "unicode", + "field_pattern" = "string_*", + "lower_case" = "true" + ) + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + + // Insert test data matching the reported scenario + // string_8 = firstname, string_17 = lastname + sql """INSERT INTO ${tableName} VALUES + (73095521135, '{"string_8": "John", "string_17": "Smith"}'), + (73095446198, '{"string_8": "Jane", "string_17": "Smithson"}'), + (73095754047, '{"string_8": "Michael David", "string_17": "Johnson"}') + """ + + sql "sync" + Thread.sleep(5000) + + // ============ Baseline: TERM search works ============ + + // Test 1: TERM search for 'smith' on lastname - should return John Smith + qt_term_smith """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('smith', '{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Test 2: TERM search for 'smithson' on lastname - should return Jane Smithson + qt_term_smithson """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('smithson', '{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Test 3: TERM search for 'johnson' on lastname - should return Michael David Johnson + qt_term_johnson """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('johnson', '{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // ============ Bug: Wildcard searches return empty ============ + + // Test 4: Leading wildcard '*ith' - should match "Smith" (ends with "ith") + qt_wildcard_star_ith """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('*ith', '{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Test 5: Middle wildcard 'sm*th' - should match "Smith" + qt_wildcard_sm_star_th """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('sm*th', '{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Test 6: Single char wildcard 'sm?th' - should match "Smith" + qt_wildcard_sm_q_th """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('sm?th', '{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Test 7: Trailing wildcard 'smith*' - should match "Smith" and "Smithson" + qt_wildcard_smith_star """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('smith*', '{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Test 8: Wildcard 'sm*' - should match "Smith" and "Smithson" + qt_wildcard_sm_star """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('sm*', '{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Test 9: Wildcard '*son' - should match "Smithson" and "Johnson" + qt_wildcard_star_son """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('*son', '{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Test 10: Wildcard on firstname field 'jo?n' - should match "John" + qt_wildcard_firstname """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('jo?n', '{"default_field":"props.string_8","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Test 11: Wildcard combined with AND - 'sm*th AND props.string_8:john' + qt_wildcard_and_term """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('props.string_17:sm*th AND props.string_8:john', '{"default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Test 12: Standalone wildcard '*' matches all non-null values + qt_wildcard_star_all """ + SELECT /*+SET_VAR(enable_common_expr_pushdown=true)*/ id FROM ${tableName} + WHERE search('*', '{"default_field":"props.string_17","default_operator":"and","mode":"lucene"}') + ORDER BY id + """ + + // Clean up + sql "DROP TABLE IF EXISTS ${tableName}" +} diff --git a/regression-test/suites/search/test_search_vs_match_consistency.groovy b/regression-test/suites/search/test_search_vs_match_consistency.groovy index 05ba88344cfb6f..5b2c7b4b6646b9 100644 --- a/regression-test/suites/search/test_search_vs_match_consistency.groovy +++ b/regression-test/suites/search/test_search_vs_match_consistency.groovy @@ -15,9 +15,12 @@ // specific language governing permissions and limitations // under the License. -suite("test_search_vs_match_consistency") { +suite("test_search_vs_match_consistency", "p0") { def tableName = "search_match_consistency_test" + // Pin enable_common_expr_pushdown to prevent CI flakiness from fuzzy testing. + sql """ set enable_common_expr_pushdown = true """ + sql "DROP TABLE IF EXISTS ${tableName}" // Create test table similar to wikipedia structure @@ -117,7 +120,7 @@ suite("test_search_vs_match_consistency") { // Test Suite 1: Basic OR query consistency qt_test_1_1_search """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:Ronald or title:Selma') + WHERE search('title:Ronald OR title:Selma') """ qt_test_1_1_match """ @@ -128,7 +131,7 @@ suite("test_search_vs_match_consistency") { // Test 1.2: OR across different fields qt_test_1_2_search """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:Ronald or content:Selma') + WHERE search('title:Ronald OR content:Selma') """ qt_test_1_2_match """ @@ -139,7 +142,7 @@ suite("test_search_vs_match_consistency") { // Test 1.3: Complex OR with ALL operation qt_test_1_3_search """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:Ronald or (content:ALL(Selma Blair))') + WHERE search('title:Ronald OR (content:ALL(Selma Blair))') """ qt_test_1_3_match """ @@ -150,7 +153,7 @@ suite("test_search_vs_match_consistency") { // Test Suite 2: NOT query consistency qt_test_2_1_internal_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('not content:Round') + WHERE search('NOT content:Round') """ qt_test_2_1_external_not """ @@ -161,7 +164,7 @@ suite("test_search_vs_match_consistency") { // Test 2.2: NOT with different fields qt_test_2_2_internal_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('not title:Ronald') + WHERE search('NOT title:Ronald') """ qt_test_2_2_external_not """ @@ -172,49 +175,49 @@ suite("test_search_vs_match_consistency") { // Test 2.3: NOT with complex expression qt_test_2_3_internal_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('not (title:Ronald and content:biography)') + WHERE search('NOT (title:Ronald AND content:biography)') """ qt_test_2_3_external_not """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE not search('title:Ronald and content:biography') + WHERE not search('title:Ronald AND content:biography') """ // Test Suite 3: NULL value behavior in OR queries qt_test_3_1_or_with_null """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title, content FROM ${tableName} - WHERE search('title:NonExistent or content:Ronald') + WHERE search('title:NonExistent OR content:Ronald') ORDER BY id """ qt_test_3_2_or_multiple_null """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title, content FROM ${tableName} - WHERE search('title:Mystery or content:Round') + WHERE search('title:Mystery OR content:Round') ORDER BY id """ // Test Suite 4: AND query behavior with NULLs qt_test_4_1_and_with_null """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ id, title, content FROM ${tableName} - WHERE search('title:Ronald and content:biography') + WHERE search('title:Ronald AND content:biography') ORDER BY id """ // Test Suite 5: Edge cases and complex scenarios qt_test_5_1_empty_string """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:"" or content:Round') + WHERE search('title:"" OR content:Round') """ qt_test_5_2_complex_nested """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('(title:Ronald or title:Selma) and not (content:Round and author:NonExistent)') + WHERE search('(title:Ronald OR title:Selma) AND NOT (content:Round AND author:NonExistent)') """ // Test Suite 6: Performance and consistency verification qt_test_6_1_large_or_search """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${tableName} - WHERE search('title:Ronald or title:Selma or content:Round or content:biography or author:Smith or tags:history') + WHERE search('title:Ronald OR title:Selma OR content:Round OR content:biography OR author:Smith OR tags:history') """ qt_test_6_1_large_or_match """ @@ -278,7 +281,7 @@ suite("test_search_vs_match_consistency") { // Mandy/Kesha consistency checks qt_man_pat_1_search """ SELECT /*+SET_VAR(enable_common_expr_pushdown=true) */ count(*) FROM ${mandyTable} - WHERE search('content:ALL("Mandy Patinkin") or not (content:ANY("Kesha"))') + WHERE search('content:ALL("Mandy Patinkin") OR NOT (content:ANY("Kesha"))') """ qt_man_pat_1_match """ @@ -291,7 +294,7 @@ suite("test_search_vs_match_consistency") { CASE WHEN title IS NULL THEN 'NULL' ELSE 'NOT_NULL' END AS title_status, CASE WHEN content IS NULL THEN 'NULL' ELSE 'NOT_NULL' END AS content_status FROM ${mandyTable} - WHERE search('content:ALL("Mandy Patinkin") or not (content:ANY("Kesha"))') + WHERE search('content:ALL("Mandy Patinkin") OR NOT (content:ANY("Kesha"))') ORDER BY id """