From 45700bd7b1a461d6112bde23a69355fcb0e6c89c Mon Sep 17 00:00:00 2001 From: airborne12 Date: Wed, 4 Feb 2026 18:23:40 +0800 Subject: [PATCH] [fix](inverted index) Fix empty string MATCH on keyword index returning wrong results The multi-analyzer feature commit (2c950e140a5) incorrectly added an empty string check that prevented MATCH '' from finding rows with empty string values in keyword indexes. For keyword index (no tokenization), empty string is a valid exact match value and should be matchable. The previous code incorrectly skipped empty strings with the comment "empty query should match nothing", which is wrong for keyword indexes. This fix removes the empty string check for keyword index paths in: - be/src/vec/functions/match.cpp (slow path) - be/src/olap/rowset/segment_v2/inverted_index_reader.cpp (index path) - be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp Added regression test test_empty_string_match.groovy to cover: - Empty string match on keyword index (both index and slow paths) - Empty string match on tokenized index (should return 0) - match_any and match_all with empty string --- .../inverted_index/analyzer/analyzer.cpp | 6 +- .../segment_v2/inverted_index_reader.cpp | 7 +- be/src/vec/functions/match.cpp | 13 ++-- .../test_empty_string_match.out | 26 +++++++ .../test_empty_string_match.groovy | 78 +++++++++++++++++++ 5 files changed, 115 insertions(+), 15 deletions(-) create mode 100644 regression-test/data/inverted_index_p0/test_empty_string_match.out create mode 100644 regression-test/suites/inverted_index_p0/test_empty_string_match.groovy diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp index ec9d81f1503b54..cad3837d0813cb 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp @@ -187,10 +187,10 @@ std::vector InvertedIndexAnalyzer::get_analyse_result( std::vector InvertedIndexAnalyzer::get_analyse_result( const std::string& search_str, const std::map& properties) { if (!should_analyzer(properties)) { + // Keyword index: all strings (including empty) are valid tokens for exact match. + // Empty string is a valid value in keyword index and should be matchable. std::vector result; - if (!search_str.empty()) { - result.emplace_back(search_str); - } + result.emplace_back(search_str); return result; } InvertedIndexAnalyzerConfig config; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index f34a546a105af1..fecf5b6446296f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -315,10 +315,9 @@ Status FullTextIndexReader::query(const IndexQueryContextPtr& context, } else { SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer); if (analyzer_ctx != nullptr && !analyzer_ctx->should_tokenize()) { - // Don't add empty string as token - empty query should match nothing - if (!search_str.empty()) { - query_info.term_infos.emplace_back(search_str); - } + // Keyword index: all strings (including empty) are valid tokens for exact match. + // Empty string is a valid value in keyword index and should be matchable. + query_info.term_infos.emplace_back(search_str); } else if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) { // Use analyzer from query context for consistent behavior across all segments. // This ensures that the query uses the same analyzer settings (e.g., lowercase) diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index c3a2ec9fce98f4..502a636b8c8339 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -205,20 +205,17 @@ std::vector FunctionMatchBase::analyse_query_str_token( // - PARSER_NONE: no tokenization (keyword/exact match) // - Other parsers: tokenize using the analyzer if (!analyzer_ctx->should_tokenize()) { - // Keyword index or no tokenization needed - // Don't add empty string as token - empty query should match nothing - if (!match_query_str.empty()) { - query_tokens.emplace_back(match_query_str); - } + // Keyword index: all strings (including empty) are valid tokens for exact match. + // Empty string is a valid value in keyword index and should be matchable. + query_tokens.emplace_back(match_query_str); return query_tokens; } // Safety check: if analyzer is nullptr but tokenization is expected, fall back to no tokenization if (analyzer_ctx->analyzer == nullptr) { VLOG_DEBUG << "Analyzer is nullptr, falling back to no tokenization"; - if (!match_query_str.empty()) { - query_tokens.emplace_back(match_query_str); - } + // For fallback case, also allow empty strings to be matched + query_tokens.emplace_back(match_query_str); return query_tokens; } diff --git a/regression-test/data/inverted_index_p0/test_empty_string_match.out b/regression-test/data/inverted_index_p0/test_empty_string_match.out new file mode 100644 index 00000000000000..c05432b680bcad --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_empty_string_match.out @@ -0,0 +1,26 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !keyword_index_path -- +1 +3 + +-- !keyword_slow_path -- +1 +3 + +-- !english_index_path -- +0 + +-- !english_slow_path -- +0 + +-- !keyword_nonempty -- +2 + +-- !match_any_empty -- +1 +3 + +-- !match_all_empty -- +1 +3 + diff --git a/regression-test/suites/inverted_index_p0/test_empty_string_match.groovy b/regression-test/suites/inverted_index_p0/test_empty_string_match.groovy new file mode 100644 index 00000000000000..798e0100f1beb9 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_empty_string_match.groovy @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_empty_string_match", "p0") { + def tableName = "test_empty_string_match" + + sql "DROP TABLE IF EXISTS ${tableName}" + sql """ + CREATE TABLE ${tableName} ( + id INT, + keyword_col TEXT DEFAULT '', + english_col TEXT DEFAULT '', + INDEX keyword_idx(keyword_col) USING INVERTED COMMENT 'keyword index', + INDEX english_idx(english_col) USING INVERTED PROPERTIES("parser" = "english") COMMENT 'english parser' + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES("replication_allocation" = "tag.location.default: 1"); + """ + + sql """ + INSERT INTO ${tableName} VALUES + (1, '', 'hello world'), + (2, 'test', ''), + (3, '', ''), + (4, 'data', 'some text'); + """ + + sql "SET enable_common_expr_pushdown = true" + + // Test 1: Empty string match on keyword index (index path) + // Should match rows where keyword_col is empty string (rows 1 and 3) + sql "SET enable_inverted_index_query = true" + qt_keyword_index_path """SELECT id FROM ${tableName} WHERE keyword_col match '' ORDER BY id""" + + // Test 2: Empty string match on keyword index (slow path) + // Should also match rows where keyword_col is empty string + sql "SET enable_inverted_index_query = false" + sql "SET enable_match_without_inverted_index = true" + qt_keyword_slow_path """SELECT id FROM ${tableName} WHERE keyword_col match '' ORDER BY id""" + + // Test 3: Empty string match on tokenized index (index path) + // Should return no rows because empty string tokenizes to nothing + sql "SET enable_inverted_index_query = true" + qt_english_index_path """SELECT count() FROM ${tableName} WHERE english_col match ''""" + + // Test 4: Empty string match on tokenized index (slow path) + // Should also return no rows + sql "SET enable_inverted_index_query = false" + qt_english_slow_path """SELECT count() FROM ${tableName} WHERE english_col match ''""" + + // Test 5: Non-empty string match on keyword index should work as before + sql "SET enable_inverted_index_query = true" + qt_keyword_nonempty """SELECT id FROM ${tableName} WHERE keyword_col match 'test' ORDER BY id""" + + // Test 6: Verify match_any with empty string on keyword index + sql "SET enable_inverted_index_query = false" + qt_match_any_empty """SELECT id FROM ${tableName} WHERE keyword_col match_any '' ORDER BY id""" + + // Test 7: Verify match_all with empty string on keyword index + qt_match_all_empty """SELECT id FROM ${tableName} WHERE keyword_col match_all '' ORDER BY id""" + + sql "DROP TABLE IF EXISTS ${tableName}" +}