Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -187,10 +187,10 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
const std::string& search_str, const std::map<std::string, std::string>& properties) {
if (!should_analyzer(properties)) {
// Keyword index: all strings (including empty) are valid tokens for exact match.
// Empty string is a valid value in keyword index and should be matchable.
std::vector<TermInfo> result;
if (!search_str.empty()) {
result.emplace_back(search_str);
}
result.emplace_back(search_str);
return result;
}
InvertedIndexAnalyzerConfig config;
Expand Down
7 changes: 3 additions & 4 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,10 +315,9 @@ Status FullTextIndexReader::query(const IndexQueryContextPtr& context,
} else {
SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer);
if (analyzer_ctx != nullptr && !analyzer_ctx->should_tokenize()) {
// Don't add empty string as token - empty query should match nothing
if (!search_str.empty()) {
query_info.term_infos.emplace_back(search_str);
}
// Keyword index: all strings (including empty) are valid tokens for exact match.
// Empty string is a valid value in keyword index and should be matchable.
query_info.term_infos.emplace_back(search_str);
} else if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) {
// Use analyzer from query context for consistent behavior across all segments.
// This ensures that the query uses the same analyzer settings (e.g., lowercase)
Expand Down
13 changes: 5 additions & 8 deletions be/src/vec/functions/match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,20 +205,17 @@ std::vector<TermInfo> FunctionMatchBase::analyse_query_str_token(
// - PARSER_NONE: no tokenization (keyword/exact match)
// - Other parsers: tokenize using the analyzer
if (!analyzer_ctx->should_tokenize()) {
// Keyword index or no tokenization needed
// Don't add empty string as token - empty query should match nothing
if (!match_query_str.empty()) {
query_tokens.emplace_back(match_query_str);
}
// Keyword index: all strings (including empty) are valid tokens for exact match.
// Empty string is a valid value in keyword index and should be matchable.
query_tokens.emplace_back(match_query_str);
return query_tokens;
}

// Safety check: if analyzer is nullptr but tokenization is expected, fall back to no tokenization
if (analyzer_ctx->analyzer == nullptr) {
VLOG_DEBUG << "Analyzer is nullptr, falling back to no tokenization";
if (!match_query_str.empty()) {
query_tokens.emplace_back(match_query_str);
}
// For fallback case, also allow empty strings to be matched
query_tokens.emplace_back(match_query_str);
return query_tokens;
}

Expand Down
26 changes: 26 additions & 0 deletions regression-test/data/inverted_index_p0/test_empty_string_match.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !keyword_index_path --
1
3

-- !keyword_slow_path --
1
3

-- !english_index_path --
0

-- !english_slow_path --
0

-- !keyword_nonempty --
2

-- !match_any_empty --
1
3

-- !match_all_empty --
1
3

Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_empty_string_match", "p0") {
def tableName = "test_empty_string_match"

sql "DROP TABLE IF EXISTS ${tableName}"
sql """
CREATE TABLE ${tableName} (
id INT,
keyword_col TEXT DEFAULT '',
english_col TEXT DEFAULT '',
INDEX keyword_idx(keyword_col) USING INVERTED COMMENT 'keyword index',
INDEX english_idx(english_col) USING INVERTED PROPERTIES("parser" = "english") COMMENT 'english parser'
) ENGINE=OLAP
DUPLICATE KEY(id)
DISTRIBUTED BY HASH(id) BUCKETS 1
PROPERTIES("replication_allocation" = "tag.location.default: 1");
"""

sql """
INSERT INTO ${tableName} VALUES
(1, '', 'hello world'),
(2, 'test', ''),
(3, '', ''),
(4, 'data', 'some text');
"""

sql "SET enable_common_expr_pushdown = true"

// Test 1: Empty string match on keyword index (index path)
// Should match rows where keyword_col is empty string (rows 1 and 3)
sql "SET enable_inverted_index_query = true"
qt_keyword_index_path """SELECT id FROM ${tableName} WHERE keyword_col match '' ORDER BY id"""

// Test 2: Empty string match on keyword index (slow path)
// Should also match rows where keyword_col is empty string
sql "SET enable_inverted_index_query = false"
sql "SET enable_match_without_inverted_index = true"
qt_keyword_slow_path """SELECT id FROM ${tableName} WHERE keyword_col match '' ORDER BY id"""

// Test 3: Empty string match on tokenized index (index path)
// Should return no rows because empty string tokenizes to nothing
sql "SET enable_inverted_index_query = true"
qt_english_index_path """SELECT count() FROM ${tableName} WHERE english_col match ''"""

// Test 4: Empty string match on tokenized index (slow path)
// Should also return no rows
sql "SET enable_inverted_index_query = false"
qt_english_slow_path """SELECT count() FROM ${tableName} WHERE english_col match ''"""

// Test 5: Non-empty string match on keyword index should work as before
sql "SET enable_inverted_index_query = true"
qt_keyword_nonempty """SELECT id FROM ${tableName} WHERE keyword_col match 'test' ORDER BY id"""

// Test 6: Verify match_any with empty string on keyword index
sql "SET enable_inverted_index_query = false"
qt_match_any_empty """SELECT id FROM ${tableName} WHERE keyword_col match_any '' ORDER BY id"""

// Test 7: Verify match_all with empty string on keyword index
qt_match_all_empty """SELECT id FROM ${tableName} WHERE keyword_col match_all '' ORDER BY id"""

sql "DROP TABLE IF EXISTS ${tableName}"
}
Loading