From 4db6a6c2fa4d9c6b7d13c134e95ce931bb13cda4 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Wed, 22 Apr 2026 14:57:45 +0800 Subject: [PATCH 1/4] [fix](function) fix tokenize function incorrect result when first argument is const After unpack_if_const, the inner data column of a ColumnConst has only 1 row. The _do_tokenize and _do_tokenize_none functions iterate based on the source column's row count, so when the first argument is a constant (e.g., SELECT tokenize('hello world', 'parser=english') FROM table), only 1 output row was produced instead of input_rows_count rows. Fix by wrapping the result in ColumnConst when the source column was const. --- be/src/exprs/function/function_tokenize.cpp | 19 ++++++++++++++++--- be/src/exprs/function/function_tokenize.h | 2 +- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/be/src/exprs/function/function_tokenize.cpp b/be/src/exprs/function/function_tokenize.cpp index e8d923f981a485..0df133ba2791ee 100644 --- a/be/src/exprs/function/function_tokenize.cpp +++ b/be/src/exprs/function/function_tokenize.cpp @@ -30,6 +30,7 @@ #include "core/block/block.h" #include "core/block/column_with_type_and_name.h" #include "core/column/column.h" +#include "core/column/column_const.h" #include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_number.h" #include "core/string_ref.h" @@ -134,7 +135,7 @@ void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string, Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, uint32_t result, - size_t /*input_rows_count*/) const { + size_t input_rows_count) const { DCHECK_EQ(arguments.size(), 2); const auto& [src_column, left_const] = unpack_if_const(block.get_by_position(arguments[0]).column); @@ -165,7 +166,13 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block if (config.analyzer_name.empty() && config.parser_type == InvertedIndexParserType::PARSER_NONE) { _do_tokenize_none(*col_left, dest_column_ptr); - block.replace_by_position(result, std::move(dest_column_ptr)); + if (left_const) { + block.replace_by_position( + result, + ColumnConst::create(std::move(dest_column_ptr), input_rows_count)); + } else { + block.replace_by_position(result, std::move(dest_column_ptr)); + } return Status::OK(); } @@ -196,7 +203,13 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block analyzer_ctx.analyzer = analyzer_holder; _do_tokenize(*col_left, analyzer_ctx, support_phrase, dest_column_ptr); - block.replace_by_position(result, std::move(dest_column_ptr)); + if (left_const) { + block.replace_by_position( + result, + ColumnConst::create(std::move(dest_column_ptr), input_rows_count)); + } else { + block.replace_by_position(result, std::move(dest_column_ptr)); + } return Status::OK(); } } diff --git a/be/src/exprs/function/function_tokenize.h b/be/src/exprs/function/function_tokenize.h index 533b411458bf3a..01828d120a55d4 100644 --- a/be/src/exprs/function/function_tokenize.h +++ b/be/src/exprs/function/function_tokenize.h @@ -70,6 +70,6 @@ class FunctionTokenize : public IFunction { void _do_tokenize_none(const ColumnString& src_column_string, const MutableColumnPtr& dest_column_ptr) const; Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, - uint32_t result, size_t /*input_rows_count*/) const override; + uint32_t result, size_t input_rows_count) const override; }; } // namespace doris From 05e014fc037f9f3c2adefa784ad01b0e8f0faa7f Mon Sep 17 00:00:00 2001 From: airborne12 Date: Mon, 25 May 2026 22:34:59 +0800 Subject: [PATCH 2/4] [fix](be) Fix clang-format issue in function_tokenize ### What problem does this PR solve? Problem Summary: clang-format-16 reported a formatting deviation in `function_tokenize.cpp` where the `ColumnConst::create(...)` argument fit on the same line as `result`. Join the two arguments on a single line so the file passes `build-support/check-format.sh`. ### Release note None ### Check List (For Author) - Test: No need to test (formatting-only change) - Behavior changed: No - Does this need documentation: No --- be/src/exprs/function/function_tokenize.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/be/src/exprs/function/function_tokenize.cpp b/be/src/exprs/function/function_tokenize.cpp index 0df133ba2791ee..a3d616d635763c 100644 --- a/be/src/exprs/function/function_tokenize.cpp +++ b/be/src/exprs/function/function_tokenize.cpp @@ -205,8 +205,7 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block if (left_const) { block.replace_by_position( - result, - ColumnConst::create(std::move(dest_column_ptr), input_rows_count)); + result, ColumnConst::create(std::move(dest_column_ptr), input_rows_count)); } else { block.replace_by_position(result, std::move(dest_column_ptr)); } From 029e1bc8d6a920f1c1ce86993a1e677a578767a8 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Wed, 27 May 2026 11:55:49 +0800 Subject: [PATCH 3/4] [test](function) add regression for tokenize with const first arg Adds regression coverage for the const-first-argument fix in FunctionTokenize::execute_impl. The generic PreparedFunctionImpl::default_implementation_for_constant_arguments path only fires when all arguments are constant, so the original bug example `SELECT tokenize('hello', '"parser"="english"') FROM t` never actually reaches execute_impl. The new tests use a parser_config column to force a non-const second argument, which is the path where the left_const branch matters. Both the analyzer path (parser=english) and the PARSER_NONE early return are covered. Assertions check that row count equals input_rows_count (was 1 before the fix) and that every row carries the expected tokenized value. --- .../inverted_index_p0/test_tokenize.groovy | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy index d0bdada2e31d7c..118b95397e5cad 100644 --- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy +++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy @@ -128,4 +128,84 @@ suite("test_tokenize"){ qt_tokenize_sql """SELECT TOKENIZE('1・2', '"parser"="ik","parser_mode"="ik_max_word"');""" qt_tokenize_sql """SELECT TOKENIZE('abcşīabc', '"parser"="ik","parser_mode"="ik_max_word"');""" + // Regression for the case where the first argument is a constant literal + // while the second argument is a non-const column. The generic + // PreparedFunctionImpl::default_implementation_for_constant_arguments path + // only fires when all arguments are constant, so the example from the bug + // description (`SELECT tokenize('hello', '"parser"="english"') FROM t`) + // never actually reaches FunctionTokenize::execute_impl. To exercise the + // left_const branch added by this fix we need a non-const second argument; + // we get that by reading the properties string from a table column. + // + // Before the fix _do_tokenize / _do_tokenize_none would produce a + // dest_column with only 1 row (because unpack_if_const unwrapped the + // ColumnConst of the first arg to its inner 1-row data column), so the + // outer block ended up with a row count of 1 instead of input_rows_count. + def constArgTbl = "tokenize_const_first_arg" + sql "DROP TABLE IF EXISTS ${constArgTbl}" + sql """ + CREATE TABLE IF NOT EXISTS ${constArgTbl}( + `id` int NULL, + `parser_config` text NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES("replication_allocation" = "tag.location.default: 1"); + """ + sql """INSERT INTO ${constArgTbl} VALUES + (1, '"parser"="english"'), + (2, '"parser"="english"'), + (3, '"parser"="english"'), + (4, '"parser"="english"'), + (5, '"parser"="english"');""" + + // Row count must match the source table's row count, not collapse to 1. + def cntEnglish = sql """SELECT COUNT(*) FROM + (SELECT tokenize('hello world', parser_config) AS t FROM ${constArgTbl}) sub;""" + assertEquals(5L, cntEnglish[0][0]) + + // Every row must carry the tokenized value (and they must all be equal, + // since the source string is constant across rows). + def rowsEnglish = sql """SELECT id, tokenize('hello world', parser_config) AS t + FROM ${constArgTbl} ORDER BY id;""" + assertEquals(5, rowsEnglish.size()) + def firstEnglishTok = rowsEnglish[0][1] + assertTrue(firstEnglishTok != null && firstEnglishTok.toString().contains("hello")) + assertTrue(firstEnglishTok.toString().contains("world")) + for (int i = 0; i < rowsEnglish.size(); i++) { + assertEquals(i + 1, rowsEnglish[i][0] as Integer) + assertEquals(firstEnglishTok, rowsEnglish[i][1]) + } + + // Same exercise for the PARSER_NONE branch, which is a separate code path + // inside execute_impl (the _do_tokenize_none early return). + def constArgNoneTbl = "tokenize_const_first_arg_none" + sql "DROP TABLE IF EXISTS ${constArgNoneTbl}" + sql """ + CREATE TABLE IF NOT EXISTS ${constArgNoneTbl}( + `id` int NULL, + `parser_config` text NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES("replication_allocation" = "tag.location.default: 1"); + """ + sql """INSERT INTO ${constArgNoneTbl} VALUES + (1, '"parser"="none"'), + (2, '"parser"="none"'), + (3, '"parser"="none"');""" + + def cntNone = sql """SELECT COUNT(*) FROM + (SELECT tokenize('hello world', parser_config) AS t FROM ${constArgNoneTbl}) sub;""" + assertEquals(3L, cntNone[0][0]) + + def rowsNone = sql """SELECT id, tokenize('hello world', parser_config) AS t + FROM ${constArgNoneTbl} ORDER BY id;""" + assertEquals(3, rowsNone.size()) + def firstNoneTok = rowsNone[0][1] + assertTrue(firstNoneTok != null && firstNoneTok.toString().contains("hello world")) + for (int i = 0; i < rowsNone.size(); i++) { + assertEquals(i + 1, rowsNone[i][0] as Integer) + assertEquals(firstNoneTok, rowsNone[i][1]) + } } From de5e1d7eb791a883092b1dedd870def1247e73c4 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Wed, 27 May 2026 13:16:35 +0800 Subject: [PATCH 4/4] [test](be) cover const-first-arg path via BE unit test The earlier regression-test addition was invalid: FE `Tokenize.checkLegalityBeforeTypeCoercion()` requires the second argument to be a `StringLikeLiteral`, so the test query that read the parser config from a table column failed analysis instead of reaching `FunctionTokenize::execute_impl`. And when both arguments are literals `PreparedFunctionImpl::default_implementation_for_constant_arguments` already short-circuits with the all-const fast path, so there is no SQL shape that exercises the new `left_const` branch. Drop the invalid regression test and add a BE unit test that builds the block directly: arg0 is wrapped in a `ColumnConst(size=N)` whose inner data column has 1 row, arg1 is a regular `ColumnString` with N rows. With that shape `all_arguments_are_constant` is false, the generic fast path skips, `execute_impl` is reached, and `unpack_if_const(arg0)` returns `left_const == true`. The assertions check that the result column has N rows (was 1 before the fix) and that every row carries the same tokenized value, for both the analyzer branch and the `PARSER_NONE` early-return branch. --- .../exprs/function/function_tokenize_test.cpp | 94 +++++++++++++++++++ .../inverted_index_p0/test_tokenize.groovy | 80 ---------------- 2 files changed, 94 insertions(+), 80 deletions(-) diff --git a/be/test/exprs/function/function_tokenize_test.cpp b/be/test/exprs/function/function_tokenize_test.cpp index 322ee56c893bc1..a17974d7c228fd 100644 --- a/be/test/exprs/function/function_tokenize_test.cpp +++ b/be/test/exprs/function/function_tokenize_test.cpp @@ -25,6 +25,7 @@ #include "core/block/block.h" #include "core/block/column_with_type_and_name.h" +#include "core/column/column_const.h" #include "core/column/column_string.h" #include "core/data_type/data_type_string.h" #include "exprs/function/simple_function_factory.h" @@ -203,6 +204,99 @@ TEST_F(FunctionTokenizeTest, ParserNonePropertyFormats) { } } +// Regression for const-first-argument row count. +// +// The fix in FunctionTokenize::execute_impl wraps the dest column in a +// ColumnConst(input_rows_count) when the first argument was already a +// ColumnConst. The generic PreparedFunctionImpl::default_implementation_for_ +// constant_arguments path only short-circuits when *all* arguments are const, +// so to reach the new left_const branch we need arg0 const but arg1 non-const. +// FE rejects non-literal second arguments for tokenize, so this path is only +// exercisable from a unit test that builds the block directly. +TEST_F(FunctionTokenizeTest, ConstFirstArgPreservesRowCount) { + const size_t input_rows_count = 5; + const std::string input_str = "Hello World Test"; + const std::string properties = "parser='english'"; + + auto input_inner = ColumnString::create(); + input_inner->insert_data(input_str.data(), input_str.size()); + auto input_const = ColumnConst::create(std::move(input_inner), input_rows_count); + + auto properties_column = ColumnString::create(); + for (size_t i = 0; i < input_rows_count; ++i) { + properties_column->insert_data(properties.data(), properties.size()); + } + + auto string_type = std::make_shared(); + + Block block; + block.insert(ColumnWithTypeAndName(std::move(input_const), string_type, "input")); + block.insert(ColumnWithTypeAndName(std::move(properties_column), string_type, "properties")); + block.insert(ColumnWithTypeAndName(string_type->create_column(), string_type, "result")); + + ColumnNumbers arguments = {0, 1}; + uint32_t result = 2; + + auto status = _function->execute(nullptr, block, arguments, result, input_rows_count); + ASSERT_TRUE(status.ok()) << status.to_string(); + + auto result_column = block.get_by_position(result).column; + ASSERT_EQ(result_column->size(), input_rows_count); + + // Source is constant, so every row must carry the same tokenized value. + StringRef first_token = result_column->get_data_at(0); + ASSERT_GT(first_token.size, 0); + EXPECT_NE(std::string(first_token.data, first_token.size).find("hello"), std::string::npos); + EXPECT_NE(std::string(first_token.data, first_token.size).find("world"), std::string::npos); + for (size_t i = 1; i < input_rows_count; ++i) { + StringRef row_i = result_column->get_data_at(i); + EXPECT_EQ(row_i.size, first_token.size); + EXPECT_EQ(memcmp(row_i.data, first_token.data, first_token.size), 0); + } +} + +// Same check for the PARSER_NONE early-return branch in execute_impl. +TEST_F(FunctionTokenizeTest, ConstFirstArgParserNonePreservesRowCount) { + const size_t input_rows_count = 4; + const std::string input_str = "Hello World"; + const std::string properties = "parser='none'"; + + auto input_inner = ColumnString::create(); + input_inner->insert_data(input_str.data(), input_str.size()); + auto input_const = ColumnConst::create(std::move(input_inner), input_rows_count); + + auto properties_column = ColumnString::create(); + for (size_t i = 0; i < input_rows_count; ++i) { + properties_column->insert_data(properties.data(), properties.size()); + } + + auto string_type = std::make_shared(); + + Block block; + block.insert(ColumnWithTypeAndName(std::move(input_const), string_type, "input")); + block.insert(ColumnWithTypeAndName(std::move(properties_column), string_type, "properties")); + block.insert(ColumnWithTypeAndName(string_type->create_column(), string_type, "result")); + + ColumnNumbers arguments = {0, 1}; + uint32_t result = 2; + + auto status = _function->execute(nullptr, block, arguments, result, input_rows_count); + ASSERT_TRUE(status.ok()) << status.to_string(); + + auto result_column = block.get_by_position(result).column; + ASSERT_EQ(result_column->size(), input_rows_count); + + StringRef first_token = result_column->get_data_at(0); + const std::string expected = R"([{ + "token": "Hello World" + }])"; + EXPECT_EQ(std::string(first_token.data, first_token.size), expected); + for (size_t i = 1; i < input_rows_count; ++i) { + StringRef row_i = result_column->get_data_at(i); + EXPECT_EQ(std::string(row_i.data, row_i.size), expected); + } +} + // Test error cases TEST_F(FunctionTokenizeTest, InvalidParser) { std::vector input_strings = {"Test String"}; diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy index 118b95397e5cad..d0bdada2e31d7c 100644 --- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy +++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy @@ -128,84 +128,4 @@ suite("test_tokenize"){ qt_tokenize_sql """SELECT TOKENIZE('1・2', '"parser"="ik","parser_mode"="ik_max_word"');""" qt_tokenize_sql """SELECT TOKENIZE('abcşīabc', '"parser"="ik","parser_mode"="ik_max_word"');""" - // Regression for the case where the first argument is a constant literal - // while the second argument is a non-const column. The generic - // PreparedFunctionImpl::default_implementation_for_constant_arguments path - // only fires when all arguments are constant, so the example from the bug - // description (`SELECT tokenize('hello', '"parser"="english"') FROM t`) - // never actually reaches FunctionTokenize::execute_impl. To exercise the - // left_const branch added by this fix we need a non-const second argument; - // we get that by reading the properties string from a table column. - // - // Before the fix _do_tokenize / _do_tokenize_none would produce a - // dest_column with only 1 row (because unpack_if_const unwrapped the - // ColumnConst of the first arg to its inner 1-row data column), so the - // outer block ended up with a row count of 1 instead of input_rows_count. - def constArgTbl = "tokenize_const_first_arg" - sql "DROP TABLE IF EXISTS ${constArgTbl}" - sql """ - CREATE TABLE IF NOT EXISTS ${constArgTbl}( - `id` int NULL, - `parser_config` text NULL - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES("replication_allocation" = "tag.location.default: 1"); - """ - sql """INSERT INTO ${constArgTbl} VALUES - (1, '"parser"="english"'), - (2, '"parser"="english"'), - (3, '"parser"="english"'), - (4, '"parser"="english"'), - (5, '"parser"="english"');""" - - // Row count must match the source table's row count, not collapse to 1. - def cntEnglish = sql """SELECT COUNT(*) FROM - (SELECT tokenize('hello world', parser_config) AS t FROM ${constArgTbl}) sub;""" - assertEquals(5L, cntEnglish[0][0]) - - // Every row must carry the tokenized value (and they must all be equal, - // since the source string is constant across rows). - def rowsEnglish = sql """SELECT id, tokenize('hello world', parser_config) AS t - FROM ${constArgTbl} ORDER BY id;""" - assertEquals(5, rowsEnglish.size()) - def firstEnglishTok = rowsEnglish[0][1] - assertTrue(firstEnglishTok != null && firstEnglishTok.toString().contains("hello")) - assertTrue(firstEnglishTok.toString().contains("world")) - for (int i = 0; i < rowsEnglish.size(); i++) { - assertEquals(i + 1, rowsEnglish[i][0] as Integer) - assertEquals(firstEnglishTok, rowsEnglish[i][1]) - } - - // Same exercise for the PARSER_NONE branch, which is a separate code path - // inside execute_impl (the _do_tokenize_none early return). - def constArgNoneTbl = "tokenize_const_first_arg_none" - sql "DROP TABLE IF EXISTS ${constArgNoneTbl}" - sql """ - CREATE TABLE IF NOT EXISTS ${constArgNoneTbl}( - `id` int NULL, - `parser_config` text NULL - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES("replication_allocation" = "tag.location.default: 1"); - """ - sql """INSERT INTO ${constArgNoneTbl} VALUES - (1, '"parser"="none"'), - (2, '"parser"="none"'), - (3, '"parser"="none"');""" - - def cntNone = sql """SELECT COUNT(*) FROM - (SELECT tokenize('hello world', parser_config) AS t FROM ${constArgNoneTbl}) sub;""" - assertEquals(3L, cntNone[0][0]) - - def rowsNone = sql """SELECT id, tokenize('hello world', parser_config) AS t - FROM ${constArgNoneTbl} ORDER BY id;""" - assertEquals(3, rowsNone.size()) - def firstNoneTok = rowsNone[0][1] - assertTrue(firstNoneTok != null && firstNoneTok.toString().contains("hello world")) - for (int i = 0; i < rowsNone.size(); i++) { - assertEquals(i + 1, rowsNone[i][0] as Integer) - assertEquals(firstNoneTok, rowsNone[i][1]) - } }