From 4db6a6c2fa4d9c6b7d13c134e95ce931bb13cda4 Mon Sep 17 00:00:00 2001
From: airborne12 <jiangkai@selectdb.com>
Date: Wed, 22 Apr 2026 14:57:45 +0800
Subject: [PATCH 1/4] [fix](function) fix tokenize function incorrect result
 when first argument is const

After unpack_if_const, the inner data column of a ColumnConst has only 1 row.
The _do_tokenize and _do_tokenize_none functions iterate based on the source
column's row count, so when the first argument is a constant (e.g.,
SELECT tokenize('hello world', 'parser=english') FROM table), only 1 output
row was produced instead of input_rows_count rows. Fix by wrapping the result
in ColumnConst when the source column was const.
---
 be/src/exprs/function/function_tokenize.cpp | 19 ++++++++++++++++---
 be/src/exprs/function/function_tokenize.h   |  2 +-
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/be/src/exprs/function/function_tokenize.cpp b/be/src/exprs/function/function_tokenize.cpp
index e8d923f981a485..0df133ba2791ee 100644
--- a/be/src/exprs/function/function_tokenize.cpp
+++ b/be/src/exprs/function/function_tokenize.cpp
@@ -30,6 +30,7 @@
 #include "core/block/block.h"
 #include "core/block/column_with_type_and_name.h"
 #include "core/column/column.h"
+#include "core/column/column_const.h"
 #include "core/data_type/data_type_nullable.h"
 #include "core/data_type/data_type_number.h"
 #include "core/string_ref.h"
@@ -134,7 +135,7 @@ void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string,
 
 Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block,
                                       const ColumnNumbers& arguments, uint32_t result,
-                                      size_t /*input_rows_count*/) const {
+                                      size_t input_rows_count) const {
     DCHECK_EQ(arguments.size(), 2);
     const auto& [src_column, left_const] =
             unpack_if_const(block.get_by_position(arguments[0]).column);
@@ -165,7 +166,13 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block
             if (config.analyzer_name.empty() &&
                 config.parser_type == InvertedIndexParserType::PARSER_NONE) {
                 _do_tokenize_none(*col_left, dest_column_ptr);
-                block.replace_by_position(result, std::move(dest_column_ptr));
+                if (left_const) {
+                    block.replace_by_position(
+                            result,
+                            ColumnConst::create(std::move(dest_column_ptr), input_rows_count));
+                } else {
+                    block.replace_by_position(result, std::move(dest_column_ptr));
+                }
                 return Status::OK();
             }
 
@@ -196,7 +203,13 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block
             analyzer_ctx.analyzer = analyzer_holder;
             _do_tokenize(*col_left, analyzer_ctx, support_phrase, dest_column_ptr);
 
-            block.replace_by_position(result, std::move(dest_column_ptr));
+            if (left_const) {
+                block.replace_by_position(
+                        result,
+                        ColumnConst::create(std::move(dest_column_ptr), input_rows_count));
+            } else {
+                block.replace_by_position(result, std::move(dest_column_ptr));
+            }
             return Status::OK();
         }
     }
diff --git a/be/src/exprs/function/function_tokenize.h b/be/src/exprs/function/function_tokenize.h
index 533b411458bf3a..01828d120a55d4 100644
--- a/be/src/exprs/function/function_tokenize.h
+++ b/be/src/exprs/function/function_tokenize.h
@@ -70,6 +70,6 @@ class FunctionTokenize : public IFunction {
     void _do_tokenize_none(const ColumnString& src_column_string,
                            const MutableColumnPtr& dest_column_ptr) const;
     Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments,
-                        uint32_t result, size_t /*input_rows_count*/) const override;
+                        uint32_t result, size_t input_rows_count) const override;
 };
 } // namespace doris

From 05e014fc037f9f3c2adefa784ad01b0e8f0faa7f Mon Sep 17 00:00:00 2001
From: airborne12 <jiangkai@selectdb.com>
Date: Mon, 25 May 2026 22:34:59 +0800
Subject: [PATCH 2/4] [fix](be) Fix clang-format issue in function_tokenize

### What problem does this PR solve?

Problem Summary: clang-format-16 reported a formatting deviation in
`function_tokenize.cpp` where the `ColumnConst::create(...)` argument fit
on the same line as `result`. Join the two arguments on a single line so
the file passes `build-support/check-format.sh`.

### Release note

None

### Check List (For Author)

- Test: No need to test (formatting-only change)
- Behavior changed: No
- Does this need documentation: No
---
 be/src/exprs/function/function_tokenize.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/be/src/exprs/function/function_tokenize.cpp b/be/src/exprs/function/function_tokenize.cpp
index 0df133ba2791ee..a3d616d635763c 100644
--- a/be/src/exprs/function/function_tokenize.cpp
+++ b/be/src/exprs/function/function_tokenize.cpp
@@ -205,8 +205,7 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block
 
             if (left_const) {
                 block.replace_by_position(
-                        result,
-                        ColumnConst::create(std::move(dest_column_ptr), input_rows_count));
+                        result, ColumnConst::create(std::move(dest_column_ptr), input_rows_count));
             } else {
                 block.replace_by_position(result, std::move(dest_column_ptr));
             }

From 029e1bc8d6a920f1c1ce86993a1e677a578767a8 Mon Sep 17 00:00:00 2001
From: airborne12 <jiangkai@selectdb.com>
Date: Wed, 27 May 2026 11:55:49 +0800
Subject: [PATCH 3/4] [test](function) add regression for tokenize with const
 first arg

Adds regression coverage for the const-first-argument fix in
FunctionTokenize::execute_impl. The generic
PreparedFunctionImpl::default_implementation_for_constant_arguments
path only fires when all arguments are constant, so the original bug
example `SELECT tokenize('hello', '"parser"="english"') FROM t` never
actually reaches execute_impl. The new tests use a parser_config
column to force a non-const second argument, which is the path where
the left_const branch matters.

Both the analyzer path (parser=english) and the PARSER_NONE early
return are covered. Assertions check that row count equals
input_rows_count (was 1 before the fix) and that every row carries
the expected tokenized value.
---
 .../inverted_index_p0/test_tokenize.groovy    | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index d0bdada2e31d7c..118b95397e5cad 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -128,4 +128,84 @@ suite("test_tokenize"){
     qt_tokenize_sql """SELECT TOKENIZE('1･2', '"parser"="ik","parser_mode"="ik_max_word"');"""
     qt_tokenize_sql """SELECT TOKENIZE('abcşīabc', '"parser"="ik","parser_mode"="ik_max_word"');"""
 
+    // Regression for the case where the first argument is a constant literal
+    // while the second argument is a non-const column. The generic
+    // PreparedFunctionImpl::default_implementation_for_constant_arguments path
+    // only fires when all arguments are constant, so the example from the bug
+    // description (`SELECT tokenize('hello', '"parser"="english"') FROM t`)
+    // never actually reaches FunctionTokenize::execute_impl. To exercise the
+    // left_const branch added by this fix we need a non-const second argument;
+    // we get that by reading the properties string from a table column.
+    //
+    // Before the fix _do_tokenize / _do_tokenize_none would produce a
+    // dest_column with only 1 row (because unpack_if_const unwrapped the
+    // ColumnConst of the first arg to its inner 1-row data column), so the
+    // outer block ended up with a row count of 1 instead of input_rows_count.
+    def constArgTbl = "tokenize_const_first_arg"
+    sql "DROP TABLE IF EXISTS ${constArgTbl}"
+    sql """
+    CREATE TABLE IF NOT EXISTS ${constArgTbl}(
+      `id` int NULL,
+      `parser_config` text NULL
+    ) ENGINE=OLAP
+    DUPLICATE KEY(`id`)
+    DISTRIBUTED BY HASH(`id`) BUCKETS 1
+    PROPERTIES("replication_allocation" = "tag.location.default: 1");
+    """
+    sql """INSERT INTO ${constArgTbl} VALUES
+           (1, '"parser"="english"'),
+           (2, '"parser"="english"'),
+           (3, '"parser"="english"'),
+           (4, '"parser"="english"'),
+           (5, '"parser"="english"');"""
+
+    // Row count must match the source table's row count, not collapse to 1.
+    def cntEnglish = sql """SELECT COUNT(*) FROM
+        (SELECT tokenize('hello world', parser_config) AS t FROM ${constArgTbl}) sub;"""
+    assertEquals(5L, cntEnglish[0][0])
+
+    // Every row must carry the tokenized value (and they must all be equal,
+    // since the source string is constant across rows).
+    def rowsEnglish = sql """SELECT id, tokenize('hello world', parser_config) AS t
+        FROM ${constArgTbl} ORDER BY id;"""
+    assertEquals(5, rowsEnglish.size())
+    def firstEnglishTok = rowsEnglish[0][1]
+    assertTrue(firstEnglishTok != null && firstEnglishTok.toString().contains("hello"))
+    assertTrue(firstEnglishTok.toString().contains("world"))
+    for (int i = 0; i < rowsEnglish.size(); i++) {
+        assertEquals(i + 1, rowsEnglish[i][0] as Integer)
+        assertEquals(firstEnglishTok, rowsEnglish[i][1])
+    }
+
+    // Same exercise for the PARSER_NONE branch, which is a separate code path
+    // inside execute_impl (the _do_tokenize_none early return).
+    def constArgNoneTbl = "tokenize_const_first_arg_none"
+    sql "DROP TABLE IF EXISTS ${constArgNoneTbl}"
+    sql """
+    CREATE TABLE IF NOT EXISTS ${constArgNoneTbl}(
+      `id` int NULL,
+      `parser_config` text NULL
+    ) ENGINE=OLAP
+    DUPLICATE KEY(`id`)
+    DISTRIBUTED BY HASH(`id`) BUCKETS 1
+    PROPERTIES("replication_allocation" = "tag.location.default: 1");
+    """
+    sql """INSERT INTO ${constArgNoneTbl} VALUES
+           (1, '"parser"="none"'),
+           (2, '"parser"="none"'),
+           (3, '"parser"="none"');"""
+
+    def cntNone = sql """SELECT COUNT(*) FROM
+        (SELECT tokenize('hello world', parser_config) AS t FROM ${constArgNoneTbl}) sub;"""
+    assertEquals(3L, cntNone[0][0])
+
+    def rowsNone = sql """SELECT id, tokenize('hello world', parser_config) AS t
+        FROM ${constArgNoneTbl} ORDER BY id;"""
+    assertEquals(3, rowsNone.size())
+    def firstNoneTok = rowsNone[0][1]
+    assertTrue(firstNoneTok != null && firstNoneTok.toString().contains("hello world"))
+    for (int i = 0; i < rowsNone.size(); i++) {
+        assertEquals(i + 1, rowsNone[i][0] as Integer)
+        assertEquals(firstNoneTok, rowsNone[i][1])
+    }
 }

From de5e1d7eb791a883092b1dedd870def1247e73c4 Mon Sep 17 00:00:00 2001
From: airborne12 <jiangkai@selectdb.com>
Date: Wed, 27 May 2026 13:16:35 +0800
Subject: [PATCH 4/4] [test](be) cover const-first-arg path via BE unit test

The earlier regression-test addition was invalid: FE
`Tokenize.checkLegalityBeforeTypeCoercion()` requires the second
argument to be a `StringLikeLiteral`, so the test query that read the
parser config from a table column failed analysis instead of reaching
`FunctionTokenize::execute_impl`. And when both arguments are literals
`PreparedFunctionImpl::default_implementation_for_constant_arguments`
already short-circuits with the all-const fast path, so there is no
SQL shape that exercises the new `left_const` branch.

Drop the invalid regression test and add a BE unit test that builds
the block directly: arg0 is wrapped in a `ColumnConst(size=N)` whose
inner data column has 1 row, arg1 is a regular `ColumnString` with N
rows. With that shape `all_arguments_are_constant` is false, the
generic fast path skips, `execute_impl` is reached, and
`unpack_if_const(arg0)` returns `left_const == true`. The assertions
check that the result column has N rows (was 1 before the fix) and
that every row carries the same tokenized value, for both the
analyzer branch and the `PARSER_NONE` early-return branch.
---
 .../exprs/function/function_tokenize_test.cpp | 94 +++++++++++++++++++
 .../inverted_index_p0/test_tokenize.groovy    | 80 ----------------
 2 files changed, 94 insertions(+), 80 deletions(-)

diff --git a/be/test/exprs/function/function_tokenize_test.cpp b/be/test/exprs/function/function_tokenize_test.cpp
index 322ee56c893bc1..a17974d7c228fd 100644
--- a/be/test/exprs/function/function_tokenize_test.cpp
+++ b/be/test/exprs/function/function_tokenize_test.cpp
@@ -25,6 +25,7 @@
 
 #include "core/block/block.h"
 #include "core/block/column_with_type_and_name.h"
+#include "core/column/column_const.h"
 #include "core/column/column_string.h"
 #include "core/data_type/data_type_string.h"
 #include "exprs/function/simple_function_factory.h"
@@ -203,6 +204,99 @@ TEST_F(FunctionTokenizeTest, ParserNonePropertyFormats) {
     }
 }
 
+// Regression for const-first-argument row count.
+//
+// The fix in FunctionTokenize::execute_impl wraps the dest column in a
+// ColumnConst(input_rows_count) when the first argument was already a
+// ColumnConst. The generic PreparedFunctionImpl::default_implementation_for_
+// constant_arguments path only short-circuits when *all* arguments are const,
+// so to reach the new left_const branch we need arg0 const but arg1 non-const.
+// FE rejects non-literal second arguments for tokenize, so this path is only
+// exercisable from a unit test that builds the block directly.
+TEST_F(FunctionTokenizeTest, ConstFirstArgPreservesRowCount) {
+    const size_t input_rows_count = 5;
+    const std::string input_str = "Hello World Test";
+    const std::string properties = "parser='english'";
+
+    auto input_inner = ColumnString::create();
+    input_inner->insert_data(input_str.data(), input_str.size());
+    auto input_const = ColumnConst::create(std::move(input_inner), input_rows_count);
+
+    auto properties_column = ColumnString::create();
+    for (size_t i = 0; i < input_rows_count; ++i) {
+        properties_column->insert_data(properties.data(), properties.size());
+    }
+
+    auto string_type = std::make_shared<DataTypeString>();
+
+    Block block;
+    block.insert(ColumnWithTypeAndName(std::move(input_const), string_type, "input"));
+    block.insert(ColumnWithTypeAndName(std::move(properties_column), string_type, "properties"));
+    block.insert(ColumnWithTypeAndName(string_type->create_column(), string_type, "result"));
+
+    ColumnNumbers arguments = {0, 1};
+    uint32_t result = 2;
+
+    auto status = _function->execute(nullptr, block, arguments, result, input_rows_count);
+    ASSERT_TRUE(status.ok()) << status.to_string();
+
+    auto result_column = block.get_by_position(result).column;
+    ASSERT_EQ(result_column->size(), input_rows_count);
+
+    // Source is constant, so every row must carry the same tokenized value.
+    StringRef first_token = result_column->get_data_at(0);
+    ASSERT_GT(first_token.size, 0);
+    EXPECT_NE(std::string(first_token.data, first_token.size).find("hello"), std::string::npos);
+    EXPECT_NE(std::string(first_token.data, first_token.size).find("world"), std::string::npos);
+    for (size_t i = 1; i < input_rows_count; ++i) {
+        StringRef row_i = result_column->get_data_at(i);
+        EXPECT_EQ(row_i.size, first_token.size);
+        EXPECT_EQ(memcmp(row_i.data, first_token.data, first_token.size), 0);
+    }
+}
+
+// Same check for the PARSER_NONE early-return branch in execute_impl.
+TEST_F(FunctionTokenizeTest, ConstFirstArgParserNonePreservesRowCount) {
+    const size_t input_rows_count = 4;
+    const std::string input_str = "Hello World";
+    const std::string properties = "parser='none'";
+
+    auto input_inner = ColumnString::create();
+    input_inner->insert_data(input_str.data(), input_str.size());
+    auto input_const = ColumnConst::create(std::move(input_inner), input_rows_count);
+
+    auto properties_column = ColumnString::create();
+    for (size_t i = 0; i < input_rows_count; ++i) {
+        properties_column->insert_data(properties.data(), properties.size());
+    }
+
+    auto string_type = std::make_shared<DataTypeString>();
+
+    Block block;
+    block.insert(ColumnWithTypeAndName(std::move(input_const), string_type, "input"));
+    block.insert(ColumnWithTypeAndName(std::move(properties_column), string_type, "properties"));
+    block.insert(ColumnWithTypeAndName(string_type->create_column(), string_type, "result"));
+
+    ColumnNumbers arguments = {0, 1};
+    uint32_t result = 2;
+
+    auto status = _function->execute(nullptr, block, arguments, result, input_rows_count);
+    ASSERT_TRUE(status.ok()) << status.to_string();
+
+    auto result_column = block.get_by_position(result).column;
+    ASSERT_EQ(result_column->size(), input_rows_count);
+
+    StringRef first_token = result_column->get_data_at(0);
+    const std::string expected = R"([{
+        "token": "Hello World"
+    }])";
+    EXPECT_EQ(std::string(first_token.data, first_token.size), expected);
+    for (size_t i = 1; i < input_rows_count; ++i) {
+        StringRef row_i = result_column->get_data_at(i);
+        EXPECT_EQ(std::string(row_i.data, row_i.size), expected);
+    }
+}
+
 // Test error cases
 TEST_F(FunctionTokenizeTest, InvalidParser) {
     std::vector<std::string> input_strings = {"Test String"};
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index 118b95397e5cad..d0bdada2e31d7c 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -128,84 +128,4 @@ suite("test_tokenize"){
     qt_tokenize_sql """SELECT TOKENIZE('1･2', '"parser"="ik","parser_mode"="ik_max_word"');"""
     qt_tokenize_sql """SELECT TOKENIZE('abcşīabc', '"parser"="ik","parser_mode"="ik_max_word"');"""
 
-    // Regression for the case where the first argument is a constant literal
-    // while the second argument is a non-const column. The generic
-    // PreparedFunctionImpl::default_implementation_for_constant_arguments path
-    // only fires when all arguments are constant, so the example from the bug
-    // description (`SELECT tokenize('hello', '"parser"="english"') FROM t`)
-    // never actually reaches FunctionTokenize::execute_impl. To exercise the
-    // left_const branch added by this fix we need a non-const second argument;
-    // we get that by reading the properties string from a table column.
-    //
-    // Before the fix _do_tokenize / _do_tokenize_none would produce a
-    // dest_column with only 1 row (because unpack_if_const unwrapped the
-    // ColumnConst of the first arg to its inner 1-row data column), so the
-    // outer block ended up with a row count of 1 instead of input_rows_count.
-    def constArgTbl = "tokenize_const_first_arg"
-    sql "DROP TABLE IF EXISTS ${constArgTbl}"
-    sql """
-    CREATE TABLE IF NOT EXISTS ${constArgTbl}(
-      `id` int NULL,
-      `parser_config` text NULL
-    ) ENGINE=OLAP
-    DUPLICATE KEY(`id`)
-    DISTRIBUTED BY HASH(`id`) BUCKETS 1
-    PROPERTIES("replication_allocation" = "tag.location.default: 1");
-    """
-    sql """INSERT INTO ${constArgTbl} VALUES
-           (1, '"parser"="english"'),
-           (2, '"parser"="english"'),
-           (3, '"parser"="english"'),
-           (4, '"parser"="english"'),
-           (5, '"parser"="english"');"""
-
-    // Row count must match the source table's row count, not collapse to 1.
-    def cntEnglish = sql """SELECT COUNT(*) FROM
-        (SELECT tokenize('hello world', parser_config) AS t FROM ${constArgTbl}) sub;"""
-    assertEquals(5L, cntEnglish[0][0])
-
-    // Every row must carry the tokenized value (and they must all be equal,
-    // since the source string is constant across rows).
-    def rowsEnglish = sql """SELECT id, tokenize('hello world', parser_config) AS t
-        FROM ${constArgTbl} ORDER BY id;"""
-    assertEquals(5, rowsEnglish.size())
-    def firstEnglishTok = rowsEnglish[0][1]
-    assertTrue(firstEnglishTok != null && firstEnglishTok.toString().contains("hello"))
-    assertTrue(firstEnglishTok.toString().contains("world"))
-    for (int i = 0; i < rowsEnglish.size(); i++) {
-        assertEquals(i + 1, rowsEnglish[i][0] as Integer)
-        assertEquals(firstEnglishTok, rowsEnglish[i][1])
-    }
-
-    // Same exercise for the PARSER_NONE branch, which is a separate code path
-    // inside execute_impl (the _do_tokenize_none early return).
-    def constArgNoneTbl = "tokenize_const_first_arg_none"
-    sql "DROP TABLE IF EXISTS ${constArgNoneTbl}"
-    sql """
-    CREATE TABLE IF NOT EXISTS ${constArgNoneTbl}(
-      `id` int NULL,
-      `parser_config` text NULL
-    ) ENGINE=OLAP
-    DUPLICATE KEY(`id`)
-    DISTRIBUTED BY HASH(`id`) BUCKETS 1
-    PROPERTIES("replication_allocation" = "tag.location.default: 1");
-    """
-    sql """INSERT INTO ${constArgNoneTbl} VALUES
-           (1, '"parser"="none"'),
-           (2, '"parser"="none"'),
-           (3, '"parser"="none"');"""
-
-    def cntNone = sql """SELECT COUNT(*) FROM
-        (SELECT tokenize('hello world', parser_config) AS t FROM ${constArgNoneTbl}) sub;"""
-    assertEquals(3L, cntNone[0][0])
-
-    def rowsNone = sql """SELECT id, tokenize('hello world', parser_config) AS t
-        FROM ${constArgNoneTbl} ORDER BY id;"""
-    assertEquals(3, rowsNone.size())
-    def firstNoneTok = rowsNone[0][1]
-    assertTrue(firstNoneTok != null && firstNoneTok.toString().contains("hello world"))
-    for (int i = 0; i < rowsNone.size(); i++) {
-        assertEquals(i + 1, rowsNone[i][0] as Integer)
-        assertEquals(firstNoneTok, rowsNone[i][1])
-    }
 }