Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions be/src/exprs/function/function_tokenize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "core/block/block.h"
#include "core/block/column_with_type_and_name.h"
#include "core/column/column.h"
#include "core/column/column_const.h"
#include "core/data_type/data_type_nullable.h"
#include "core/data_type/data_type_number.h"
#include "core/string_ref.h"
Expand Down Expand Up @@ -134,7 +135,7 @@ void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string,

Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block,
const ColumnNumbers& arguments, uint32_t result,
size_t /*input_rows_count*/) const {
size_t input_rows_count) const {
DCHECK_EQ(arguments.size(), 2);
const auto& [src_column, left_const] =
unpack_if_const(block.get_by_position(arguments[0]).column);
Expand Down Expand Up @@ -165,7 +166,13 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block
if (config.analyzer_name.empty() &&
config.parser_type == InvertedIndexParserType::PARSER_NONE) {
_do_tokenize_none(*col_left, dest_column_ptr);
block.replace_by_position(result, std::move(dest_column_ptr));
if (left_const) {
block.replace_by_position(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This behavior change needs regression coverage. The PR description uses SELECT tokenize('hello world', 'parser=english') FROM table_with_many_rows, but when both arguments are literals the generic PreparedFunctionImpl::default_implementation_for_constant_arguments should unwrap, execute one row, and wrap the result as a ColumnConst before this left_const branch is reached. Please add a regression test that fails before this patch and exercises the actual intended path, with multiple input rows and deterministic expected output/row count, so we can verify this branch fixes a real user-visible case rather than an untested lower-level edge case.

result,
ColumnConst::create(std::move(dest_column_ptr), input_rows_count));
} else {
block.replace_by_position(result, std::move(dest_column_ptr));
}
return Status::OK();
}

Expand Down Expand Up @@ -196,7 +203,12 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block
analyzer_ctx.analyzer = analyzer_holder;
_do_tokenize(*col_left, analyzer_ctx, support_phrase, dest_column_ptr);

block.replace_by_position(result, std::move(dest_column_ptr));
if (left_const) {
block.replace_by_position(
result, ColumnConst::create(std::move(dest_column_ptr), input_rows_count));
} else {
block.replace_by_position(result, std::move(dest_column_ptr));
}
return Status::OK();
}
}
Expand Down
2 changes: 1 addition & 1 deletion be/src/exprs/function/function_tokenize.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,6 @@ class FunctionTokenize : public IFunction {
void _do_tokenize_none(const ColumnString& src_column_string,
const MutableColumnPtr& dest_column_ptr) const;
Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments,
uint32_t result, size_t /*input_rows_count*/) const override;
uint32_t result, size_t input_rows_count) const override;
};
} // namespace doris
94 changes: 94 additions & 0 deletions be/test/exprs/function/function_tokenize_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include "core/block/block.h"
#include "core/block/column_with_type_and_name.h"
#include "core/column/column_const.h"
#include "core/column/column_string.h"
#include "core/data_type/data_type_string.h"
#include "exprs/function/simple_function_factory.h"
Expand Down Expand Up @@ -203,6 +204,99 @@ TEST_F(FunctionTokenizeTest, ParserNonePropertyFormats) {
}
}

// Regression for const-first-argument row count.
//
// The fix in FunctionTokenize::execute_impl wraps the dest column in a
// ColumnConst(input_rows_count) when the first argument was already a
// ColumnConst. The generic PreparedFunctionImpl::default_implementation_for_
// constant_arguments path only short-circuits when *all* arguments are const,
// so to reach the new left_const branch we need arg0 const but arg1 non-const.
// FE rejects non-literal second arguments for tokenize, so this path is only
// exercisable from a unit test that builds the block directly.
TEST_F(FunctionTokenizeTest, ConstFirstArgPreservesRowCount) {
const size_t input_rows_count = 5;
const std::string input_str = "Hello World Test";
const std::string properties = "parser='english'";

auto input_inner = ColumnString::create();
input_inner->insert_data(input_str.data(), input_str.size());
auto input_const = ColumnConst::create(std::move(input_inner), input_rows_count);

auto properties_column = ColumnString::create();
for (size_t i = 0; i < input_rows_count; ++i) {
properties_column->insert_data(properties.data(), properties.size());
}

auto string_type = std::make_shared<DataTypeString>();

Block block;
block.insert(ColumnWithTypeAndName(std::move(input_const), string_type, "input"));
block.insert(ColumnWithTypeAndName(std::move(properties_column), string_type, "properties"));
block.insert(ColumnWithTypeAndName(string_type->create_column(), string_type, "result"));

ColumnNumbers arguments = {0, 1};
uint32_t result = 2;

auto status = _function->execute(nullptr, block, arguments, result, input_rows_count);
ASSERT_TRUE(status.ok()) << status.to_string();

auto result_column = block.get_by_position(result).column;
ASSERT_EQ(result_column->size(), input_rows_count);

// Source is constant, so every row must carry the same tokenized value.
StringRef first_token = result_column->get_data_at(0);
ASSERT_GT(first_token.size, 0);
EXPECT_NE(std::string(first_token.data, first_token.size).find("hello"), std::string::npos);
EXPECT_NE(std::string(first_token.data, first_token.size).find("world"), std::string::npos);
for (size_t i = 1; i < input_rows_count; ++i) {
StringRef row_i = result_column->get_data_at(i);
EXPECT_EQ(row_i.size, first_token.size);
EXPECT_EQ(memcmp(row_i.data, first_token.data, first_token.size), 0);
}
}

// Same check for the PARSER_NONE early-return branch in execute_impl.
TEST_F(FunctionTokenizeTest, ConstFirstArgParserNonePreservesRowCount) {
const size_t input_rows_count = 4;
const std::string input_str = "Hello World";
const std::string properties = "parser='none'";

auto input_inner = ColumnString::create();
input_inner->insert_data(input_str.data(), input_str.size());
auto input_const = ColumnConst::create(std::move(input_inner), input_rows_count);

auto properties_column = ColumnString::create();
for (size_t i = 0; i < input_rows_count; ++i) {
properties_column->insert_data(properties.data(), properties.size());
}

auto string_type = std::make_shared<DataTypeString>();

Block block;
block.insert(ColumnWithTypeAndName(std::move(input_const), string_type, "input"));
block.insert(ColumnWithTypeAndName(std::move(properties_column), string_type, "properties"));
block.insert(ColumnWithTypeAndName(string_type->create_column(), string_type, "result"));

ColumnNumbers arguments = {0, 1};
uint32_t result = 2;

auto status = _function->execute(nullptr, block, arguments, result, input_rows_count);
ASSERT_TRUE(status.ok()) << status.to_string();

auto result_column = block.get_by_position(result).column;
ASSERT_EQ(result_column->size(), input_rows_count);

StringRef first_token = result_column->get_data_at(0);
const std::string expected = R"([{
"token": "Hello World"
}])";
EXPECT_EQ(std::string(first_token.data, first_token.size), expected);
for (size_t i = 1; i < input_rows_count; ++i) {
StringRef row_i = result_column->get_data_at(i);
EXPECT_EQ(std::string(row_i.data, row_i.size), expected);
}
}

// Test error cases
TEST_F(FunctionTokenizeTest, InvalidParser) {
std::vector<std::string> input_strings = {"Test String"};
Expand Down
Loading