Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions be/src/exprs/function/function_variant_element.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,15 @@ class FunctionVariantElement : public IFunction {
}
break;
}
case simdjson::ondemand::json_type::string: {
// Extract the raw (unescaped) string value rather than its JSON
// representation. simdjson::to_json_string would keep the surrounding
// double quotes (e.g. "2026-05-20"), which leaks into the result and
// makes scalar-string variants inconsistent with structured ones.
std::string_view value_str = value.get_string().value();
column->insert_data(value_str.data(), value_str.length());
break;
}
default: {
auto value_str = simdjson::to_json_string(value).value();
column->insert_data(value_str.data(), value_str.length());
Expand Down
38 changes: 38 additions & 0 deletions be/test/exprs/function/function_variant_element_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,42 @@ TEST(function_variant_element_test, extract_from_sparse_column) {
EXPECT_EQ(result_string, "{\"age\":\"John\",\"name\":\"John\"}");
}

// CIR-20498: extracting a string property from a scalar-string-root variant
// (the shape produced by `cast(text as variant)`) must return the raw string,
// not its JSON token with surrounding double quotes.
TEST(function_variant_element_test, extract_string_from_scalar_root) {
auto variant_column = ColumnVariant::create(0 /*max_subcolumns_count*/, false);
auto root_column = ColumnString::create();
std::string doc = R"({"wsn":"SRFSPXFDVY","uploadTimeValue":"2026-05-20 18:40:02","n":49.98})";
root_column->insert_data(doc.data(), doc.size());
variant_column->create_root(std::make_shared<DataTypeString>(), std::move(root_column));
variant_column->set_num_rows(1);
ASSERT_TRUE(variant_column->is_scalar_variant());

DataTypeSerDe::FormatOptions options;
auto tz = cctz::utc_time_zone();
options.timezone = &tz;

auto extract = [&](const std::string& key) {
ColumnPtr index_inner = ColumnString::create();
assert_cast<ColumnString*>(index_inner->assert_mutable().get())
->insert_data(key.data(), key.size());
ColumnPtr index_column = ColumnConst::create(index_inner, 1);
ColumnPtr result;
auto status =
FunctionVariantElement::get_element_column(*variant_column, index_column, &result);
EXPECT_TRUE(status.ok());
std::string out;
assert_cast<const ColumnVariant&>(*result.get())
.serialize_one_row_to_string(0, &out, options);
return out;
};

// string values: no surrounding quotes
EXPECT_EQ(extract("wsn"), "SRFSPXFDVY");
EXPECT_EQ(extract("uploadTimeValue"), "2026-05-20 18:40:02");
// non-string scalars keep their JSON representation
EXPECT_EQ(extract("n"), "49.98");
}

} // namespace doris
2 changes: 1 addition & 1 deletion regression-test/data/variant_p0/sql/select_from_value.out
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !select_from_value --
"b"
b

28 changes: 28 additions & 0 deletions regression-test/suites/variant_p0/element_function.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,32 @@ suite("regression_test_variant_element_at", "p0") {

sql """insert into element_fn_test values (1, '{"arr1" : [1, 2, 3]}', '{"arr2" : [4, 5, 6]}')"""
qt_sql """select array_first((x,y) -> (x - y) < 0, cast(v['arr1'] as array<int>), cast(v1['arr2'] as array<int>)) from element_fn_test"""

// CIR-20498: extracting a string property from a scalar-string variant
// (e.g. `cast(text as variant)['key']`) must not leak the surrounding JSON
// double quotes. The root of such a variant is a raw JSON string, so the
// extraction goes through the simdjson document path; a string value must be
// returned unescaped, consistently with the structured-subcolumn path.
def scalar = sql """select cast('{"wsn":"SRFSPXFDVY","uploadTimeValue":"2026-05-20 18:40:02"}' as variant)['wsn']"""
assertEquals("SRFSPXFDVY", scalar[0][0])

def sub = sql """select substring(cast('{"uploadTimeValue":"2026-05-20 18:40:02"}' as variant)['uploadTimeValue'], 1, 10)"""
assertEquals("2026-05-20", sub[0][0])

// values containing escaped characters must be unescaped, not kept as raw JSON tokens
def escaped = sql """select cast('{"k":"a\\\\"b"}' as variant)['k']"""
assertEquals("a\"b", escaped[0][0])

// non-string scalars keep their existing JSON representation
def num = sql """select cast('{"n":49.98}' as variant)['n']"""
assertEquals("49.98", num[0][0])

// array / object values must keep their JSON text representation (no unquoting):
// only the top-level string scalar is unquoted; quotes nested inside JSON are
// part of the value and must be preserved.
def arr = sql """select cast('{"a":[1,2,3]}' as variant)['a']"""
assertEquals("[1,2,3]", arr[0][0])

def obj = sql """select cast('{"o":{"name":"john"}}' as variant)['o']"""
assertEquals('{"name":"john"}', obj[0][0])
}
Loading