From feaa5f6a4251f53b0939b5df96afe5ecae5140f6 Mon Sep 17 00:00:00 2001 From: csun5285 Date: Thu, 4 Jun 2026 12:15:42 +0800 Subject: [PATCH] [fix](variant) return raw string for element_at on scalar-string variant When extracting a string property from a scalar-string variant (the shape produced by `cast(text as variant)`), `element_at` goes through the simdjson document path and stored `simdjson::to_json_string(value)` for the extracted value. For a JSON string that representation keeps the surrounding double quotes (e.g. `"2026-05-20 18:40:02"`), which leaked into the result and made scalar-string variants inconsistent with the structured-subcolumn path, which returns the string unquoted. This also broke downstream string ops, e.g. `substring(v['k'], 1, 10)` consumed the leading quote. Add a dedicated `string` branch in `_write_data_to_column` that extracts the raw, unescaped value via `value.get_string()`. number/array/object values keep their JSON-text representation through `to_json_string` as before. Add BE unit test `extract_string_from_scalar_root` and regression assertions covering string/substring/escaped/number/array/object extraction. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../function/function_variant_element.cpp | 9 +++++ .../function_variant_element_test.cpp | 38 +++++++++++++++++++ .../data/variant_p0/sql/select_from_value.out | 2 +- .../suites/variant_p0/element_function.groovy | 28 ++++++++++++++ 4 files changed, 76 insertions(+), 1 deletion(-) diff --git a/be/src/exprs/function/function_variant_element.cpp b/be/src/exprs/function/function_variant_element.cpp index 4736342f4eeed3..c2d984885de00a 100644 --- a/be/src/exprs/function/function_variant_element.cpp +++ b/be/src/exprs/function/function_variant_element.cpp @@ -388,6 +388,15 @@ class FunctionVariantElement : public IFunction { } break; } + case simdjson::ondemand::json_type::string: { + // Extract the raw (unescaped) string value rather than its JSON + // representation. simdjson::to_json_string would keep the surrounding + // double quotes (e.g. "2026-05-20"), which leaks into the result and + // makes scalar-string variants inconsistent with structured ones. + std::string_view value_str = value.get_string().value(); + column->insert_data(value_str.data(), value_str.length()); + break; + } default: { auto value_str = simdjson::to_json_string(value).value(); column->insert_data(value_str.data(), value_str.length()); diff --git a/be/test/exprs/function/function_variant_element_test.cpp b/be/test/exprs/function/function_variant_element_test.cpp index 1a8d6985167f28..7db931af14b473 100644 --- a/be/test/exprs/function/function_variant_element_test.cpp +++ b/be/test/exprs/function/function_variant_element_test.cpp @@ -61,4 +61,42 @@ TEST(function_variant_element_test, extract_from_sparse_column) { EXPECT_EQ(result_string, "{\"age\":\"John\",\"name\":\"John\"}"); } +// CIR-20498: extracting a string property from a scalar-string-root variant +// (the shape produced by `cast(text as variant)`) must return the raw string, +// not its JSON token with surrounding double quotes. +TEST(function_variant_element_test, extract_string_from_scalar_root) { + auto variant_column = ColumnVariant::create(0 /*max_subcolumns_count*/, false); + auto root_column = ColumnString::create(); + std::string doc = R"({"wsn":"SRFSPXFDVY","uploadTimeValue":"2026-05-20 18:40:02","n":49.98})"; + root_column->insert_data(doc.data(), doc.size()); + variant_column->create_root(std::make_shared(), std::move(root_column)); + variant_column->set_num_rows(1); + ASSERT_TRUE(variant_column->is_scalar_variant()); + + DataTypeSerDe::FormatOptions options; + auto tz = cctz::utc_time_zone(); + options.timezone = &tz; + + auto extract = [&](const std::string& key) { + ColumnPtr index_inner = ColumnString::create(); + assert_cast(index_inner->assert_mutable().get()) + ->insert_data(key.data(), key.size()); + ColumnPtr index_column = ColumnConst::create(index_inner, 1); + ColumnPtr result; + auto status = + FunctionVariantElement::get_element_column(*variant_column, index_column, &result); + EXPECT_TRUE(status.ok()); + std::string out; + assert_cast(*result.get()) + .serialize_one_row_to_string(0, &out, options); + return out; + }; + + // string values: no surrounding quotes + EXPECT_EQ(extract("wsn"), "SRFSPXFDVY"); + EXPECT_EQ(extract("uploadTimeValue"), "2026-05-20 18:40:02"); + // non-string scalars keep their JSON representation + EXPECT_EQ(extract("n"), "49.98"); +} + } // namespace doris diff --git a/regression-test/data/variant_p0/sql/select_from_value.out b/regression-test/data/variant_p0/sql/select_from_value.out index ef562a658e91c5..1fe3c49651b05a 100644 --- a/regression-test/data/variant_p0/sql/select_from_value.out +++ b/regression-test/data/variant_p0/sql/select_from_value.out @@ -1,4 +1,4 @@ -- This file is automatically generated. You should know what you did if you want to edit this -- !select_from_value -- -"b" +b diff --git a/regression-test/suites/variant_p0/element_function.groovy b/regression-test/suites/variant_p0/element_function.groovy index 7b5e55ea53bdad..fb183db9690c82 100644 --- a/regression-test/suites/variant_p0/element_function.groovy +++ b/regression-test/suites/variant_p0/element_function.groovy @@ -29,4 +29,32 @@ suite("regression_test_variant_element_at", "p0") { sql """insert into element_fn_test values (1, '{"arr1" : [1, 2, 3]}', '{"arr2" : [4, 5, 6]}')""" qt_sql """select array_first((x,y) -> (x - y) < 0, cast(v['arr1'] as array), cast(v1['arr2'] as array)) from element_fn_test""" + + // CIR-20498: extracting a string property from a scalar-string variant + // (e.g. `cast(text as variant)['key']`) must not leak the surrounding JSON + // double quotes. The root of such a variant is a raw JSON string, so the + // extraction goes through the simdjson document path; a string value must be + // returned unescaped, consistently with the structured-subcolumn path. + def scalar = sql """select cast('{"wsn":"SRFSPXFDVY","uploadTimeValue":"2026-05-20 18:40:02"}' as variant)['wsn']""" + assertEquals("SRFSPXFDVY", scalar[0][0]) + + def sub = sql """select substring(cast('{"uploadTimeValue":"2026-05-20 18:40:02"}' as variant)['uploadTimeValue'], 1, 10)""" + assertEquals("2026-05-20", sub[0][0]) + + // values containing escaped characters must be unescaped, not kept as raw JSON tokens + def escaped = sql """select cast('{"k":"a\\\\"b"}' as variant)['k']""" + assertEquals("a\"b", escaped[0][0]) + + // non-string scalars keep their existing JSON representation + def num = sql """select cast('{"n":49.98}' as variant)['n']""" + assertEquals("49.98", num[0][0]) + + // array / object values must keep their JSON text representation (no unquoting): + // only the top-level string scalar is unquoted; quotes nested inside JSON are + // part of the value and must be preserved. + def arr = sql """select cast('{"a":[1,2,3]}' as variant)['a']""" + assertEquals("[1,2,3]", arr[0][0]) + + def obj = sql """select cast('{"o":{"name":"john"}}' as variant)['o']""" + assertEquals('{"name":"john"}', obj[0][0]) } \ No newline at end of file