Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion be/src/core/column/column_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,10 @@ class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {
running_offset += lengths[i];
offsets_ptr[i] = static_cast<T>(running_offset);
}
chars.resize(offsets[old_rows + num - 1]);
// OFFSET_ONLY columns carry valid offsets but no real string payload. Use non-zero
// placeholders so char-padding shrink logic cannot recompute these offsets as zero-length
// strings when this column is nested under a struct that also contains CHAR fields.
chars.resize_fill(offsets[old_rows + num - 1], 1);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes every OFFSET_ONLY string read write one synthetic byte for every logical byte in the column. The previous resize() only advanced the chars size after allocating, while resize_fill(..., 1) touches the whole appended range. For a query such as select length(big_string_col) ... with nested pruning enabled, the BE still only needs offsets, but this now performs O(total string bytes) memory writes per block and can dominate the scan for large values, even though the CHAR/struct shrink issue only applies to the later shrink_padding_chars() path. Please keep the general OFFSET_ONLY path sparse and fix the shrink path more narrowly, e.g. by preventing shrink from recomputing offsets for offset-only string children or only materializing placeholders when that specific shrink path is actually required.

}

void insert_many_strings(const StringRef* strings, size_t num) override {
Expand Down
6 changes: 3 additions & 3 deletions be/src/exprs/function/function_struct_element.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@ class FunctionStructElement : public IFunction {
size_t index;
RETURN_IF_ERROR(get_element_index(*struct_type, index_column, index_type, &index));
ColumnPtr res_column = struct_col->get_column_ptr(index);
ColumnPtr ele_column = res_column->clone_resized(res_column->size());
//This function must return a ColumnNullable column, so it is necessary to convert the result column into ColumnNullable.
block.replace_by_position(result, make_nullable(ele_column));
// This function must return a ColumnNullable column, so it is necessary to convert the
// result column into ColumnNullable.
block.replace_by_position(result, make_nullable(res_column));
return Status::OK();
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !struct_offset_group_min_by --
\N v6
1 v0
2 v1
3 v2
4 v3
5 v4
6 v5

-- !struct_offset_group_count --
\N 1
1 1
2 2
3 2
4 2
5 1
6 1

-- !array_full_access_strips_offset --
1 3 [1, 2, 3]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,53 @@ suite("string_length_column_pruning") {
notContains "type=bigint"
}
sql "select length(struct_element(struct_col, 'f3')) from slcp_str_tbl"

sql """ DROP TABLE IF EXISTS slcp_struct_offset_group_tbl """
sql """
CREATE TABLE slcp_struct_offset_group_tbl (
id INT,
val STRING,
s STRUCT<c: CHAR(10), b: STRING>
) ENGINE = OLAP
DUPLICATE KEY(id)
DISTRIBUTED BY HASH(id) BUCKETS 3
PROPERTIES ("replication_allocation" = "tag.location.default: 1")
"""
sql """
INSERT INTO slcp_struct_offset_group_tbl VALUES
(0, 'v0', named_struct('c', 'x', 'b', 'a')),
(1, 'v1', named_struct('c', 'x', 'b', 'bb')),
(2, 'v2', named_struct('c', 'x', 'b', 'ccc')),
(3, 'v3', named_struct('c', 'x', 'b', 'dddd')),
(4, 'v4', named_struct('c', 'x', 'b', 'eeeee')),
(5, 'v5', named_struct('c', 'x', 'b', 'ffffff')),
(6, 'v6', named_struct('c', 'x', 'b', NULL)),
(7, 'v7', named_struct('c', 'x', 'b', 'gg')),
(8, 'v8', named_struct('c', 'x', 'b', 'hhhh')),
(9, 'v9', named_struct('c', 'x', 'b', 'iii'))
"""
explain {
sql """
select length(struct_element(s, 'b')), min_by(val, id)
from slcp_struct_offset_group_tbl
group by 1
"""
contains "nested columns"
contains "s.b.OFFSET"
}
order_qt_struct_offset_group_min_by """
select length(struct_element(s, 'b')), min_by(val, id)
from slcp_struct_offset_group_tbl
group by 1
order by 1, 2
"""
order_qt_struct_offset_group_count """
select length(struct_element(s, 'b')), count(*)
from slcp_struct_offset_group_tbl
group by 1
order by 1
"""

// length() in both SELECT and WHERE: predicate must remain length(str_col) > 1,
// never be rewritten to CAST(str_col AS int) > 1. Slot type must stay varchar.
explain {
Expand Down
Loading