Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion be/src/core/data_type_serde/data_type_number_serde.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "util/jsonb_writer.h"
#include "util/mysql_global.h"
#include "util/to_string.h"
#include "util/unaligned.h"

namespace doris {
// Type map的基本结构
Expand Down Expand Up @@ -708,7 +709,9 @@ void DataTypeNumberSerDe<T>::write_one_cell_to_jsonb(const IColumn& column,
int64_t val = *reinterpret_cast<const int64_t*>(data_ref.data);
result.writeInt64(val);
} else if constexpr (T == TYPE_LARGEINT) {
__int128_t val = *reinterpret_cast<const __int128_t*>(data_ref.data);
// data_ref.data may not be 16-byte aligned; dereferencing __int128*
// directly is UB and may SIGBUS on alignment-strict platforms.
__int128_t val = unaligned_load<__int128_t>(data_ref.data);
result.writeInt128(val);
} else if constexpr (T == TYPE_FLOAT) {
float val = *reinterpret_cast<const float*>(data_ref.data);
Expand Down
17 changes: 11 additions & 6 deletions be/src/exprs/vexpr.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include "storage/index/index_reader.h"
#include "storage/index/inverted/inverted_index_reader.h"
#include "util/date_func.h"
#include "util/unaligned.h"

namespace doris {
class BitmapFilterFuncBase;
Expand Down Expand Up @@ -491,10 +492,11 @@ Status create_texpr_literal_node(const void* data, TExprNode* node, int precisio
(*node).__set_int_literal(intLiteral);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_BIGINT));
} else if constexpr (T == TYPE_LARGEINT) {
const auto* origin_value = reinterpret_cast<const int128_t*>(data);
// data may not be 16-byte aligned; use unaligned_load to avoid UB.
int128_t origin_value = unaligned_load<int128_t>(data);
(*node).__set_node_type(TExprNodeType::LARGE_INT_LITERAL);
TLargeIntLiteral large_int_literal;
large_int_literal.__set_value(LargeIntValue::to_string(*origin_value));
large_int_literal.__set_value(LargeIntValue::to_string(origin_value));
(*node).__set_large_int_literal(large_int_literal);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_LARGEINT));
} else if constexpr ((T == TYPE_DATE) || (T == TYPE_DATETIME)) {
Expand Down Expand Up @@ -538,10 +540,12 @@ Status create_texpr_literal_node(const void* data, TExprNode* node, int precisio
(*node).__set_node_type(TExprNodeType::DATE_LITERAL);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_TIMESTAMPTZ, precision, scale));
} else if constexpr (T == TYPE_DECIMALV2) {
const auto* origin_value = reinterpret_cast<const DecimalV2Value*>(data);
// data may not be 16-byte aligned (DecimalV2Value stores int128_t);
// use unaligned_load to avoid UB.
DecimalV2Value origin_value = unaligned_load<DecimalV2Value>(data);
(*node).__set_node_type(TExprNodeType::DECIMAL_LITERAL);
TDecimalLiteral decimal_literal;
decimal_literal.__set_value(origin_value->to_string());
decimal_literal.__set_value(origin_value.to_string());
(*node).__set_decimal_literal(decimal_literal);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_DECIMALV2, precision, scale));
} else if constexpr (T == TYPE_DECIMAL32) {
Expand All @@ -559,7 +563,8 @@ Status create_texpr_literal_node(const void* data, TExprNode* node, int precisio
(*node).__set_decimal_literal(decimal_literal);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_DECIMAL64, precision, scale));
} else if constexpr (T == TYPE_DECIMAL128I) {
const auto* origin_value = reinterpret_cast<const Decimal<int128_t>*>(data);
// data may not be 16-byte aligned; use unaligned_load to avoid UB.
Decimal<int128_t> origin_value = unaligned_load<Decimal<int128_t>>(data);
(*node).__set_node_type(TExprNodeType::DECIMAL_LITERAL);
TDecimalLiteral decimal_literal;
// e.g. For a decimal(26,6) column, the initial value of the _min of the MinMax RF
Expand All @@ -569,7 +574,7 @@ Status create_texpr_literal_node(const void* data, TExprNode* node, int precisio
// error when casting string back to decimal later.
// TODO: this is a temporary solution, the best solution is to produce the
// right min max value at the producer side.
decimal_literal.__set_value(origin_value->to_string(precision, scale));
decimal_literal.__set_value(origin_value.to_string(precision, scale));
(*node).__set_decimal_literal(decimal_literal);
(*node).__set_type(create_type_desc(PrimitiveType::TYPE_DECIMAL128I, precision, scale));
} else if constexpr (T == TYPE_DECIMAL256) {
Expand Down
6 changes: 4 additions & 2 deletions be/src/format/orc/vorc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
#include "storage/utils.h"
#include "util/slice.h"
#include "util/timezone_utils.h"
#include "util/unaligned.h"

namespace doris {
class RuntimeState;
Expand Down Expand Up @@ -781,15 +782,16 @@ std::tuple<bool, orc::Literal> convert_to_orc_literal(const orc::Type* type,
case orc::TypeKind::DECIMAL: {
int128_t decimal_value;
if constexpr (primitive_type == TYPE_DECIMALV2) {
decimal_value = *reinterpret_cast<const int128_t*>(value);
// value may not be 16-byte aligned; use unaligned_load to avoid UB.
decimal_value = unaligned_load<int128_t>(value);
precision = DecimalV2Value::PRECISION;
scale = DecimalV2Value::SCALE;
} else if constexpr (primitive_type == TYPE_DECIMAL32) {
decimal_value = *((int32_t*)value);
} else if constexpr (primitive_type == TYPE_DECIMAL64) {
decimal_value = *((int64_t*)value);
} else if constexpr (primitive_type == TYPE_DECIMAL128I) {
decimal_value = *((int128_t*)value);
decimal_value = unaligned_load<int128_t>(value);
} else {
return std::make_tuple(false, orc::Literal(false));
}
Expand Down
7 changes: 5 additions & 2 deletions be/src/tools/meta_tool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
#include "storage/tablet/tablet_schema_cache.h"
#include "storage/types.h"
#include "util/coding.h"
#include "util/unaligned.h"

using doris::DataDir;
using doris::StorageEngine;
Expand Down Expand Up @@ -472,7 +473,8 @@ std::string format_column_value(const doris::IColumn& column, size_t row,
// LargeInt is stored as Int128
const StringRef& data = column.get_data_at(row);
if (data.size == sizeof(__int128)) {
__int128 val = *reinterpret_cast<const __int128*>(data.data);
// data.data may not be 16-byte aligned; use unaligned_load to avoid UB.
__int128 val = unaligned_load<__int128>(data.data);
return doris::LargeIntValue::to_string(val);
}
return "<invalid largeint>";
Expand Down Expand Up @@ -556,7 +558,8 @@ std::string format_column_value(const doris::IColumn& column, size_t row,
case FieldType::OLAP_FIELD_TYPE_DECIMAL128I: {
const StringRef& data = column.get_data_at(row);
if (data.size == sizeof(__int128)) {
__int128 val = *reinterpret_cast<const __int128*>(data.data);
// data.data may not be 16-byte aligned; use unaligned_load to avoid UB.
__int128 val = unaligned_load<__int128>(data.data);
return doris::LargeIntValue::to_string(val);
}
return "<invalid decimal>";
Expand Down
107 changes: 107 additions & 0 deletions be/test/exprs/vexpr_unaligned_int128_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// Regression test for unaligned __int128 dereference UB.
//
// Several call sites used to dereference a `__int128*` produced from a
// `StringRef::data` (or similar byte pointer) without any alignment
// guarantee. This file pins the contract that the helpers that build
// literal TExprNodes from a raw `const void* data` pointer must accept
// pointers that are *not* 16-byte aligned, since on alignment-strict
// platforms (e.g. some aarch64 / SPARC builds, and UBSan
// -fsanitize=alignment) such reads are undefined behavior and may
// SIGBUS.

#include <gtest/gtest.h>

#include <cstring>
#include <string>
#include <vector>

#include "core/value/decimalv2_value.h"
#include "core/value/large_int_value.h"
#include "exprs/vexpr.h"

namespace doris {

// Returns a pointer guaranteed to be 1 byte off any 16-byte boundary.
static char* misaligned_slot(std::vector<char>& buf, std::size_t bytes) {
buf.assign(bytes + 32, 0);
char* base = buf.data();
// Move forward until we land on an odd address.
std::size_t off = 0;
while ((reinterpret_cast<std::uintptr_t>(base + off) & 0xF) != 1) {
++off;
}
return base + off;
}

TEST(UnalignedInt128Test, LargeIntLiteralFromUnalignedBuffer) {
std::vector<char> buf;
char* p = misaligned_slot(buf, sizeof(__int128));
ASSERT_NE(reinterpret_cast<std::uintptr_t>(p) % alignof(__int128), 0u);

// 2^126 - 1: a value that uses both 64-bit halves.
__int128 expected = (static_cast<__int128>(0x3FFFFFFFFFFFFFFFLL) << 64) |
static_cast<__int128>(0xFEEDFACECAFEBEEFULL);
std::memcpy(p, &expected, sizeof(expected));

TExprNode node;
Status st = create_texpr_literal_node<TYPE_LARGEINT>(p, &node);
ASSERT_TRUE(st.ok()) << st;
ASSERT_TRUE(node.__isset.large_int_literal);
EXPECT_EQ(node.large_int_literal.value, LargeIntValue::to_string(expected));
}

TEST(UnalignedInt128Test, Decimal128ILiteralFromUnalignedBuffer) {
std::vector<char> buf;
char* p = misaligned_slot(buf, sizeof(__int128));
ASSERT_NE(reinterpret_cast<std::uintptr_t>(p) % alignof(__int128), 0u);

// Decimal(20, 4) value: 1234567890123456.7890
__int128 raw = static_cast<__int128>(1234567890123456789LL) * 10 + 1;
std::memcpy(p, &raw, sizeof(raw));

TExprNode node;
Status st = create_texpr_literal_node<TYPE_DECIMAL128I>(p, &node, /*precision=*/20,
/*scale=*/4);
ASSERT_TRUE(st.ok()) << st;
ASSERT_TRUE(node.__isset.decimal_literal);
// Sanity: the formatted value must contain "1234567890123456".
EXPECT_NE(node.decimal_literal.value.find("1234567890123456"), std::string::npos)
<< node.decimal_literal.value;
}

TEST(UnalignedInt128Test, DecimalV2LiteralFromUnalignedBuffer) {
std::vector<char> buf;
char* p = misaligned_slot(buf, sizeof(DecimalV2Value));
ASSERT_NE(reinterpret_cast<std::uintptr_t>(p) % alignof(__int128), 0u);

DecimalV2Value src;
// 12345.6789 * 1e9 (DecimalV2 internal scale = 9).
src.set_value(static_cast<__int128>(12345678900000LL));
std::memcpy(p, &src, sizeof(src));

TExprNode node;
Status st = create_texpr_literal_node<TYPE_DECIMALV2>(p, &node, /*precision=*/27,
/*scale=*/9);
ASSERT_TRUE(st.ok()) << st;
ASSERT_TRUE(node.__isset.decimal_literal);
EXPECT_EQ(node.decimal_literal.value, src.to_string());
}

} // namespace doris
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !largeint_select --
1 170141183460469231731687303715884105727
2 -170141183460469231731687303715884105728
3 0
4 1
5 \N

-- !decimal128_select --
1 12345678901234567890.1234567890
2 -12345678901234567890.1234567890
3 0E-10
4 1E-10
5 \N

-- !decimalv2_select --
1 1234567890.123456789
2 -1234567890.123456789
3 0E-9
4 1E-9
5 \N

-- !largeint_groupby --
\N 1
-170141183460469231731687303715884105728 1
0 1
1 1
170141183460469231731687303715884105727 1

-- !decimal128_groupby --
\N 1
-12345678901234567890.1234567890 1
0E-10 1
12345678901234567890.1234567890 1
1E-10 1

-- !largeint_to_json --
1 170141183460469231731687303715884105727
2 -170141183460469231731687303715884105728
3 0
4 1

Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// Regression for the unaligned __int128 dereference fix. Exercises the
// real-runtime paths that previously dereferenced a __int128* through a
// non 16-byte aligned StringRef::data / Slice::data pointer:
//
// * cast LARGEINT column -> JSON: DataTypeNumberSerDe<TYPE_LARGEINT>::
// write_one_cell_to_jsonb
// * LARGEINT / DECIMALV2 / DECIMAL128 round-trip through a table and
// group-by: column-data-to-string paths used by vexpr literal
// construction at runtime (e.g. runtime filter min/max push), and
// by meta_tool.
//
// Under UBSan -fsanitize=alignment or on strict-alignment platforms
// (e.g. aarch64) the previous code could SIGBUS / abort.
//
// Note: FoldConstantExecutor::_get_result is not exercised here because
// it is unreachable under the current Nereids planner with default
// be_exec_version (>= 4); the new BE-side fold path serializes the
// result via DataTypeSerDe::write_column_to_pb which does not contain an
// unaligned __int128 dereference. The fix to _get_result is retained as
// defensive code; the BE unit test (vexpr_unaligned_int128_test.cpp)
// covers the create_texpr_literal_node LARGEINT / DECIMAL128I /
// DECIMALV2 branches directly from a deliberately misaligned buffer.
suite("test_int128_unaligned_access") {
sql "set enable_sql_cache=false;"

// Storage path: round-trip largeint / decimal128 / decimalv2 through
// a table to exercise the column-data-to-string code paths used by
// vexpr literal construction and meta_tool at runtime.
sql "drop table if exists test_int128_unaligned"
sql """
CREATE TABLE test_int128_unaligned (
id INT NOT NULL,
v_largeint LARGEINT NULL,
v_decimal128 DECIMALV3(38, 10) NULL,
v_decimalv2 DECIMALV2(27, 9) NULL
) DISTRIBUTED BY HASH(id) BUCKETS 1
PROPERTIES("replication_num" = "1")
"""

sql """
INSERT INTO test_int128_unaligned VALUES
(1, 170141183460469231731687303715884105727,
12345678901234567890.1234567890,
1234567890.123456789),
(2, -170141183460469231731687303715884105728,
-12345678901234567890.1234567890,
-1234567890.123456789),
(3, 0, 0, 0),
(4, 1, 0.0000000001, 0.000000001),
(5, NULL, NULL, NULL)
"""

order_qt_largeint_select "select id, v_largeint from test_int128_unaligned"
order_qt_decimal128_select "select id, v_decimal128 from test_int128_unaligned"
order_qt_decimalv2_select "select id, v_decimalv2 from test_int128_unaligned"

// Aggregation + group by exercises hash-table key serialization where
// largeint values are packed into non-aligned byte buffers.
order_qt_largeint_groupby """
select v_largeint, count(*) from test_int128_unaligned group by v_largeint
"""
order_qt_decimal128_groupby """
select v_decimal128, count(*) from test_int128_unaligned group by v_decimal128
"""

// JSON serialization path: DataTypeNumberSerDe<TYPE_LARGEINT>::
// write_one_cell_to_jsonb reads __int128 from StringRef::data.
order_qt_largeint_to_json """
select id, cast(v_largeint as JSON) from test_int128_unaligned where v_largeint is not null
"""
}
Loading