Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 103 additions & 1 deletion be/src/exprs/function/function_hash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,107 @@ using FunctionMurmurHash3_64_V2 =
using FunctionMurmurHash3U64V2 =
FunctionVariadicArgumentsBase<DataTypeInt128, MurmurHash3Impl<TYPE_LARGEINT, true>>;

struct MurmurHash3128Impl {
static constexpr auto name = "murmur_hash3_128";

static Status empty_apply(IColumn& icolumn, size_t input_rows_count) {
ColumnVector<TYPE_LARGEINT>& vec_to = assert_cast<ColumnVector<TYPE_LARGEINT>&>(icolumn);
vec_to.get_data().assign(input_rows_count, pack_hash(emtpy_value, emtpy_value));
return Status::OK();
}

static Status first_apply(const IDataType* type, const IColumn* column, size_t input_rows_count,
IColumn& icolumn) {
return execute<true>(type, column, input_rows_count, icolumn);
}

static Status combine_apply(const IDataType* type, const IColumn* column,
size_t input_rows_count, IColumn& icolumn) {
return execute<false>(type, column, input_rows_count, icolumn);
}

template <bool first>
static Status execute(const IDataType* type, const IColumn* column, size_t input_rows_count,
IColumn& col_to) {
auto& to_column = assert_cast<ColumnVector<TYPE_LARGEINT>&>(col_to);
if constexpr (first) {
// The first argument initializes one 128-bit hash state per row. Later arguments reuse
// the same result column and update the saved state in place.
to_column.insert_many_defaults(input_rows_count);
}
auto& col_to_data = to_column.get_data();
if (const auto* col_from = check_and_get_column<ColumnString>(column)) {
const typename ColumnString::Chars& data = col_from->get_chars();
const typename ColumnString::Offsets& offsets = col_from->get_offsets();
size_t size = offsets.size();
ColumnString::Offset current_offset = 0;
for (size_t i = 0; i < size; ++i) {
if constexpr (first) {
init_hash(col_to_data[i], reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset);
} else {
update_hash(col_to_data[i],
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset);
}
current_offset = offsets[i];
}
} else if (const ColumnConst* col_from_const =
check_and_get_column_const_string_or_fixedstring(column)) {
auto value = col_from_const->get_value<TYPE_STRING>();
for (size_t i = 0; i < input_rows_count; ++i) {
if constexpr (first) {
init_hash(col_to_data[i], value.data(), value.size());
} else {
update_hash(col_to_data[i], value.data(), value.size());
}
}
} else {
DCHECK(false);
return Status::NotSupported("Illegal column {} of argument of function {}",
column->get_name(), name);
}
return Status::OK();
}

private:
static __int128_t pack_hash(uint64_t h1, uint64_t h2) {
static_assert(sizeof(__int128_t) == sizeof(uint64_t) * 2);
// Store the two MurmurHash3 x64 128-bit lanes in a single LARGEINT value. Keep h1 in the
// low 64 bits and h2 in the high 64 bits to match murmur_hash3_x64_128's out[0]/out[1].
const auto value =
(static_cast<unsigned __int128>(h2) << 64) | static_cast<unsigned __int128>(h1);
return static_cast<__int128_t>(value);
}

static void unpack_hash(__int128_t value, uint64_t& h1, uint64_t& h2) {
static_assert(sizeof(__int128_t) == sizeof(uint64_t) * 2);
const auto unsigned_value = static_cast<unsigned __int128>(value);
h1 = static_cast<uint64_t>(unsigned_value);
h2 = static_cast<uint64_t>(unsigned_value >> 64);
}

static void init_hash(__int128_t& value, const void* data, size_t size) {
uint64_t hash[2] = {0, 0};
// The first SQL argument starts from seed 0, so it can use the existing 128-bit primitive
// directly. Later arguments must use update_hash() to continue from the saved (h1, h2).
murmur_hash3_x64_128(data, static_cast<int>(size), 0, hash);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This narrows a SQL string length from size_t to int before entering murmur_hash3_x64_128 / murmur_hash3_x64_process. Doris string offsets are not limited to INT_MAX (StringLengthType is uint32_t), so a large STRING value can wrap to a negative len; murmur_hash3_x64_process then computes a negative nblocks and derives tail = data + nblocks * 16, which can point before the buffer and read out of bounds. Please either make the 128-bit Murmur primitive accept a non-narrowing length type (as other hash wrappers do at their public boundary) or reject/guard values larger than INT_MAX before calling it.

value = pack_hash(hash[0], hash[1]);
}

static void update_hash(__int128_t& value, const void* data, size_t size) {
uint64_t h1 = 0;
uint64_t h2 = 0;
// Variadic hash functions feed each argument with the previous argument's hash state.
// For 128-bit MurmurHash3 that state is the pair (h1, h2), packed in the LARGEINT column.
unpack_hash(value, h1, h2);
murmur_hash3_x64_process(data, static_cast<int>(size), h1, h2);
value = pack_hash(h1, h2);
}
};

using FunctionMurmurHash3_128 = FunctionVariadicArgumentsBase<DataTypeInt128, MurmurHash3128Impl>;

#ifdef BE_TEST
const char* murmur_hash3_get_name_type_int_for_test() {
return MurmurHash3Impl<TYPE_INT>::get_name();
Expand Down Expand Up @@ -234,8 +335,9 @@ void register_function_hash(SimpleFunctionFactory& factory) {
factory.register_function<FunctionMurmurHash3_64>();
factory.register_function<FunctionMurmurHash3_64_V2>();
factory.register_function<FunctionMurmurHash3U64V2>();
factory.register_function<FunctionMurmurHash3_128>();
factory.register_function<FunctionXxHash_32>();
factory.register_function<FunctionXxHash_64>();
factory.register_alias("xxhash_64", "xxhash3_64");
}
} // namespace doris
} // namespace doris
58 changes: 58 additions & 0 deletions be/test/exprs/function/function_hash_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,26 @@
namespace doris {
using namespace ut_type;

namespace {

__int128_t pack_murmur_hash3_128_for_test(uint64_t h1, uint64_t h2) {
static_assert(sizeof(__int128_t) == sizeof(uint64_t) * 2);
const auto value =
(static_cast<unsigned __int128>(h2) << 64) | static_cast<unsigned __int128>(h1);
return static_cast<__int128_t>(value);
}

__int128_t murmur_hash3_128_for_test(const std::vector<std::string>& values) {
uint64_t h1 = 0;
uint64_t h2 = 0;
for (const std::string& value : values) {
murmur_hash3_x64_process(value.data(), static_cast<int>(value.size()), h1, h2);
}
return pack_murmur_hash3_128_for_test(h1, h2);
}

} // namespace

TEST(HashFunctionTest, murmur_hash_3_test) {
std::string func_name = "murmur_hash3_32";

Expand Down Expand Up @@ -114,6 +134,44 @@ TEST(HashFunctionTest, murmur_hash_3_64_v2_test) {
};
}

TEST(HashFunctionTest, murmur_hash_3_128_test) {
std::string func_name = "murmur_hash3_128";

{
InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};

DataSet data_set = {
{{Null()}, Null()},
{{std::string("hello world")},
pack_murmur_hash3_128_for_test(5998619086395760910ULL, 12364428806279881649ULL)}};

static_cast<void>(check_function<DataTypeInt128, true>(func_name, input_types, data_set));
check_function_all_arg_comb<DataTypeInt128, true>(func_name, input_types, data_set);
};

{
InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR, PrimitiveType::TYPE_VARCHAR};

DataSet data_set = {{{std::string("hello"), std::string("world")},
murmur_hash3_128_for_test({"hello", "world"})},
{{std::string("hello"), Null()}, Null()}};

static_cast<void>(check_function<DataTypeInt128, true>(func_name, input_types, data_set));
check_function_all_arg_comb<DataTypeInt128, true>(func_name, input_types, data_set);
};

{
InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR, PrimitiveType::TYPE_VARCHAR,
PrimitiveType::TYPE_VARCHAR};

DataSet data_set = {{{std::string("hello"), std::string("world"), std::string("!")},
murmur_hash3_128_for_test({"hello", "world", "!"})},
{{std::string("hello"), std::string("world"), Null()}, Null()}};

check_function_all_arg_comb<DataTypeInt128, true>(func_name, input_types, data_set);
};
}

TEST(HashFunctionTest, murmur_hash_get_name_test) {
EXPECT_STREQ(murmur_hash3_get_name_type_int_for_test(), "murmur_hash3_32");
EXPECT_STREQ(murmur_hash3_get_name_type_bigint_for_test(), "murmur_hash3_64");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiMatch;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiMatchAny;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiSearchAllPositions;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash3128;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364V2;
Expand Down Expand Up @@ -946,6 +947,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
scalar(MultiMatch.class, "multi_match"),
scalar(MultiMatchAny.class, "multi_match_any"),
scalar(MultiSearchAllPositions.class, "multi_search_all_positions"),
scalar(MurmurHash3128.class, "murmur_hash3_128"),
scalar(MurmurHash332.class, "murmur_hash3_32"),
scalar(MurmurHash364.class, "murmur_hash3_64"),
scalar(MurmurHash364V2.class, "murmur_hash3_64_v2"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.trees.expressions.functions.scalar;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.LargeIntType;
import org.apache.doris.nereids.types.StringType;
import org.apache.doris.nereids.types.VarcharType;
import org.apache.doris.nereids.util.ExpressionUtils;
import org.apache.doris.nereids.util.Utils;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;

import java.util.List;

/**
* ScalarFunction 'murmur_hash3_128'.
*/
public class MurmurHash3128 extends ScalarFunction
implements ExplicitlyCastableSignature, PropagateNullable {

public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
FunctionSignature.ret(LargeIntType.INSTANCE).varArgs(VarcharType.SYSTEM_DEFAULT),
FunctionSignature.ret(LargeIntType.INSTANCE).varArgs(StringType.INSTANCE)
);

/**
* constructor with 1 or more arguments.
*/
public MurmurHash3128(Expression arg, Expression... varArgs) {
this(ExpressionUtils.mergeArguments(arg, varArgs));
}

/** constructor with list arguments. */
public MurmurHash3128(List<Expression> args) {
super("murmur_hash3_128", Utils.fastToImmutableList(args));
}

/** constructor for withChildren and reuse signature */
private MurmurHash3128(ScalarFunctionParams functionParams) {
super(functionParams);
}

/**
* withChildren.
*/
@Override
public MurmurHash3128 withChildren(List<Expression> children) {
Preconditions.checkArgument(!children.isEmpty());
return new MurmurHash3128(getFunctionParams(children));
}

@Override
public List<FunctionSignature> getSignatures() {
return SIGNATURES;
}

@Override
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
return visitor.visitMurmurHash3128(this, context);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,7 @@
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiMatch;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiMatchAny;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MultiSearchAllPositions;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash3128;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364V2;
Expand Down Expand Up @@ -1999,6 +2000,10 @@ default R visitMultiSearchAllPositions(MultiSearchAllPositions function, C conte
return visitScalarFunction(function, context);
}

default R visitMurmurHash3128(MurmurHash3128 murmurHash3128, C context) {
return visitScalarFunction(murmurHash3128, context);
}

default R visitMurmurHash332(MurmurHash332 murmurHash332, C context) {
return visitScalarFunction(murmurHash332, context);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,60 @@ suite("test_hash_function", "arrow_flight_sql") {

qt_mmh3_64_v2_table "SELECT id, MURMUR_HASH3_64_V2(str_col) FROM test_hash_tbl ORDER BY id;"
qt_mmh3_u64_v2_table "SELECT id, MURMUR_HASH3_U64_V2(str_col) FROM test_hash_tbl ORDER BY id;"
def mmh3_128_table = sql "SELECT id, MURMUR_HASH3_128(str_col) FROM test_hash_tbl ORDER BY id;"
assertEquals([
[1, "160552765667853844864347215091851402511"],
[2, "-112198913113891391029130930996035755762"],
[3, null],
[4, "0"],
[5, "125233622341202073067337261280912046255"],
[6, "15723305950287370021067100420381546638"],
[7, "76022033372587150664028094316560832338"],
[8, "8282804273666544992604160676428939260"],
[9, "54500626245739954189896806014374040748"]
], mmh3_128_table.collect { [it[0] as int, it[1] == null ? null : it[1].toString()] });

def mmh3_128_multi_arg_table = sql """
SELECT id,
MURMUR_HASH3_128(str_col, 'world'),
MURMUR_HASH3_128('hello', str_col),
MURMUR_HASH3_128(str_col, str_col)
FROM test_hash_tbl
ORDER BY id;
"""
assertEquals([
[1, "1446959605745449161743580155912574278",
"-56339891497867408245721289945506420485",
"-168178611131198900113957651047418886169"],
[2, "-149549192126671924717567585844879967555",
"-60587747024077554617701559639572348746",
"65424866033221268138215169303830582651"],
[3, null, null, null],
[4, "-78565033930154308766756204499853146902",
"84714210717646297788662261898201230080", "0"],
[5, "-39544922153624419472698581057092341686",
"164681627131843042222834911849678645237",
"-461669328540671960194073097226353499"],
[6, "3855286205383178813738041956665736806",
"114705208091273276245241207548307136670",
"126143460685998379802738880954496866607"],
[7, "-66251693712752822782446614605307054187",
"145660169740749413061118106551153383205",
"167010716176867357320321384372081403147"],
[8, "-53522276451386554290637598501892217568",
"-49434810126805792985863702972638694950",
"-162781926366258959505488386157422138285"],
[9, "46144780234418243372325741019404497451",
"109924268220979943366442352659211725694",
"-68242619748682641450662548496031489232"]
], mmh3_128_multi_arg_table.collect {
[
it[0] as int,
it[1] == null ? null : it[1].toString(),
it[2] == null ? null : it[2].toString(),
it[3] == null ? null : it[3].toString()
]
});

sql "DROP TABLE IF EXISTS test_hash_tbl;"

Expand All @@ -100,6 +154,22 @@ suite("test_hash_function", "arrow_flight_sql") {
qt_mmh3_u64_v2_fold_1 "SELECT MURMUR_HASH3_U64_V2('test') + 1;"
qt_mmh3_u64_v2_fold_2 "SELECT MURMUR_HASH3_U64_V2('a', 'b') * 2;"

def validate_mmh3_128 = { String expected, String expression ->
def res = sql "SELECT MURMUR_HASH3_128(${expression});"
assertEquals(expected, res[0][0] == null ? null : res[0][0].toString());
}

validate_mmh3_128(null, "NULL");
validate_mmh3_128("0", "''");
validate_mmh3_128("121118445609844952839898260755277781762", "'hello'");
validate_mmh3_128("-112198913113891391029130930996035755762", "'hello world'");
validate_mmh3_128("125233622341202073067337261280912046255", "'apache doris'");
validate_mmh3_128("-17367660094379006912106945534038101931", "'hello', 'world'");
validate_mmh3_128("9994430460069927257443176797242139063", "'hello', 'world', '!'");
validate_mmh3_128(null, "'hello', NULL");
validate_mmh3_128("8282804273666544992604160676428939260", "'你好🤣'");
validate_mmh3_128("54500626245739954189896806014374040748", "'アパッチドリス'");

qt_sql "SELECT xxhash_32(null);"
qt_sql "SELECT xxhash_32(\"hello\");"
qt_sql "SELECT xxhash_32(\"hello\", \"world\");"
Expand Down
Loading