diff --git a/be/src/exprs/hash_functions.cpp b/be/src/exprs/hash_functions.cpp index 0407e62381b6db..9b9689f5f0cbb3 100644 --- a/be/src/exprs/hash_functions.cpp +++ b/be/src/exprs/hash_functions.cpp @@ -40,4 +40,18 @@ IntVal HashFunctions::murmur_hash3_32(FunctionContext* ctx, int num_children, return seed; } +BigIntVal HashFunctions::murmur_hash3_64(FunctionContext* ctx, int num_children, + const StringVal* inputs) { + uint64_t seed = 0; + uint64_t hash = 0; + for (int i = 0; i < num_children; ++i) { + if (inputs[i].is_null) { + return BigIntVal::null(); + } + murmur_hash3_x64_64(inputs[i].ptr, inputs[i].len, seed, &hash); + seed = hash; + } + return hash; +} + } // namespace doris diff --git a/be/src/exprs/hash_functions.h b/be/src/exprs/hash_functions.h index 9fcfb9a7aa7363..288dfbc7fd366a 100644 --- a/be/src/exprs/hash_functions.h +++ b/be/src/exprs/hash_functions.h @@ -20,6 +20,7 @@ namespace doris_udf { class FunctionContext; struct IntVal; +struct BigIntVal; struct StringVal; } // namespace doris_udf @@ -30,6 +31,8 @@ class HashFunctions { static void init(); static doris_udf::IntVal murmur_hash3_32(doris_udf::FunctionContext* ctx, int num_children, const doris_udf::StringVal* inputs); + static doris_udf::BigIntVal murmur_hash3_64(doris_udf::FunctionContext* ctx, int num_children, + const doris_udf::StringVal* inputs); }; } // namespace doris diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index d03f466aff3194..f8e1076d585a5f 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -119,60 +119,11 @@ class HashUtil { // refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java static const uint32_t MURMUR3_32_SEED = 104729; - ALWAYS_INLINE static uint32_t rotl32(uint32_t x, int8_t r) { - return (x << r) | (x >> (32 - r)); - } - - ALWAYS_INLINE static uint32_t fmix32(uint32_t h) { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; - } - // modify from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp static uint32_t murmur_hash3_32(const void* key, int32_t len, uint32_t seed) { - const uint8_t* data = (const uint8_t*)key; - const int nblocks = len / 4; - - uint32_t h1 = seed; - - const uint32_t c1 = 0xcc9e2d51; - const uint32_t c2 = 0x1b873593; - const uint32_t* blocks = (const uint32_t*)(data + nblocks * 4); - - for (int i = -nblocks; i; i++) { - uint32_t k1 = blocks[i]; - - k1 *= c1; - k1 = rotl32(k1, 15); - k1 *= c2; - - h1 ^= k1; - h1 = rotl32(h1, 13); - h1 = h1 * 5 + 0xe6546b64; - } - - const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); - uint32_t k1 = 0; - switch (len & 3) { - case 3: - k1 ^= tail[2] << 16; - case 2: - k1 ^= tail[1] << 8; - case 1: - k1 ^= tail[0]; - k1 *= c1; - k1 = rotl32(k1, 15); - k1 *= c2; - h1 ^= k1; - }; - - h1 ^= len; - h1 = fmix32(h1); - return h1; + uint32_t out = 0; + murmur_hash3_x86_32(key, len, seed, &out); + return out; } static const int MURMUR_R = 47; diff --git a/be/src/util/murmur_hash3.cpp b/be/src/util/murmur_hash3.cpp index d2fadc5e17a4e4..5181558a81ef19 100644 --- a/be/src/util/murmur_hash3.cpp +++ b/be/src/util/murmur_hash3.cpp @@ -31,11 +31,11 @@ #define FORCE_INLINE inline __attribute__((always_inline)) -inline uint32_t rotl32(uint32_t x, int8_t r) { +FORCE_INLINE uint32_t rotl32(uint32_t x, int8_t r) { return (x << r) | (x >> (32 - r)); } -inline uint64_t rotl64(uint64_t x, int8_t r) { +FORCE_INLINE uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - r)); } diff --git a/be/src/vec/functions/function_hash.cpp b/be/src/vec/functions/function_hash.cpp index 92e2a558273e87..7e9ccc94926f99 100644 --- a/be/src/vec/functions/function_hash.cpp +++ b/be/src/vec/functions/function_hash.cpp @@ -155,9 +155,22 @@ struct MurmurHash2Impl64 { }; using FunctionMurmurHash2_64 = FunctionVariadicArgumentsBase; -struct MurmurHash3Impl32 { +template +struct MurmurHash3ImplName {}; + +template <> +struct MurmurHash3ImplName { static constexpr auto name = "murmur_hash3_32"; - using ReturnType = Int32; +}; + +template <> +struct MurmurHash3ImplName { + static constexpr auto name = "murmur_hash3_64"; +}; + +template +struct MurmurHash3Impl { + static constexpr auto name = MurmurHash3ImplName::name; static Status empty_apply(IColumn& icolumn, size_t input_rows_count) { ColumnVector& vec_to = assert_cast&>(icolumn); @@ -178,6 +191,7 @@ struct MurmurHash3Impl32 { template static Status execute(const IDataType* type, const IColumn* column, size_t input_rows_count, IColumn& col_to) { + auto* col_to_data = assert_cast&>(col_to).get_data().data(); if (const ColumnString* col_from = check_and_get_column(column)) { const typename ColumnString::Chars& data = col_from->get_chars(); const typename ColumnString::Offsets& offsets = col_from->get_offsets(); @@ -186,16 +200,29 @@ struct MurmurHash3Impl32 { ColumnString::Offset current_offset = 0; for (size_t i = 0; i < size; ++i) { if (first) { - UInt32 val = HashUtil::murmur_hash3_32( - reinterpret_cast(&data[current_offset]), - offsets[i] - current_offset - 1, HashUtil::MURMUR3_32_SEED); - col_to.insert_data(const_cast(reinterpret_cast(&val)), 0); + if constexpr (std::is_same_v) { + UInt32 val = HashUtil::murmur_hash3_32( + reinterpret_cast(&data[current_offset]), + offsets[i] - current_offset - 1, HashUtil::MURMUR3_32_SEED); + col_to.insert_data(const_cast(reinterpret_cast(&val)), + 0); + } else { + UInt64 val = 0; + murmur_hash3_x64_64(reinterpret_cast(&data[current_offset]), + offsets[i] - current_offset - 1, 0, &val); + col_to.insert_data(const_cast(reinterpret_cast(&val)), + 0); + } } else { - assert_cast&>(col_to).get_data()[i] = - HashUtil::murmur_hash3_32( - reinterpret_cast(&data[current_offset]), - offsets[i] - current_offset - 1, - ext::bit_cast(col_to[i])); + if constexpr (std::is_same_v) { + col_to_data[i] = HashUtil::murmur_hash3_32( + reinterpret_cast(&data[current_offset]), + offsets[i] - current_offset - 1, ext::bit_cast(col_to[i])); + } else { + murmur_hash3_x64_64(reinterpret_cast(&data[current_offset]), + offsets[i] - current_offset - 1, + ext::bit_cast(col_to[i]), col_to_data + i); + } } current_offset = offsets[i]; } @@ -204,13 +231,25 @@ struct MurmurHash3Impl32 { String value = col_from_const->get_value().data(); for (size_t i = 0; i < input_rows_count; ++i) { if (first) { - UInt32 val = HashUtil::murmur_hash3_32(value.data(), value.size(), - HashUtil::MURMUR3_32_SEED); - col_to.insert_data(const_cast(reinterpret_cast(&val)), 0); + if constexpr (std::is_same_v) { + UInt32 val = HashUtil::murmur_hash3_32(value.data(), value.size(), + HashUtil::MURMUR3_32_SEED); + col_to.insert_data(const_cast(reinterpret_cast(&val)), + 0); + } else { + UInt64 val = 0; + murmur_hash3_x64_64(value.data(), value.size(), 0, &val); + col_to.insert_data(const_cast(reinterpret_cast(&val)), + 0); + } } else { - assert_cast&>(col_to).get_data()[i] = - HashUtil::murmur_hash3_32(value.data(), value.size(), - ext::bit_cast(col_to[i])); + if constexpr (std::is_same_v) { + col_to_data[i] = HashUtil::murmur_hash3_32( + value.data(), value.size(), ext::bit_cast(col_to[i])); + } else { + murmur_hash3_x64_64(value.data(), value.size(), + ext::bit_cast(col_to[i]), col_to_data + i); + } } } } else { @@ -221,10 +260,12 @@ struct MurmurHash3Impl32 { return Status::OK(); } }; -using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase; +using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase>; +using FunctionMurmurHash3_64 = FunctionVariadicArgumentsBase>; void register_function_function_hash(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); + factory.register_function(); } -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt index 93966d2a2083ee..437dd4e3549555 100644 --- a/be/test/CMakeLists.txt +++ b/be/test/CMakeLists.txt @@ -113,6 +113,7 @@ set(EXPRS_TEST_FILES exprs/bloom_filter_predicate_test.cpp exprs/array_functions_test.cpp exprs/window_funnel_test.cpp + exprs/hash_function_test.cpp ) set(GEO_TEST_FILES geo/wkt_parse_test.cpp diff --git a/be/test/exprs/hash_function_test.cpp b/be/test/exprs/hash_function_test.cpp new file mode 100644 index 00000000000000..5fcccbb6570e0b --- /dev/null +++ b/be/test/exprs/hash_function_test.cpp @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include + +#include "exprs/anyval_util.h" +#include "exprs/hash_functions.h" +#include "testutil/function_utils.h" +#include "testutil/test_util.h" + +namespace doris { + +class HashFunctionsTest : public testing::Test { +public: + HashFunctionsTest() = default; + + void SetUp() { + utils = new FunctionUtils(); + ctx = utils->get_fn_ctx(); + } + void TearDown() { delete utils; } + +private: + FunctionUtils* utils; + FunctionContext* ctx; +}; + +TEST_F(HashFunctionsTest, murmur_hash3_64) { + StringVal input = AnyValUtil::from_string_temp(ctx, std::string("hello")); + BigIntVal result = HashFunctions::murmur_hash3_64(ctx, 1, &input); + BigIntVal expected((int64_t)-3215607508166160593); + + EXPECT_EQ(expected, result); +} +} // namespace doris \ No newline at end of file diff --git a/be/test/vec/function/function_hash_test.cpp b/be/test/vec/function/function_hash_test.cpp index be22cea4bd5fd1..45781811810103 100644 --- a/be/test/vec/function/function_hash_test.cpp +++ b/be/test/vec/function/function_hash_test.cpp @@ -55,6 +55,39 @@ TEST(HashFunctionTest, murmur_hash_3_test) { }; } +TEST(HashFunctionTest, murmur_hash_3_64_test) { + std::string func_name = "murmur_hash3_64"; + + { + InputTypeSet input_types = {TypeIndex::String}; + + DataSet data_set = {{{Null()}, Null()}, + {{std::string("hello")}, (int64_t)-3215607508166160593}}; + + check_function(func_name, input_types, data_set); + }; + + { + InputTypeSet input_types = {TypeIndex::String, TypeIndex::String}; + + DataSet data_set = { + {{std::string("hello"), std::string("world")}, (int64_t)3583109472027628045}, + {{std::string("hello"), Null()}, Null()}}; + + check_function(func_name, input_types, data_set); + }; + + { + InputTypeSet input_types = {TypeIndex::String, TypeIndex::String, TypeIndex::String}; + + DataSet data_set = {{{std::string("hello"), std::string("world"), std::string("!")}, + (int64_t)1887828212617890932}, + {{std::string("hello"), std::string("world"), Null()}, Null()}}; + + check_function(func_name, input_types, data_set); + }; +} + TEST(HashFunctionTest, murmur_hash_2_test) { std::string func_name = "murmurHash2_64"; diff --git a/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md b/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md new file mode 100644 index 00000000000000..cd05f72b05f2fa --- /dev/null +++ b/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md @@ -0,0 +1,61 @@ +--- +{ + "title": "murmur_hash3_64", + "language": "en" +} +--- + + + +## murmur_hash3_64 + +### description +#### Syntax + +`BIGINT MURMUR_HASH3_64(VARCHAR input, ...)` + +Return the 64 bits murmur3 hash of input string. + +### example + +``` +mysql> select murmur_hash3_64(null); ++-----------------------+ +| murmur_hash3_64(NULL) | ++-----------------------+ +| NULL | ++-----------------------+ + +mysql> select murmur_hash3_64("hello"); ++--------------------------+ +| murmur_hash3_64('hello') | ++--------------------------+ +| -3215607508166160593 | ++--------------------------+ + +mysql> select murmur_hash3_64("hello", "world"); ++-----------------------------------+ +| murmur_hash3_64('hello', 'world') | ++-----------------------------------+ +| 3583109472027628045 | ++-----------------------------------+ +``` + +### keywords + + MURMUR_HASH3_64,HASH diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md new file mode 100644 index 00000000000000..c25861444c40fb --- /dev/null +++ b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md @@ -0,0 +1,61 @@ +--- +{ + "title": "murmur_hash3_64", + "language": "zh-CN" +} +--- + + + +## murmur_hash3_64 + +### description +#### Syntax + +`BIGINT MURMUR_HASH3_64(VARCHAR input, ...)` + +返回输入字符串的64位murmur3 hash值 + +### example + +``` +mysql> select murmur_hash3_64(null); ++-----------------------+ +| murmur_hash3_64(NULL) | ++-----------------------+ +| NULL | ++-----------------------+ + +mysql> select murmur_hash3_64("hello"); ++--------------------------+ +| murmur_hash3_64('hello') | ++--------------------------+ +| -3215607508166160593 | ++--------------------------+ + +mysql> select murmur_hash3_64("hello", "world"); ++-----------------------------------+ +| murmur_hash3_64('hello', 'world') | ++-----------------------------------+ +| 3583109472027628045 | ++-----------------------------------+ +``` + +### keywords + + MURMUR_HASH3_64,HASH diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index ab0353f826259e..8cce6f2adea3bd 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -1245,6 +1245,12 @@ [['murmur_hash3_32'], 'INT', ['STRING', '...'], '_ZN5doris13HashFunctions15murmur_hash3_32EPN9doris_udf15FunctionContextEiPKNS1_9StringValE', '', '', 'vec', ''], + [['murmur_hash3_64'], 'BIGINT', ['VARCHAR', '...'], + '_ZN5doris13HashFunctions15murmur_hash3_64EPN9doris_udf15FunctionContextEiPKNS1_9StringValE', + '', '', 'vec', ''], + [['murmur_hash3_64'], 'BIGINT', ['STRING', '...'], + '_ZN5doris13HashFunctions15murmur_hash3_64EPN9doris_udf15FunctionContextEiPKNS1_9StringValE', + '', '', 'vec', ''], # aes and base64 function [['aes_encrypt'], 'VARCHAR', ['VARCHAR', 'VARCHAR'], diff --git a/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out b/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out new file mode 100644 index 00000000000000..c7b9485d454b65 --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out @@ -0,0 +1,18 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +\N + +-- !sql -- +1321743225 + +-- !sql -- +984713481 + +-- !sql -- +\N + +-- !sql -- +-3215607508166160593 + +-- !sql -- +3583109472027628045 diff --git a/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy b/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy new file mode 100644 index 00000000000000..3f2bccaee524b0 --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +suite("test_hash_function") { + sql "set enable_vectorized_engine = true;" + sql "set batch_size = 4096;" + + qt_sql "SELECT murmur_hash3_32(null);" + qt_sql "SELECT murmur_hash3_32(\"hello\");" + qt_sql "SELECT murmur_hash3_32(\"hello\", \"world\");" + + qt_sql "SELECT murmur_hash3_64(null);" + qt_sql "SELECT murmur_hash3_64(\"hello\");" + qt_sql "SELECT murmur_hash3_64(\"hello\", \"world\");" +}