Skip to content

Commit

Permalink
[function](hash) add support of murmur_hash3_64 (#12923) (#14636)
Browse files Browse the repository at this point in the history
  • Loading branch information
jacktengg committed Nov 28, 2022
1 parent 3e0a2d9 commit 1ec59f7
Show file tree
Hide file tree
Showing 13 changed files with 342 additions and 73 deletions.
14 changes: 14 additions & 0 deletions be/src/exprs/hash_functions.cpp
Expand Up @@ -40,4 +40,18 @@ IntVal HashFunctions::murmur_hash3_32(FunctionContext* ctx, int num_children,
return seed;
}

BigIntVal HashFunctions::murmur_hash3_64(FunctionContext* ctx, int num_children,
const StringVal* inputs) {
uint64_t seed = 0;
uint64_t hash = 0;
for (int i = 0; i < num_children; ++i) {
if (inputs[i].is_null) {
return BigIntVal::null();
}
murmur_hash3_x64_64(inputs[i].ptr, inputs[i].len, seed, &hash);
seed = hash;
}
return hash;
}

} // namespace doris
3 changes: 3 additions & 0 deletions be/src/exprs/hash_functions.h
Expand Up @@ -20,6 +20,7 @@
namespace doris_udf {
class FunctionContext;
struct IntVal;
struct BigIntVal;
struct StringVal;
} // namespace doris_udf

Expand All @@ -30,6 +31,8 @@ class HashFunctions {
static void init();
static doris_udf::IntVal murmur_hash3_32(doris_udf::FunctionContext* ctx, int num_children,
const doris_udf::StringVal* inputs);
static doris_udf::BigIntVal murmur_hash3_64(doris_udf::FunctionContext* ctx, int num_children,
const doris_udf::StringVal* inputs);
};

} // namespace doris
55 changes: 3 additions & 52 deletions be/src/util/hash_util.hpp
Expand Up @@ -119,60 +119,11 @@ class HashUtil {
// refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
static const uint32_t MURMUR3_32_SEED = 104729;

ALWAYS_INLINE static uint32_t rotl32(uint32_t x, int8_t r) {
return (x << r) | (x >> (32 - r));
}

ALWAYS_INLINE static uint32_t fmix32(uint32_t h) {
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}

// modify from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
static uint32_t murmur_hash3_32(const void* key, int32_t len, uint32_t seed) {
const uint8_t* data = (const uint8_t*)key;
const int nblocks = len / 4;

uint32_t h1 = seed;

const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
const uint32_t* blocks = (const uint32_t*)(data + nblocks * 4);

for (int i = -nblocks; i; i++) {
uint32_t k1 = blocks[i];

k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;

h1 ^= k1;
h1 = rotl32(h1, 13);
h1 = h1 * 5 + 0xe6546b64;
}

const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
uint32_t k1 = 0;
switch (len & 3) {
case 3:
k1 ^= tail[2] << 16;
case 2:
k1 ^= tail[1] << 8;
case 1:
k1 ^= tail[0];
k1 *= c1;
k1 = rotl32(k1, 15);
k1 *= c2;
h1 ^= k1;
};

h1 ^= len;
h1 = fmix32(h1);
return h1;
uint32_t out = 0;
murmur_hash3_x86_32(key, len, seed, &out);
return out;
}

static const int MURMUR_R = 47;
Expand Down
4 changes: 2 additions & 2 deletions be/src/util/murmur_hash3.cpp
Expand Up @@ -31,11 +31,11 @@

#define FORCE_INLINE inline __attribute__((always_inline))

inline uint32_t rotl32(uint32_t x, int8_t r) {
FORCE_INLINE uint32_t rotl32(uint32_t x, int8_t r) {
return (x << r) | (x >> (32 - r));
}

inline uint64_t rotl64(uint64_t x, int8_t r) {
FORCE_INLINE uint64_t rotl64(uint64_t x, int8_t r) {
return (x << r) | (x >> (64 - r));
}

Expand Down
79 changes: 60 additions & 19 deletions be/src/vec/functions/function_hash.cpp
Expand Up @@ -155,9 +155,22 @@ struct MurmurHash2Impl64 {
};
using FunctionMurmurHash2_64 = FunctionVariadicArgumentsBase<DataTypeUInt64, MurmurHash2Impl64>;

struct MurmurHash3Impl32 {
template <typename ReturnType>
struct MurmurHash3ImplName {};

template <>
struct MurmurHash3ImplName<Int32> {
static constexpr auto name = "murmur_hash3_32";
using ReturnType = Int32;
};

template <>
struct MurmurHash3ImplName<Int64> {
static constexpr auto name = "murmur_hash3_64";
};

template <typename ReturnType>
struct MurmurHash3Impl {
static constexpr auto name = MurmurHash3ImplName<ReturnType>::name;

static Status empty_apply(IColumn& icolumn, size_t input_rows_count) {
ColumnVector<ReturnType>& vec_to = assert_cast<ColumnVector<ReturnType>&>(icolumn);
Expand All @@ -178,6 +191,7 @@ struct MurmurHash3Impl32 {
template <bool first>
static Status execute(const IDataType* type, const IColumn* column, size_t input_rows_count,
IColumn& col_to) {
auto* col_to_data = assert_cast<ColumnVector<ReturnType>&>(col_to).get_data().data();
if (const ColumnString* col_from = check_and_get_column<ColumnString>(column)) {
const typename ColumnString::Chars& data = col_from->get_chars();
const typename ColumnString::Offsets& offsets = col_from->get_offsets();
Expand All @@ -186,16 +200,29 @@ struct MurmurHash3Impl32 {
ColumnString::Offset current_offset = 0;
for (size_t i = 0; i < size; ++i) {
if (first) {
UInt32 val = HashUtil::murmur_hash3_32(
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset - 1, HashUtil::MURMUR3_32_SEED);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0);
if constexpr (std::is_same_v<ReturnType, Int32>) {
UInt32 val = HashUtil::murmur_hash3_32(
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset - 1, HashUtil::MURMUR3_32_SEED);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)),
0);
} else {
UInt64 val = 0;
murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset - 1, 0, &val);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)),
0);
}
} else {
assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i] =
HashUtil::murmur_hash3_32(
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset - 1,
ext::bit_cast<UInt32>(col_to[i]));
if constexpr (std::is_same_v<ReturnType, Int32>) {
col_to_data[i] = HashUtil::murmur_hash3_32(
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset - 1, ext::bit_cast<UInt32>(col_to[i]));
} else {
murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset - 1,
ext::bit_cast<UInt64>(col_to[i]), col_to_data + i);
}
}
current_offset = offsets[i];
}
Expand All @@ -204,13 +231,25 @@ struct MurmurHash3Impl32 {
String value = col_from_const->get_value<String>().data();
for (size_t i = 0; i < input_rows_count; ++i) {
if (first) {
UInt32 val = HashUtil::murmur_hash3_32(value.data(), value.size(),
HashUtil::MURMUR3_32_SEED);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0);
if constexpr (std::is_same_v<ReturnType, Int32>) {
UInt32 val = HashUtil::murmur_hash3_32(value.data(), value.size(),
HashUtil::MURMUR3_32_SEED);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)),
0);
} else {
UInt64 val = 0;
murmur_hash3_x64_64(value.data(), value.size(), 0, &val);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)),
0);
}
} else {
assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i] =
HashUtil::murmur_hash3_32(value.data(), value.size(),
ext::bit_cast<UInt32>(col_to[i]));
if constexpr (std::is_same_v<ReturnType, Int32>) {
col_to_data[i] = HashUtil::murmur_hash3_32(
value.data(), value.size(), ext::bit_cast<UInt32>(col_to[i]));
} else {
murmur_hash3_x64_64(value.data(), value.size(),
ext::bit_cast<UInt64>(col_to[i]), col_to_data + i);
}
}
}
} else {
Expand All @@ -221,10 +260,12 @@ struct MurmurHash3Impl32 {
return Status::OK();
}
};
using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase<DataTypeInt32, MurmurHash3Impl32>;
using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase<DataTypeInt32, MurmurHash3Impl<Int32>>;
using FunctionMurmurHash3_64 = FunctionVariadicArgumentsBase<DataTypeInt64, MurmurHash3Impl<Int64>>;

void register_function_function_hash(SimpleFunctionFactory& factory) {
factory.register_function<FunctionMurmurHash2_64>();
factory.register_function<FunctionMurmurHash3_32>();
factory.register_function<FunctionMurmurHash3_64>();
}
} // namespace doris::vectorized
} // namespace doris::vectorized
1 change: 1 addition & 0 deletions be/test/CMakeLists.txt
Expand Up @@ -113,6 +113,7 @@ set(EXPRS_TEST_FILES
exprs/bloom_filter_predicate_test.cpp
exprs/array_functions_test.cpp
exprs/window_funnel_test.cpp
exprs/hash_function_test.cpp
)
set(GEO_TEST_FILES
geo/wkt_parse_test.cpp
Expand Down
52 changes: 52 additions & 0 deletions be/test/exprs/hash_function_test.cpp
@@ -0,0 +1,52 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>

#include <iostream>
#include <string>

#include "exprs/anyval_util.h"
#include "exprs/hash_functions.h"
#include "testutil/function_utils.h"
#include "testutil/test_util.h"

namespace doris {

class HashFunctionsTest : public testing::Test {
public:
HashFunctionsTest() = default;

void SetUp() {
utils = new FunctionUtils();
ctx = utils->get_fn_ctx();
}
void TearDown() { delete utils; }

private:
FunctionUtils* utils;
FunctionContext* ctx;
};

TEST_F(HashFunctionsTest, murmur_hash3_64) {
StringVal input = AnyValUtil::from_string_temp(ctx, std::string("hello"));
BigIntVal result = HashFunctions::murmur_hash3_64(ctx, 1, &input);
BigIntVal expected((int64_t)-3215607508166160593);

EXPECT_EQ(expected, result);
}
} // namespace doris
33 changes: 33 additions & 0 deletions be/test/vec/function/function_hash_test.cpp
Expand Up @@ -55,6 +55,39 @@ TEST(HashFunctionTest, murmur_hash_3_test) {
};
}

TEST(HashFunctionTest, murmur_hash_3_64_test) {
std::string func_name = "murmur_hash3_64";

{
InputTypeSet input_types = {TypeIndex::String};

DataSet data_set = {{{Null()}, Null()},
{{std::string("hello")}, (int64_t)-3215607508166160593}};

check_function<DataTypeInt64, true>(func_name, input_types, data_set);
};

{
InputTypeSet input_types = {TypeIndex::String, TypeIndex::String};

DataSet data_set = {
{{std::string("hello"), std::string("world")}, (int64_t)3583109472027628045},
{{std::string("hello"), Null()}, Null()}};

check_function<DataTypeInt64, true>(func_name, input_types, data_set);
};

{
InputTypeSet input_types = {TypeIndex::String, TypeIndex::String, TypeIndex::String};

DataSet data_set = {{{std::string("hello"), std::string("world"), std::string("!")},
(int64_t)1887828212617890932},
{{std::string("hello"), std::string("world"), Null()}, Null()}};

check_function<DataTypeInt64, true>(func_name, input_types, data_set);
};
}

TEST(HashFunctionTest, murmur_hash_2_test) {
std::string func_name = "murmurHash2_64";

Expand Down
@@ -0,0 +1,61 @@
---
{
"title": "murmur_hash3_64",
"language": "en"
}
---

<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

## murmur_hash3_64

### description
#### Syntax

`BIGINT MURMUR_HASH3_64(VARCHAR input, ...)`

Return the 64 bits murmur3 hash of input string.

### example

```
mysql> select murmur_hash3_64(null);
+-----------------------+
| murmur_hash3_64(NULL) |
+-----------------------+
| NULL |
+-----------------------+
mysql> select murmur_hash3_64("hello");
+--------------------------+
| murmur_hash3_64('hello') |
+--------------------------+
| -3215607508166160593 |
+--------------------------+
mysql> select murmur_hash3_64("hello", "world");
+-----------------------------------+
| murmur_hash3_64('hello', 'world') |
+-----------------------------------+
| 3583109472027628045 |
+-----------------------------------+
```

### keywords

MURMUR_HASH3_64,HASH

0 comments on commit 1ec59f7

Please sign in to comment.