Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[arm64] implement with native NEON instructions #57

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions build-support/lint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ done

if $ONLY_CHANGED; then
FILES=$(git diff --name-only $($ROOT/build-support/get-upstream-commit.sh) \
| egrep '\.(cc|h)$' | grep -v "gutil\|trace_event\|x509_check_host\|sse2neon\.h")
| egrep '\.(cc|h)$' | grep -v "gutil\|trace_event\|x509_check_host\.h")
if [ -z "$FILES" ]; then
echo No source files changed
exit 0
fi
else
FILES=$(find $ROOT/src -name '*.cc' -or -name '*.h' | grep -v "\.pb\.\|\.service\.\|\.proxy\.\|\.krpc\.\|gutil\|trace_event\|kudu_export\.h\|x509_check_host\|sse2neon\.h")
FILES=$(find $ROOT/src -name '*.cc' -or -name '*.h' | grep -v "\.pb\.\|\.service\.\|\.proxy\.\|\.krpc\.\|gutil\|trace_event\|kudu_export\.h\|x509_check_host\.h")
fi

cpplint_filter="+runtime/broken_libstdcpp_regex,-whitespace/comments,-readability/todo,-readability/inheritance,-build/header_guard,-build/include_order,-legal/copyright,-build/c++11,-readability/nolint"
Expand Down
6 changes: 3 additions & 3 deletions src/kudu/common/columnar_serialization.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@

#include "kudu/common/columnar_serialization.h"

#ifdef __aarch64__
#include "kudu/util/sse2neon.h" // IWYU pragma: keep
#else
#ifdef __x86_64__
#include <emmintrin.h>
#include <immintrin.h>
#endif
Expand Down Expand Up @@ -221,11 +219,13 @@ void CopyNonNullBitmapImpl(
bw.Flush();
}

#ifdef __x86_64__
struct PextZp7Clmul {
inline static uint64_t call(uint64_t val, uint64_t mask) {
return zp7_pext_64_clmul(val, mask);
}
};
#endif
struct PextZp7Simple {
inline static uint64_t call(uint64_t val, uint64_t mask) {
return zp7_pext_64_simple(val, mask);
Expand Down
28 changes: 12 additions & 16 deletions src/kudu/common/zp7.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,6 @@

#ifdef __x86_64__
#include <emmintrin.h>
#elif defined(__aarch64__)
#include "kudu/util/sse2neon.h"
#endif

#ifndef __aarch64__
#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
#define USE_INLINE_ASM_CLMUL
#else
Expand All @@ -59,14 +54,7 @@ typedef struct {
uint64_t ppp_bit[N_BITS];
} zp7_masks_64_t;

// If we don't have access to the CLMUL instruction, emulate it with
// shifts and XORs
static inline uint64_t prefix_sum(uint64_t x) {
for (int i = 0; i < N_BITS; i++)
x ^= x << (1 << i);
return x;
}

#ifdef __x86_64__
// GCC <5 doesn't properly handle the _pext_u64 intrinsic inside
// a function with a specified target attribute. So, use inline
// assembly instead.
Expand All @@ -82,16 +70,13 @@ static inline __m128i asm_mm_clmulepi64_si128(__m128i a, __m128i b) {
#define CLMUL(a, b) (_mm_clmulepi64_si128(a, b, 0))
#endif


// Parallel-prefix-popcount. This is used by both the PEXT/PDEP polyfills.
// It can also be called separately and cached, if the mask values will be used
// more than once (these can be shared across PEXT and PDEP calls if they use
// the same masks).
//
// This variant depends on the CLMUL instruction.
#ifndef __aarch64__
__attribute__((target("pclmul")))
#endif // __aarch64__
ATTRIBUTE_NO_SANITIZE_INTEGER
static zp7_masks_64_t zp7_ppp_64_clmul(uint64_t mask) {
zp7_masks_64_t r;
Expand Down Expand Up @@ -124,6 +109,15 @@ static zp7_masks_64_t zp7_ppp_64_clmul(uint64_t mask) {

return r;
}
#endif // __x86_64__

// If we don't have access to the CLMUL instruction, emulate it with
// shifts and XORs
static inline uint64_t prefix_sum(uint64_t x) {
for (int i = 0; i < N_BITS; i++)
x ^= x << (1 << i);
return x;
}

// Implementation that doesn't depend on CLMUL
ATTRIBUTE_NO_SANITIZE_INTEGER
Expand Down Expand Up @@ -171,9 +165,11 @@ uint64_t zp7_pext_64_simple(uint64_t a, uint64_t mask) {
return zp7_pext_pre_64(a, &masks);
}

#ifdef __x86_64__
uint64_t zp7_pext_64_clmul(uint64_t a, uint64_t mask) {
zp7_masks_64_t masks = zp7_ppp_64_clmul(mask);
return zp7_pext_pre_64(a, &masks);
}
#endif // __x86_64__

} // namespace kudu
82 changes: 74 additions & 8 deletions src/kudu/util/block_bloom_filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include "kudu/util/block_bloom_filter.h"

#ifdef __aarch64__
#include "kudu/util/sse2neon.h"
#include <arm_neon.h>
#else //__aarch64__
#include <emmintrin.h>
#include <mm_malloc.h>
Expand Down Expand Up @@ -176,6 +176,55 @@ void BlockBloomFilter::Close() {
}
}

#ifdef __aarch64__
// A static helper function for the arm64 methods. Turns a 32-bit hash into a 256-bit
// Bucket with 1 single 1-bit set in each 32-bit lane.
static inline ATTRIBUTE_ALWAYS_INLINE uint32x4x2_t MakeMask(const uint32_t hash) {
const uint32x4_t ones = vdupq_n_u32(1);
constexpr uint32_t c[8] = {BLOOM_HASH_CONSTANTS};
const uint32x4x2_t rehash = vld1q_u32_x2(c);
// Load hash, repeated 4 times.
uint32x4_t hash_data = vdupq_n_u32(hash);

// Multiply-shift hashing ala Dietzfelbinger et al.: multiply 'hash' by eight different
// odd constants, then keep the 5 most significant bits from each product.
int32x4x2_t t;
t.val[0] = vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(rehash.val[0], hash_data), 27));
t.val[1] = vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(rehash.val[1], hash_data), 27));

// Use these 5 bits to shift a single bit to a location in each 32-bit lane
uint32x4x2_t res;
res.val[0] = vshlq_u32(ones, t.val[0]);
res.val[1] = vshlq_u32(ones, t.val[1]);
return res;
}

ATTRIBUTE_NO_SANITIZE_INTEGER
void BlockBloomFilter::BucketInsert(const uint32_t bucket_idx, const uint32_t hash) noexcept {
const uint32x4x2_t mask = MakeMask(hash);
uint32x4x2_t* addr = &(reinterpret_cast<uint32x4x2_t*>(directory_)[bucket_idx]);
uint32_t* bucket = reinterpret_cast<uint32_t*>(addr);
uint32x4x2_t data = vld1q_u32_x2(bucket);
data.val[0] = vorrq_u32(data.val[0], mask.val[0]);
data.val[1] = vorrq_u32(data.val[1], mask.val[1]);
vst1q_u32_x2(bucket, data);
}

ATTRIBUTE_NO_SANITIZE_INTEGER
bool BlockBloomFilter::BucketFind(
const uint32_t bucket_idx, const uint32_t hash) const noexcept {
const uint32x4x2_t mask = MakeMask(hash);
uint32x4x2_t* addr = &(reinterpret_cast<uint32x4x2_t*>(directory_)[bucket_idx]);
uint32_t* bucket = reinterpret_cast<uint32_t*>(addr);
uint32x4x2_t data = vld1q_u32_x2(bucket);
// We should return true if 'bucket' has a one wherever 'mask' does.
uint32x4_t t0 = vtstq_u32(data.val[0], mask.val[0]);
uint32x4_t t1 = vtstq_u32(data.val[1], mask.val[1]);
int64x2_t t = vreinterpretq_s64_u32(vandq_u32(t0, t1));
int64_t a = vgetq_lane_s64(t, 0) & vgetq_lane_s64(t, 1);
return a == -1;
}
#elif defined(__x86_64__)
ATTRIBUTE_NO_SANITIZE_INTEGER
void BlockBloomFilter::BucketInsert(const uint32_t bucket_idx, const uint32_t hash) noexcept {
// new_bucket will be all zeros except for eight 1-bits, one in each 32-bit word. It is
Expand All @@ -188,17 +237,10 @@ void BlockBloomFilter::BucketInsert(const uint32_t bucket_idx, const uint32_t ha
new_bucket[i] = 1U << new_bucket[i];
}
for (int i = 0; i < 2; ++i) {
#ifdef __aarch64__
// IWYU pragma: no_include <arm_neon.h>
uint8x16_t new_bucket_neon = vreinterpretq_u8_u32(vld1q_u32(new_bucket + 4 * i));
uint8x16_t* existing_bucket = reinterpret_cast<uint8x16_t*>(&directory_[bucket_idx][4 * i]);
*existing_bucket = vorrq_u8(*existing_bucket, new_bucket_neon);
#else
__m128i new_bucket_sse = _mm_load_si128(reinterpret_cast<__m128i*>(new_bucket + 4 * i));
__m128i* existing_bucket = reinterpret_cast<__m128i*>(
&DCHECK_NOTNULL(directory_)[bucket_idx][4 * i]);
*existing_bucket = _mm_or_si128(*existing_bucket, new_bucket_sse);
#endif
}
}

Expand All @@ -214,6 +256,7 @@ bool BlockBloomFilter::BucketFind(
}
return true;
}
#endif

// This implements the false positive probability in Putze et al.'s "Cache-, hash-and
// space-efficient bloom filters", equation 3.
Expand Down Expand Up @@ -374,6 +417,28 @@ Status BlockBloomFilter::OrEqualArray(size_t n, const uint8_t* __restrict__ in,
return Status::OK();
}

#ifdef __aarch64__
void BlockBloomFilter::OrEqualArrayNoAVX2(size_t n, const uint8_t* __restrict__ in,
uint8_t* __restrict__ out) {
// The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
// written in a way that is very friendly to auto-vectorization. Instead, we manually
// vectorize, increasing the speed by up to 56x.
const uint8_t* simd_in = in;
const uint8_t* const simd_in_end = in + n;
uint8_t* simd_out = out;
// in.directory has a size (in bytes) that is a multiple of 32. Since sizeof(uint8x16_t)
// == 16, we can do two vorq's in each iteration without checking array bounds.
while (simd_in != simd_in_end) {
uint8x16x2_t a = vld1q_u8_x2(simd_in);
uint8x16x2_t b = vld1q_u8_x2(simd_out);
b.val[0] = vorrq_u8(b.val[0], a.val[0]);
b.val[1] = vorrq_u8(b.val[1], a.val[1]);
vst1q_u8_x2(simd_out, b);
simd_out += 32;
simd_in += 32;
}
}
#elif defined(__x86_64__)
void BlockBloomFilter::OrEqualArrayNoAVX2(size_t n, const uint8_t* __restrict__ in,
uint8_t* __restrict__ out) {
// The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
Expand All @@ -392,6 +457,7 @@ void BlockBloomFilter::OrEqualArrayNoAVX2(size_t n, const uint8_t* __restrict__
}
}
}
#endif

Status BlockBloomFilter::Or(const BlockBloomFilter& other) {
// AlwaysTrueFilter is a special case implemented with a nullptr.
Expand Down
21 changes: 18 additions & 3 deletions src/kudu/util/char_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include "kudu/util/char_util.h"

#ifdef __aarch64__
#include "kudu/util/sse2neon.h"
#include <arm_neon.h>
#else
#include <emmintrin.h>
#include <smmintrin.h>
Expand All @@ -29,6 +29,18 @@

namespace kudu {

#ifdef __aarch64__
static inline bool AllASCII16B(const uint8_t* str, const uint8x16_t mask) {
uint8x16_t a = vld1q_u8(str);
uint64x2_t res = vreinterpretq_u64_u8(vtstq_u8(a, mask));
return 0 == (vgetq_lane_u64(res, 0) & vgetq_lane_u64(res, 1));
}
#elif defined(__x86_64__)
static inline bool AllASCII16B(const uint8_t* str, const __m128i mask) {
return _mm_test_all_zeros(_mm_loadu_si128(reinterpret_cast<const __m128i*>(str)), mask) == 1;
}
#endif

Slice UTF8Truncate(Slice val, size_t max_utf8_length) {
size_t num_utf8_chars = 0;
const uint8_t* str;
Expand All @@ -39,14 +51,17 @@ Slice UTF8Truncate(Slice val, size_t max_utf8_length) {

// Mask used to determine whether there are any non-ASCII characters in a
// 128-bit chunk
#ifdef __aarch64__
const uint8x16_t mask = vdupq_n_u8(0x80);
#elif defined(__x86_64__)
const __m128i mask = _mm_set1_epi32(0x80808080);
#endif

while (num_bytes < size) {
// If the next chunk of bytes are all ASCII we can fast path them.
if (size - num_bytes >= 16 &&
max_utf8_length - num_utf8_chars >= 16 &&
_mm_test_all_zeros(_mm_loadu_si128(reinterpret_cast<const __m128i*>(str)),
mask) == 1) {
AllASCII16B(str, mask)) {
num_utf8_chars += 16;
num_bytes += 16;
str += 16;
Expand Down
2 changes: 1 addition & 1 deletion src/kudu/util/group_varint-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#endif

#ifdef __aarch64__
#include "kudu/util/sse2neon.h"
#include <arm_neon.h>
#else
#include <emmintrin.h>
#include <smmintrin.h>
Expand Down