apache · sebpop · Jun 28, 2023
diff --git a/build-support/lint.sh b/build-support/lint.sh
@@ -38,13 +38,13 @@ done
 
 if $ONLY_CHANGED; then
   FILES=$(git diff --name-only $($ROOT/build-support/get-upstream-commit.sh)  \
-    | egrep  '\.(cc|h)$' | grep -v "gutil\|trace_event\|x509_check_host\|sse2neon\.h")
+    | egrep  '\.(cc|h)$' | grep -v "gutil\|trace_event\|x509_check_host\.h")
   if [ -z "$FILES" ]; then
     echo No source files changed
     exit 0
   fi
 else
-  FILES=$(find $ROOT/src -name '*.cc' -or -name '*.h' | grep -v "\.pb\.\|\.service\.\|\.proxy\.\|\.krpc\.\|gutil\|trace_event\|kudu_export\.h\|x509_check_host\|sse2neon\.h")
+  FILES=$(find $ROOT/src -name '*.cc' -or -name '*.h' | grep -v "\.pb\.\|\.service\.\|\.proxy\.\|\.krpc\.\|gutil\|trace_event\|kudu_export\.h\|x509_check_host\.h")
 fi
 
 cpplint_filter="+runtime/broken_libstdcpp_regex,-whitespace/comments,-readability/todo,-readability/inheritance,-build/header_guard,-build/include_order,-legal/copyright,-build/c++11,-readability/nolint"

diff --git a/src/kudu/common/columnar_serialization.cc b/src/kudu/common/columnar_serialization.cc
@@ -17,9 +17,7 @@
 
 #include "kudu/common/columnar_serialization.h"
 
-#ifdef __aarch64__
-#include "kudu/util/sse2neon.h" // IWYU pragma: keep
-#else
+#ifdef __x86_64__
 #include <emmintrin.h>
 #include <immintrin.h>
 #endif
@@ -221,11 +219,13 @@ void CopyNonNullBitmapImpl(
   bw.Flush();
 }
 
+#ifdef __x86_64__
 struct PextZp7Clmul {
   inline static uint64_t call(uint64_t val, uint64_t mask) {
     return zp7_pext_64_clmul(val, mask);
   }
 };
+#endif
 struct PextZp7Simple {
   inline static uint64_t call(uint64_t val, uint64_t mask) {
     return zp7_pext_64_simple(val, mask);

diff --git a/src/kudu/common/zp7.cc b/src/kudu/common/zp7.cc
@@ -38,11 +38,6 @@
 
 #ifdef __x86_64__
 #include <emmintrin.h>
-#elif defined(__aarch64__)
-#include "kudu/util/sse2neon.h"
-#endif
-
-#ifndef __aarch64__
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
 #define USE_INLINE_ASM_CLMUL
 #else
@@ -59,14 +54,7 @@ typedef struct {
   uint64_t ppp_bit[N_BITS];
 } zp7_masks_64_t;
 
-// If we don't have access to the CLMUL instruction, emulate it with
-// shifts and XORs
-static inline uint64_t prefix_sum(uint64_t x) {
-  for (int i = 0; i < N_BITS; i++)
-    x ^= x << (1 << i);
-  return x;
-}
-
+#ifdef __x86_64__
 // GCC <5 doesn't properly handle the _pext_u64 intrinsic inside
 // a function with a specified target attribute. So, use inline
 // assembly instead.
@@ -82,16 +70,13 @@ static inline __m128i asm_mm_clmulepi64_si128(__m128i a, __m128i b) {
 #define CLMUL(a, b) (_mm_clmulepi64_si128(a, b, 0))
 #endif
 
-
 // Parallel-prefix-popcount. This is used by both the PEXT/PDEP polyfills.
 // It can also be called separately and cached, if the mask values will be used
 // more than once (these can be shared across PEXT and PDEP calls if they use
 // the same masks).
 //
 // This variant depends on the CLMUL instruction.
-#ifndef __aarch64__
 __attribute__((target("pclmul")))
-#endif // __aarch64__
 ATTRIBUTE_NO_SANITIZE_INTEGER
 static zp7_masks_64_t zp7_ppp_64_clmul(uint64_t mask) {
   zp7_masks_64_t r;
@@ -124,6 +109,15 @@ static zp7_masks_64_t zp7_ppp_64_clmul(uint64_t mask) {
 
   return r;
 }
+#endif // __x86_64__
+
+// If we don't have access to the CLMUL instruction, emulate it with
+// shifts and XORs
+static inline uint64_t prefix_sum(uint64_t x) {
+  for (int i = 0; i < N_BITS; i++)
+    x ^= x << (1 << i);
+  return x;
+}
 
 // Implementation that doesn't depend on CLMUL
 ATTRIBUTE_NO_SANITIZE_INTEGER
@@ -171,9 +165,11 @@ uint64_t zp7_pext_64_simple(uint64_t a, uint64_t mask) {
   return zp7_pext_pre_64(a, &masks);
 }
 
+#ifdef __x86_64__
 uint64_t zp7_pext_64_clmul(uint64_t a, uint64_t mask) {
   zp7_masks_64_t masks = zp7_ppp_64_clmul(mask);
   return zp7_pext_pre_64(a, &masks);
 }
+#endif // __x86_64__
 
 } // namespace kudu
diff --git a/src/kudu/util/block_bloom_filter.cc b/src/kudu/util/block_bloom_filter.cc
@@ -18,7 +18,7 @@
 #include "kudu/util/block_bloom_filter.h"
 
 #ifdef __aarch64__
-#include "kudu/util/sse2neon.h"
+#include <arm_neon.h>
 #else //__aarch64__
 #include <emmintrin.h>
 #include <mm_malloc.h>
@@ -176,6 +176,55 @@ void BlockBloomFilter::Close() {
   }
 }
 
+#ifdef __aarch64__
+// A static helper function for the arm64 methods. Turns a 32-bit hash into a 256-bit
+// Bucket with 1 single 1-bit set in each 32-bit lane.
+static inline ATTRIBUTE_ALWAYS_INLINE uint32x4x2_t MakeMask(const uint32_t hash) {
+  const uint32x4_t ones = vdupq_n_u32(1);
+  constexpr uint32_t c[8] = {BLOOM_HASH_CONSTANTS};
+  const uint32x4x2_t rehash = vld1q_u32_x2(c);
+  // Load hash, repeated 4 times.
+  uint32x4_t hash_data = vdupq_n_u32(hash);
+
+  // Multiply-shift hashing ala Dietzfelbinger et al.: multiply 'hash' by eight different
+  // odd constants, then keep the 5 most significant bits from each product.
+  int32x4x2_t t;
+  t.val[0] = vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(rehash.val[0], hash_data), 27));
+  t.val[1] = vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(rehash.val[1], hash_data), 27));
+
+  // Use these 5 bits to shift a single bit to a location in each 32-bit lane
+  uint32x4x2_t res;
+  res.val[0] = vshlq_u32(ones, t.val[0]);
+  res.val[1] = vshlq_u32(ones, t.val[1]);
+  return res;
+}
+
+ATTRIBUTE_NO_SANITIZE_INTEGER
+void BlockBloomFilter::BucketInsert(const uint32_t bucket_idx, const uint32_t hash) noexcept {
+  const uint32x4x2_t mask = MakeMask(hash);
+  uint32x4x2_t* addr = &(reinterpret_cast<uint32x4x2_t*>(directory_)[bucket_idx]);
+  uint32_t* bucket = reinterpret_cast<uint32_t*>(addr);
+  uint32x4x2_t data = vld1q_u32_x2(bucket);
+  data.val[0] = vorrq_u32(data.val[0], mask.val[0]);
+  data.val[1] = vorrq_u32(data.val[1], mask.val[1]);
+  vst1q_u32_x2(bucket, data);
+}
+
+ATTRIBUTE_NO_SANITIZE_INTEGER
+bool BlockBloomFilter::BucketFind(
+    const uint32_t bucket_idx, const uint32_t hash) const noexcept {
+  const uint32x4x2_t mask = MakeMask(hash);
+  uint32x4x2_t* addr = &(reinterpret_cast<uint32x4x2_t*>(directory_)[bucket_idx]);
+  uint32_t* bucket = reinterpret_cast<uint32_t*>(addr);
+  uint32x4x2_t data = vld1q_u32_x2(bucket);
+  // We should return true if 'bucket' has a one wherever 'mask' does.
+  uint32x4_t t0 = vtstq_u32(data.val[0], mask.val[0]);
+  uint32x4_t t1 = vtstq_u32(data.val[1], mask.val[1]);
+  int64x2_t t = vreinterpretq_s64_u32(vandq_u32(t0, t1));
+  int64_t a = vgetq_lane_s64(t, 0) & vgetq_lane_s64(t, 1);
+  return a == -1;
+}
+#elif defined(__x86_64__)
 ATTRIBUTE_NO_SANITIZE_INTEGER
 void BlockBloomFilter::BucketInsert(const uint32_t bucket_idx, const uint32_t hash) noexcept {
   // new_bucket will be all zeros except for eight 1-bits, one in each 32-bit word. It is
@@ -188,17 +237,10 @@ void BlockBloomFilter::BucketInsert(const uint32_t bucket_idx, const uint32_t ha
     new_bucket[i] = 1U << new_bucket[i];
   }
   for (int i = 0; i < 2; ++i) {
-#ifdef __aarch64__
-    // IWYU pragma: no_include <arm_neon.h>
-    uint8x16_t new_bucket_neon = vreinterpretq_u8_u32(vld1q_u32(new_bucket + 4 * i));
-    uint8x16_t* existing_bucket = reinterpret_cast<uint8x16_t*>(&directory_[bucket_idx][4 * i]);
-    *existing_bucket = vorrq_u8(*existing_bucket, new_bucket_neon);
-#else
     __m128i new_bucket_sse = _mm_load_si128(reinterpret_cast<__m128i*>(new_bucket + 4 * i));
     __m128i* existing_bucket = reinterpret_cast<__m128i*>(
         &DCHECK_NOTNULL(directory_)[bucket_idx][4 * i]);
     *existing_bucket = _mm_or_si128(*existing_bucket, new_bucket_sse);
-#endif
   }
 }
 
@@ -214,6 +256,7 @@ bool BlockBloomFilter::BucketFind(
   }
   return true;
 }
+#endif
 
 // This implements the false positive probability in Putze et al.'s "Cache-, hash-and
 // space-efficient bloom filters", equation 3.
@@ -374,6 +417,28 @@ Status BlockBloomFilter::OrEqualArray(size_t n, const uint8_t* __restrict__ in,
   return Status::OK();
 }
 
+#ifdef __aarch64__
+void BlockBloomFilter::OrEqualArrayNoAVX2(size_t n, const uint8_t* __restrict__ in,
+                                          uint8_t* __restrict__ out) {
+  // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
+  // written in a way that is very friendly to auto-vectorization. Instead, we manually
+  // vectorize, increasing the speed by up to 56x.
+  const uint8_t* simd_in = in;
+  const uint8_t* const simd_in_end = in + n;
+  uint8_t* simd_out = out;
+  // in.directory has a size (in bytes) that is a multiple of 32. Since sizeof(uint8x16_t)
+  // == 16, we can do two vorq's in each iteration without checking array bounds.
+  while (simd_in != simd_in_end) {
+    uint8x16x2_t a = vld1q_u8_x2(simd_in);
+    uint8x16x2_t b = vld1q_u8_x2(simd_out);
+    b.val[0] = vorrq_u8(b.val[0], a.val[0]);
+    b.val[1] = vorrq_u8(b.val[1], a.val[1]);
+    vst1q_u8_x2(simd_out, b);
+    simd_out += 32;
+    simd_in += 32;
+  }
+}
+#elif defined(__x86_64__)
 void BlockBloomFilter::OrEqualArrayNoAVX2(size_t n, const uint8_t* __restrict__ in,
                                           uint8_t* __restrict__ out) {
   // The trivial loop out[i] |= in[i] should auto-vectorize with gcc at -O3, but it is not
@@ -392,6 +457,7 @@ void BlockBloomFilter::OrEqualArrayNoAVX2(size_t n, const uint8_t* __restrict__
     }
   }
 }
+#endif
 
 Status BlockBloomFilter::Or(const BlockBloomFilter& other) {
   // AlwaysTrueFilter is a special case implemented with a nullptr.

diff --git a/src/kudu/util/char_util.cc b/src/kudu/util/char_util.cc
@@ -18,7 +18,7 @@
 #include "kudu/util/char_util.h"
 
 #ifdef __aarch64__
-#include "kudu/util/sse2neon.h"
+#include <arm_neon.h>
 #else
 #include <emmintrin.h>
 #include <smmintrin.h>
@@ -29,6 +29,18 @@
 
 namespace kudu {
 
+#ifdef __aarch64__
+static inline bool AllASCII16B(const uint8_t* str, const uint8x16_t mask) {
+  uint8x16_t a = vld1q_u8(str);
+  uint64x2_t res = vreinterpretq_u64_u8(vtstq_u8(a, mask));
+  return 0 == (vgetq_lane_u64(res, 0) & vgetq_lane_u64(res, 1));
+}
+#elif defined(__x86_64__)
+static inline bool AllASCII16B(const uint8_t* str, const __m128i mask) {
+  return _mm_test_all_zeros(_mm_loadu_si128(reinterpret_cast<const __m128i*>(str)), mask) == 1;
+}
+#endif
+
 Slice UTF8Truncate(Slice val, size_t max_utf8_length) {
   size_t num_utf8_chars = 0;
   const uint8_t* str;
@@ -39,14 +51,17 @@ Slice UTF8Truncate(Slice val, size_t max_utf8_length) {
 
   // Mask used to determine whether there are any non-ASCII characters in a
   // 128-bit chunk
+#ifdef __aarch64__
+  const uint8x16_t mask = vdupq_n_u8(0x80);
+#elif defined(__x86_64__)
   const __m128i mask = _mm_set1_epi32(0x80808080);
+#endif
 
   while (num_bytes < size) {
     // If the next chunk of bytes are all ASCII we can fast path them.
     if (size - num_bytes >= 16 &&
         max_utf8_length - num_utf8_chars >= 16 &&
-        _mm_test_all_zeros(_mm_loadu_si128(reinterpret_cast<const __m128i*>(str)),
-                           mask) == 1) {
+        AllASCII16B(str, mask)) {
       num_utf8_chars += 16;
       num_bytes += 16;
       str += 16;

diff --git a/src/kudu/util/group_varint-inl.h b/src/kudu/util/group_varint-inl.h
@@ -22,7 +22,7 @@
 #endif
 
 #ifdef __aarch64__
-#include "kudu/util/sse2neon.h"
+#include <arm_neon.h>
 #else
 #include <emmintrin.h>
 #include <smmintrin.h>