PARQUET-1786: [C++] Improve ByteStreamSplit decoder using SSE2

Since the data is already likely in the cache, memory accesses are sufficiently fast to use SIMD intrinsics to speed-up decoding. This patch adds a decode path which is only taken when Arrow is compiled with SSE2 and higher support. The BYTE_STREAM_SPLIT round-trip test size is purposely changed to a value non-divisible by 4 and 8 to guarantee that the incomplete SIMD suffix is correctly handled by the scalar decode loop. Closes #6679 from martinradev/byte_stream_split_submit Lead-authored-by: Martin Radev <martin.b.radev@gmail.com> Co-authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
apache · Mar 24, 2020 · bc873dc · bc873dc
1 parent 0449ea7
commit bc873dc
Show file tree

Hide file tree

Showing 4 changed files with 215 additions and 16 deletions.
diff --git a/cpp/src/arrow/util/byte_stream_split.h b/cpp/src/arrow/util/byte_stream_split.h
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_UTIL_BYTE_STREAM_SPLIT_H
+#define ARROW_UTIL_BYTE_STREAM_SPLIT_H
+
+#include "arrow/util/sse_util.h"
+#include "arrow/util/ubsan.h"
+
+#include <stdint.h>
+#include <algorithm>
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+#if defined(ARROW_HAVE_SSE2)
+
+template <typename T>
+void ByteStreamSlitDecodeSSE2(const uint8_t* data, int64_t num_values, int64_t stride,
+                              T* out) {
+  constexpr size_t kNumStreams = sizeof(T);
+  static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+  constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+  const int64_t size = num_values * sizeof(T);
+  const int64_t block_size = sizeof(__m128i) * kNumStreams;
+  const int64_t num_blocks = size / block_size;
+  uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+  // First handle suffix.
+  // This helps catch if the simd-based processing overflows into the suffix
+  // since almost surely a test would fail.
+  const int64_t num_processed_elements = (num_blocks * block_size) / kNumStreams;
+  for (int64_t i = num_processed_elements; i < num_values; ++i) {
+    uint8_t gathered_byte_data[kNumStreams];
+    for (size_t b = 0; b < kNumStreams; ++b) {
+      const size_t byte_index = b * stride + i;
+      gathered_byte_data[b] = data[byte_index];
+    }
+    out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+  }
+
+  // The blocks get processed hierahically using the unpack intrinsics.
+  // Example with four streams:
+  // Stage 1: AAAA BBBB CCCC DDDD
+  // Stage 2: ACAC ACAC BDBD BDBD
+  // Stage 3: ABCD ABCD ABCD ABCD
+  __m128i stage[kNumStreamsLog2 + 1U][kNumStreams];
+  const size_t half = kNumStreams / 2U;
+
+  for (int64_t i = 0; i < num_blocks; ++i) {
+    for (size_t j = 0; j < kNumStreams; ++j) {
+      stage[0][j] = _mm_loadu_si128(
+          reinterpret_cast<const __m128i*>(&data[i * sizeof(__m128i) + j * stride]));
+    }
+    for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+      for (size_t j = 0; j < half; ++j) {
+        stage[step + 1U][j * 2] =
+            _mm_unpacklo_epi8(stage[step][j], stage[step][half + j]);
+        stage[step + 1U][j * 2 + 1U] =
+            _mm_unpackhi_epi8(stage[step][j], stage[step][half + j]);
+      }
+    }
+    for (size_t j = 0; j < kNumStreams; ++j) {
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(
+                           &output_data[(i * kNumStreams + j) * sizeof(__m128i)]),
+                       stage[kNumStreamsLog2][j]);
+    }
+  }
+}
+
+#endif
+
+template <typename T>
+void ByteStreamSlitDecodeScalar(const uint8_t* data, int64_t num_values, int64_t stride,
+                                T* out) {
+  constexpr size_t kNumStreams = sizeof(T);
+
+  for (int64_t i = 0; i < num_values; ++i) {
+    uint8_t gathered_byte_data[kNumStreams];
+    for (size_t b = 0; b < kNumStreams; ++b) {
+      const size_t byte_index = b * stride + i;
+      gathered_byte_data[b] = data[byte_index];
+    }
+    out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+  }
+}
+
+}  // namespace internal
+}  // namespace util
+}  // namespace arrow
+
+#endif  // ARROW_UTIL_BYTE_STREAM_SPLIT_H
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
@@ -28,6 +28,7 @@
 #include "arrow/array.h"
 #include "arrow/stl_allocator.h"
 #include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/byte_stream_split.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/hashing.h"
 #include "arrow/util/logging.h"
@@ -2309,8 +2310,17 @@ class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecoder<D
 
   void SetData(int num_values, const uint8_t* data, int len) override;
 
+  T* EnsureDecodeBuffer(int64_t min_values) {
+    const int64_t size = sizeof(T) * min_values;
+    if (!decode_buffer_ || decode_buffer_->size() < size) {
+      PARQUET_THROW_NOT_OK(AllocateBuffer(size, &decode_buffer_));
+    }
+    return reinterpret_cast<T*>(decode_buffer_->mutable_data());
+  }
+
  private:
-  int num_values_in_buffer{0U};
+  int num_values_in_buffer_{0};
+  std::shared_ptr<Buffer> decode_buffer_;
 
   static constexpr size_t kNumStreams = sizeof(T);
 };
@@ -2323,21 +2333,22 @@ template <typename DType>
 void ByteStreamSplitDecoder<DType>::SetData(int num_values, const uint8_t* data,
                                             int len) {
   DecoderImpl::SetData(num_values, data, len);
-  num_values_in_buffer = num_values;
+  num_values_in_buffer_ = num_values;
 }
 
 template <typename DType>
 int ByteStreamSplitDecoder<DType>::Decode(T* buffer, int max_values) {
   const int values_to_decode = std::min(num_values_, max_values);
-  const int num_decoded_previously = num_values_in_buffer - num_values_;
-  for (int i = 0; i < values_to_decode; ++i) {
-    uint8_t gathered_byte_data[kNumStreams];
-    for (size_t b = 0; b < kNumStreams; ++b) {
-      const size_t byte_index = b * num_values_in_buffer + num_decoded_previously + i;
-      gathered_byte_data[b] = data_[byte_index];
-    }
-    buffer[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
-  }
+  const int num_decoded_previously = num_values_in_buffer_ - num_values_;
+  const uint8_t* data = data_ + num_decoded_previously;
+
+#if defined(ARROW_HAVE_SSE2)
+  arrow::util::internal::ByteStreamSlitDecodeSSE2<T>(data, values_to_decode,
+                                                     num_values_in_buffer_, buffer);
+#else
+  arrow::util::internal::ByteStreamSlitDecodeScalar<T>(data, values_to_decode,
+                                                       num_values_in_buffer_, buffer);
+#endif
   num_values_ -= values_to_decode;
   len_ -= sizeof(T) * values_to_decode;
   return values_to_decode;
@@ -2355,16 +2366,37 @@ int ByteStreamSplitDecoder<DType>::DecodeArrow(
 
   PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
 
-  const int num_decoded_previously = num_values_in_buffer - num_values_;
+  const int num_decoded_previously = num_values_in_buffer_ - num_values_;
+  const uint8_t* data = data_ + num_decoded_previously;
   int offset = 0;
 
+#if defined(ARROW_HAVE_SSE2)
+  // Use fast decoding into intermediate buffer.  This will also decode
+  // some null values, but it's fast enough that we don't care.
+  T* decode_out = EnsureDecodeBuffer(values_decoded);
+  arrow::util::internal::ByteStreamSlitDecodeSSE2<T>(data, values_decoded,
+                                                     num_values_in_buffer_, decode_out);
+
+  // XXX If null_count is 0, we could even append in bulk or decode directly into
+  // builder
+  auto decode_value = [&](bool is_valid) {
+    if (is_valid) {
+      builder->UnsafeAppend(decode_out[offset]);
+      ++offset;
+    } else {
+      builder->UnsafeAppendNull();
+    }
+  };
+
+  VisitNullBitmapInline(valid_bits, valid_bits_offset, num_values, null_count,
+                        std::move(decode_value));
+#else
   auto decode_value = [&](bool is_valid) {
     if (is_valid) {
       uint8_t gathered_byte_data[kNumStreams];
       for (size_t b = 0; b < kNumStreams; ++b) {
-        const size_t byte_index =
-            b * num_values_in_buffer + num_decoded_previously + offset;
-        gathered_byte_data[b] = data_[byte_index];
+        const size_t byte_index = b * num_values_in_buffer_ + offset;
+        gathered_byte_data[b] = data[byte_index];
       }
       builder->UnsafeAppend(arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]));
       ++offset;
@@ -2375,6 +2407,7 @@ int ByteStreamSplitDecoder<DType>::DecodeArrow(
 
   VisitNullBitmapInline(valid_bits, valid_bits_offset, num_values, null_count,
                         std::move(decode_value));
+#endif
 
   num_values_ -= values_decoded;
   len_ -= sizeof(T) * values_decoded;

diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc
@@ -24,6 +24,7 @@
 #include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
 #include "arrow/type.h"
+#include "arrow/util/byte_stream_split.h"
 
 #include "parquet/encoding.h"
 #include "parquet/platform.h"
@@ -198,6 +199,48 @@ static void BM_PlainDecodingFloat(benchmark::State& state) {
 
 BENCHMARK(BM_PlainDecodingFloat)->Range(MIN_RANGE, MAX_RANGE);
 
+template <typename T, typename DecodeFunc>
+static void BM_ByteStreamSplitDecode(benchmark::State& state, DecodeFunc&& decode_func) {
+  std::vector<T> values(state.range(0), 64.0);
+  const uint8_t* values_raw = reinterpret_cast<const uint8_t*>(values.data());
+  std::vector<T> output(state.range(0), 0);
+
+  for (auto _ : state) {
+    decode_func(values_raw, static_cast<int64_t>(values.size()),
+                static_cast<int64_t>(values.size()), output.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * values.size() * sizeof(T));
+}
+
+static void BM_ByteStreamSplitDecode_Float_Scalar(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<float>(
+      state, arrow::util::internal::ByteStreamSlitDecodeScalar<float>);
+}
+
+static void BM_ByteStreamSplitDecode_Double_Scalar(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<double>(
+      state, arrow::util::internal::ByteStreamSlitDecodeScalar<double>);
+}
+
+BENCHMARK(BM_ByteStreamSplitDecode_Float_Scalar)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitDecode_Double_Scalar)->Range(MIN_RANGE, MAX_RANGE);
+
+#if defined(ARROW_HAVE_SSE2)
+static void BM_ByteStreamSplitDecode_Float_SSE2(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<float>(state,
+                                  arrow::util::internal::ByteStreamSlitDecodeSSE2<float>);
+}
+
+static void BM_ByteStreamSplitDecode_Double_SSE2(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<double>(
+      state, arrow::util::internal::ByteStreamSlitDecodeSSE2<double>);
+}
+
+BENCHMARK(BM_ByteStreamSplitDecode_Float_SSE2)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitDecode_Double_SSE2)->Range(MIN_RANGE, MAX_RANGE);
+#endif
+
 template <typename Type>
 static void DecodeDict(std::vector<typename Type::c_type>& values,
                        benchmark::State& state) {

diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
@@ -1090,7 +1090,22 @@ typedef ::testing::Types<FloatType, DoubleType> ByteStreamSplitTypes;
 TYPED_TEST_SUITE(TestByteStreamSplitEncoding, ByteStreamSplitTypes);
 
 TYPED_TEST(TestByteStreamSplitEncoding, BasicRoundTrip) {
-  ASSERT_NO_FATAL_FAILURE(this->Execute(1000, 1));
+  // We need to test with different sizes to guarantee that the SIMD implementation
+  // can handle both inputs with size divisible by 4/8 and sizes which would
+  // require a scalar loop for the suffix.
+
+  // Exercise only the scalar loop.
+  ASSERT_NO_FATAL_FAILURE(this->Execute(3, 1));
+
+  // Exercise only the SIMD loop.
+  ASSERT_NO_FATAL_FAILURE(this->Execute(256, 1));
+
+  // Exercise both.
+  ASSERT_NO_FATAL_FAILURE(this->Execute(1337, 1));
+
+  for (int values = 0; values < 32; ++values) {
+    ASSERT_NO_FATAL_FAILURE(this->Execute(values, 1));
+  }
 }
 
 TYPED_TEST(TestByteStreamSplitEncoding, RoundTripSingleElement) {