Skip to content

Commit

Permalink
ARROW-8141: [C++] speed unpack1_32 using intrinsics API
Browse files Browse the repository at this point in the history
BM_PlainDecodingBoolean benchmark shows about 1.3x to 1.7x improvements per the size
against the avx512 complier optimizations.

Signed-off-by: Frank Du <frank.du@intel.com>

Closes #6650 from jianxind/PlainDecodingBoolean-avx512-Intrinsics

Authored-by: Frank Du <frank.du@intel.com>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
frankdjx authored and wesm committed Mar 18, 2020
1 parent 6b87c6c commit f2f9d8f
Showing 1 changed file with 28 additions and 0 deletions.
28 changes: 28 additions & 0 deletions cpp/src/arrow/util/bpacking.h
Expand Up @@ -29,10 +29,37 @@

#include "arrow/util/logging.h"
#include "arrow/util/ubsan.h"
#if defined(__AVX512F__)
#include <immintrin.h>
#endif

namespace arrow {
namespace internal {

#if defined(__AVX512F__)
inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
uint32_t inl = util::SafeLoad(in);
__m512i shifts, inls, masks;

inls = _mm512_set1_epi32(inl);
masks = _mm512_set1_epi32(1);

// shift the first 16 outs
shifts = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
*(__m512i_u*)out = _mm512_and_epi32(_mm512_srlv_epi32(inls, shifts), masks);
out += 16;

// shift the last 16 outs
shifts =
_mm512_set_epi32(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16);
*(__m512i_u*)out = _mm512_and_epi32(_mm512_srlv_epi32(inls, shifts), masks);
out += 16;

++in;

return in;
}
#else
inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {
uint32_t inl = util::SafeLoad(in);
*out = (inl >> 0) & 1;
Expand Down Expand Up @@ -103,6 +130,7 @@ inline const uint32_t* unpack1_32(const uint32_t* in, uint32_t* out) {

return in;
}
#endif

inline const uint32_t* unpack2_32(const uint32_t* in, uint32_t* out) {
uint32_t inl = util::SafeLoad(in);
Expand Down

0 comments on commit f2f9d8f

Please sign in to comment.