Skip to content

Commit

Permalink
ARROW-8227: [C++] Refine SIMD feature definitions
Browse files Browse the repository at this point in the history
This patch moves SIMD feature definitions from source code to cmake,
and supports more flexible Arm64 CPU feature settings.

Binary building is controlled by two factors: compiler capability and
build requirement. Compiler capability is detected in cmake by trying
flags like "-mavx2". Build requirement is passed by cmake command line
such as "-DARROW_SIMD_LEVEL=AVX2". Combining these two factors, we can
define SIMD feature macros ARROW_HAVE_AVX2, which controls conditional
compiling of related SIMD implementations in source code.

Currently we set compiler options(e.g. -msse4.2) in cmake but define
SIMD features by checking compiler macros in source code like below:
  #if defined(__SSE4_2__)
  #define ARROW_HAVE_SSE4_2 1
  #endif
Putting them together in cmake eases maintenance.

Closes #6794 from cyb70289/simd

Authored-by: Yibo Cai <yibo.cai@arm.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
cyb70289 authored and pitrou committed Apr 6, 2020
1 parent 96a2522 commit b1d4c86
Show file tree
Hide file tree
Showing 11 changed files with 105 additions and 87 deletions.
4 changes: 4 additions & 0 deletions cpp/cmake_modules/DefineOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
"AVX2"
"AVX512")

# Arm64 architectures and extensions can lead to exploding combinations.
# So set it directly through cmake command line.
define_option_string(ARROW_ARMV8_ARCH "Arm64 arch and extensions" "armv8-a+crc+crypto")

define_option(ARROW_ALTIVEC "Build with Altivec if compiler has support" ON)

define_option(ARROW_RPATH_ORIGIN "Build Arrow libraries with RATH set to \$ORIGIN" OFF)
Expand Down
108 changes: 80 additions & 28 deletions cpp/cmake_modules/SetupCxxFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,39 @@
# Check if the target architecture and compiler supports some special
# instruction sets that would boost performance.
include(CheckCXXCompilerFlag)
# x86/amd64 compiler flags
check_cxx_compiler_flag("-msse4.2" CXX_SUPPORTS_SSE4_2)
check_cxx_compiler_flag("-mavx2" CXX_SUPPORTS_AVX2)
check_cxx_compiler_flag("-mavx512f" CXX_SUPPORTS_AVX512)
# power compiler flags
check_cxx_compiler_flag("-maltivec" CXX_SUPPORTS_ALTIVEC)
# Arm64 compiler flags
set(ARROW_ARMV8_CRC_FLAG "-march=armv8-a+crc")
check_cxx_compiler_flag(${ARROW_ARMV8_CRC_FLAG} CXX_SUPPORTS_ARMCRC)
set(ARROW_ARMV8_CRC_CRYPTO_FLAG "-march=armv8-a+crc+crypto")
check_cxx_compiler_flag(${ARROW_ARMV8_CRC_CRYPTO_FLAG} CXX_SUPPORTS_ARMV8_CRC_CRYPTO)
# Get cpu architecture
set(ARROW_CPU_FLAG "x86")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64")
set(ARROW_CPU_FLAG "arm")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc")
set(ARROW_CPU_FLAG "ppc")
endif()
# Check architecture specific compiler flags
if(ARROW_CPU_FLAG STREQUAL "x86")
# x86/amd64 compiler flags, msvc/gcc/clang
if(MSVC)
set(ARROW_SSE4_2_FLAG "")
set(ARROW_AVX2_FLAG "/arch:AVX2")
set(ARROW_AVX512_FLAG "/arch:AVX512")
set(CXX_SUPPORTS_SSE4_2 TRUE)
else()
set(ARROW_SSE4_2_FLAG "-msse4.2")
set(ARROW_AVX2_FLAG "-mavx2")
# skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ
set(ARROW_AVX512_FLAG "-march=skylake-avx512")
check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2)
endif()
check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2)
check_cxx_compiler_flag(${ARROW_AVX512_FLAG} CXX_SUPPORTS_AVX512)
elseif(ARROW_CPU_FLAG STREQUAL "ppc")
# power compiler flags, gcc/clang only
set(ARROW_ALTIVEC_FLAG "-maltivec")
check_cxx_compiler_flag(${ARROW_ALTIVEC_FLAG} CXX_SUPPORTS_ALTIVEC)
elseif(ARROW_CPU_FLAG STREQUAL "arm")
# Arm64 compiler flags, gcc/clang only
set(ARROW_ARMV8_ARCH_FLAG "-march=${ARROW_ARMV8_ARCH}")
check_cxx_compiler_flag(${ARROW_ARMV8_ARCH_FLAG} CXX_SUPPORTS_ARMV8_ARCH)
endif()

# Support C11
set(CMAKE_C_STANDARD 11)
Expand Down Expand Up @@ -274,29 +296,59 @@ if(BUILD_WARNING_FLAGS)
endif(BUILD_WARNING_FLAGS)

# Only enable additional instruction sets if they are supported
if(CXX_SUPPORTS_AVX512 AND ARROW_SIMD_LEVEL STREQUAL "AVX512")
# skylake-avx512 consist of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=skylake-avx512")
elseif(CXX_SUPPORTS_AVX2 AND ARROW_SIMD_LEVEL STREQUAL "AVX2")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
elseif(CXX_SUPPORTS_SSE4_2 AND ARROW_SIMD_LEVEL STREQUAL "SSE4_2")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse4.2")
if(ARROW_CPU_FLAG STREQUAL "x86" AND ARROW_USE_SIMD)
if(ARROW_SIMD_LEVEL STREQUAL "AVX512")
if(NOT CXX_SUPPORTS_AVX512)
message(FATAL_ERROR "AVX512 required but compiler doesn't support it.")
endif()
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_AVX512_FLAG}")
add_definitions(-DARROW_HAVE_AVX512 -DARROW_HAVE_AVX2 -DARROW_HAVE_SSE4_2)
elseif(ARROW_SIMD_LEVEL STREQUAL "AVX2")
if(NOT CXX_SUPPORTS_AVX2)
message(FATAL_ERROR "AVX2 required but compiler doesn't support it.")
endif()
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_AVX2_FLAG}")
add_definitions(-DARROW_HAVE_AVX2 -DARROW_HAVE_SSE4_2)
elseif(ARROW_SIMD_LEVEL STREQUAL "SSE4_2")
if(NOT CXX_SUPPORTS_SSE4_2)
message(FATAL_ERROR "SSE4.2 required but compiler doesn't support it.")
endif()
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_SSE4_2_FLAG}")
add_definitions(-DARROW_HAVE_SSE4_2)
endif()
endif()

if(CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -maltivec")
if(ARROW_CPU_FLAG STREQUAL "ppc" AND ARROW_USE_SIMD)
if(CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ALTIVEC_FLAG}")
endif()
endif()

if(CXX_SUPPORTS_ARMCRC)
if(CXX_SUPPORTS_ARMV8_CRC_CRYPTO)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_CRC_CRYPTO_FLAG}")
else()
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_CRC_FLAG}")
if(ARROW_CPU_FLAG STREQUAL "arm")
if(NOT CXX_SUPPORTS_ARMV8_ARCH)
message(FATAL_ERROR "Unsupported arch flag: ${ARROW_ARMV8_ARCH_FLAG}.")
endif()
if(ARROW_ARMV8_ARCH_FLAG MATCHES "native")
message(FATAL_ERROR "native arch not allowed, please specify arch explicitly.")
endif()
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_ARCH_FLAG}")

if(ARROW_USE_SIMD)
add_definitions(-DARROW_HAVE_NEON)
endif()
endif()

if(ARROW_USE_SIMD)
add_definitions(-DARROW_USE_SIMD)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU"
AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.4")
message(WARNING "Disable Armv8 CRC and Crypto as compiler doesn't support them well.")
else()
if(ARROW_ARMV8_ARCH_FLAG MATCHES "\\+crypto")
add_definitions(-DARROW_HAVE_ARMV8_CRYPTO)
endif()
# armv8.1+ implies crc support
if(ARROW_ARMV8_ARCH_FLAG MATCHES "armv8\\.[1-9]|\\+crc")
add_definitions(-DARROW_HAVE_ARMV8_CRC)
endif()
endif()
endif()

# ----------------------------------------------------------------------
Expand Down
5 changes: 1 addition & 4 deletions cpp/src/arrow/json/rapidjson_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,8 @@
#include "arrow/util/sse_util.h"

// enable SIMD whitespace skipping, if available
#if defined(ARROW_HAVE_SSE2)
#define RAPIDJSON_SSE2 1
#endif

#if defined(ARROW_HAVE_SSE4_2)
#define RAPIDJSON_SSE2 1
#define RAPIDJSON_SSE42 1
#endif

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/util/bpacking.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

#include "arrow/util/logging.h"
#include "arrow/util/ubsan.h"
#if defined(__AVX512F__)
#if defined(ARROW_HAVE_AVX512)
#include "arrow/util/bpacking_avx512_generated.h"
#else
#include "arrow/util/bpacking_default.h"
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/util/byte_stream_split.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace arrow {
namespace util {
namespace internal {

#if defined(ARROW_HAVE_SSE2)
#if defined(ARROW_HAVE_SSE4_2)

template <typename T>
void ByteStreamSplitDecodeSSE2(const uint8_t* data, int64_t num_values, int64_t stride,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/util/cpu_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -341,15 +341,15 @@ void CpuInfo::VerifyCpuRequirements() {
DCHECK(false) << "CPU does not support the Supplemental SSE3 instruction set";
}
#endif
#if defined(__aarch64__)
#if defined(ARROW_HAVE_NEON)
if (!IsSupported(CpuInfo::ASIMD)) {
DCHECK(false) << "CPU does not support the Armv8 Neon instruction set";
}
#endif
}

bool CpuInfo::CanUseSSE4_2() const {
#if defined(ARROW_HAVE_SSE4_2) && defined(ARROW_USE_SIMD)
#if defined(ARROW_HAVE_SSE4_2)
return IsSupported(CpuInfo::SSE4_2);
#else
return false;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/util/hash_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) {
#define HW_crc32_u16 SSE4_crc32_u16
#define HW_crc32_u32 SSE4_crc32_u32
#define HW_crc32_u64 SSE4_crc32_u64
#elif defined(ARROW_HAVE_ARM_CRC)
#elif defined(ARROW_HAVE_ARMV8_CRC)
#define HW_crc32_u8 ARMCE_crc32_u8
#define HW_crc32_u16 ARMCE_crc32_u16
#define HW_crc32_u32 ARMCE_crc32_u32
Expand All @@ -67,7 +67,7 @@ namespace arrow {
/// Utility class to compute hash values.
class HashUtil {
public:
#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_ARM_CRC)
#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_ARMV8_CRC)
static constexpr bool have_hardware_crc32 = true;
#else
static constexpr bool have_hardware_crc32 = false;
Expand Down
35 changes: 7 additions & 28 deletions cpp/src/arrow/util/neon_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,38 +17,17 @@

#pragma once

namespace arrow {

#if defined(__aarch64__) || defined(__AARCH64__)

#ifdef __ARM_NEON
#define ARROW_HAVE_NEON
#ifdef ARROW_HAVE_NEON
#include <arm_neon.h>
#endif

#ifdef __ARM_FEATURE_CRC32
#define ARROW_HAVE_ARM_CRC
#ifdef ARROW_HAVE_ARMV8_CRC
#include <arm_acle.h>
#endif

#ifdef __ARM_FEATURE_CRYPTO
#include <arm_neon.h>
#define ARROW_HAVE_ARMV8_CRYPTO
#endif // __ARM_FEATURE_CRYPTO

#endif // __ARM_FEATURE_CRC32

#endif // defined(__aarch64__) || defined(__AARCH64__)

#if defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC)
namespace arrow {

#include <asm/hwcap.h>
#include <sys/auxv.h>
#ifndef HWCAP_CRC32
#define HWCAP_CRC32 (1 << 7)
#endif
static inline uint32_t crc32c_runtime_check(void) {
uint64_t auxv = getauxval(AT_HWCAP);
return (auxv & HWCAP_CRC32) != 0;
}
#ifdef ARROW_HAVE_ARMV8_CRC

static inline uint32_t ARMCE_crc32_u8(uint32_t crc, uint8_t v) {
return __crc32cb(crc, v);
Expand All @@ -66,6 +45,6 @@ static inline uint32_t ARMCE_crc32_u64(uint32_t crc, uint64_t v) {
return __crc32cd(crc, v);
}

#endif // defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC)
#endif // ARROW_HAVE_ARMV8_CRC

} // namespace arrow
20 changes: 3 additions & 17 deletions cpp/src/arrow/util/sse_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,31 +22,17 @@

#include "arrow/util/macros.h"

#ifdef ARROW_USE_SIMD
#ifdef ARROW_HAVE_SSE4_2

// MSVC x86-64

#if (defined(_M_AMD64) || defined(_M_X64))
#define ARROW_HAVE_SSE2 1
#define ARROW_HAVE_SSE4_2 1
#include <intrin.h>
#endif

#else
// gcc/clang (possibly others)

#if defined(__SSE2__)
#define ARROW_HAVE_SSE2 1
#include <emmintrin.h>
#endif

#if defined(__SSE4_2__)
#define ARROW_HAVE_SSE4_2 1
#include <nmmintrin.h>
#endif

#endif // ARROW_USE_SIMD

// MSVC x86-64
#endif // ARROW_HAVE_SSE4_2

namespace arrow {

Expand Down
6 changes: 3 additions & 3 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@ std::shared_ptr<Buffer> ByteStreamSplitEncoder<DType>::FlushValues() {
uint8_t* output_buffer_raw = output_buffer->mutable_data();
const size_t num_values = values_.length();
const uint8_t* raw_values = reinterpret_cast<const uint8_t*>(values_.data());
#if defined(ARROW_HAVE_SSE2)
#if defined(ARROW_HAVE_SSE4_2)
arrow::util::internal::ByteStreamSplitEncodeSSE2<T>(raw_values, num_values,
output_buffer_raw);
#else
Expand Down Expand Up @@ -2347,7 +2347,7 @@ int ByteStreamSplitDecoder<DType>::Decode(T* buffer, int max_values) {
const int num_decoded_previously = num_values_in_buffer_ - num_values_;
const uint8_t* data = data_ + num_decoded_previously;

#if defined(ARROW_HAVE_SSE2)
#if defined(ARROW_HAVE_SSE4_2)
arrow::util::internal::ByteStreamSplitDecodeSSE2<T>(data, values_to_decode,
num_values_in_buffer_, buffer);
#else
Expand Down Expand Up @@ -2375,7 +2375,7 @@ int ByteStreamSplitDecoder<DType>::DecodeArrow(
const uint8_t* data = data_ + num_decoded_previously;
int offset = 0;

#if defined(ARROW_HAVE_SSE2)
#if defined(ARROW_HAVE_SSE4_2)
// Use fast decoding into intermediate buffer. This will also decode
// some null values, but it's fast enough that we don't care.
T* decode_out = EnsureDecodeBuffer(values_decoded);
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/encoding_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ BENCHMARK(BM_ByteStreamSplitDecode_Double_Scalar)->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK(BM_ByteStreamSplitEncode_Float_Scalar)->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Range(MIN_RANGE, MAX_RANGE);

#if defined(ARROW_HAVE_SSE2)
#if defined(ARROW_HAVE_SSE4_2)
static void BM_ByteStreamSplitDecode_Float_SSE2(benchmark::State& state) {
BM_ByteStreamSplitDecode<float>(
state, arrow::util::internal::ByteStreamSplitDecodeSSE2<float>);
Expand Down

0 comments on commit b1d4c86

Please sign in to comment.