Skip to content

Commit

Permalink
Merge #13386: SHA256 implementations based on Intel SHA Extensions
Browse files Browse the repository at this point in the history
66b2cf1 Use immintrin.h everywhere for intrinsics (Pieter Wuille)
4c935e2 Add SHA256 implementation using using Intel SHA intrinsics (Pieter Wuille)
268400d [Refactor] CPU feature detection logic for SHA256 (Pieter Wuille)

Pull request description:

  Based on #13191.

  This adds SHA256 implementations that use Intel's SHA Extension instructions (using intrinsics). This needs GCC 4.9 or Clang 3.4.

  In addition to #13191, two extra implementations are provided:
  * (a) A variable-length SHA256 implementation using SHA extensions.
  * (b) A 2-way 64-byte input double-SHA256 implementation using SHA extensions.

  Benchmarks for 9001-element Merkle tree root computation on an AMD Ryzen 1800X system:
  * Using generic C++ code (pre-#10821): 6.1ms
  * Using SSE4 (master, #10821): 4.6ms
  * Using 4-way SSE4 specialized for 64-byte inputs (#13191): 2.8ms
  * Using 8-way AVX2 specialized for 64-byte inputs (#13191): 2.1ms
  * Using 2-way SHA-NI specialized for 64-byte inputs (this PR): 0.56ms

  Benchmarks for 32-byte SHA256 on the same system:
  * Using SSE4 (master, #10821): 190ns
  * Using SHA-NI (this PR): 53ns

  Benchmarks for 1000000-byte SHA256 on the same system:
  * Using SSE4 (master, #10821): 2.5ms
  * Using SHA-NI (this PR): 0.51ms

Tree-SHA512: 2b319e33b22579f815d91f9daf7994a5e1e799c4f73c13e15070dd54ba71f3f6438ccf77ae9cbd1ce76f972d9cbeb5f0edfea3d86f101bbc1055db70e42743b7
  • Loading branch information
laanwj committed Jul 9, 2018
2 parents 7e74c54 + 66b2cf1 commit 3a3eabe
Show file tree
Hide file tree
Showing 7 changed files with 464 additions and 32 deletions.
28 changes: 20 additions & 8 deletions configure.ac
Expand Up @@ -320,6 +320,7 @@ fi
AX_CHECK_COMPILE_FLAG([-msse4.2],[[SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-msse4.1],[[SSE41_CXXFLAGS="-msse4.1"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-mavx -mavx2],[[AVX2_CXXFLAGS="-mavx -mavx2"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-msse4 -msha],[[SHANI_CXXFLAGS="-msse4 -msha"]],,[[$CXXFLAG_WERROR]])

TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $SSE42_CXXFLAGS"
Expand Down Expand Up @@ -348,11 +349,7 @@ CXXFLAGS="$CXXFLAGS $SSE41_CXXFLAGS"
AC_MSG_CHECKING(for SSE4.1 intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
]],[[
__m128i l = _mm_set1_epi32(0);
return _mm_extract_epi32(l, 3);
Expand All @@ -367,11 +364,7 @@ CXXFLAGS="$CXXFLAGS $AVX2_CXXFLAGS"
AC_MSG_CHECKING(for AVX2 intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__) && defined(__AVX2__)
#include <x86intrin.h>
#endif
]],[[
__m256i l = _mm256_set1_epi32(0);
return _mm256_extract_epi32(l, 7);
Expand All @@ -381,6 +374,23 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
)
CXXFLAGS="$TEMP_CXXFLAGS"

TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $SHANI_CXXFLAGS"
AC_MSG_CHECKING(for SHA-NI intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>
#include <immintrin.h>
]],[[
__m128i i = _mm_set1_epi32(0);
__m128i j = _mm_set1_epi32(1);
__m128i k = _mm_set1_epi32(2);
return _mm_extract_epi32(_mm_sha256rnds2_epu32(i, i, k), 0);
]])],
[ AC_MSG_RESULT(yes); enable_shani=yes; AC_DEFINE(ENABLE_SHANI, 1, [Define this symbol to build code that uses SHA-NI intrinsics]) ],
[ AC_MSG_RESULT(no)]
)
CXXFLAGS="$TEMP_CXXFLAGS"

CPPFLAGS="$CPPFLAGS -DHAVE_BUILD_INFO -D__STDC_FORMAT_MACROS"

AC_ARG_WITH([utils],
Expand Down Expand Up @@ -1309,6 +1319,7 @@ AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes])
AM_CONDITIONAL([ENABLE_HWCRC32],[test x$enable_hwcrc32 = xyes])
AM_CONDITIONAL([ENABLE_SSE41],[test x$enable_sse41 = xyes])
AM_CONDITIONAL([ENABLE_AVX2],[test x$enable_avx2 = xyes])
AM_CONDITIONAL([ENABLE_SHANI],[test x$enable_shani = xyes])
AM_CONDITIONAL([USE_ASM],[test x$use_asm = xyes])

AC_DEFINE(CLIENT_VERSION_MAJOR, _CLIENT_VERSION_MAJOR, [Major version])
Expand Down Expand Up @@ -1353,6 +1364,7 @@ AC_SUBST(SANITIZER_LDFLAGS)
AC_SUBST(SSE42_CXXFLAGS)
AC_SUBST(SSE41_CXXFLAGS)
AC_SUBST(AVX2_CXXFLAGS)
AC_SUBST(SHANI_CXXFLAGS)
AC_SUBST(LIBTOOL_APP_LDFLAGS)
AC_SUBST(USE_UPNP)
AC_SUBST(USE_QRCODE)
Expand Down
10 changes: 10 additions & 0 deletions src/Makefile.am
Expand Up @@ -52,6 +52,10 @@ if ENABLE_AVX2
LIBBITCOIN_CRYPTO_AVX2 = crypto/libbitcoin_crypto_avx2.a
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_AVX2)
endif
if ENABLE_SHANI
LIBBITCOIN_CRYPTO_SHANI = crypto/libbitcoin_crypto_shani.a
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_SHANI)
endif

$(LIBSECP256K1): $(wildcard secp256k1/src/*.h) $(wildcard secp256k1/src/*.c) $(wildcard secp256k1/include/*)
$(AM_V_at)$(MAKE) $(AM_MAKEFLAGS) -C $(@D) $(@F)
Expand Down Expand Up @@ -318,6 +322,12 @@ crypto_libbitcoin_crypto_avx2_a_CXXFLAGS += $(AVX2_CXXFLAGS)
crypto_libbitcoin_crypto_avx2_a_CPPFLAGS += -DENABLE_AVX2
crypto_libbitcoin_crypto_avx2_a_SOURCES = crypto/sha256_avx2.cpp

crypto_libbitcoin_crypto_shani_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
crypto_libbitcoin_crypto_shani_a_CPPFLAGS = $(AM_CPPFLAGS)
crypto_libbitcoin_crypto_shani_a_CXXFLAGS += $(SHANI_CXXFLAGS)
crypto_libbitcoin_crypto_shani_a_CPPFLAGS += -DENABLE_SHANI
crypto_libbitcoin_crypto_shani_a_SOURCES = crypto/sha256_shani.cpp

# consensus: shared between all executables that validate any consensus rules.
libbitcoin_consensus_a_CPPFLAGS = $(AM_CPPFLAGS) $(BITCOIN_INCLUDES)
libbitcoin_consensus_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
Expand Down
1 change: 1 addition & 0 deletions src/Makefile.test.include
Expand Up @@ -137,6 +137,7 @@ test_test_bitcoin_fuzzy_LDADD = \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBBITCOIN_CRYPTO_SHANI) \
$(LIBSECP256K1)

test_test_bitcoin_fuzzy_LDADD += $(BOOST_LIBS) $(CRYPTO_LIBS)
Expand Down
90 changes: 74 additions & 16 deletions src/crypto/sha256.cpp
Expand Up @@ -29,6 +29,16 @@ namespace sha256d64_avx2
void Transform_8way(unsigned char* out, const unsigned char* in);
}

namespace sha256d64_shani
{
void Transform_2way(unsigned char* out, const unsigned char* in);
}

namespace sha256_shani
{
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks);
}

// Internal implementation code.
namespace
{
Expand Down Expand Up @@ -448,6 +458,7 @@ void TransformD64Wrapper(unsigned char* out, const unsigned char* in)

TransformType Transform = sha256::Transform;
TransformD64Type TransformD64 = sha256::TransformD64;
TransformD64Type TransformD64_2way = nullptr;
TransformD64Type TransformD64_4way = nullptr;
TransformD64Type TransformD64_8way = nullptr;

Expand Down Expand Up @@ -512,6 +523,13 @@ bool SelfTest() {
TransformD64(out, data + 1);
if (!std::equal(out, out + 32, result_d64)) return false;

// Test TransformD64_2way, if available.
if (TransformD64_2way) {
unsigned char out[64];
TransformD64_2way(out, data + 1);
if (!std::equal(out, out + 64, result_d64)) return false;
}

// Test TransformD64_4way, if available.
if (TransformD64_4way) {
unsigned char out[128];
Expand Down Expand Up @@ -556,32 +574,64 @@ std::string SHA256AutoDetect()
{
std::string ret = "standard";
#if defined(USE_ASM) && (defined(__x86_64__) || defined(__amd64__) || defined(__i386__))
(void)AVXEnabled; // Silence unused warning (in case ENABLE_AVX2 is not defined)
bool have_sse4 = false;
bool have_xsave = false;
bool have_avx = false;
bool have_avx2 = false;
bool have_shani = false;
bool enabled_avx = false;

(void)AVXEnabled;
(void)have_sse4;
(void)have_avx;
(void)have_xsave;
(void)have_avx2;
(void)have_shani;
(void)enabled_avx;

uint32_t eax, ebx, ecx, edx;
cpuid(1, 0, eax, ebx, ecx, edx);
if ((ecx >> 19) & 1) {
have_sse4 = (ecx >> 19) & 1;
have_xsave = (ecx >> 27) & 1;
have_avx = (ecx >> 28) & 1;
if (have_xsave && have_avx) {
enabled_avx = AVXEnabled();
}
if (have_sse4) {
cpuid(7, 0, eax, ebx, ecx, edx);
have_avx2 = (ebx >> 5) & 1;
have_shani = (ebx >> 29) & 1;
}

#if defined(ENABLE_SHANI) && !defined(BUILD_BITCOIN_INTERNAL)
if (have_shani) {
Transform = sha256_shani::Transform;
TransformD64 = TransformD64Wrapper<sha256_shani::Transform>;
TransformD64_2way = sha256d64_shani::Transform_2way;
ret = "shani(1way,2way)";
have_sse4 = false; // Disable SSE4/AVX2;
have_avx2 = false;
}
#endif

if (have_sse4) {
#if defined(__x86_64__) || defined(__amd64__)
Transform = sha256_sse4::Transform;
TransformD64 = TransformD64Wrapper<sha256_sse4::Transform>;
ret = "sse4(1way)";
#endif
#if defined(ENABLE_SSE41) && !defined(BUILD_BITCOIN_INTERNAL)
TransformD64_4way = sha256d64_sse41::Transform_4way;
ret = "sse4(1way+4way)";
#if defined(ENABLE_AVX2) && !defined(BUILD_BITCOIN_INTERNAL)
if (((ecx >> 27) & 1) && ((ecx >> 28) & 1)) { // XSAVE and AVX
cpuid(7, 0, eax, ebx, ecx, edx);
if ((ebx >> 5) & 1) { // AVX2 flag
if (AVXEnabled()) { // OS has enabled AVX registers
TransformD64_8way = sha256d64_avx2::Transform_8way;
ret += ",avx2(8way)";
}
}
}
#endif
#else
ret = "sse4";
ret += ",sse41(4way)";
#endif
}

#if defined(ENABLE_AVX2) && !defined(BUILD_BITCOIN_INTERNAL)
if (have_avx2 && have_avx && enabled_avx) {
TransformD64_8way = sha256d64_avx2::Transform_8way;
ret += ",avx2(8way)";
}
#endif
#endif

assert(SelfTest());
Expand Down Expand Up @@ -663,6 +713,14 @@ void SHA256D64(unsigned char* out, const unsigned char* in, size_t blocks)
blocks -= 4;
}
}
if (TransformD64_2way) {
while (blocks >= 2) {
TransformD64_2way(out, in);
out += 64;
in += 128;
blocks -= 2;
}
}
while (blocks) {
TransformD64(out, in);
out += 32;
Expand Down
4 changes: 0 additions & 4 deletions src/crypto/sha256_avx2.cpp
@@ -1,11 +1,7 @@
#ifdef ENABLE_AVX2

#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif

#include <crypto/sha256.h>
#include <crypto/common.h>
Expand Down

0 comments on commit 3a3eabe

Please sign in to comment.