Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add avx512 instrinsic #13989

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -358,6 +358,7 @@ if test "x$use_asm" = "xyes"; then
AX_CHECK_COMPILE_FLAG([-msse4.2],[[SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-msse4.1],[[SSE41_CXXFLAGS="-msse4.1"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-mavx -mavx2],[[AVX2_CXXFLAGS="-mavx -mavx2"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-mavx -mavx2 -mavx512f],[[AVX512_CXXFLAGS="-mavx -mavx2 -mavx512f"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-msse4 -msha],[[SHANI_CXXFLAGS="-msse4 -msha"]],,[[$CXXFLAG_WERROR]])

TEMP_CXXFLAGS="$CXXFLAGS"
@@ -412,6 +413,87 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
)
CXXFLAGS="$TEMP_CXXFLAGS"

TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $AVX512_CXXFLAGS"
AC_MSG_CHECKING(for AVX512 intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>

This comment has been minimized.

@theuni

theuni Jul 8, 2019
Member

No need to put the whole file here. Just check the intrinsics themselves.

This comment has been minimized.

@fingera

fingera Jul 9, 2019
Author Contributor

A big program to trigger MINGW-GCC BUG

#include <immintrin.h>
__m512i inline K(uint32_t x) { return _mm512_set1_epi32(x); }
__m512i inline Add(__m512i x, __m512i y) { return _mm512_add_epi32(x, y); }
__m512i inline Add(__m512i x, __m512i y, __m512i z) { return Add(Add(x, y), z); }
__m512i inline Add(__m512i x, __m512i y, __m512i z, __m512i w) { return Add(Add(x, y), Add(z, w)); }
__m512i inline Add(__m512i x, __m512i y, __m512i z, __m512i w, __m512i v) { return Add(Add(x, y, z), Add(w, v)); }
__m512i inline Inc(__m512i& x, __m512i y) { x = Add(x, y); return x; }
__m512i inline Inc(__m512i& x, __m512i y, __m512i z) { x = Add(x, y, z); return x; }
__m512i inline Inc(__m512i& x, __m512i y, __m512i z, __m512i w) { x = Add(x, y, z, w); return x; }
__m512i inline Xor(__m512i x, __m512i y) { return _mm512_xor_si512(x, y); }
__m512i inline Xor(__m512i x, __m512i y, __m512i z) { return Xor(Xor(x, y), z); }
__m512i inline Or(__m512i x, __m512i y) { return _mm512_or_si512(x, y); }
__m512i inline And(__m512i x, __m512i y) { return _mm512_and_si512(x, y); }
__m512i inline ShR(__m512i x, unsigned int n) { return _mm512_srli_epi32(x, n); }
template<int N> __m512i inline RoL(__m512i x) { return _mm512_rol_epi32(x, N); }
__m512i inline Ch(__m512i x, __m512i y, __m512i z) { return Xor(z, And(x, Xor(y, z))); }
__m512i inline Maj(__m512i x, __m512i y, __m512i z) { return Or(And(x, y), And(z, Or(x, y))); }
__m512i inline Sigma0(__m512i x) { return Xor(RoL<30>(x), RoL<19>(x), RoL<10>(x)); }
__m512i inline Sigma1(__m512i x) { return Xor(RoL<26>(x), RoL<21>(x), RoL<7>(x)); }
__m512i inline sigma0(__m512i x) { return Xor(RoL<25>(x), RoL<14>(x), ShR(x, 3)); }
__m512i inline sigma1(__m512i x) { return Xor(RoL<15>(x), RoL<13>(x), ShR(x, 10)); }
/** One round of SHA-256. */
void inline __attribute__((always_inline)) Round(__m512i a, __m512i b, __m512i c, __m512i& d, __m512i e, __m512i f, __m512i g, __m512i& h, __m512i k)
{
__m512i t1 = Add(h, Sigma1(e), Ch(e, f, g), k);
__m512i t2 = Add(Sigma0(a), Maj(a, b, c));
d = Add(d, t1);
h = Add(t1, t2);
}
__m512i inline Read16(int offset) {
return K(offset);
}
]],[[
__m512i a = K(0x6a09e667ul);
__m512i b = K(0xbb67ae85ul);
__m512i c = K(0x3c6ef372ul);
__m512i d = K(0xa54ff53aul);
__m512i e = K(0x510e527ful);
__m512i f = K(0x9b05688cul);
__m512i g = K(0x1f83d9abul);
__m512i h = K(0x5be0cd19ul);
__m512i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
Round(a, b, c, d, e, f, g, h, Add(K(0x428a2f98ul), w0 = Read16(0)));
Round(h, a, b, c, d, e, f, g, Add(K(0x71374491ul), w1 = Read16(4)));
Round(g, h, a, b, c, d, e, f, Add(K(0xb5c0fbcful), w2 = Read16(8)));
Round(f, g, h, a, b, c, d, e, Add(K(0xe9b5dba5ul), w3 = Read16(12)));
Round(e, f, g, h, a, b, c, d, Add(K(0x3956c25bul), w4 = Read16(16)));
Round(d, e, f, g, h, a, b, c, Add(K(0x59f111f1ul), w5 = Read16(20)));
Round(c, d, e, f, g, h, a, b, Add(K(0x923f82a4ul), w6 = Read16(24)));
Round(b, c, d, e, f, g, h, a, Add(K(0xab1c5ed5ul), w7 = Read16(28)));
Round(a, b, c, d, e, f, g, h, Add(K(0xd807aa98ul), w8 = Read16(32)));
Round(h, a, b, c, d, e, f, g, Add(K(0x12835b01ul), w9 = Read16(36)));
Round(g, h, a, b, c, d, e, f, Add(K(0x243185beul), w10 = Read16(40)));
Round(f, g, h, a, b, c, d, e, Add(K(0x550c7dc3ul), w11 = Read16(44)));
Round(e, f, g, h, a, b, c, d, Add(K(0x72be5d74ul), w12 = Read16(48)));
Round(d, e, f, g, h, a, b, c, Add(K(0x80deb1feul), w13 = Read16(52)));
Round(c, d, e, f, g, h, a, b, Add(K(0x9bdc06a7ul), w14 = Read16(56)));
Round(b, c, d, e, f, g, h, a, Add(K(0xc19bf174ul), w15 = Read16(60)));
Round(a, b, c, d, e, f, g, h, Add(K(0xe49b69c1ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0xefbe4786ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x0fc19dc6ul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x240ca1ccul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x2de92c6ful), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x4a7484aaul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
return _mm_extract_epi32(_mm512_extracti32x4_epi32(g, 3), 3);
]])],
[ AC_MSG_RESULT(yes); enable_avx512=yes; AC_DEFINE(ENABLE_AVX512, 1, [Define this symbol to build code that uses AVX512 intrinsics]) ],
[ AC_MSG_RESULT(no)]
)
CXXFLAGS="$TEMP_CXXFLAGS"

TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $SHANI_CXXFLAGS"
AC_MSG_CHECKING(for SHA-NI intrinsics)
@@ -1462,6 +1544,7 @@ AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes])
AM_CONDITIONAL([ENABLE_HWCRC32],[test x$enable_hwcrc32 = xyes])
AM_CONDITIONAL([ENABLE_SSE41],[test x$enable_sse41 = xyes])
AM_CONDITIONAL([ENABLE_AVX2],[test x$enable_avx2 = xyes])
AM_CONDITIONAL([ENABLE_AVX512],[test x$enable_avx512 = xyes])
AM_CONDITIONAL([ENABLE_SHANI],[test x$enable_shani = xyes])
AM_CONDITIONAL([USE_ASM],[test x$use_asm = xyes])

@@ -1509,6 +1592,7 @@ AC_SUBST(SANITIZER_LDFLAGS)
AC_SUBST(SSE42_CXXFLAGS)
AC_SUBST(SSE41_CXXFLAGS)
AC_SUBST(AVX2_CXXFLAGS)
AC_SUBST(AVX512_CXXFLAGS)
AC_SUBST(SHANI_CXXFLAGS)
AC_SUBST(LIBTOOL_APP_LDFLAGS)
AC_SUBST(USE_UPNP)
@@ -53,6 +53,10 @@ if ENABLE_AVX2
LIBBITCOIN_CRYPTO_AVX2 = crypto/libbitcoin_crypto_avx2.a
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_AVX2)
endif
if ENABLE_AVX512
LIBBITCOIN_CRYPTO_AVX512 = crypto/libbitcoin_crypto_avx512.a
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_AVX512)
endif
if ENABLE_SHANI
LIBBITCOIN_CRYPTO_SHANI = crypto/libbitcoin_crypto_shani.a
LIBBITCOIN_CRYPTO += $(LIBBITCOIN_CRYPTO_SHANI)
@@ -388,6 +392,12 @@ crypto_libbitcoin_crypto_avx2_a_CXXFLAGS += $(AVX2_CXXFLAGS)
crypto_libbitcoin_crypto_avx2_a_CPPFLAGS += -DENABLE_AVX2
crypto_libbitcoin_crypto_avx2_a_SOURCES = crypto/sha256_avx2.cpp

crypto_libbitcoin_crypto_avx512_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
crypto_libbitcoin_crypto_avx512_a_CPPFLAGS = $(AM_CPPFLAGS)
crypto_libbitcoin_crypto_avx512_a_CXXFLAGS += $(AVX512_CXXFLAGS)
crypto_libbitcoin_crypto_avx512_a_CPPFLAGS += -DENABLE_AVX512
crypto_libbitcoin_crypto_avx512_a_SOURCES = crypto/sha256_avx512.cpp

crypto_libbitcoin_crypto_shani_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
crypto_libbitcoin_crypto_shani_a_CPPFLAGS = $(AM_CPPFLAGS)
crypto_libbitcoin_crypto_shani_a_CXXFLAGS += $(SHANI_CXXFLAGS)
@@ -29,6 +29,11 @@ namespace sha256d64_avx2
void Transform_8way(unsigned char* out, const unsigned char* in);
}

namespace sha256d64_avx512
{
void Transform_16way(unsigned char* out, const unsigned char* in);
}

namespace sha256d64_shani
{
void Transform_2way(unsigned char* out, const unsigned char* in);
@@ -461,6 +466,7 @@ TransformD64Type TransformD64 = sha256::TransformD64;
TransformD64Type TransformD64_2way = nullptr;
TransformD64Type TransformD64_4way = nullptr;
TransformD64Type TransformD64_8way = nullptr;
TransformD64Type TransformD64_16way = nullptr;

bool SelfTest() {
// Input state (equal to the initial SHA256 state)
@@ -560,11 +566,11 @@ void inline cpuid(uint32_t leaf, uint32_t subleaf, uint32_t& a, uint32_t& b, uin
}

/** Check whether the OS has enabled AVX registers. */
bool AVXEnabled()
uint32_t AVXEnabledFlags()
{
uint32_t a, d;
__asm__("xgetbv" : "=a"(a), "=d"(d) : "c"(0));
return (a & 6) == 6;
return a;
}
#endif
} // namespace
@@ -578,29 +584,36 @@ std::string SHA256AutoDetect()
bool have_xsave = false;
bool have_avx = false;
bool have_avx2 = false;
bool have_avx512 = false;
bool have_shani = false;
bool enabled_avx = false;
bool enabled_avx512 = false;

(void)AVXEnabled;
(void)AVXEnabledFlags;
(void)have_sse4;
(void)have_avx;
(void)have_xsave;
(void)have_avx2;
(void)have_avx512;
(void)have_shani;
(void)enabled_avx;
(void)enabled_avx512;

uint32_t eax, ebx, ecx, edx;
cpuid(1, 0, eax, ebx, ecx, edx);
have_sse4 = (ecx >> 19) & 1;
have_xsave = (ecx >> 27) & 1;
have_avx = (ecx >> 28) & 1;
if (have_xsave && have_avx) {
enabled_avx = AVXEnabled();
uint32_t flags = AVXEnabledFlags();
enabled_avx = (flags & 6) == 6;
enabled_avx512 = (flags & 0xe6) == 0xe6;
}
if (have_sse4) {
cpuid(7, 0, eax, ebx, ecx, edx);
have_avx2 = (ebx >> 5) & 1;
have_shani = (ebx >> 29) & 1;
have_avx512 = (ebx >> 16) & 1;
}

#if defined(ENABLE_SHANI) && !defined(BUILD_BITCOIN_INTERNAL)
@@ -609,8 +622,9 @@ std::string SHA256AutoDetect()
TransformD64 = TransformD64Wrapper<sha256_shani::Transform>;
TransformD64_2way = sha256d64_shani::Transform_2way;
ret = "shani(1way,2way)";
have_sse4 = false; // Disable SSE4/AVX2;
have_sse4 = false; // Disable SSE4/AVX2/AVX512;
have_avx2 = false;
have_avx512 = false;
}
#endif

@@ -632,6 +646,13 @@ std::string SHA256AutoDetect()
ret += ",avx2(8way)";
}
#endif

#if defined(ENABLE_AVX512) && !defined(BUILD_BITCOIN_INTERNAL)
if (have_avx512 && enabled_avx512) {
TransformD64_16way = sha256d64_avx512::Transform_16way;
ret += ",avx512(16way)";
}
#endif
#endif

assert(SelfTest());
@@ -697,6 +718,14 @@ CSHA256& CSHA256::Reset()

void SHA256D64(unsigned char* out, const unsigned char* in, size_t blocks)
{
if (TransformD64_16way) {
while (blocks >= 16) {
TransformD64_16way(out, in);
out += 512;
in += 1024;
blocks -= 16;
}
}
if (TransformD64_8way) {
while (blocks >= 8) {
TransformD64_8way(out, in);