aklomp · lucshi · Aug 18, 2022 · Sep 26, 2022 · Sep 29, 2022 · Sep 29, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF)
 add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath")
 cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF)
 add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath")
+cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF)
+add_feature_info(AVX2 BASE64_WITH_AVX512 "add AVX512 codepath")
 
 cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF)
 add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath")
@@ -118,6 +120,7 @@ add_library(base64
     lib/arch/sse42/codec.c
     lib/arch/avx/codec.c
     lib/arch/avx2/codec.c
+    lib/arch/avx512/codec.c
 
     lib/arch/neon32/codec.c
     lib/arch/neon64/codec.c
@@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
     configure_codec(SSE42 __SSSE4_2__)
     configure_codec(AVX)
     configure_codec(AVX2)
+    configure_codec(AVX512)
 
 elseif (_TARGET_ARCH STREQUAL "arm")
     set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')")

diff --git a/Makefile b/Makefile
@@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic
 OBJCOPY ?= objcopy
 
 OBJS = \
+  lib/arch/avx512/codec.o \
   lib/arch/avx2/codec.o \
   lib/arch/generic/codec.o \
   lib/arch/neon32/codec.o \
@@ -16,6 +17,7 @@ OBJS = \
   lib/codec_choose.o \
   lib/tables/tables.o
 
+HAVE_AVX512 = 0
 HAVE_AVX2   = 0
 HAVE_NEON32 = 0
 HAVE_NEON64 = 0
@@ -26,6 +28,9 @@ HAVE_AVX    = 0
 
 # The user should supply compiler flags for the codecs they want to build.
 # Check which codecs we're going to include:
+ifdef AVX512_CFLAGS
+  HAVE_AVX512 = 1
+endif
 ifdef AVX2_CFLAGS
   HAVE_AVX2 = 1
 endif
@@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS)
 	$(OBJCOPY) --keep-global-symbols=lib/exports.txt $@
 
 lib/config.h:
-	@echo "#define HAVE_AVX2   $(HAVE_AVX2)"    > $@
+	@echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@
+	@echo "#define HAVE_AVX2   $(HAVE_AVX2)"   >> $@
 	@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
 	@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
 	@echo "#define HAVE_SSSE3  $(HAVE_SSSE3)"  >> $@
@@ -75,6 +81,7 @@ lib/config.h:
 $(OBJS): lib/config.h
 $(OBJS): CFLAGS += -Ilib
 
+lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS)
 lib/arch/avx2/codec.o:   CFLAGS += $(AVX2_CFLAGS)
 lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
 lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS)

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml)
 
 This is an implementation of a base64 stream encoding/decoding library in C99
-with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
+with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
 [OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions
 to encode/decode simple length-delimited strings. This library aims to be:
 
@@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a
 time, which gives a speedup of four or more times compared to the "plain"
 bytewise codec.
 
+AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI
+instructions. Decoding part reused AVX2 implementations. For CPUs later than
+Cannonlake (manufactured in 2018) supports these instructions.
+
 NEON support is hardcoded to on or off at compile time, because portable
 runtime feature detection is unavailable on ARM.
 
@@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a
 [articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html).
 His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64).
 
+The AVX512 encoder code is also referenced from the project of Wojciech Muła and
+the project code is [here](https://github.com/WojciechMula/base64-avx512)
+
 The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl).
 
 ## Building
@@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type:
 make lib/libbase64.o
 ```
 
-Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`,
-`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
+Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`, 
+`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
 A typical build invocation on x86 looks like this:
 
 ```sh
@@ -93,6 +100,15 @@ Example:
 AVX2_CFLAGS=-mavx2 make
 ```
 
+### AVX512
+
+To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`.
+Example:
+
+```sh
+AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make
+```
+
 The codec will only be used if runtime feature detection shows that the target machine supports AVX2.
 
 ### SSSE3
@@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way
 The following constants can be used:
 
 - `BASE64_FORCE_AVX2`
+- `BASE64_FORCE_AVX512`
 - `BASE64_FORCE_NEON32`
 - `BASE64_FORCE_NEON64`
 - `BASE64_FORCE_PLAIN`

diff --git a/cmake/Modules/TargetSIMDInstructionSet.cmake b/cmake/Modules/TargetSIMDInstructionSet.cmake
@@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags)
         set(COMPILE_FLAGS_SSE42 "-msse4.2")
         set(COMPILE_FLAGS_AVX "-mavx")
         set(COMPILE_FLAGS_AVX2 "-mavx2")
+        set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi")
 
         #arm
         set(COMPILE_FLAGS_NEON32 "-mfpu=neon")
@@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags)
         set(COMPILE_FLAGS_SSE42 " ")
         set(COMPILE_FLAGS_AVX "/arch:AVX")
         set(COMPILE_FLAGS_AVX2 "/arch:AVX2")
+        set(COMPILE_FLAGS_AVX512 "/arch:AVX512")
     endif()
 endmacro(define_SIMD_compile_flags)
diff --git a/cmake/config.h.in b/cmake/config.h.in
@@ -16,6 +16,9 @@
 #cmakedefine01 BASE64_WITH_AVX2
 #define HAVE_AVX2 BASE64_WITH_AVX2
 
+#cmakedefine01 BASE64_WITH_AVX512
+#define HAVE_AVX512 BASE64_WITH_AVX512
+
 #cmakedefine01 BASE64_WITH_NEON32
 #define HAVE_NEON32 BASE64_WITH_NEON32
 

diff --git a/include/libbase64.h b/include/libbase64.h
@@ -53,6 +53,7 @@ extern "C" {
 #define BASE64_FORCE_SSE41	(1 << 5)
 #define BASE64_FORCE_SSE42	(1 << 6)
 #define BASE64_FORCE_AVX	(1 << 7)
+#define BASE64_FORCE_AVX512	(1 << 8)
 
 struct base64_state {
 	int eof;

diff --git a/lib/arch/avx512/codec.c b/lib/arch/avx512/codec.c
@@ -0,0 +1,42 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_AVX512
+#include <immintrin.h>
+
+#include "dec_reshuffle.c"
+#include "dec_loop.c"
+#include "enc_reshuffle_translate.c"
+#include "enc_loop.c"
+
+#endif	// HAVE_AVX512
+
+BASE64_ENC_FUNCTION(avx512)
+{
+#if HAVE_AVX2
+	#include "../generic/enc_head.c"
+	enc_loop_avx512(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	BASE64_ENC_STUB
+#endif
+}
+
+// Reuse AVX2 decoding. Not supporting AVX512 at present
+BASE64_DEC_FUNCTION(avx512)
+{
+#if HAVE_AVX2
+	#include "../generic/dec_head.c"
+	dec_loop_avx2(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	BASE64_DEC_STUB
+#endif
+}
diff --git a/lib/arch/avx512/dec_loop.c b/lib/arch/avx512/dec_loop.c
@@ -0,0 +1,110 @@
+static inline int
+dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
+{
+	const __m256i lut_lo = _mm256_setr_epi8(
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
+
+	const __m256i lut_hi = _mm256_setr_epi8(
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
+
+	const __m256i lut_roll = _mm256_setr_epi8(
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0,
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0);
+
+	const __m256i mask_2F = _mm256_set1_epi8(0x2F);
+
+	// Load input:
+	__m256i str = _mm256_loadu_si256((__m256i *) *s);
+
+	// See the SSSE3 decoder for an explanation of the algorithm.
+	const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
+	const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
+	const __m256i hi         = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+	const __m256i lo         = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
+
+	if (!_mm256_testz_si256(lo, hi)) {
+		return 0;
+	}
+
+	const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
+	const __m256i roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
+
+	// Now simply add the delta values to the input:
+	str = _mm256_add_epi8(str, roll);
+
+	// Reshuffle the input to packed 12-byte output format:
+	str = dec_reshuffle(str);
+
+	// Store the output:
+	_mm256_storeu_si256((__m256i *) *o, str);
+
+	*s += 32;
+	*o += 24;
+	*rounds -= 1;
+
+	return 1;
+}
+
+static inline void
+dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 45) {
+		return;
+	}
+
+	// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
+	// written after the output, ensure that there will be at least 13
+	// bytes of input data left to cover the gap. (11 data bytes and up to
+	// two end-of-string markers.)
+	size_t rounds = (*slen - 13) / 32;
+
+	*slen -= rounds * 32;	// 32 bytes consumed per round
+	*olen += rounds * 24;	// 24 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 4) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 2) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		dec_loop_avx2_inner(s, o, &rounds);
+		break;
+
+	} while (rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 32;
+	*olen -= rounds * 24;
+}
diff --git a/lib/arch/avx512/dec_reshuffle.c b/lib/arch/avx512/dec_reshuffle.c
@@ -0,0 +1,34 @@
+static inline __m256i
+dec_reshuffle (const __m256i in)
+{
+	// in, lower lane, bits, upper case are most significant bits, lower
+	// case are least significant bits:
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+	const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
+	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
+	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
+	// 0000eeee FFffffff 0000DDDD DDddEEEE
+	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
+
+	__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
+	// 00000000 JJJJJJjj KKKKkkkk LLllllll
+	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
+	// 00000000 DDDDDDdd EEEEeeee FFffffff
+	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
+
+	// Pack bytes together in each lane:
+	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
+		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
+	// 00000000 00000000 00000000 00000000
+	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
+	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
+	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
+
+	// Pack lanes:
+	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
+}