Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable avx512 support for base64 encoding. Reuse WojciechMula/base64-… #102

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Expand Up @@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF)
add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath")
cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF)
add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath")
cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF)
add_feature_info(AVX2 BASE64_WITH_AVX512 "add AVX512 codepath")

cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF)
add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath")
Expand Down Expand Up @@ -118,6 +120,7 @@ add_library(base64
lib/arch/sse42/codec.c
lib/arch/avx/codec.c
lib/arch/avx2/codec.c
lib/arch/avx512/codec.c

lib/arch/neon32/codec.c
lib/arch/neon64/codec.c
Expand Down Expand Up @@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
configure_codec(SSE42 __SSSE4_2__)
configure_codec(AVX)
configure_codec(AVX2)
configure_codec(AVX512)

elseif (_TARGET_ARCH STREQUAL "arm")
set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')")
Expand Down
9 changes: 8 additions & 1 deletion Makefile
Expand Up @@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic
OBJCOPY ?= objcopy

OBJS = \
lib/arch/avx512/codec.o \
lib/arch/avx2/codec.o \
lib/arch/generic/codec.o \
lib/arch/neon32/codec.o \
Expand All @@ -16,6 +17,7 @@ OBJS = \
lib/codec_choose.o \
lib/tables/tables.o

HAVE_AVX512 = 0
HAVE_AVX2 = 0
HAVE_NEON32 = 0
HAVE_NEON64 = 0
Expand All @@ -26,6 +28,9 @@ HAVE_AVX = 0

# The user should supply compiler flags for the codecs they want to build.
# Check which codecs we're going to include:
ifdef AVX512_CFLAGS
HAVE_AVX512 = 1
endif
ifdef AVX2_CFLAGS
HAVE_AVX2 = 1
endif
Expand Down Expand Up @@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS)
$(OBJCOPY) --keep-global-symbols=lib/exports.txt $@

lib/config.h:
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@
@echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" >> $@
@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
@echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@
Expand All @@ -75,6 +81,7 @@ lib/config.h:
$(OBJS): lib/config.h
$(OBJS): CFLAGS += -Ilib

lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS)
lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS)
lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS)
Expand Down
23 changes: 20 additions & 3 deletions README.md
Expand Up @@ -3,7 +3,7 @@
[![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml)

This is an implementation of a base64 stream encoding/decoding library in C99
with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
[OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions
to encode/decode simple length-delimited strings. This library aims to be:

Expand All @@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a
time, which gives a speedup of four or more times compared to the "plain"
bytewise codec.

AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI
instructions. Decoding part reused AVX2 implementations. For CPUs later than
Cannonlake (manufactured in 2018) supports these instructions.

NEON support is hardcoded to on or off at compile time, because portable
runtime feature detection is unavailable on ARM.

Expand Down Expand Up @@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a
[articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html).
His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64).

The AVX512 encoder code is also referenced from the project of Wojciech Muła and
the project code is [here](https://github.com/WojciechMula/base64-avx512)

The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl).

## Building
Expand All @@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type:
make lib/libbase64.o
```

Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`,
`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`,
`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
A typical build invocation on x86 looks like this:

```sh
Expand All @@ -93,6 +100,15 @@ Example:
AVX2_CFLAGS=-mavx2 make
```

### AVX512

To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`.
Example:

```sh
AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make
```

The codec will only be used if runtime feature detection shows that the target machine supports AVX2.

### SSSE3
Expand Down Expand Up @@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way
The following constants can be used:

- `BASE64_FORCE_AVX2`
- `BASE64_FORCE_AVX512`
- `BASE64_FORCE_NEON32`
- `BASE64_FORCE_NEON64`
- `BASE64_FORCE_PLAIN`
Expand Down
2 changes: 2 additions & 0 deletions cmake/Modules/TargetSIMDInstructionSet.cmake
Expand Up @@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags)
set(COMPILE_FLAGS_SSE42 "-msse4.2")
set(COMPILE_FLAGS_AVX "-mavx")
set(COMPILE_FLAGS_AVX2 "-mavx2")
set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi")

#arm
set(COMPILE_FLAGS_NEON32 "-mfpu=neon")
Expand All @@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags)
set(COMPILE_FLAGS_SSE42 " ")
set(COMPILE_FLAGS_AVX "/arch:AVX")
set(COMPILE_FLAGS_AVX2 "/arch:AVX2")
set(COMPILE_FLAGS_AVX512 "/arch:AVX512")
endif()
endmacro(define_SIMD_compile_flags)
3 changes: 3 additions & 0 deletions cmake/config.h.in
Expand Up @@ -16,6 +16,9 @@
#cmakedefine01 BASE64_WITH_AVX2
#define HAVE_AVX2 BASE64_WITH_AVX2

#cmakedefine01 BASE64_WITH_AVX512
#define HAVE_AVX512 BASE64_WITH_AVX512

#cmakedefine01 BASE64_WITH_NEON32
#define HAVE_NEON32 BASE64_WITH_NEON32

Expand Down
1 change: 1 addition & 0 deletions include/libbase64.h
Expand Up @@ -53,6 +53,7 @@ extern "C" {
#define BASE64_FORCE_SSE41 (1 << 5)
#define BASE64_FORCE_SSE42 (1 << 6)
#define BASE64_FORCE_AVX (1 << 7)
#define BASE64_FORCE_AVX512 (1 << 8)

struct base64_state {
int eof;
Expand Down
42 changes: 42 additions & 0 deletions lib/arch/avx512/codec.c
@@ -0,0 +1,42 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>

#include "../../../include/libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"

#if HAVE_AVX512
#include <immintrin.h>

#include "dec_reshuffle.c"
#include "dec_loop.c"
#include "enc_reshuffle_translate.c"
#include "enc_loop.c"

#endif // HAVE_AVX512

BASE64_ENC_FUNCTION(avx512)
{
#if HAVE_AVX2
#include "../generic/enc_head.c"
enc_loop_avx512(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
BASE64_ENC_STUB
#endif
}

// Reuse AVX2 decoding. Not supporting AVX512 at present
BASE64_DEC_FUNCTION(avx512)
{
#if HAVE_AVX2
#include "../generic/dec_head.c"
dec_loop_avx2(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
BASE64_DEC_STUB
#endif
}
110 changes: 110 additions & 0 deletions lib/arch/avx512/dec_loop.c
@@ -0,0 +1,110 @@
static inline int
dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
const __m256i lut_lo = _mm256_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);

const __m256i lut_hi = _mm256_setr_epi8(
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);

const __m256i lut_roll = _mm256_setr_epi8(
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0,
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0);

const __m256i mask_2F = _mm256_set1_epi8(0x2F);

// Load input:
__m256i str = _mm256_loadu_si256((__m256i *) *s);

// See the SSSE3 decoder for an explanation of the algorithm.
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);

if (!_mm256_testz_si256(lo, hi)) {
return 0;
}

const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));

// Now simply add the delta values to the input:
str = _mm256_add_epi8(str, roll);

// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);

// Store the output:
_mm256_storeu_si256((__m256i *) *o, str);

*s += 32;
*o += 24;
*rounds -= 1;

return 1;
}

static inline void
dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo? Should be dec_loop_avx512

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. This PR is only for the encoding part for AVX512 because Node.js only depends on base64 SIMD encoding. In general Base64 decoding cannot be vectorized when there are space chars in input. To not break your project in general, I reuse the AVX2 for decoding part in my PR.

{
if (*slen < 45) {
return;
}

// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
// written after the output, ensure that there will be at least 13
// bytes of input data left to cover the gap. (11 data bytes and up to
// two end-of-string markers.)
size_t rounds = (*slen - 13) / 32;

*slen -= rounds * 32; // 32 bytes consumed per round
*olen += rounds * 24; // 24 bytes produced per round

do {
if (rounds >= 8) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 4) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 2) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
dec_loop_avx2_inner(s, o, &rounds);
break;

} while (rounds > 0);

// Adjust for any rounds that were skipped:
*slen += rounds * 32;
*olen -= rounds * 24;
}
34 changes: 34 additions & 0 deletions lib/arch/avx512/dec_reshuffle.c
@@ -0,0 +1,34 @@
static inline __m256i
dec_reshuffle (const __m256i in)
{
// in, lower lane, bits, upper case are most significant bits, lower
// case are least significant bits:
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA

const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
// 0000eeee FFffffff 0000DDDD DDddEEEE
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB

__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
// 00000000 JJJJJJjj KKKKkkkk LLllllll
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
// 00000000 DDDDDDdd EEEEeeee FFffffff
// 00000000 AAAAAAaa BBBBbbbb CCcccccc

// Pack bytes together in each lane:
out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
// 00000000 00000000 00000000 00000000
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa

// Pack lanes:
return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
}