Skip to content

Commit

Permalink
AVX2: dec: factor decoding loop into inline function
Browse files Browse the repository at this point in the history
  • Loading branch information
aklomp committed Nov 27, 2019
1 parent f831f9d commit 3e4b780
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 30 deletions.
3 changes: 2 additions & 1 deletion lib/arch/avx2/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <immintrin.h>

#include "dec_reshuffle.c"
#include "dec_loop.c"
#include "enc_translate.c"
#include "enc_reshuffle.c"
#include "enc_loop.c"
Expand All @@ -30,7 +31,7 @@ BASE64_DEC_FUNCTION(avx2)
{
#if HAVE_AVX2
#include "../generic/dec_head.c"
#include "dec_loop.c"
dec_loop_avx2(&c, &srclen, &o, &outl);
#include "../generic/dec_tail.c"
#else
BASE64_DEC_STUB
Expand Down
71 changes: 42 additions & 29 deletions lib/arch/avx2/dec_loop.c
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
// If we have AVX2 support, pick off 32 bytes at a time for as long as we can,
// but make sure that we quit before seeing any == markers at the end of the
// string. Also, because we write 8 zeroes at the end of the output, ensure
// that there are at least 11 valid bytes of input data remaining to close the
// gap. 32 + 2 + 11 = 45 bytes:
while (srclen >= 45)
static inline void
dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
// Load string:
__m256i str = _mm256_loadu_si256((__m256i *)c);
if (*slen < 45) {
return;
}

// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
// written after the output, ensure that there will be at least 13
// bytes of input data left to cover the gap. (11 data bytes and up to
// two end-of-string markers.)
size_t rounds = (*slen - 13) / 32;

// See ssse3/dec_loop.c for an explanation of how the code works.
*slen -= rounds * 32; // 32 bytes consumed per round
*olen += rounds * 24; // 24 bytes produced per round

const __m256i lut_lo = _mm256_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
Expand All @@ -30,29 +34,38 @@ while (srclen >= 45)

const __m256i mask_2F = _mm256_set1_epi8(0x2F);

// lookup
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
do {
// Load input:
__m256i str = _mm256_loadu_si256((__m256i *) *s);

if (!_mm256_testz_si256(lo, hi)) {
break;
}
// See the SSSE3 decoder for an explanation of the algorithm.
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);

if (!_mm256_testz_si256(lo, hi)) {
break;
}

const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));

// Now simply add the delta values to the input:
str = _mm256_add_epi8(str, roll);

// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);

// Now simply add the delta values to the input:
str = _mm256_add_epi8(str, roll);
// Store the output:
_mm256_storeu_si256((__m256i *) *o, str);

// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);
*s += 32;
*o += 24;

// Store back:
_mm256_storeu_si256((__m256i *)o, str);
} while (--rounds > 0);

c += 32;
o += 24;
outl += 24;
srclen -= 32;
// Adjust for any rounds that were skipped:
*slen += rounds * 32;
*olen -= rounds * 24;
}

0 comments on commit 3e4b780

Please sign in to comment.