Skip to content

Commit

Permalink
Plain decoding optimization
Browse files Browse the repository at this point in the history
This now uses a modified implementation of Nick Galbreath (@client9)
algorithm.
Modifications of the original algorithm include:
- Unaligned access is now optional (macro to enable this)
- Invalid characters are mapped to 0xffffffff instead of 0x01ffffff.
This removes the need to shift by 8 the returned value of the tables on
big-endian architecture with fast unaligned access at the expense of a
different check for invalid character depending on endianness

Speed-up using clang-800.0.42.1 on i7-4870HQ @ 2.5 GHz or iPhone SE:
- x86_64: +97%
- i386: +60%
- arm64: +0%
- armv7: +24%

As a side note, it seems that the iPhone SE processor (Apple A9) has
fast unaligned access which gives even higher speed-up with
corresponding macro set to 1 (+23% / +56%). This may not be the case
for some other ARM processors.
  • Loading branch information
mayeut committed Feb 20, 2017
1 parent f700099 commit cfa8bf7
Show file tree
Hide file tree
Showing 13 changed files with 670 additions and 143 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
*.o
bin/base64
lib/config.h
lib/table_generator
test/benchmark
test/test_base64
4 changes: 3 additions & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
Copyright (c) 2013-2015, Alfred Klomp
Copyright (c) 2005-2007, Nick Galbreath
Copyright (c) 2013-2017, Alfred Klomp
Copyright (c) 2016-2017, Matthieu Darbois
All rights reserved.

Redistribution and use in source and binary forms, with or without
Expand Down
31 changes: 22 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@ HAVE_SSSE3 = 0
HAVE_SSE41 = 0
HAVE_SSE42 = 0
HAVE_AVX = 0
HAVE_FAST_UNALIGNED_ACCESS = 0

# The user should supply compiler flags for the codecs they want to build.
# Check which codecs we're going to include:
ifdef AVX2_CFLAGS
HAVE_AVX2 = 1
HAVE_FAST_UNALIGNED_ACCESS = 1
endif
ifdef NEON32_CFLAGS
HAVE_NEON32 = 1
Expand All @@ -36,15 +38,19 @@ ifdef NEON64_CFLAGS
endif
ifdef SSSE3_CFLAGS
HAVE_SSSE3 = 1
HAVE_FAST_UNALIGNED_ACCESS = 1
endif
ifdef SSE41_CFLAGS
HAVE_SSE41 = 1
HAVE_FAST_UNALIGNED_ACCESS = 1
endif
ifdef SSE42_CFLAGS
HAVE_SSE42 = 1
HAVE_FAST_UNALIGNED_ACCESS = 1
endif
ifdef AVX_CFLAGS
HAVE_AVX = 1
HAVE_FAST_UNALIGNED_ACCESS = 1
endif
ifdef OPENMP
CFLAGS += -fopenmp
Expand All @@ -63,15 +69,22 @@ lib/libbase64.o: $(OBJS)
$(OBJCOPY) --keep-global-symbols=lib/exports.txt $@

lib/config.h:
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@
@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
@echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@
@echo "#define HAVE_SSE41 $(HAVE_SSE41)" >> $@
@echo "#define HAVE_SSE42 $(HAVE_SSE42)" >> $@
@echo "#define HAVE_AVX $(HAVE_AVX)" >> $@
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@
@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
@echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@
@echo "#define HAVE_SSE41 $(HAVE_SSE41)" >> $@
@echo "#define HAVE_SSE42 $(HAVE_SSE42)" >> $@
@echo "#define HAVE_AVX $(HAVE_AVX)" >> $@
@echo "#define HAVE_FAST_UNALIGNED_ACCESS $(HAVE_FAST_UNALIGNED_ACCESS)" >> $@

lib/codec_choose.o: lib/config.h
lib/tables.h: lib/table_generator.c
$(CC) $(CFLAGS) -o lib/table_generator $^
./lib/table_generator > $@

$(OBJS): lib/config.h

lib/lib.o: lib/tables.h

lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS)
lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
Expand All @@ -88,4 +101,4 @@ analyze: clean
scan-build --use-analyzer=`which clang` --status-bugs make

clean:
rm -f bin/base64 bin/base64.o lib/libbase64.o lib/config.h $(OBJS)
rm -f bin/base64 bin/base64.o lib/libbase64.o lib/table_generator.o lib/table_generator lib/config.h $(OBJS)
38 changes: 19 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -425,35 +425,35 @@ x86 processors

| Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | SSE4.1 enc | SSE4.1 dec| SSE4.2 enc | SSE4.2 dec| AVX enc | AVX dec | AVX2 enc | AVX2 dec |
|-------------------------------------------|----------:|----------:|----------:|----------:|-----------:|----------:|-----------:|----------:|--------:|--------:|---------:|---------:|
| i7-4771 @ 3.5 GHz | 833 | 1111 | 3333\* | 4444\* | TBD | TBD | TBD | TBD | TBD | TBD | 4999\* | 6666\* |
| i7-4770 @ 3.4 GHz DDR1600 | 1831 | 1748 | 3570\* | 3695\* | TBD | TBD | TBD | TBD | TBD | TBD | 6539\* | 6512\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1779 | 1727 | 3419\* | 3788\* | TBD | TBD | TBD | TBD | TBD | TBD | 4589\* | 5871\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3367 | 3374 | 4784\* | 6672\* | TBD | TBD | TBD | TBD | TBD | TBD | 5120\* | 7721\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4834 | 6075 | 4906\* | 8154\* | TBD | TBD | TBD | TBD | TBD | TBD | 4839\* | 6911\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 4696 | 6361 | 5227\* | 7737\* | TBD | TBD | TBD | TBD | TBD | TBD | 4813\* | 7189\* |
| i7-4870HQ @ 2.5 GHz | 1471 | 1558 | 5599 | 3886 | 5882 | 3888 | 6202 | 5098 | 6524 | 5281 | 8113 | 7063 |
| i5-4590S @ 3.0 GHz | 1721 | 1643 | 3255\* | 3404\* | TBD | TBD | TBD | TBD | TBD | TBD | 4124\* | 5403\* |
| Xeon X5570 @ 2.93 GHz | 1097 | 1048 | 2077\* | 2215\* | TBD | TBD | TBD | TBD | - | - | - | - |
| Pentium4 @ 3.4 GHz | 528 | 448 | - | - | - | - | - | - | - | - | - | - |
| Atom N270 | 112 | 125 | 331\* | 368\* | - | - | - | - | - | - | - | - |
| AMD E-450 | 370 | 332 | 405\* | 366\* | - | - | - | - | - | - | - | - |
| Intel Edison @ 500 MHz | 79 | 92 | 152\* | 172\* | TBD | TBD | TBD | TBD | - | - | - | - |
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184 | 300\* | 343\* | TBD | TBD | TBD | TBD | - | - | - | - |
| i7-4771 @ 3.5 GHz | 833 | 1111\* | 3333\* | 4444\* | TBD | TBD | TBD | TBD | TBD | TBD | 4999\* | 6666\* |
| i7-4770 @ 3.4 GHz DDR1600 | 1831 | 1748\* | 3570\* | 3695\* | TBD | TBD | TBD | TBD | TBD | TBD | 6539\* | 6512\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1779 | 1727\* | 3419\* | 3788\* | TBD | TBD | TBD | TBD | TBD | TBD | 4589\* | 5871\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3367 | 3374\* | 4784\* | 6672\* | TBD | TBD | TBD | TBD | TBD | TBD | 5120\* | 7721\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4834 | 6075\* | 4906\* | 8154\* | TBD | TBD | TBD | TBD | TBD | TBD | 4839\* | 6911\* |
| i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 4696 | 6361\* | 5227\* | 7737\* | TBD | TBD | TBD | TBD | TBD | TBD | 4813\* | 7189\* |
| i7-4870HQ @ 2.5 GHz | 1471 | 3066 | 5599 | 3886 | 5882 | 3888 | 6202 | 5098 | 6524 | 5281 | 8113 | 7063 |
| i5-4590S @ 3.0 GHz | 1721 | 1643\* | 3255\* | 3404\* | TBD | TBD | TBD | TBD | TBD | TBD | 4124\* | 5403\* |
| Xeon X5570 @ 2.93 GHz | 1097 | 1048\* | 2077\* | 2215\* | TBD | TBD | TBD | TBD | - | - | - | - |
| Pentium4 @ 3.4 GHz | 528 | 448\* | - | - | - | - | - | - | - | - | - | - |
| Atom N270 | 112 | 125\* | 331\* | 368\* | - | - | - | - | - | - | - | - |
| AMD E-450 | 370 | 332\* | 405\* | 366\* | - | - | - | - | - | - | - | - |
| Intel Edison @ 500 MHz | 79 | 92\* | 152\* | 172\* | TBD | TBD | TBD | TBD | - | - | - | - |
| Intel Edison @ 500 MHz OPENMP 2 thread | 158 | 184\* | 300\* | 343\* | TBD | TBD | TBD | TBD | - | - | - | - |

ARM processors

| Processor | Plain enc | Plain dec | NEON32 enc | NEON32 dec | NEON64 enc | NEON64 dec |
|-------------------------------------------|----------:|----------:|-----------:|-----------:|-----------:|-----------:|
| Raspberry PI B+ V1.2 | 46 | 40 | - | - | - | - |
| Raspberry PI 2 B V1.1 | 104 | 88 | 188 | 116\* | - | - |
| Apple iPhone SE armv7 | 1056 | 722 | 2943 | 1573 | - | - |
| Apple iPhone SE arm64 | 1061 | 1237 | - | - | 4098 | 3983 |
| Raspberry PI B+ V1.2 | 46 | 40\* | - | - | - | - |
| Raspberry PI 2 B V1.1 | 104 | 88\* | 188 | 116\* | - | - |
| Apple iPhone SE armv7 | 1056 | 895 | 2943 | 1573 | - | - |
| Apple iPhone SE arm64 | 1061 | 1239 | - | - | 4098 | 3983 |

PowerPC processors

| Processor | Plain enc | Plain dec |
|-------------------------------------------|----------:|----------:|
| PowerPC E6500 @ 1.8GHz | 270 | 265 |
| PowerPC E6500 @ 1.8GHz | 270 | 265\* |


Benchmarks on i7-4770 @ 3.4 GHz DDR1600 with varrying buffer sizes:
Expand Down
80 changes: 39 additions & 41 deletions lib/arch/generic/32/dec_loop.c
Original file line number Diff line number Diff line change
@@ -1,45 +1,43 @@
// If we have native uint32's, pick off 4 bytes at a time for as long as we
// can, but make sure that we quit before seeing any == markers at the end of
// the string. Also, because we write a zero at the end of the output, ensure
// that there are at least 2 valid bytes of input data remaining to close the
// gap. 4 + 2 + 2 = 8 bytes:
while (srclen >= 8)
// Read source 4 bytes at a time
// Since we might be writing one byte more than needed,
// we need to make sure there will still be some room
// for one extra byte in o.
// This will be the case if srclen > 0 when the loop
// is exited
while (srclen > 4)
{
uint32_t str, res, dec;

// Load string:
str = *(uint32_t *)c;

// Shuffle bytes to 32-bit bigendian:
str = cpu_to_be32(str);

// Lookup each byte in the decoding table; if we encounter any
// "invalid" values, fall back on the bytewise code:
if ((dec = base64_table_dec[str >> 24]) > 63) {
break;
}
res = dec << 26;

if ((dec = base64_table_dec[(str >> 16) & 0xFF]) > 63) {
break;
}
res |= dec << 20;

if ((dec = base64_table_dec[(str >> 8) & 0xFF]) > 63) {
break;
}
res |= dec << 14;

if ((dec = base64_table_dec[str & 0xFF]) > 63) {
break;
}
res |= dec << 8;

// Reshuffle and repack into 3-byte output format:
res = be32_to_cpu(res);

// Store back:
*(uint32_t *)o = res;
union {
uint32_t asint;
uint8_t aschar[4];
} x;

x.asint = base64_table_dec_d0[c[0]]
| base64_table_dec_d1[c[1]]
| base64_table_dec_d2[c[2]]
| base64_table_dec_d3[c[3]];

#if BASE64_LITTLE_ENDIAN
// LUTs for little-endian set Most Significant Bit
// in case of invalid character
if (x.asint & 0x80000000U) break;
#else
// LUTs for big-endian set Least Significant Bit
// in case of invalid character
if (x.asint & 1U) break;
#endif

#if HAVE_FAST_UNALIGNED_ACCESS
// This might segfault or be too slow on
// some architectures, do this only if specified
// with HAVE_FAST_UNALIGNED_ACCESS macro
// We write one byte more than needed
*(uint32_t*)o = x.asint;
#else
// Fallback, write bytes one by one
o[0] = x.aschar[0];
o[1] = x.aschar[1];
o[2] = x.aschar[2];
#endif

c += 4;
o += 3;
Expand Down
68 changes: 0 additions & 68 deletions lib/arch/generic/64/dec_loop.c

This file was deleted.

4 changes: 1 addition & 3 deletions lib/arch/generic/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,8 @@ BASE64_ENC_FUNCTION(plain)
BASE64_DEC_FUNCTION(plain)
{
#include "dec_head.c"
#if BASE64_WORDSIZE == 32
#if BASE64_WORDSIZE >= 32
#include "32/dec_loop.c"
#elif BASE64_WORDSIZE == 64
#include "64/dec_loop.c"
#endif
#include "dec_tail.c"
}
2 changes: 1 addition & 1 deletion lib/arch/neon64/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ BASE64_DEC_FUNCTION(neon64)

#include "../generic/dec_head.c"
#include "dec_loop.c"
#include "../generic/64/dec_loop.c"
#include "../generic/32/dec_loop.c"
#include "../generic/dec_tail.c"
#else
BASE64_DEC_STUB
Expand Down
1 change: 0 additions & 1 deletion lib/codec_choose.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

#include "../include/libbase64.h"
#include "codecs.h"
#include "config.h"

#if __x86_64__ || __i386__ || _M_X86 || _M_X64
#ifdef _MSC_VER
Expand Down
7 changes: 7 additions & 0 deletions lib/codecs.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include "config.h"

// Function parameters for encoding functions:
#define BASE64_ENC_PARAMS \
( struct base64_state *state \
Expand Down Expand Up @@ -115,3 +117,8 @@ void codec_choose (struct codec *, int flags);
// for fallback plain encoding/decoding:
extern const uint8_t base64_table_enc[];
extern const uint8_t base64_table_dec[];

extern const uint32_t base64_table_dec_d0[];
extern const uint32_t base64_table_dec_d1[];
extern const uint32_t base64_table_dec_d2[];
extern const uint32_t base64_table_dec_d3[];
1 change: 1 addition & 0 deletions lib/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "../include/libbase64.h"
#include "codecs.h"
#include "tables.h"

// These static function pointers are initialized once when the library is
// first used, and remain in use for the remaining lifetime of the program.
Expand Down
Loading

0 comments on commit cfa8bf7

Please sign in to comment.