-
Notifications
You must be signed in to change notification settings - Fork 165
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
NEON64: enc: convert full encoding loop to inline assembly
Convert the full encoding loop to an inline assembly implementation for systems that can use inline assembly. The motivation for this work is that when optimization is turned off on recent versions of clang, the encoding table would not be loaded into sequential registers (see issue #96). This happened despite taking pains to ensure that the compiler uses an explicit set of registers for the load (v8-v11). This leaves us with not much options beside rewriting the full encoding loop in inline assembly. Only that way can we be absolutely certain that the register usage is always correct. Thankfully, aarch64 assembly is not very difficult to write by hand. In making this change, we can/should add some optimizations in the loop unrolling for rounds >= 8. The unrolled loop should optimize pipeline efficiency by interleaving memory operations (like loads and stores) with data operations (like table lookups). The best way to achieve this is to blend the unrolled loops such that one loop prefetches the registers needed in the next loop. To make that possible without duplicating massive amounts of code, we abstract the various assembly blocks into preprocessor macros and instantiate them as needed. This mixing of the preprocessor with inline assembly is perhaps a bit gnarly, but I think the usage is simple enough that the advantages (code reuse) outweigh the disadvantages. Code was tested on a Debian VM running under QEMU. Unfortunately this does not let us see how the actual bare metal performance increases/decreases.
- Loading branch information
Showing
1 changed file
with
154 additions
and
86 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,103 +1,171 @@ | ||
// Apologies in advance for combining the preprocessor with inline assembly; | ||
// two notoriously gnarly parts of C. This was necessary to avoid a lot of code | ||
// repetition. The preprocessor is used to template large sections of inline | ||
// assembly that differ only in the registers they operate on. If this code | ||
// would be written out by hand, it would be very large and hard to audit. | ||
|
||
// Generate a block of inline assembly that loads three user-defined registers | ||
// V0..V2 from memory and deinterleaves them, post-incrementing the src | ||
// pointer. The register set should be sequential. | ||
#define LOAD(V0, V1, V2) \ | ||
"ld3 {"V0".16b, "V1".16b, "V2".16b}, [%[src]], #48 \n\t" | ||
|
||
// Generate a block of inline assembly that takes three deinterleaved registers | ||
// and shuffles the bytes. The output is in temporary registers t0..t3. | ||
#define SHUF(V0, V1, V2) \ | ||
"ushr %[t0].16b, "V0".16b, #2 \n\t" \ | ||
"ushr %[t1].16b, "V1".16b, #4 \n\t" \ | ||
"ushr %[t2].16b, "V2".16b, #6 \n\t" \ | ||
"sli %[t1].16b, "V0".16b, #4 \n\t" \ | ||
"sli %[t2].16b, "V1".16b, #2 \n\t" \ | ||
"and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \ | ||
"and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \ | ||
"and %[t3].16b, "V2".16b, %[n63].16b \n\t" | ||
|
||
// Generate a block of inline assembly that takes temporary registers t0..t3 | ||
// and translates them to the base64 alphabet, using a table loaded into | ||
// v8..v11. The output is in user-defined registers V0..V3. | ||
#define TRAN(V0, V1, V2, V3) \ | ||
"tbl "V0".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \ | ||
"tbl "V1".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \ | ||
"tbl "V2".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \ | ||
"tbl "V3".16b, {v8.16b-v11.16b}, %[t3].16b \n\t" | ||
|
||
// Generate a block of inline assembly that interleaves four registers and | ||
// stores them, post-incrementing the destination pointer. | ||
#define STOR(V0, V1, V2, V3) \ | ||
"st4 {"V0".16b, "V1".16b, "V2".16b, "V3".16b}, [%[dst]], #64 \n\t" | ||
|
||
// Generate a block of inline assembly that generates a single generic encoder | ||
// round: fetch the data, process it, and store the result. | ||
#define ROUND() \ | ||
LOAD("v12", "v13", "v14") \ | ||
SHUF("v12", "v13", "v14") \ | ||
TRAN("v12", "v13", "v14", "v15") \ | ||
STOR("v12", "v13", "v14", "v15") | ||
|
||
// Generate a block of assembly that generates a type A interleaved encoder | ||
// round: it uses registers that were fetched by the previous type B round, and | ||
// fetches registers for the next type B round. | ||
#define ROUND_A() \ | ||
SHUF("v2", "v3", "v4") \ | ||
LOAD("v12", "v13", "v14") \ | ||
TRAN("v2", "v3", "v4", "v5") \ | ||
STOR("v2", "v3", "v4", "v5") | ||
|
||
// Type B interleaved encoder round. Same as type A, but with the register sets | ||
// swapped. | ||
#define ROUND_B() \ | ||
SHUF("v12", "v13", "v14") \ | ||
LOAD("v2", "v3", "v4") \ | ||
TRAN("v12", "v13", "v14", "v15") \ | ||
STOR("v12", "v13", "v14", "v15") | ||
|
||
// Suppress clang's warning that the literal string in the asm statement is | ||
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 | ||
// compilers). It might be true, but the goal here is not C99 portability. | ||
#pragma clang diagnostic push | ||
#pragma clang diagnostic ignored "-Woverlength-strings" | ||
|
||
static inline void | ||
enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) | ||
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) | ||
{ | ||
// This function duplicates the functionality of enc_loop_neon64_inner, | ||
// but entirely with inline assembly. This gives a significant speedup | ||
// over using NEON intrinsics, which do not always generate very good | ||
// code. The logic of the assembly is directly lifted from the | ||
// intrinsics version, so it can be used as a guide to this code. | ||
size_t rounds = *slen / 48; | ||
|
||
if (rounds == 0) { | ||
return; | ||
} | ||
|
||
*slen -= rounds * 48; // 48 bytes consumed per round. | ||
*olen += rounds * 64; // 64 bytes produced per round. | ||
|
||
// Temporary registers, used as scratch space. | ||
uint8x16_t tmp0, tmp1, tmp2, tmp3; | ||
|
||
// Numeric constant. | ||
const uint8x16_t n63 = vdupq_n_u8(63); | ||
|
||
__asm__ ( | ||
|
||
// Load 48 bytes and deinterleave. The bytes are loaded to | ||
// hard-coded registers v12, v13 and v14, to ensure that they | ||
// are contiguous. Increment the source pointer. | ||
"ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t" | ||
|
||
// Reshuffle the bytes using temporaries. | ||
"ushr %[t0].16b, v12.16b, #2 \n\t" | ||
"ushr %[t1].16b, v13.16b, #4 \n\t" | ||
"ushr %[t2].16b, v14.16b, #6 \n\t" | ||
"sli %[t1].16b, v12.16b, #4 \n\t" | ||
"sli %[t2].16b, v13.16b, #2 \n\t" | ||
"and %[t1].16b, %[t1].16b, %[n63].16b \n\t" | ||
"and %[t2].16b, %[t2].16b, %[n63].16b \n\t" | ||
"and %[t3].16b, v14.16b, %[n63].16b \n\t" | ||
|
||
// Translate the values to the Base64 alphabet. | ||
"tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t" | ||
"tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t" | ||
"tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t" | ||
"tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t" | ||
|
||
// Store 64 bytes and interleave. Increment the dest pointer. | ||
"st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t" | ||
// Load the encoding table into v8-v11. | ||
" ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t" | ||
|
||
// If there are less than eight rounds, jump to the dispatch | ||
// table. Otherwise, enter an unrolled block of eight | ||
// interleaved encoding rounds. The rounds interleave memory | ||
// operations (load/store) with data operations (table lookups, | ||
// etc) to maximize pipeline throughput. | ||
" cmp %[rounds], #8 \n\t" | ||
" b.lt 99f \n\t" | ||
" \n\t" | ||
"8: \n\t" | ||
|
||
// Load the registers for the first type A round. | ||
LOAD("v2", "v3", "v4") | ||
|
||
ROUND_A() // Round 1. | ||
ROUND_B() // Round 2. | ||
ROUND_A() // Round 3. | ||
ROUND_B() // Round 4. | ||
ROUND_A() // Round 5. | ||
ROUND_B() // Round 6. | ||
ROUND_A() // Round 7. | ||
|
||
// Round 8 is type B, but omits the load for the next step. | ||
SHUF("v12", "v13", "v14") | ||
TRAN("v12", "v13", "v14", "v15") | ||
STOR("v12", "v13", "v14", "v15") | ||
|
||
// Decrement the round counter. Re-enter the loop while the | ||
// number of rounds is eight or more. | ||
" sub %[rounds], %[rounds], #8 \n\t" | ||
" cmp %[rounds], #8 \n\t" | ||
" b.ge 8b \n\t" | ||
|
||
// Dispatch the remaining rounds 0..7. | ||
"99: cmp %[rounds], #4 \n\t" | ||
" b.gt 57f \n\t" | ||
" b.eq 4f \n\t" | ||
|
||
// Handle rounds 0..3. | ||
" cmp %[rounds], #2 \n\t" | ||
" b.gt 3f \n\t" | ||
" b.eq 2f \n\t" | ||
" cmp %[rounds], #1 \n\t" | ||
" b.eq 1f \n\t" | ||
" b 0f \n\t" | ||
|
||
// Handle rounds 5..7. | ||
"57: cmp %[rounds], #6 \n\t" | ||
" b.lt 5f \n\t" | ||
" b.eq 6f \n\t" | ||
|
||
// Block of non-interlaced encoding rounds, so that each one | ||
// can individually be jumped to. Rounds "fall through". | ||
"7: " ROUND() | ||
"6: " ROUND() | ||
"5: " ROUND() | ||
"4: " ROUND() | ||
"3: " ROUND() | ||
"2: " ROUND() | ||
"1: " ROUND() | ||
"0: \n\t" | ||
|
||
// Outputs (modified). | ||
: [src] "+r" (*s), | ||
[dst] "+r" (*o), | ||
[t0] "=&w" (tmp0), | ||
[t1] "=&w" (tmp1), | ||
[t2] "=&w" (tmp2), | ||
[t3] "=&w" (tmp3) | ||
: [rounds] "+r" (rounds), | ||
[src] "+r" (*s), | ||
[dst] "+r" (*o), | ||
[t0] "=&w" (tmp0), | ||
[t1] "=&w" (tmp1), | ||
[t2] "=&w" (tmp2), | ||
[t3] "=&w" (tmp3) | ||
|
||
// Inputs (not modified). | ||
: [n63] "w" (n63), | ||
[l0] "w" (tbl_enc.val[0]), | ||
[l1] "w" (tbl_enc.val[1]), | ||
[l2] "w" (tbl_enc.val[2]), | ||
[l3] "w" (tbl_enc.val[3]) | ||
: [tbl] "r" (base64_table_enc_6bit), | ||
[n63] "w" (vdupq_n_u8(63)) | ||
|
||
// Clobbers. | ||
: "v12", "v13", "v14", "v15" | ||
: "v2", "v3", "v4", "v5", | ||
"v8", "v9", "v10", "v11", | ||
"v12", "v13", "v14", "v15" | ||
); | ||
} | ||
|
||
static inline void | ||
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) | ||
{ | ||
size_t rounds = *slen / 48; | ||
|
||
*slen -= rounds * 48; // 48 bytes consumed per round | ||
*olen += rounds * 64; // 64 bytes produced per round | ||
|
||
// Load the encoding table: | ||
const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit); | ||
|
||
while (rounds > 0) { | ||
if (rounds >= 8) { | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
rounds -= 8; | ||
continue; | ||
} | ||
if (rounds >= 4) { | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
rounds -= 4; | ||
continue; | ||
} | ||
if (rounds >= 2) { | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
rounds -= 2; | ||
continue; | ||
} | ||
enc_loop_neon64_inner(s, o, tbl_enc); | ||
break; | ||
} | ||
} | ||
#pragma clang diagnostic pop |