diff --git a/lib/arch/neon64/enc_loop_asm.c b/lib/arch/neon64/enc_loop_asm.c index 1370faff..030ed38a 100644 --- a/lib/arch/neon64/enc_loop_asm.c +++ b/lib/arch/neon64/enc_loop_asm.c @@ -1,103 +1,171 @@ +// Apologies in advance for combining the preprocessor with inline assembly, +// two notoriously gnarly parts of C, but it was necessary to avoid a lot of +// code repetition. The preprocessor is used to template large sections of +// inline assembly that differ only in the registers used. If the code was +// written out by hand, it would become very large and hard to audit. + +// Generate a block of inline assembly that loads three user-defined registers +// V0..V2 from memory and deinterleaves them, post-incrementing the src +// pointer. The register set should be sequential. +#define LOAD(V0, V1, V2) \ + "ld3 {"V0".16b, "V1".16b, "V2".16b}, [%[src]], #48 \n\t" + +// Generate a block of inline assembly that takes three deinterleaved registers +// and shuffles the bytes. The output is in temporary registers t0..t3. +#define SHUF(V0, V1, V2) \ + "ushr %[t0].16b, "V0".16b, #2 \n\t" \ + "ushr %[t1].16b, "V1".16b, #4 \n\t" \ + "ushr %[t2].16b, "V2".16b, #6 \n\t" \ + "sli %[t1].16b, "V0".16b, #4 \n\t" \ + "sli %[t2].16b, "V1".16b, #2 \n\t" \ + "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \ + "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \ + "and %[t3].16b, "V2".16b, %[n63].16b \n\t" + +// Generate a block of inline assembly that takes temporary registers t0..t3 +// and translates them to the base64 alphabet, using a table loaded into +// v8..v11. The output is in user-defined registers V0..V3. +#define TRAN(V0, V1, V2, V3) \ + "tbl "V0".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \ + "tbl "V1".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \ + "tbl "V2".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \ + "tbl "V3".16b, {v8.16b-v11.16b}, %[t3].16b \n\t" + +// Generate a block of inline assembly that interleaves four registers and +// stores them, post-incrementing the destination pointer. +#define STOR(V0, V1, V2, V3) \ + "st4 {"V0".16b, "V1".16b, "V2".16b, "V3".16b}, [%[dst]], #64 \n\t" + +// Generate a block of inline assembly that generates a single generic encoder +// round: fetch the data, process it, and store the result. +#define ROUND() \ + LOAD("v12", "v13", "v14") \ + SHUF("v12", "v13", "v14") \ + TRAN("v12", "v13", "v14", "v15") \ + STOR("v12", "v13", "v14", "v15") + +// Generate a block of assembly that generates a type A interleaved encoder +// round: it uses registers that were fetched by the previous type B round, and +// fetches registers for the next type B round. +#define ROUND_A() \ + SHUF("v2", "v3", "v4") \ + LOAD("v12", "v13", "v14") \ + TRAN("v2", "v3", "v4", "v5") \ + STOR("v2", "v3", "v4", "v5") + +// Type B interleaved encoder round. Same as type A, but registers swapped. +#define ROUND_B() \ + SHUF("v12", "v13", "v14") \ + LOAD("v2", "v3", "v4") \ + TRAN("v12", "v13", "v14", "v15") \ + STOR("v12", "v13", "v14", "v15") + +// Suppress clang's warning that the literal string in the asm statement is +// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 +// compilers). It may be true, but the goal here is not C99 portability. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverlength-strings" + static inline void -enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) +enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { - // This function duplicates the functionality of enc_loop_neon64_inner, - // but entirely with inline assembly. This gives a significant speedup - // over using NEON intrinsics, which do not always generate very good - // code. The logic of the assembly is directly lifted from the - // intrinsics version, so it can be used as a guide to this code. + size_t rounds = *slen / 48; + + if (rounds == 0) { + return; + } + + *slen -= rounds * 48; // 48 bytes consumed per round. + *olen += rounds * 64; // 64 bytes produced per round. + + // Number of times to go through the 8x loop. + size_t loop = rounds / 8; + + // Number of rounds remaining after the 8x loop. + rounds %= 8; // Temporary registers, used as scratch space. uint8x16_t tmp0, tmp1, tmp2, tmp3; - // Numeric constant. - const uint8x16_t n63 = vdupq_n_u8(63); - __asm__ ( - // Load 48 bytes and deinterleave. The bytes are loaded to - // hard-coded registers v12, v13 and v14, to ensure that they - // are contiguous. Increment the source pointer. - "ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t" - - // Reshuffle the bytes using temporaries. - "ushr %[t0].16b, v12.16b, #2 \n\t" - "ushr %[t1].16b, v13.16b, #4 \n\t" - "ushr %[t2].16b, v14.16b, #6 \n\t" - "sli %[t1].16b, v12.16b, #4 \n\t" - "sli %[t2].16b, v13.16b, #2 \n\t" - "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" - "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" - "and %[t3].16b, v14.16b, %[n63].16b \n\t" - - // Translate the values to the Base64 alphabet. - "tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t" - "tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t" - "tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t" - "tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t" - - // Store 64 bytes and interleave. Increment the dest pointer. - "st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t" + // Load the encoding table into v8..v11. + " ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t" + + // If there are less than eight rounds, jump to the dispatch + // table. Otherwise, enter an 8x unrolled loop of interleaved + // encoding rounds. The rounds interleave memory operations + // (load/store) with data operations (table lookups, etc) to + // maximize pipeline throughput. + " cbz %[loop], 99f \n\t" + + // Load the registers for the first type A round. + "8: " LOAD("v2", "v3", "v4") + + ROUND_A() // Round 1. + ROUND_B() // Round 2. + ROUND_A() // Round 3. + ROUND_B() // Round 4. + ROUND_A() // Round 5. + ROUND_B() // Round 6. + ROUND_A() // Round 7. + + // Round 8 is type B, but omits the load for the next step. + SHUF("v12", "v13", "v14") + TRAN("v12", "v13", "v14", "v15") + STOR("v12", "v13", "v14", "v15") + + // Decrement the loop counter until it reaches zero. + " subs %[loop], %[loop], #1 \n\t" + " b.ne 8b \n\t" + + // Dispatch the remaining rounds 0..7. + "99: cmp %[rounds], #4 \n\t" + " b.gt 57f \n\t" + " b.eq 4f \n\t" + + // Handle rounds 0..3. + " cmp %[rounds], #2 \n\t" + " b.gt 3f \n\t" + " b.eq 2f \n\t" + " cbnz %[rounds], 1f \n\t" + " b 0f \n\t" + + // Handle rounds 5..7. + "57: cmp %[rounds], #6 \n\t" + " b.lt 5f \n\t" + " b.eq 6f \n\t" + + // Block of non-interlaced encoding rounds, which can each + // individually be jumped to. Rounds fall through to the next. + "7: " ROUND() + "6: " ROUND() + "5: " ROUND() + "4: " ROUND() + "3: " ROUND() + "2: " ROUND() + "1: " ROUND() + "0: \n\t" // Outputs (modified). - : [src] "+r" (*s), - [dst] "+r" (*o), - [t0] "=&w" (tmp0), - [t1] "=&w" (tmp1), - [t2] "=&w" (tmp2), - [t3] "=&w" (tmp3) + : [loop] "+r" (loop), + [src] "+r" (*s), + [dst] "+r" (*o), + [t0] "=&w" (tmp0), + [t1] "=&w" (tmp1), + [t2] "=&w" (tmp2), + [t3] "=&w" (tmp3) // Inputs (not modified). - : [n63] "w" (n63), - [l0] "w" (tbl_enc.val[0]), - [l1] "w" (tbl_enc.val[1]), - [l2] "w" (tbl_enc.val[2]), - [l3] "w" (tbl_enc.val[3]) + : [rounds] "r" (rounds), + [tbl] "r" (base64_table_enc_6bit), + [n63] "w" (vdupq_n_u8(63)) // Clobbers. - : "v12", "v13", "v14", "v15" + : "v2", "v3", "v4", "v5", + "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15" ); } -static inline void -enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) -{ - size_t rounds = *slen / 48; - - *slen -= rounds * 48; // 48 bytes consumed per round - *olen += rounds * 64; // 64 bytes produced per round - - // Load the encoding table: - const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit); - - while (rounds > 0) { - if (rounds >= 8) { - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - rounds -= 8; - continue; - } - if (rounds >= 4) { - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - rounds -= 4; - continue; - } - if (rounds >= 2) { - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - rounds -= 2; - continue; - } - enc_loop_neon64_inner(s, o, tbl_enc); - break; - } -} +#pragma GCC diagnostic pop