diff --git a/lib/arch/neon64/enc_loop_asm.c b/lib/arch/neon64/enc_loop_asm.c
index 1370faff..030ed38a 100644
--- a/lib/arch/neon64/enc_loop_asm.c
+++ b/lib/arch/neon64/enc_loop_asm.c
@@ -1,103 +1,171 @@
+// Apologies in advance for combining the preprocessor with inline assembly,
+// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
+// code repetition. The preprocessor is used to template large sections of
+// inline assembly that differ only in the registers used. If the code was
+// written out by hand, it would become very large and hard to audit.
+
+// Generate a block of inline assembly that loads three user-defined registers
+// V0..V2 from memory and deinterleaves them, post-incrementing the src
+// pointer. The register set should be sequential.
+#define LOAD(V0, V1, V2) \
+	"ld3 {"V0".16b, "V1".16b, "V2".16b}, [%[src]], #48 \n\t"
+
+// Generate a block of inline assembly that takes three deinterleaved registers
+// and shuffles the bytes. The output is in temporary registers t0..t3.
+#define SHUF(V0, V1, V2) \
+	"ushr %[t0].16b, "V0".16b,  #2         \n\t" \
+	"ushr %[t1].16b, "V1".16b,  #4         \n\t" \
+	"ushr %[t2].16b, "V2".16b,  #6         \n\t" \
+	"sli  %[t1].16b, "V0".16b,  #4         \n\t" \
+	"sli  %[t2].16b, "V1".16b,  #2         \n\t" \
+	"and  %[t1].16b, %[t1].16b, %[n63].16b \n\t" \
+	"and  %[t2].16b, %[t2].16b, %[n63].16b \n\t" \
+	"and  %[t3].16b, "V2".16b,  %[n63].16b \n\t"
+
+// Generate a block of inline assembly that takes temporary registers t0..t3
+// and translates them to the base64 alphabet, using a table loaded into
+// v8..v11. The output is in user-defined registers V0..V3.
+#define TRAN(V0, V1, V2, V3) \
+	"tbl "V0".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \
+	"tbl "V1".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \
+	"tbl "V2".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \
+	"tbl "V3".16b, {v8.16b-v11.16b}, %[t3].16b \n\t"
+
+// Generate a block of inline assembly that interleaves four registers and
+// stores them, post-incrementing the destination pointer.
+#define STOR(V0, V1, V2, V3) \
+	"st4 {"V0".16b, "V1".16b, "V2".16b, "V3".16b}, [%[dst]], #64 \n\t"
+
+// Generate a block of inline assembly that generates a single generic encoder
+// round: fetch the data, process it, and store the result.
+#define ROUND() \
+	LOAD("v12", "v13", "v14") \
+	SHUF("v12", "v13", "v14") \
+	TRAN("v12", "v13", "v14", "v15") \
+	STOR("v12", "v13", "v14", "v15")
+
+// Generate a block of assembly that generates a type A interleaved encoder
+// round: it uses registers that were fetched by the previous type B round, and
+// fetches registers for the next type B round.
+#define ROUND_A() \
+	SHUF("v2",  "v3",  "v4") \
+	LOAD("v12", "v13", "v14") \
+	TRAN("v2",  "v3",  "v4", "v5") \
+	STOR("v2",  "v3",  "v4", "v5")
+
+// Type B interleaved encoder round. Same as type A, but registers swapped.
+#define ROUND_B() \
+	SHUF("v12", "v13", "v14") \
+	LOAD("v2",  "v3",  "v4") \
+	TRAN("v12", "v13", "v14", "v15") \
+	STOR("v12", "v13", "v14", "v15")
+
+// Suppress clang's warning that the literal string in the asm statement is
+// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
+// compilers). It may be true, but the goal here is not C99 portability.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+
 static inline void
-enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
+enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 {
-	// This function duplicates the functionality of enc_loop_neon64_inner,
-	// but entirely with inline assembly. This gives a significant speedup
-	// over using NEON intrinsics, which do not always generate very good
-	// code. The logic of the assembly is directly lifted from the
-	// intrinsics version, so it can be used as a guide to this code.
+	size_t rounds = *slen / 48;
+
+	if (rounds == 0) {
+		return;
+	}
+
+	*slen -= rounds * 48;	// 48 bytes consumed per round.
+	*olen += rounds * 64;	// 64 bytes produced per round.
+
+	// Number of times to go through the 8x loop.
+	size_t loop = rounds / 8;
+
+	// Number of rounds remaining after the 8x loop.
+	rounds %= 8;
 
 	// Temporary registers, used as scratch space.
 	uint8x16_t tmp0, tmp1, tmp2, tmp3;
 
-	// Numeric constant.
-	const uint8x16_t n63 = vdupq_n_u8(63);
-
 	__asm__ (
 
-		// Load 48 bytes and deinterleave. The bytes are loaded to
-		// hard-coded registers v12, v13 and v14, to ensure that they
-		// are contiguous. Increment the source pointer.
-		"ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t"
-
-		// Reshuffle the bytes using temporaries.
-		"ushr %[t0].16b, v12.16b,   #2         \n\t"
-		"ushr %[t1].16b, v13.16b,   #4         \n\t"
-		"ushr %[t2].16b, v14.16b,   #6         \n\t"
-		"sli  %[t1].16b, v12.16b,   #4         \n\t"
-		"sli  %[t2].16b, v13.16b,   #2         \n\t"
-		"and  %[t1].16b, %[t1].16b, %[n63].16b \n\t"
-		"and  %[t2].16b, %[t2].16b, %[n63].16b \n\t"
-		"and  %[t3].16b, v14.16b,   %[n63].16b \n\t"
-
-		// Translate the values to the Base64 alphabet.
-		"tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t"
-		"tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t"
-		"tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t"
-		"tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t"
-
-		// Store 64 bytes and interleave. Increment the dest pointer.
-		"st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t"
+		// Load the encoding table into v8..v11.
+		"    ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t"
+
+		// If there are less than eight rounds, jump to the dispatch
+		// table. Otherwise, enter an 8x unrolled loop of interleaved
+		// encoding rounds. The rounds interleave memory operations
+		// (load/store) with data operations (table lookups, etc) to
+		// maximize pipeline throughput.
+		"    cbz %[loop], 99f \n\t"
+
+		// Load the registers for the first type A round.
+		"8: " LOAD("v2", "v3", "v4")
+
+		ROUND_A()	// Round 1.
+		ROUND_B()	// Round 2.
+		ROUND_A()	// Round 3.
+		ROUND_B()	// Round 4.
+		ROUND_A()	// Round 5.
+		ROUND_B()	// Round 6.
+		ROUND_A()	// Round 7.
+
+		// Round 8 is type B, but omits the load for the next step.
+		SHUF("v12", "v13", "v14")
+		TRAN("v12", "v13", "v14", "v15")
+		STOR("v12", "v13", "v14", "v15")
+
+		// Decrement the loop counter until it reaches zero.
+		"    subs %[loop], %[loop], #1 \n\t"
+		"    b.ne 8b                   \n\t"
+
+		// Dispatch the remaining rounds 0..7.
+		"99: cmp  %[rounds], #4 \n\t"
+		"    b.gt 57f           \n\t"
+		"    b.eq 4f            \n\t"
+
+		// Handle rounds 0..3.
+		"    cmp  %[rounds], #2 \n\t"
+		"    b.gt 3f            \n\t"
+		"    b.eq 2f            \n\t"
+		"    cbnz %[rounds], 1f \n\t"
+		"    b    0f            \n\t"
+
+		// Handle rounds 5..7.
+		"57: cmp  %[rounds], #6 \n\t"
+		"    b.lt 5f            \n\t"
+		"    b.eq 6f            \n\t"
+
+		// Block of non-interlaced encoding rounds, which can each
+		// individually be jumped to. Rounds fall through to the next.
+		"7: " ROUND()
+		"6: " ROUND()
+		"5: " ROUND()
+		"4: " ROUND()
+		"3: " ROUND()
+		"2: " ROUND()
+		"1: " ROUND()
+		"0: \n\t"
 
 		// Outputs (modified).
-		: [src] "+r"  (*s),
-		  [dst] "+r"  (*o),
-		  [t0]  "=&w" (tmp0),
-		  [t1]  "=&w" (tmp1),
-		  [t2]  "=&w" (tmp2),
-		  [t3]  "=&w" (tmp3)
+		: [loop] "+r"  (loop),
+		  [src]  "+r"  (*s),
+		  [dst]  "+r"  (*o),
+		  [t0]   "=&w" (tmp0),
+		  [t1]   "=&w" (tmp1),
+		  [t2]   "=&w" (tmp2),
+		  [t3]   "=&w" (tmp3)
 
 		// Inputs (not modified).
-		: [n63] "w" (n63),
-		  [l0]  "w" (tbl_enc.val[0]),
-		  [l1]  "w" (tbl_enc.val[1]),
-		  [l2]  "w" (tbl_enc.val[2]),
-		  [l3]  "w" (tbl_enc.val[3])
+		: [rounds] "r" (rounds),
+		  [tbl]    "r" (base64_table_enc_6bit),
+		  [n63]    "w" (vdupq_n_u8(63))
 
 		// Clobbers.
-		: "v12", "v13", "v14", "v15"
+		: "v2",  "v3",  "v4",  "v5",
+		  "v8",  "v9",  "v10", "v11",
+		  "v12", "v13", "v14", "v15"
 	);
 }
 
-static inline void
-enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
-{
-	size_t rounds = *slen / 48;
-
-	*slen -= rounds * 48;	// 48 bytes consumed per round
-	*olen += rounds * 64;	// 64 bytes produced per round
-
-	// Load the encoding table:
-	const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
-
-	while (rounds > 0) {
-		if (rounds >= 8) {
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			rounds -= 8;
-			continue;
-		}
-		if (rounds >= 4) {
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			rounds -= 4;
-			continue;
-		}
-		if (rounds >= 2) {
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			enc_loop_neon64_inner(s, o, tbl_enc);
-			rounds -= 2;
-			continue;
-		}
-		enc_loop_neon64_inner(s, o, tbl_enc);
-		break;
-	}
-}
+#pragma GCC diagnostic pop