diff --git a/src/asm/field_5x64_aarch64.s b/src/asm/field_5x64_aarch64.s
index 13fa1a33d0..d89f6cf545 100644
--- a/src/asm/field_5x64_aarch64.s
+++ b/src/asm/field_5x64_aarch64.s
@@ -3,358 +3,891 @@
 
 	.text
 
+/* Aarch64 assembly modules, created by disassembling the 
+   output of high level c function written by Kaushik */
+
 	.p2align 4
-	.global secp256k1_fe_mul_inner
-	.type	secp256k1_fe_mul_inner, %function
-secp256k1_fe_mul_inner:
-	stp	x29, x30, [sp, #-48]!
-	mov	x5, #0x3d1                 	// #977
-	movk	x5, #0x1, lsl #32
+	.global secp256k1_fe_mul_45to5
+	.type	secp256k1_fe_mul_45to5, %function
+secp256k1_fe_mul_45to5:
+	stp	x29, x30, [sp, #-16]!
+	mov	x8, #0x3d1                 	// #977
+	movk	x8, #0x1, lsl #32
 	mov	x29, sp
-	ldp	x12, x3, [x1]
-	stp	x19, x20, [sp, #16]
-	ldr	x4, [x1, #32]
-	ldp	x10, x14, [x1, #16]
-	mul	x6, x4, x5
-	umulh	x4, x4, x5
-	adds	x12, x12, x6
-	cset	x6, cs  // cs = hs, nlast
-	adds	x4, x4, x3
-	cset	x3, cs  // cs = hs, nlast
+	ldp	x7, x3, [x2, #24]
+	ldp	x6, x5, [x2]
+	ldp	x10, x9, [x1]
+	mul	x4, x3, x8
+	umulh	x3, x3, x8
 	adds	x4, x4, x6
 	cinc	x3, x3, cs  // cs = hs, nlast
-	adds	x10, x10, x3
-	cset	x1, cs  // cs = hs, nlast
-	adds	x14, x14, x1
-	csetm	x1, cs  // cs = hs, nlast
-	and	x1, x1, x5
-	adds	x12, x1, x12
-	cset	x1, cs  // cs = hs, nlast
-	adds	x4, x4, x1
-	ldp	x15, x9, [x2, #24]
-	cset	x1, cs  // cs = hs, nlast
-	adds	x10, x10, x1
-	ldp	x11, x1, [x2]
+	ldr	x6, [x2, #16]
+	adds	x5, x5, x3
+	cset	x3, cs  // cs = hs, nlast
+	adds	x6, x6, x3
+	cset	x2, cs  // cs = hs, nlast
+	adds	x7, x7, x2
+	cset	x2, cs  // cs = hs, nlast
+	mul	x18, x9, x6
+	ldp	x12, x15, [x1, #16]
+	mul	x14, x9, x7
+	mul	x11, x2, x8
+	umulh	x2, x2, x8
+	adds	x11, x11, x4
+	umulh	x4, x10, x7
+	adc	x5, x5, x2
+	umulh	x2, x9, x6
+	mul	x3, x12, x6
+	mul	x17, x10, x7
+	adds	x2, x2, x3
+	mul	x13, x15, x5
+	umulh	x1, x12, x5
+	cset	x3, cs  // cs = hs, nlast
+	adds	x4, x4, x14
+	cset	x14, cs  // cs = hs, nlast
+	adds	x2, x2, x4
+	adc	x3, x3, x14
+	umulh	x4, x15, x11
+	adds	x1, x1, x13
+	mul	x14, x10, x11
 	cset	x13, cs  // cs = hs, nlast
-	mul	x3, x9, x5
-	umulh	x9, x9, x5
-	adds	x11, x11, x3
+	adds	x2, x2, x1
+	adc	x3, x3, x13
+	adds	x2, x2, x4
+	cinc	x3, x3, cs  // cs = hs, nlast
+	mul	x13, x15, x6
+	umulh	x1, x12, x6
+	mul	x4, x2, x8
+	umulh	x2, x2, x8
+	madd	x2, x3, x8, x2
+	adds	x4, x4, x14
+	umulh	x3, x9, x7
+	str	x4, [x0]
+	mul	x14, x12, x7
+	cinc	x2, x2, cs  // cs = hs, nlast
+	adds	x1, x1, x13
+	umulh	x13, x15, x5
+	cset	x4, cs  // cs = hs, nlast
+	adds	x3, x3, x14
+	cset	x14, cs  // cs = hs, nlast
+	adds	x1, x1, x3
+	adc	x4, x4, x14
+	adds	x1, x1, x13
+	cinc	x4, x4, cs  // cs = hs, nlast
+	umulh	x3, x15, x7
+	mul	x13, x5, x10
+	umulh	x14, x1, x8
+	mul	x1, x1, x8
+	madd	x14, x4, x8, x14
+	adds	x1, x1, x13
+	umulh	x4, x10, x11
+	mul	x13, x9, x11
+	cinc	x14, x14, cs  // cs = hs, nlast
+	adds	x16, x4, x13
+	mul	x4, x3, x8
+	umulh	x13, x3, x8
+	cset	x30, cs  // cs = hs, nlast
+	adds	x3, x1, x16
+	umulh	x16, x10, x6
+	adc	x14, x14, x30
+	adds	x4, x4, x17
+	umulh	x1, x12, x11
+	cinc	x13, x13, cs  // cs = hs, nlast
+	mul	x17, x15, x11
+	adds	x16, x16, x18
+	cset	x30, cs  // cs = hs, nlast
+	adds	x4, x4, x16
+	mul	x18, x12, x5
+	adc	x13, x13, x30
+	umulh	x16, x9, x5
+	adds	x17, x1, x17
+	cset	x30, cs  // cs = hs, nlast
+	adds	x4, x4, x17
+	umulh	x1, x12, x7
+	adc	x13, x13, x30
+	mul	x7, x15, x7
+	adds	x16, x16, x18
+	umulh	x15, x15, x6
+	cset	x17, cs  // cs = hs, nlast
+	adds	x4, x4, x16
+	umulh	x16, x9, x11
+	adc	x13, x13, x17
+	adds	x1, x1, x7
+	cset	x7, cs  // cs = hs, nlast
+	adds	x1, x1, x15
+	cinc	x15, x7, cs  // cs = hs, nlast
+	mul	x12, x12, x11
+	mul	x6, x10, x6
+	umulh	x7, x1, x8
+	mul	x1, x1, x8
+	madd	x7, x15, x8, x7
+	adds	x1, x1, x16
+	mul	x9, x9, x5
+	cinc	x7, x7, cs  // cs = hs, nlast
+	umulh	x5, x5, x10
+	adds	x6, x6, x12
+	cset	x8, cs  // cs = hs, nlast
+	adds	x1, x1, x6
+	adc	x7, x7, x8
+	adds	x5, x5, x9
 	cset	x6, cs  // cs = hs, nlast
-	adds	x9, x9, x1
-	ldr	x3, [x2, #16]
+	adds	x1, x1, x5
+	adc	x7, x7, x6
+	adds	x2, x2, x3
+	cinc	x14, x14, cs  // cs = hs, nlast
+	adds	x1, x1, x14
+	stp	x2, x1, [x0, #8]
+	cinc	x7, x7, cs  // cs = hs, nlast
+	adds	x4, x4, x7
+	cinc	x13, x13, cs  // cs = hs, nlast
+	stp	x4, x13, [x0, #24]
+	ldp	x29, x30, [sp], #16
+	ret
+	.size secp256k1_fe_mul_45to5, .-secp256k1_fe_mul_45to5
+
+	.p2align 4
+	.global secp256k1_fe_mul_55to5
+	.type	secp256k1_fe_mul_55to5, %function
+secp256k1_fe_mul_55to5:
+	stp	x29, x30, [sp, #-32]!
+	mov	x9, #0x3d1                 	// #977
+	movk	x9, #0x1, lsl #32
+	mov	x29, sp
+	ldp	x5, x10, [x1]
+	stp	x19, x20, [sp, #16]
+	ldr	x3, [x1, #32]
+	ldp	x12, x8, [x1, #16]
+	mul	x4, x3, x9
+	umulh	x3, x3, x9
+	adds	x4, x4, x5
+	cinc	x3, x3, cs  // cs = hs, nlast
+	adds	x10, x10, x3
+	cset	x3, cs  // cs = hs, nlast
+	adds	x12, x12, x3
 	cset	x1, cs  // cs = hs, nlast
-	adds	x9, x9, x6
-	str	x21, [sp, #32]
+	adds	x8, x8, x1
+	cset	x3, cs  // cs = hs, nlast
+	ldp	x7, x11, [x2]
+	ldr	x1, [x2, #32]
+	mul	x6, x3, x9
+	umulh	x5, x3, x9
+	adds	x6, x6, x4
+	mul	x3, x1, x9
+	adc	x10, x10, x5
+	umulh	x1, x1, x9
+	adds	x3, x3, x7
 	cinc	x1, x1, cs  // cs = hs, nlast
-	adds	x3, x3, x1
-	cset	x8, cs  // cs = hs, nlast
-	adds	x8, x15, x8
-	csetm	x1, cs  // cs = hs, nlast
-	and	x1, x1, x5
-	adds	x11, x1, x11
+	ldr	x13, [x2, #16]
+	adds	x11, x11, x1
+	cset	x1, cs  // cs = hs, nlast
+	ldr	x7, [x2, #24]
+	adds	x13, x13, x1
 	cset	x1, cs  // cs = hs, nlast
-	adds	x9, x9, x1
-	cset	x2, cs  // cs = hs, nlast
-	umulh	x1, x12, x11
-	adds	x3, x3, x2
-	mul	x6, x4, x11
-	mul	x2, x9, x12
-	cset	x15, cs  // cs = hs, nlast
-	umulh	x7, x9, x12
-	adds	x1, x1, x2
-	umulh	x2, x4, x11
-	cinc	x7, x7, cs  // cs = hs, nlast
-	adds	x6, x1, x6
-	mul	x16, x12, x3
-	cinc	x1, x2, cs  // cs = hs, nlast
 	adds	x7, x7, x1
-	umulh	x1, x12, x3
-	cset	x2, cs  // cs = hs, nlast
-	adds	x7, x7, x16
-	mul	x17, x9, x4
-	cinc	x1, x1, cs  // cs = hs, nlast
-	adds	x2, x2, x1
-	umulh	x1, x9, x4
-	cset	x16, cs  // cs = hs, nlast
-	adds	x7, x7, x17
-	cinc	x1, x1, cs  // cs = hs, nlast
-	mul	x17, x10, x11
-	adds	x2, x2, x1
-	umulh	x1, x10, x11
-	cinc	x19, x16, cs  // cs = hs, nlast
-	adds	x7, x7, x17
-	cinc	x1, x1, cs  // cs = hs, nlast
-	mul	x21, x4, x3
-	adds	x2, x2, x1
-	umulh	x1, x4, x3
-	cset	x16, cs  // cs = hs, nlast
-	adds	x15, x15, x8
-	mul	x20, x9, x10
-	umulh	x17, x9, x10
-	mul	x8, x12, x15
-	umulh	x18, x12, x15
-	adds	x2, x2, x8
-	mul	x30, x4, x15
-	cinc	x18, x18, cs  // cs = hs, nlast
-	adds	x2, x2, x21
-	add	x16, x16, x18
-	cinc	x1, x1, cs  // cs = hs, nlast
-	add	x16, x16, x19
-	umulh	x19, x4, x15
-	adds	x1, x16, x1
-	mul	x12, x12, x11
-	cset	x4, cs  // cs = hs, nlast
-	cmp	x18, x16
-	cinc	x8, x4, hi  // hi = pmore
-	adds	x2, x2, x20
-	cinc	x4, x17, cs  // cs = hs, nlast
-	mul	x18, x10, x3
-	adds	x1, x1, x4
-	umulh	x17, x10, x3
-	cset	x4, cs  // cs = hs, nlast
-	adds	x13, x13, x14
-	mul	x16, x10, x15
-	umulh	x10, x10, x15
-	mul	x14, x11, x13
-	umulh	x11, x11, x13
+	cset	x1, cs  // cs = hs, nlast
+	mul	x19, x12, x13
+	umulh	x14, x10, x13
+	mul	x18, x10, x7
+	mul	x5, x1, x9
+	umulh	x1, x1, x9
+	adds	x5, x5, x3
+	umulh	x17, x7, x6
+	adc	x11, x11, x1
+	mul	x16, x7, x12
+	umulh	x1, x12, x13
+	umulh	x2, x8, x5
+	umulh	x4, x11, x12
+	mul	x15, x11, x8
+	adds	x2, x2, x19
+	cset	x3, cs  // cs = hs, nlast
+	mul	x19, x6, x5
+	adds	x15, x4, x15
+	umulh	x4, x11, x8
+	cset	x20, cs  // cs = hs, nlast
+	adds	x2, x2, x15
+	adc	x3, x3, x20
+	adds	x14, x14, x18
+	cset	x15, cs  // cs = hs, nlast
 	adds	x2, x2, x14
-	mul	x20, x9, x13
-	cinc	x11, x11, cs  // cs = hs, nlast
-	umulh	x21, x9, x13
-	adds	x1, x1, x11
-	mul	x11, x3, x13
-	adc	x4, x8, x4
-	adds	x1, x1, x30
-	cinc	x19, x19, cs  // cs = hs, nlast
-	umulh	x3, x3, x13
-	adds	x4, x4, x19
-	umulh	x9, x15, x13
-	cset	x8, cs  // cs = hs, nlast
-	adds	x1, x1, x18
-	cinc	x17, x17, cs  // cs = hs, nlast
-	mul	x13, x15, x13
-	adds	x4, x4, x17
-	cinc	x14, x8, cs  // cs = hs, nlast
-	adds	x1, x1, x20
-	cinc	x21, x21, cs  // cs = hs, nlast
-	adds	x4, x4, x21
-	cset	x8, cs  // cs = hs, nlast
-	adds	x4, x4, x16
-	cinc	x10, x10, cs  // cs = hs, nlast
-	adds	x4, x4, x11
-	add	x8, x8, x10
+	adc	x3, x3, x15
+	adds	x2, x2, x17
 	cinc	x3, x3, cs  // cs = hs, nlast
-	add	x8, x8, x14
-	mul	x11, x1, x5
-	adds	x3, x8, x3
-	umulh	x1, x1, x5
+	mul	x18, x13, x8
+	umulh	x17, x10, x7
+	mul	x14, x2, x9
+	umulh	x2, x2, x9
+	madd	x2, x3, x9, x2
+	adds	x14, x14, x19
+	str	x14, [x0]
+	mul	x15, x10, x5
+	cinc	x2, x2, cs  // cs = hs, nlast
+	adds	x1, x1, x16
+	cset	x3, cs  // cs = hs, nlast
+	adds	x4, x4, x18
 	cset	x14, cs  // cs = hs, nlast
-	cmp	x10, x8
-	cinc	x9, x9, hi  // hi = pmore
-	adds	x3, x3, x13
-	adc	x9, x9, x14
-	mul	x8, x4, x5
-	adds	x11, x11, x12
-	umulh	x4, x4, x5
-	cinc	x1, x1, cs  // cs = hs, nlast
-	mul	x10, x3, x5
+	adds	x1, x1, x4
+	adc	x4, x3, x14
+	adds	x1, x1, x17
+	cinc	x4, x4, cs  // cs = hs, nlast
+	mul	x17, x11, x6
+	umulh	x3, x6, x5
+	umulh	x14, x1, x9
+	mul	x1, x1, x9
+	madd	x14, x4, x9, x14
+	umulh	x16, x7, x8
+	adds	x1, x1, x15
+	umulh	x4, x12, x5
+	cinc	x14, x14, cs  // cs = hs, nlast
+	mul	x15, x8, x5
+	adds	x3, x3, x17
+	cset	x18, cs  // cs = hs, nlast
+	mul	x17, x7, x6
+	adds	x3, x1, x3
+	umulh	x1, x13, x6
+	adc	x14, x14, x18
+	adds	x4, x4, x15
+	mul	x19, x11, x12
+	cset	x15, cs  // cs = hs, nlast
+	adds	x18, x1, x17
+	mul	x1, x16, x9
+	umulh	x17, x16, x9
+	cset	x20, cs  // cs = hs, nlast
+	adds	x4, x4, x18
+	mul	x30, x10, x13
+	umulh	x16, x10, x11
+	adc	x15, x15, x20
+	adds	x18, x1, x19
+	umulh	x1, x13, x8
+	cinc	x17, x17, cs  // cs = hs, nlast
+	adds	x4, x4, x18
+	mul	x8, x7, x8
+	adc	x15, x15, x17
+	adds	x16, x16, x30
+	umulh	x7, x7, x12
+	cset	x17, cs  // cs = hs, nlast
+	adds	x4, x4, x16
+	adc	x15, x15, x17
 	adds	x1, x1, x8
-	umulh	x3, x3, x5
+	cset	x8, cs  // cs = hs, nlast
+	adds	x1, x1, x7
+	cinc	x8, x8, cs  // cs = hs, nlast
+	mul	x13, x13, x6
+	mul	x12, x12, x5
+	umulh	x7, x1, x9
+	mul	x1, x1, x9
+	madd	x7, x8, x9, x7
+	umulh	x6, x11, x6
+	adds	x1, x1, x13
+	umulh	x5, x10, x5
+	cinc	x7, x7, cs  // cs = hs, nlast
+	mul	x10, x10, x11
+	adds	x6, x6, x12
 	cset	x8, cs  // cs = hs, nlast
 	adds	x1, x1, x6
-	adc	x4, x8, x4
-	mul	x8, x9, x5
-	adds	x4, x4, x10
-	umulh	x5, x9, x5
+	adc	x7, x7, x8
+	adds	x5, x5, x10
 	cset	x6, cs  // cs = hs, nlast
+	adds	x1, x1, x5
+	adc	x7, x7, x6
+	adds	x2, x2, x3
+	cinc	x14, x14, cs  // cs = hs, nlast
+	adds	x1, x1, x14
+	stp	x2, x1, [x0, #8]
+	cinc	x7, x7, cs  // cs = hs, nlast
 	adds	x4, x4, x7
-	adc	x3, x6, x3
-	stp	x11, x1, [x0]
-	adds	x3, x3, x8
-	cset	x9, cs  // cs = hs, nlast
-	adds	x2, x3, x2
-	adc	x5, x9, x5
-	stp	x4, x2, [x0, #16]
-	str	x5, [x0, #32]
+	cinc	x15, x15, cs  // cs = hs, nlast
+	stp	x4, x15, [x0, #24]
 	ldp	x19, x20, [sp, #16]
-	ldr	x21, [sp, #32]
-	ldp	x29, x30, [sp], #48
+	ldp	x29, x30, [sp], #32
 	ret
-	.size secp256k1_fe_mul_inner, .-secp256k1_fe_mul_inner
+	.size secp256k1_fe_mul_55to5, .-secp256k1_fe_mul_55to5
 
 	.p2align 4
-	.global secp256k1_fe_sqr_inner
-	.type secp256k1_fe_sqr_inner, %function
-secp256k1_fe_sqr_inner:
-	stp	x29, x30, [sp, #-32]!
-	mov	x5, #0x3d1                 	// #977
-	movk	x5, #0x1, lsl #32
+	.global secp256k1_fe_sqr_5to5
+	.type secp256k1_fe_sqr_5to5, %function
+secp256k1_fe_sqr_5to5:
+	stp	x29, x30, [sp, #-16]!
+	mov	x6, #0x3d1                 	// #977
+	movk	x6, #0x1, lsl #32
 	mov	x29, sp
-	ldp	x3, x4, [x1]
+	ldp	x5, x4, [x1]
 	ldr	x2, [x1, #32]
-	ldp	x10, x9, [x1, #16]
-	str	x19, [sp, #16]
-	mul	x6, x2, x5
-	umulh	x2, x2, x5
-	adds	x3, x3, x6
-	cset	x6, cs  // cs = hs, nlast
-	adds	x2, x2, x4
-	cset	x4, cs  // cs = hs, nlast
-	adds	x2, x2, x6
-	cinc	x4, x4, cs  // cs = hs, nlast
-	adds	x10, x10, x4
-	cset	x11, cs  // cs = hs, nlast
-	adds	x9, x9, x11
-	csetm	x1, cs  // cs = hs, nlast
-	and	x1, x1, x5
-	adds	x3, x1, x3
+	mul	x3, x2, x6
+	umulh	x2, x2, x6
+	adds	x5, x3, x5
+	cinc	x2, x2, cs  // cs = hs, nlast
+	adds	x4, x4, x2
+	ldp	x3, x2, [x1, #16]
+	cset	x7, cs  // cs = hs, nlast
+	adds	x3, x3, x7
 	cset	x1, cs  // cs = hs, nlast
 	adds	x2, x2, x1
-	cset	x4, cs  // cs = hs, nlast
-	umulh	x1, x3, x3
-	adds	x10, x10, x4
-	mul	x8, x3, x3
-	mul	x4, x3, x2
+	cset	x7, cs  // cs = hs, nlast
+	mul	x13, x3, x3
+	umulh	x14, x3, x3
+	umulh	x15, x2, x3
+	mul	x1, x7, x6
+	umulh	x7, x7, x6
+	adds	x1, x1, x5
+	mul	x17, x2, x2
+	adc	x4, x4, x7
+	lsl	x5, x15, #1
+	lsr	x15, x15, #63
+	mul	x16, x2, x3
+	umulh	x8, x2, x1
+	mul	x10, x4, x2
+	umulh	x7, x4, x3
+	mul	x9, x4, x1
+	adds	x7, x7, x10
+	mul	x12, x1, x1
 	cset	x11, cs  // cs = hs, nlast
-	umulh	x16, x3, x2
-	mul	x12, x3, x10
-	lsl	x6, x4, #1
-	umulh	x14, x3, x10
-	lsl	x13, x16, #1
-	cmp	x4, x6
-	cinc	x7, x13, hi  // hi = pmore
-	adds	x6, x1, x6
+	adds	x8, x7, x8
+	cinc	x11, x11, cs  // cs = hs, nlast
+	umulh	x30, x1, x1
+	lsl	x7, x8, #1
+	lsl	x10, x9, #1
+	adds	x7, x7, x13
+	extr	x8, x11, x8, #63
+	cinc	x8, x8, cs  // cs = hs, nlast
+	lsr	x9, x9, #63
+	mul	x18, x4, x4
+	mul	x11, x7, x6
+	umulh	x7, x7, x6
+	madd	x7, x8, x6, x7
+	adds	x11, x11, x12
+	str	x11, [x0]
+	umulh	x13, x4, x2
 	cinc	x7, x7, cs  // cs = hs, nlast
-	cset	w4, cs  // cs = hs, nlast
-	cmp	x7, #0x0
-	lsl	x1, x12, #1
-	ccmp	w4, #0x0, #0x4, eq  // eq = none
-	lsl	x15, x14, #1
-	cset	x4, ne  // ne = any
-	cmp	x16, x13
-	cinc	x4, x4, hi  // hi = pmore
-	cmp	x12, x1
-	cinc	x12, x15, hi  // hi = pmore
-	adds	x7, x7, x1
+	adds	x10, x10, x30
+	cinc	x9, x9, cs  // cs = hs, nlast
+	adds	x5, x5, x17
+	cinc	x8, x15, cs  // cs = hs, nlast
+	mul	x17, x3, x1
+	mul	x15, x2, x1
+	umulh	x11, x5, x6
+	mul	x5, x5, x6
+	madd	x11, x8, x6, x11
+	adds	x5, x5, x18
+	umulh	x12, x4, x1
+	cinc	x11, x11, cs  // cs = hs, nlast
+	adds	x13, x13, x16
+	cset	x8, cs  // cs = hs, nlast
+	mul	x16, x4, x3
+	umulh	x1, x3, x1
+	lsl	x3, x13, #1
+	adds	x3, x3, x14
+	extr	x8, x8, x13, #63
+	cinc	x14, x8, cs  // cs = hs, nlast
+	umulh	x2, x2, x2
+	umulh	x4, x4, x4
+	mul	x13, x3, x6
+	umulh	x3, x3, x6
+	madd	x3, x14, x6, x3
+	adds	x13, x13, x7
+	mul	x8, x2, x6
+	cinc	x3, x3, cs  // cs = hs, nlast
+	adds	x10, x10, x13
+	adc	x9, x9, x3
+	adds	x12, x12, x17
+	cset	x3, cs  // cs = hs, nlast
+	umulh	x2, x2, x6
+	lsl	x6, x12, #1
+	adds	x6, x6, x9
+	extr	x3, x3, x12, #63
+	cinc	x3, x3, cs  // cs = hs, nlast
+	adds	x5, x5, x6
+	adc	x11, x11, x3
+	adds	x1, x1, x15
+	cset	x3, cs  // cs = hs, nlast
+	adds	x1, x1, x16
+	cinc	x3, x3, cs  // cs = hs, nlast
+	stp	x10, x5, [x0, #8]
+	lsl	x5, x1, #1
+	adds	x5, x5, x11
+	extr	x1, x3, x1, #63
+	cinc	x1, x1, cs  // cs = hs, nlast
+	adds	x4, x8, x4
+	cinc	x2, x2, cs  // cs = hs, nlast
+	adds	x4, x4, x5
+	adc	x2, x1, x2
+	stp	x4, x2, [x0, #24]
+	ldp	x29, x30, [sp], #16
+	ret
+	nop
+	.size secp256k1_fe_sqr_5to5, .-secp256k1_fe_sqr_5to5
+
+	.p2align 4
+	.global secp256k1_fe_mul_44to5
+	.type	secp256k1_fe_mul_44to5, %function
+secp256k1_fe_mul_44to5:
+	stp	x29, x30, [sp, #-32]!
+	mov	x9, #0x3d1                 	// #977
+	movk	x9, #0x1, lsl #32
+	mov	x29, sp
+	ldp	x10, x6, [x1]
+	ldp	x11, x15, [x1, #16]
+	ldp	x5, x12, [x2, #16]
+	ldp	x7, x8, [x2]
+	str	x19, [sp, #16]
+	mul	x13, x11, x5
+	mul	x3, x6, x12
+	umulh	x4, x12, x10
+	adds	x1, x3, x13
+	mul	x13, x15, x8
+	umulh	x2, x6, x5
+	cset	x3, cs  // cs = hs, nlast
+	adds	x4, x4, x13
+	umulh	x13, x11, x8
+	cset	x14, cs  // cs = hs, nlast
+	adds	x1, x1, x4
+	adc	x3, x3, x14
+	umulh	x4, x15, x7
+	adds	x2, x2, x13
+	mul	x14, x10, x7
+	cset	x13, cs  // cs = hs, nlast
+	adds	x1, x1, x2
+	adc	x3, x3, x13
+	adds	x1, x1, x4
+	cinc	x3, x3, cs  // cs = hs, nlast
+	umulh	x13, x11, x5
+	umulh	x2, x6, x12
+	mul	x4, x1, x9
+	umulh	x1, x1, x9
+	madd	x1, x3, x9, x1
+	adds	x4, x4, x14
+	umulh	x3, x15, x8
+	str	x4, [x0]
+	mul	x14, x12, x11
+	cinc	x1, x1, cs  // cs = hs, nlast
+	adds	x2, x2, x13
+	mul	x13, x5, x15
+	cset	x4, cs  // cs = hs, nlast
+	adds	x3, x3, x14
+	cset	x14, cs  // cs = hs, nlast
+	adds	x2, x2, x3
+	adc	x4, x4, x14
+	adds	x2, x2, x13
+	cinc	x4, x4, cs  // cs = hs, nlast
+	mul	x16, x6, x7
+	mul	x14, x8, x10
+	umulh	x13, x2, x9
+	mul	x2, x2, x9
+	umulh	x3, x10, x7
+	madd	x13, x4, x9, x13
+	adds	x2, x2, x16
+	mul	x4, x12, x10
+	cinc	x13, x13, cs  // cs = hs, nlast
+	adds	x3, x3, x14
+	mul	x14, x6, x5
+	cset	x16, cs  // cs = hs, nlast
+	mul	x30, x15, x7
+	adds	x3, x2, x3
+	mul	x17, x11, x8
+	adc	x13, x13, x16
+	umulh	x2, x12, x15
+	adds	x4, x4, x14
+	umulh	x16, x5, x10
+	cset	x14, cs  // cs = hs, nlast
+	umulh	x18, x6, x8
+	adds	x17, x17, x30
+	cset	x19, cs  // cs = hs, nlast
+	adds	x4, x4, x17
+	umulh	x30, x11, x7
+	adc	x14, x14, x19
+	mul	x17, x2, x9
+	adds	x18, x16, x18
+	umulh	x16, x2, x9
+	cset	x2, cs  // cs = hs, nlast
+	adds	x4, x4, x18
+	umulh	x18, x5, x15
+	adc	x14, x14, x2
+	adds	x17, x17, x30
+	umulh	x2, x12, x11
+	cinc	x16, x16, cs  // cs = hs, nlast
+	mul	x12, x12, x15
+	adds	x4, x4, x17
+	adc	x14, x14, x16
+	adds	x2, x2, x18
+	cset	x16, cs  // cs = hs, nlast
+	adds	x2, x2, x12
+	cinc	x16, x16, cs  // cs = hs, nlast
+	mul	x11, x11, x7
+	umulh	x15, x8, x10
+	umulh	x12, x2, x9
+	mul	x2, x2, x9
+	umulh	x7, x6, x7
+	madd	x12, x16, x9, x12
+	adds	x2, x2, x11
+	mul	x6, x6, x8
+	mul	x5, x5, x10
 	cinc	x12, x12, cs  // cs = hs, nlast
-	cset	w13, cs  // cs = hs, nlast
-	adds	x4, x4, x12
-	mul	x16, x2, x2
-	cset	x1, cs  // cs = hs, nlast
-	cmp	x12, #0x0
-	ccmp	w13, #0x0, #0x4, eq  // eq = none
-	umulh	x12, x2, x2
-	cinc	x13, x1, ne  // ne = any
-	adds	x7, x7, x16
+	adds	x7, x15, x7
+	cset	x8, cs  // cs = hs, nlast
+	adds	x5, x5, x6
+	cset	x6, cs  // cs = hs, nlast
+	adds	x7, x7, x5
+	adc	x5, x8, x6
+	adds	x2, x2, x7
+	adc	x12, x12, x5
+	adds	x1, x1, x3
+	cinc	x13, x13, cs  // cs = hs, nlast
+	adds	x2, x2, x13
+	stp	x1, x2, [x0, #8]
 	cinc	x12, x12, cs  // cs = hs, nlast
-	umulh	x17, x2, x10
 	adds	x4, x4, x12
-	mul	x18, x2, x10
+	cinc	x14, x14, cs  // cs = hs, nlast
+	stp	x4, x14, [x0, #24]
+	ldr	x19, [sp, #16]
+	ldp	x29, x30, [sp], #32
+	ret
+	.size secp256k1_fe_mul_44to5, .-secp256k1_fe_mul_44to5
+
+	.p2align 4
+	.global secp256k1_fe_sqr_4to5
+	.type secp256k1_fe_sqr_4to5, %function
+secp256k1_fe_sqr_4to5:
+	ldp	x9, x3, [x1]
+	mov	x7, #0x3d1                 	// #977
+	ldr	x2, [x1, #24]
+	movk	x7, #0x1, lsl #32
+	ldr	x1, [x1, #16]
+	mul	x6, x3, x2
+	umulh	x4, x2, x9
+	umulh	x5, x3, x1
+	adds	x4, x4, x6
+	mul	x12, x1, x1
+	cset	x6, cs  // cs = hs, nlast
+	adds	x5, x4, x5
+	cinc	x6, x6, cs  // cs = hs, nlast
+	mul	x10, x3, x9
+	lsl	x4, x5, #1
+	mul	x15, x9, x9
+	adds	x4, x4, x12
+	extr	x5, x6, x5, #63
+	cinc	x5, x5, cs  // cs = hs, nlast
+	umulh	x13, x9, x9
+	mul	x12, x1, x9
+	lsl	x11, x10, #1
+	mul	x6, x4, x7
+	lsr	x10, x10, #63
+	umulh	x4, x4, x7
+	madd	x4, x5, x7, x4
+	adds	x6, x6, x15
+	umulh	x8, x3, x9
+	str	x6, [x0]
+	cinc	x6, x4, cs  // cs = hs, nlast
+	adds	x11, x11, x13
+	cinc	x10, x10, cs  // cs = hs, nlast
+	adds	x8, x8, x12
+	mul	x14, x3, x3
+	cset	x12, cs  // cs = hs, nlast
+	umulh	x5, x3, x2
+	lsl	x13, x8, #1
+	mul	x4, x2, x1
+	adds	x13, x13, x14
+	umulh	x15, x1, x1
+	extr	x12, x12, x8, #63
+	cinc	x12, x12, cs  // cs = hs, nlast
+	adds	x14, x5, x4
+	cset	x17, cs  // cs = hs, nlast
+	umulh	x8, x2, x1
+	lsl	x5, x14, #1
+	mul	x18, x2, x2
+	adds	x5, x5, x15
+	extr	x17, x17, x14, #63
+	cinc	x17, x17, cs  // cs = hs, nlast
+	lsl	x4, x8, #1
+	lsr	x14, x8, #63
+	mul	x16, x3, x1
+	mul	x15, x5, x7
+	umulh	x5, x5, x7
+	madd	x5, x17, x7, x5
+	adds	x15, x15, x6
+	mul	x8, x2, x9
+	cinc	x5, x5, cs  // cs = hs, nlast
+	adds	x11, x11, x15
+	adc	x10, x10, x5
+	adds	x4, x4, x18
+	cinc	x5, x14, cs  // cs = hs, nlast
+	umulh	x9, x1, x9
+	umulh	x2, x2, x2
+	mul	x6, x4, x7
+	umulh	x4, x4, x7
+	madd	x4, x5, x7, x4
+	adds	x6, x6, x10
+	mul	x5, x2, x7
+	cinc	x4, x4, cs  // cs = hs, nlast
+	adds	x6, x6, x13
+	adc	x12, x12, x4
+	adds	x4, x8, x16
 	cset	x1, cs  // cs = hs, nlast
-	cmp	x14, x15
-	cinc	x1, x1, hi  // hi = pmore
-	adds	x11, x11, x9
-	add	x1, x1, x13
-	lsl	x12, x17, #1
-	lsl	x30, x18, #1
-	mul	x13, x10, x10
-	mul	x15, x3, x11
-	umulh	x3, x3, x11
-	mul	x16, x2, x11
-	lsl	x14, x15, #1
-	umulh	x9, x2, x11
-	cmp	x15, x14
-	lsl	x19, x3, #1
-	cinc	x2, x19, hi  // hi = pmore
-	adds	x4, x4, x14
-	cinc	x2, x2, cs  // cs = hs, nlast
-	cset	w15, cs  // cs = hs, nlast
-	adds	x1, x2, x1
-	cset	x14, cs  // cs = hs, nlast
-	cmp	x2, #0x0
-	ccmp	w15, #0x0, #0x4, eq  // eq = none
-	lsl	x15, x16, #1
-	cinc	x14, x14, ne  // ne = any
-	cmp	x3, x19
-	cset	x2, hi  // hi = pmore
-	cmp	x17, x12
-	cinc	x3, x2, hi  // hi = pmore
-	cmp	x18, x30
-	cinc	x2, x12, hi  // hi = pmore
-	adds	x4, x4, x30
+	adds	x4, x4, x9
+	cinc	x1, x1, cs  // cs = hs, nlast
+	umulh	x2, x2, x7
+	umulh	x3, x3, x3
+	lsl	x7, x4, #1
+	adds	x7, x7, x12
+	extr	x1, x1, x4, #63
+	cinc	x1, x1, cs  // cs = hs, nlast
+	adds	x3, x5, x3
 	cinc	x2, x2, cs  // cs = hs, nlast
-	cset	w18, cs  // cs = hs, nlast
-	adds	x1, x1, x2
-	lsl	x17, x9, #1
+	adds	x3, x3, x7
+	adc	x1, x1, x2
+	stp	x11, x6, [x0, #8]
+	stp	x3, x1, [x0, #24]
+	ret
+	nop
+	nop
+	.size secp256k1_fe_sqr_4to5, .-secp256k1_fe_sqr_4to5
+
+	.p2align 4
+	.global secp256k1_fe_mul_44to4
+	.type	secp256k1_fe_mul_44to4, %function
+secp256k1_fe_mul_44to4:
+	stp	x29, x30, [sp, #-32]!
+	mov	x6, #0x3d1                 	// #977
+	movk	x6, #0x1, lsl #32
+	mov	x29, sp
+	ldp	x10, x8, [x1]
+	stp	x19, x20, [sp, #16]
+	ldp	x13, x15, [x1, #16]
+	ldp	x5, x14, [x2, #16]
+	ldp	x11, x9, [x2]
+	mul	x7, x13, x5
+	mul	x3, x8, x14
+	umulh	x4, x14, x10
+	adds	x1, x3, x7
+	mul	x7, x15, x9
+	umulh	x2, x8, x5
+	cset	x3, cs  // cs = hs, nlast
+	adds	x4, x4, x7
+	umulh	x7, x13, x9
 	cset	x12, cs  // cs = hs, nlast
-	cmp	x2, #0x0
-	ccmp	w18, #0x0, #0x4, eq  // eq = none
-	add	x2, x14, x3
-	cinc	x12, x12, ne  // ne = any
-	cmp	x16, x15
-	cinc	x3, x17, hi  // hi = pmore
-	adds	x1, x1, x15
+	adds	x1, x1, x4
+	adc	x3, x3, x12
+	umulh	x4, x15, x11
+	adds	x2, x2, x7
+	mul	x7, x10, x11
+	cset	x12, cs  // cs = hs, nlast
+	adds	x1, x1, x2
+	adc	x3, x3, x12
+	adds	x1, x1, x4
 	cinc	x3, x3, cs  // cs = hs, nlast
-	cset	w14, cs  // cs = hs, nlast
-	cmp	x3, #0x0
-	add	x12, x12, x2
-	ccmp	w14, #0x0, #0x4, eq  // eq = none
-	mul	x16, x10, x11
-	cset	x2, ne  // ne = any
-	cmp	x9, x17
-	umulh	x14, x10, x10
-	cinc	x2, x2, hi  // hi = pmore
-	adds	x3, x3, x12
-	umulh	x15, x10, x11
-	cset	x10, cs  // cs = hs, nlast
-	adds	x1, x1, x13
-	cinc	x9, x14, cs  // cs = hs, nlast
-	lsl	x12, x16, #1
-	adds	x3, x3, x9
-	lsl	x14, x15, #1
-	adc	x2, x2, x10
-	cmp	x16, x12
-	cinc	x9, x14, hi  // hi = pmore
-	adds	x3, x3, x12
-	cinc	x9, x9, cs  // cs = hs, nlast
-	cset	w12, cs  // cs = hs, nlast
-	adds	x2, x9, x2
-	umulh	x13, x11, x11
+	umulh	x4, x13, x5
+	umulh	x2, x8, x14
+	umulh	x12, x1, x6
+	mul	x1, x1, x6
+	madd	x12, x3, x6, x12
+	mul	x16, x14, x13
+	adds	x1, x1, x7
+	umulh	x3, x15, x9
+	cinc	x12, x12, cs  // cs = hs, nlast
+	adds	x2, x2, x4
+	mul	x7, x5, x15
+	cset	x4, cs  // cs = hs, nlast
+	adds	x3, x3, x16
+	cset	x16, cs  // cs = hs, nlast
+	adds	x2, x2, x3
+	adc	x3, x4, x16
+	adds	x2, x2, x7
+	cinc	x3, x3, cs  // cs = hs, nlast
+	mul	x17, x8, x11
+	mul	x7, x9, x10
+	umulh	x16, x2, x6
+	mul	x2, x2, x6
+	umulh	x4, x10, x11
+	madd	x16, x3, x6, x16
+	adds	x2, x2, x17
+	mul	x3, x14, x10
+	cinc	x16, x16, cs  // cs = hs, nlast
+	adds	x4, x4, x7
+	mul	x7, x8, x5
+	cset	x17, cs  // cs = hs, nlast
+	mul	x19, x15, x11
+	adds	x4, x2, x4
+	mul	x18, x13, x9
+	adc	x16, x16, x17
+	umulh	x2, x14, x15
+	adds	x3, x3, x7
+	umulh	x17, x5, x10
+	cset	x7, cs  // cs = hs, nlast
+	umulh	x30, x8, x9
+	adds	x18, x18, x19
+	cset	x20, cs  // cs = hs, nlast
+	adds	x3, x3, x18
+	umulh	x19, x13, x11
+	adc	x7, x7, x20
+	mul	x18, x2, x6
+	adds	x30, x17, x30
+	cset	x20, cs  // cs = hs, nlast
+	umulh	x17, x2, x6
+	adds	x3, x3, x30
+	umulh	x2, x14, x13
+	umulh	x30, x5, x15
+	adc	x7, x7, x20
+	adds	x18, x18, x19
+	mul	x14, x14, x15
+	cinc	x17, x17, cs  // cs = hs, nlast
+	adds	x3, x3, x18
+	adc	x7, x7, x17
+	adds	x2, x2, x30
+	cset	x17, cs  // cs = hs, nlast
+	adds	x2, x2, x14
+	cinc	x17, x17, cs  // cs = hs, nlast
+	mul	x13, x13, x11
+	umulh	x15, x9, x10
+	umulh	x14, x2, x6
+	mul	x2, x2, x6
+	umulh	x11, x8, x11
+	madd	x14, x17, x6, x14
+	adds	x2, x2, x13
+	mul	x8, x8, x9
+	mul	x5, x5, x10
+	cinc	x14, x14, cs  // cs = hs, nlast
+	adds	x9, x15, x11
 	cset	x10, cs  // cs = hs, nlast
-	cmp	x9, #0x0
-	ccmp	w12, #0x0, #0x4, eq  // eq = none
-	mul	x9, x11, x11
-	mul	x12, x1, x5
-	cinc	x11, x10, ne  // ne = any
-	cmp	x15, x14
-	umulh	x1, x1, x5
-	cinc	x10, x13, hi  // hi = pmore
+	adds	x5, x5, x8
+	cset	x8, cs  // cs = hs, nlast
+	adds	x9, x9, x5
+	adc	x5, x10, x8
 	adds	x2, x2, x9
-	adc	x10, x11, x10
-	mul	x9, x3, x5
-	adds	x8, x12, x8
-	umulh	x3, x3, x5
+	adc	x14, x14, x5
+	adds	x12, x12, x4
+	cinc	x16, x16, cs  // cs = hs, nlast
+	adds	x2, x2, x16
+	cinc	x14, x14, cs  // cs = hs, nlast
+	adds	x3, x3, x14
+	cinc	x4, x7, cs  // cs = hs, nlast
+	ldp	x19, x20, [sp, #16]
+	mul	x5, x4, x6
+	umulh	x4, x4, x6
+	adds	x1, x1, x5
+	cinc	x4, x4, cs  // cs = hs, nlast
+	adds	x4, x4, x12
+	cset	x5, cs  // cs = hs, nlast
+	adds	x5, x5, x2
+	cset	x2, cs  // cs = hs, nlast
+	adds	x2, x2, x3
+	stp	x5, x2, [x0, #16]
+	cset	x3, cs  // cs = hs, nlast
+	ldp	x29, x30, [sp], #32
+	mul	x2, x3, x6
+	umulh	x3, x3, x6
+	adds	x2, x2, x1
+	adc	x4, x4, x3
+	stp	x2, x4, [x0]
+	ret
+	.size secp256k1_fe_mul_44to4, .-secp256k1_fe_mul_44to4
+
+	.p2align 4
+	.global secp256k1_fe_sqr_4to4
+	.type secp256k1_fe_sqr_4to4, %function
+secp256k1_fe_sqr_4to4:
+	stp	x29, x30, [sp, #-16]!
+	mov	x4, #0x3d1                 	// #977
+	movk	x4, #0x1, lsl #32
+	mov	x29, sp
+	ldp	x8, x5, [x1]
+	ldr	x3, [x1, #24]
+	ldr	x1, [x1, #16]
+	mul	x7, x5, x3
+	umulh	x2, x3, x8
+	umulh	x6, x5, x1
+	adds	x2, x2, x7
+	mul	x14, x1, x1
+	cset	x7, cs  // cs = hs, nlast
+	adds	x6, x2, x6
+	cinc	x7, x7, cs  // cs = hs, nlast
+	mul	x11, x5, x8
+	lsl	x2, x6, #1
+	mul	x13, x8, x8
+	adds	x2, x2, x14
+	extr	x6, x7, x6, #63
+	cinc	x6, x6, cs  // cs = hs, nlast
+	umulh	x12, x8, x8
+	mul	x7, x1, x8
+	lsl	x9, x11, #1
+	umulh	x17, x2, x4
+	lsr	x11, x11, #63
+	mul	x2, x2, x4
+	madd	x17, x6, x4, x17
+	adds	x2, x2, x13
+	umulh	x10, x5, x8
+	cinc	x17, x17, cs  // cs = hs, nlast
+	adds	x9, x9, x12
+	cinc	x11, x11, cs  // cs = hs, nlast
+	adds	x10, x10, x7
+	mul	x14, x5, x5
+	cset	x12, cs  // cs = hs, nlast
+	umulh	x6, x5, x3
+	lsl	x13, x10, #1
+	mul	x7, x3, x1
+	adds	x13, x13, x14
+	umulh	x15, x1, x1
+	extr	x12, x12, x10, #63
+	cinc	x12, x12, cs  // cs = hs, nlast
+	adds	x14, x6, x7
+	cset	x18, cs  // cs = hs, nlast
+	umulh	x10, x3, x1
+	lsl	x7, x14, #1
+	mul	x30, x3, x3
+	adds	x7, x7, x15
+	extr	x18, x18, x14, #63
+	cinc	x18, x18, cs  // cs = hs, nlast
+	lsl	x6, x10, #1
+	lsr	x14, x10, #63
+	mul	x10, x3, x8
+	mul	x15, x7, x4
+	umulh	x7, x7, x4
+	madd	x7, x18, x4, x7
+	adds	x15, x15, x17
+	umulh	x17, x1, x8
+	cinc	x7, x7, cs  // cs = hs, nlast
+	adds	x9, x9, x15
+	adc	x11, x11, x7
+	adds	x6, x6, x30
+	cinc	x7, x14, cs  // cs = hs, nlast
+	mul	x16, x5, x1
+	umulh	x3, x3, x3
+	mul	x8, x6, x4
+	umulh	x6, x6, x4
+	madd	x6, x7, x4, x6
+	adds	x8, x8, x11
+	mul	x7, x3, x4
+	cinc	x6, x6, cs  // cs = hs, nlast
+	adds	x8, x8, x13
+	adc	x12, x12, x6
+	adds	x6, x10, x16
+	cset	x1, cs  // cs = hs, nlast
+	adds	x6, x6, x17
+	cinc	x1, x1, cs  // cs = hs, nlast
+	umulh	x5, x5, x5
+	lsl	x10, x6, #1
+	umulh	x3, x3, x4
+	adds	x10, x10, x12
+	extr	x1, x1, x6, #63
+	cinc	x1, x1, cs  // cs = hs, nlast
+	adds	x5, x7, x5
+	cinc	x3, x3, cs  // cs = hs, nlast
+	adds	x5, x5, x10
+	adc	x1, x1, x3
+	ldp	x29, x30, [sp], #16
+	mul	x3, x1, x4
+	umulh	x1, x1, x4
+	adds	x2, x2, x3
 	cinc	x1, x1, cs  // cs = hs, nlast
-	mul	x11, x2, x5
 	adds	x1, x1, x9
-	umulh	x2, x2, x5
-	cset	x9, cs  // cs = hs, nlast
-	adds	x1, x1, x6
-	adc	x3, x9, x3
-	mul	x9, x10, x5
-	adds	x3, x3, x11
-	umulh	x5, x10, x5
-	cset	x6, cs  // cs = hs, nlast
-	adds	x3, x3, x7
-	adc	x2, x6, x2
-	stp	x8, x1, [x0]
-	adds	x2, x2, x9
-	cset	x10, cs  // cs = hs, nlast
-	adds	x4, x2, x4
-	adc	x5, x10, x5
-	stp	x3, x4, [x0, #16]
-	str	x5, [x0, #32]
-	ldr	x19, [sp, #16]
-	ldp	x29, x30, [sp], #32
+	cset	x3, cs  // cs = hs, nlast
+	adds	x3, x3, x8
+	str	x3, [x0, #16]
+	cset	x3, cs  // cs = hs, nlast
+	adds	x3, x3, x5
+	str	x3, [x0, #24]
+	cset	x5, cs  // cs = hs, nlast
+	mul	x3, x5, x4
+	umulh	x5, x5, x4
+	adds	x3, x3, x2
+	adc	x1, x1, x5
+	stp	x3, x1, [x0]
 	ret
-	.size secp256k1_fe_sqr_inner, .-secp256k1_fe_sqr_inner
+	.size secp256k1_fe_sqr_4to4, .-secp256k1_fe_sqr_4to4
diff --git a/src/asm/field_5x64_x86_64_gen.s b/src/asm/field_5x64_x86_64_gen.s
index 596e7e9596..ee29792512 100644
--- a/src/asm/field_5x64_x86_64_gen.s
+++ b/src/asm/field_5x64_x86_64_gen.s
@@ -1,23 +1,28 @@
-/***********************************************************************
- * Copyright (c) 2021 Kaushik Nath                                     *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+/************************************************************************
+ * Field multiplication and squaring assemblies using representation of *
+ * field elements in base 2^{64}.				        *
+ * Major instructions used in the assemblies are mul/add/adc.           *
+ *									*
+ * Copyright (c) 2021 Kaushik Nath                                      *
+ * Distributed under the MIT software license, see the accompanying     *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php. *
  ***********************************************************************/
 
-/* 4-limb field multiplication and squaring using the bottom 4-limbs of 
- * a 5-limb representation. First reduce the 5-limb inputs to fully
- * reduced 4-limb forms, then multiply and finally output a half reduced
- * output in 5-limb form. The leading limb is of atmost 33 bits. 
- *
- * Major instructions used in the assemblies: mul/add/adc.
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is of at most 64 bits. The 5-limb inputs are fully reduced first  
+ * to 4-limb forms, then multiplied, after which a field element in 5-limb 
+ * form is reported as output. The fifth limb of the output has at most 
+ * 33 bits. 
  */
 
 .att_syntax
 .text
 
 .p2align 4
-.global secp256k1_fe_mul_inner
-secp256k1_fe_mul_inner:
+.global secp256k1_fe_mul_55to5
+secp256k1_fe_mul_55to5:
 movq   %rsp,%r11
 subq   $96,%rsp
 
@@ -228,8 +233,8 @@ movq   %r11,%rsp
 ret
 
 .p2align 4
-.global secp256k1_fe_sqr_inner
-secp256k1_fe_sqr_inner:
+.global secp256k1_fe_sqr_5to5
+secp256k1_fe_sqr_5to5:
 movq   %rsp,%r11
 subq   $64,%rsp
 
@@ -401,3 +406,555 @@ movq   48(%rsp),%rbp
 movq   %r11,%rsp
 
 ret
+
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is zero. A field element in 5-limb form is reported as output
+ * such that the fifth limb is of at most 33 bits. 
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_44to5
+secp256k1_fe_mul_44to5:
+movq   %rsp,%r11
+subq   $48,%rsp
+
+movq   %r11,0(%rsp)
+movq   %r12,8(%rsp)
+movq   %r13,16(%rsp)
+movq   %r14,24(%rsp)
+movq   %r15,32(%rsp)
+movq   %rbx,40(%rsp)
+
+movq   %rdx,%rcx
+movq   $0x1000003D1,%rbx
+
+movq   8(%rsi),%rax
+mulq   24(%rcx)
+movq   %rax,%r8
+xorq   %r9,%r9
+movq   %rdx,%r10
+xorq   %r11,%r11
+
+movq   16(%rsi),%rax
+mulq   16(%rcx)
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   24(%rsi),%rax
+mulq   8(%rcx)
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   16(%rsi),%rax
+mulq   24(%rcx)
+addq   %rax,%r10
+adcq   $0,%r11
+movq   %rdx,%r12
+xorq   %r13,%r13
+
+movq   24(%rsi),%rax
+mulq   16(%rcx)
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   %rbx,%rax
+mulq   %r10
+imul   %rbx,%r11
+movq   %rax,%r10
+addq   %rdx,%r11
+
+movq   24(%rsi),%rax
+mulq   24(%rcx)
+addq   %rax,%r12
+adcq   $0,%r13
+
+movq   %rbx,%rax
+mulq   %rdx
+movq   %rax,%r14
+movq   %rdx,%r15
+
+movq   %rbx,%rax
+mulq   %r12
+imul   %rbx,%r13
+movq   %rax,%r12
+addq   %rdx,%r13
+
+movq   0(%rsi),%rax
+mulq   24(%rcx)
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   8(%rsi),%rax
+mulq   16(%rcx)
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   16(%rsi),%rax
+mulq   8(%rcx)
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   24(%rsi),%rax
+mulq   0(%rcx)
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   %rbx,%rax
+mulq   %r8
+imul   %rbx,%r9
+movq   %rax,%r8
+addq   %rdx,%r9
+
+movq   0(%rsi),%rax
+mulq   0(%rcx)
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   0(%rsi),%rax
+mulq   8(%rcx)
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   8(%rsi),%rax
+mulq   0(%rcx)
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   0(%rsi),%rax
+mulq   16(%rcx)
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   8(%rsi),%rax
+mulq   8(%rcx)
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   16(%rsi),%rax
+mulq   0(%rcx)
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+addq   %r9,%r10
+adcq   $0,%r11
+
+addq   %r11,%r12
+adcq   $0,%r13
+
+addq   %r13,%r14
+adcq   $0,%r15
+
+movq   %r8,0(%rdi)
+movq   %r10,8(%rdi)
+movq   %r12,16(%rdi)
+movq   %r14,24(%rdi)
+movq   %r15,32(%rdi)
+
+movq   0(%rsp),%r11
+movq   8(%rsp),%r12
+movq   16(%rsp),%r13
+movq   24(%rsp),%r14
+movq   32(%rsp),%r15
+movq   40(%rsp),%rbx
+
+movq   %r11,%rsp
+
+ret
+
+.p2align 4
+.global secp256k1_fe_sqr_4to5
+secp256k1_fe_sqr_4to5:
+movq   %rsp,%r11
+subq   $64,%rsp
+
+movq   %r11,0(%rsp)
+movq   %r12,8(%rsp)
+movq   %r13,16(%rsp)
+movq   %r14,24(%rsp)
+movq   %r15,32(%rsp)
+movq   %rbx,40(%rsp)
+movq   %rbp,48(%rsp)
+movq   %rdi,56(%rsp)
+
+movq   0(%rsi),%rbx
+movq   8(%rsi),%rbp
+movq   16(%rsi),%rcx
+movq   24(%rsi),%rdi
+
+movq   $0x1000003D1,%rsi
+
+movq   %rbp,%rax
+mulq   %rdi
+movq   %rax,%r8
+xorq   %r9,%r9
+movq   %rdx,%r10
+xorq   %r11,%r11
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   %rcx,%rax
+mulq   %rcx
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   %rcx,%rax
+mulq   %rdi
+addq   %rax,%r10
+adcq   $0,%r11
+movq   %rdx,%r12
+xorq   %r13,%r13
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   %rsi,%rax
+mulq   %r10
+imul   %rsi,%r11
+movq   %rax,%r10
+addq   %rdx,%r11
+
+movq   %rdi,%rax
+mulq   %rdi
+addq   %rax,%r12
+adcq   $0,%r13
+
+movq   %rsi,%rax
+mulq   %rdx
+movq   %rax,%r14
+movq   %rdx,%r15
+
+movq   %rsi,%rax
+mulq   %r12
+imul   %rsi,%r13
+movq   %rax,%r12
+addq   %rdx,%r13
+
+movq   %rbx,%rax
+mulq   %rdi
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   %rbp,%rax
+mulq   %rcx
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   %rsi,%rax
+mulq   %r8
+imul   %rsi,%r9
+movq   %rax,%r8
+addq   %rdx,%r9
+
+movq   %rbx,%rax
+mulq   %rbx
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   %rbx,%rax
+mulq   %rbp
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   %rbx,%rax
+mulq   %rcx
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   %rbp,%rax
+mulq   %rbp
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   %r10,%rbp
+addq   %r9,%rbp
+adcq   $0,%r11
+
+movq   %r12,%rcx
+addq   %r11,%rcx
+adcq   $0,%r13
+
+addq   %r13,%r14
+adcq   $0,%r15
+
+movq   56(%rsp),%rdi
+
+movq   %rbx,0(%rdi)
+movq   %rbp,8(%rdi)
+movq   %rcx,16(%rdi)
+movq   %r14,24(%rdi)
+movq   %r15,32(%rdi)
+
+movq   0(%rsp),%r11
+movq   8(%rsp),%r12
+movq   16(%rsp),%r13
+movq   24(%rsp),%r14
+movq   32(%rsp),%r15
+movq   40(%rsp),%rbx
+movq   48(%rsp),%rbp
+
+movq   %r11,%rsp
+
+ret
+
+/* 64-bit field multiplication in which the first argument has 4-limb 
+ * and the second argument has 5-limb representations such that the 
+ * fifth limb is of at most 64 bits. The second argument is fully 
+ * reduced to 4-limb form and then field multiplication is performed. 
+ * A field element in 5-limb form is reported as output such that the 
+ * fifth limb is of at most 33 bits.
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_45to5
+secp256k1_fe_mul_45to5:
+movq   %rsp,%r11
+subq   $72,%rsp
+
+movq   %r11,0(%rsp)
+movq   %r12,8(%rsp)
+movq   %r13,16(%rsp)
+movq   %r14,24(%rsp)
+movq   %r15,32(%rsp)
+movq   %rbx,40(%rsp)
+movq   %rbp,48(%rsp)
+movq   %rdi,56(%rsp)
+
+movq   $0x1000003d1,%rcx
+
+movq   0(%rdx),%r8
+movq   8(%rdx),%r9
+movq   16(%rdx),%rbx
+movq   24(%rdx),%rbp
+movq   32(%rdx),%rax
+
+mulq   %rcx 
+xorq   %rdi,%rdi
+addq   %r8,%rax
+adcq   %r9,%rdx
+adcq   $0,%rbx
+adcq   $0,%rbp
+cmovc  %rcx,%rdi
+addq   %rax,%rdi
+adcq   $0,%rdx
+movq   %rdx,64(%rsp)
+
+movq   8(%rsi),%rax
+mulq   %rbp
+movq   %rax,%r8
+xorq   %r9,%r9
+movq   %rdx,%r10
+xorq   %r11,%r11
+
+movq   16(%rsi),%rax
+mulq   %rbx
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   24(%rsi),%rax
+mulq   64(%rsp)
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   16(%rsi),%rax
+mulq   %rbp
+addq   %rax,%r10
+adcq   $0,%r11
+movq   %rdx,%r12
+xorq   %r13,%r13
+
+movq   24(%rsi),%rax
+mulq   %rbx
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   %rcx,%rax
+mulq   %r10
+imul   %rcx,%r11
+movq   %rax,%r10
+addq   %rdx,%r11
+
+movq   24(%rsi),%rax
+mulq   %rbp
+addq   %rax,%r12
+adcq   $0,%r13
+
+movq   %rcx,%rax
+mulq   %rdx
+movq   %rax,%r14
+movq   %rdx,%r15
+
+movq   %rcx,%rax
+mulq   %r12
+imul   %rcx,%r13
+movq   %rax,%r12
+addq   %rdx,%r13
+
+movq   0(%rsi),%rax
+mulq   %rbp
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   8(%rsi),%rax
+mulq   %rbx
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   16(%rsi),%rax
+mulq   64(%rsp)
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   24(%rsi),%rax
+mulq   %rdi
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   %rcx,%rax
+mulq   %r8
+imul   %rcx,%r9
+movq   %rax,%r8
+addq   %rdx,%r9
+
+movq   0(%rsi),%rax
+mulq   %rdi
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   0(%rsi),%rax
+mulq   64(%rsp)
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   8(%rsi),%rax
+mulq   %rdi
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   0(%rsi),%rax
+mulq   %rbx
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   8(%rsi),%rax
+mulq   64(%rsp)
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   16(%rsi),%rax
+mulq   %rdi
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+addq   %r9,%r10
+adcq   $0,%r11
+addq   %r11,%r12
+adcq   $0,%r13
+addq   %r13,%r14
+adcq   $0,%r15
+
+movq   56(%rsp),%rdi
+
+movq   %r8,0(%rdi)
+movq   %r10,8(%rdi)
+movq   %r12,16(%rdi)
+movq   %r14,24(%rdi)
+movq   %r15,32(%rdi)
+
+movq   0(%rsp),%r11
+movq   8(%rsp),%r12
+movq   16(%rsp),%r13
+movq   24(%rsp),%r14
+movq   32(%rsp),%r15
+movq   40(%rsp),%rbx
+movq   48(%rsp),%rbp
+
+movq   %r11,%rsp
+
+ret
diff --git a/src/asm/field_5x64_x86_64_maax.s b/src/asm/field_5x64_x86_64_maax.s
index 79b7afb816..b06ec13af3 100644
--- a/src/asm/field_5x64_x86_64_maax.s
+++ b/src/asm/field_5x64_x86_64_maax.s
@@ -1,23 +1,28 @@
-/***********************************************************************
- * Copyright (c) 2021 Kaushik Nath                                     *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+/************************************************************************
+ * Field multiplication and squaring assemblies using representation of *
+ * field elements in base 2^{64}.				        *
+ * Major instructions used in the assemblies are mulx/adcx/adox.        *
+ *									*
+ * Copyright (c) 2021 Kaushik Nath                                      *
+ * Distributed under the MIT software license, see the accompanying     *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php. *
  ***********************************************************************/
 
-/* 4-limb field multiplication and squaring using the bottom 4-limbs of 
- * a 5-limb representation. First reduce the 5-limb inputs to fully
- * reduced 4-limb forms, then multiply and finally output a half reduced
- * output in 5-limb form. The leading limb is of atmost 33 bits. 
- *
- * Major instructions used in the assemblies: mulx/adcx/adox.
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is of at most 64 bits. The 5-limb inputs are fully reduced first  
+ * to 4-limb forms, then multiplied, after which a field element in 5-limb 
+ * form is reported as output. The fifth limb of the output has at most 
+ * 33 bits. 
  */
 
 .att_syntax
 .text
 
 .p2align 4
-.global secp256k1_fe_mul_inner
-secp256k1_fe_mul_inner:
+.global secp256k1_fe_mul_55to5
+secp256k1_fe_mul_55to5:
 movq 	%rsp,%r11
 subq 	$96,%rsp
 
@@ -163,8 +168,8 @@ movq 	%r11,%rsp
 ret
 
 .p2align 4
-.global secp256k1_fe_sqr_inner
-secp256k1_fe_sqr_inner:
+.global secp256k1_fe_sqr_5to5
+secp256k1_fe_sqr_5to5:
 movq    %rsp,%r11
 subq    $56,%rsp
 
@@ -279,3 +284,348 @@ movq 	48(%rsp),%rbx
 movq 	%r11,%rsp
 
 ret
+
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is zero. A field element in 5-limb form is reported as output
+ * such that the fifth limb is of at most 33 bits. 
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_44to5
+secp256k1_fe_mul_44to5:
+push    %rbp
+push    %rbx
+push    %r12
+push    %r13
+push    %r14
+push    %r15
+    
+movq    %rdx,%rbx
+
+xorq    %r13,%r13    
+movq    0(%rbx),%rdx    
+mulx    0(%rsi),%r8,%r9
+mulx    8(%rsi),%rcx,%r10
+adcx    %rcx,%r9     
+mulx    16(%rsi),%rcx,%r11
+adcx    %rcx,%r10    
+mulx    24(%rsi),%rcx,%r12
+adcx    %rcx,%r11
+adcx    %r13,%r12
+
+xorq    %r14,%r14
+movq    8(%rbx),%rdx
+mulx    0(%rsi),%rcx,%rbp
+adcx    %rcx,%r9
+adox    %rbp,%r10
+mulx    8(%rsi),%rcx,%rbp
+adcx    %rcx,%r10
+adox    %rbp,%r11
+mulx    16(%rsi),%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    24(%rsi),%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13	
+adcx    %r14,%r13
+
+xorq    %r15,%r15
+movq    16(%rbx),%rdx
+mulx    0(%rsi),%rcx,%rbp
+adcx    %rcx,%r10
+adox    %rbp,%r11
+mulx    8(%rsi),%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    16(%rsi),%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13
+mulx    24(%rsi),%rcx,%rbp
+adcx    %rcx,%r13
+adox    %rbp,%r14
+adcx    %r15,%r14
+
+xorq    %rax,%rax
+movq    24(%rbx),%rdx
+mulx    0(%rsi),%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    8(%rsi),%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13
+mulx    16(%rsi),%rcx,%rbp
+adcx    %rcx,%r13
+adox    %rbp,%r14
+mulx    24(%rsi),%rcx,%rbp
+adcx    %rcx,%r14
+adox    %rbp,%r15			
+adcx    %rax,%r15
+  
+xorq    %rbp,%rbp
+movq    $0x1000003D1,%rdx
+mulx    %r12,%rax,%r12 
+adcx    %rax,%r8
+adox    %r12,%r9
+mulx    %r13,%rcx,%r13
+adcx    %rcx,%r9
+adox    %r13,%r10
+mulx    %r14,%rcx,%r14
+adcx    %rcx,%r10
+adox    %r14,%r11
+mulx    %r15,%rcx,%r15
+adcx    %rcx,%r11
+adox    %rbp,%r15
+adcx    %rbp,%r15		
+
+movq    %r8,0(%rdi)
+movq    %r9,8(%rdi)
+movq    %r10,16(%rdi)
+movq    %r11,24(%rdi)
+movq    %r15,32(%rdi)
+
+pop     %r15
+pop     %r14
+pop     %r13
+pop     %r12
+pop     %rbx
+pop     %rbp
+
+ret
+
+.p2align 4
+.global secp256k1_fe_sqr_4to5
+secp256k1_fe_sqr_4to5:
+push    %rbp
+push    %rbx
+push    %r12
+push    %r13
+push    %r14
+push    %r15
+push    %rdi
+
+movq    0(%rsi),%rbx  
+movq    8(%rsi),%rbp  
+movq    16(%rsi),%rax
+movq    24(%rsi),%rsi
+
+xorq    %r13,%r13
+movq    %rbx,%rdx
+mulx    %rbp,%r9,%r10
+mulx    %rax,%rcx,%r11
+adcx    %rcx,%r10
+mulx    %rsi,%rcx,%r12
+adcx    %rcx,%r11
+adcx    %r13,%r12
+
+xorq    %r14,%r14
+movq    %rbp,%rdx
+mulx    %rax,%rcx,%rdx
+adcx    %rcx,%r11
+adox    %rdx,%r12
+movq    %rbp,%rdx
+mulx    %rsi,%rcx,%rdx
+adcx    %rcx,%r12
+adox    %rdx,%r13
+adcx    %r14,%r13
+
+xorq    %r15,%r15
+movq    %rax,%rdx
+mulx    %rsi,%rcx,%r14
+adcx    %rcx,%r13
+adcx    %r15,%r14
+
+shld    $1,%r14,%r15
+shld    $1,%r13,%r14
+shld    $1,%r12,%r13
+shld    $1,%r11,%r12
+shld    $1,%r10,%r11
+shld    $1,%r9,%r10
+addq    %r9,%r9
+     
+xorq    %rdx,%rdx
+movq    %rbx,%rdx
+mulx    %rdx,%r8,%rdx
+adcx    %rdx,%r9
+
+movq    %rbp,%rdx
+mulx    %rdx,%rcx,%rdx
+adcx    %rcx,%r10
+adcx    %rdx,%r11
+
+movq    %rax,%rdx
+mulx    %rdx,%rcx,%rdx
+adcx    %rcx,%r12
+adcx    %rdx,%r13
+
+movq    %rsi,%rdx
+mulx    %rdx,%rcx,%rdx
+adcx    %rcx,%r14
+adcx    %rdx,%r15	
+
+xorq    %rbp,%rbp
+movq    $0x1000003D1,%rdx
+mulx    %r12,%rbx,%r12
+adcx    %r8,%rbx
+adox    %r9,%r12
+mulx    %r13,%rcx,%rax
+adcx    %rcx,%r12
+adox    %r10,%rax
+mulx    %r14,%rcx,%rsi
+adcx    %rcx,%rax
+adox    %r11,%rsi
+mulx    %r15,%rcx,%r15
+adcx    %rcx,%rsi
+adox    %rbp,%r15
+adcx    %rbp,%r15
+
+movq    %rbx,0(%rdi)
+movq    %r12,8(%rdi)
+movq    %rax,16(%rdi)
+movq    %rsi,24(%rdi)
+movq    %r15,32(%rdi)
+
+pop     %r15
+pop     %r14
+pop     %r13
+pop     %r12
+pop     %rbx
+pop     %rbp
+
+ret
+
+/* 64-bit field multiplication in which the first argument has 4-limb 
+ * and the second argument has 5-limb representations such that the 
+ * fifth limb is of at most 64 bits. The second argument is fully 
+ * reduced to 4-limb form and then field multiplication is performed. 
+ * A field element in 5-limb form is reported as output such that the 
+ * fifth limb is of at most 33 bits.
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_45to5
+secp256k1_fe_mul_45to5:
+movq 	%rsp,%r11
+subq 	$72,%rsp
+
+movq 	%r11,0(%rsp)
+movq 	%r12,8(%rsp)
+movq 	%r13,16(%rsp)
+movq 	%r14,24(%rsp)
+movq 	%r15,32(%rsp)
+movq 	%rbp,40(%rsp)
+movq 	%rbx,48(%rsp)
+movq 	%rdi,56(%rsp)
+
+movq    0(%rdx),%rax
+movq    8(%rdx),%rbx
+movq    16(%rdx),%r8
+movq    24(%rdx),%r9
+
+movq    $0x1000003D1,%r15
+xorq    %rcx,%rcx
+mulx    32(%rdx),%r13,%r14
+adcx    %r13,%rax
+adcx    %r14,%rbx
+adcx    %rcx,%r8
+adcx    %rcx,%r9
+cmovc   %r15,%rcx
+addq    %rcx,%rax
+adcq    $0,%rbx
+
+movq    %r8,56(%rsp)
+movq    %r9,64(%rsp)
+
+xorq    %r13,%r13
+movq    0(%rsi),%rdx    
+mulx    %rax,%r8,%r9
+mulx    %rbx,%rcx,%r10
+adcx    %rcx,%r9     
+mulx    56(%rsp),%rcx,%r11
+adcx    %rcx,%r10    
+mulx    64(%rsp),%rcx,%r12
+adcx    %rcx,%r11
+adcx    %r13,%r12
+
+xorq    %r14,%r14
+movq    8(%rsi),%rdx
+mulx    %rax,%rcx,%rbp
+adcx    %rcx,%r9
+adox    %rbp,%r10
+mulx    %rbx,%rcx,%rbp
+adcx    %rcx,%r10
+adox    %rbp,%r11
+mulx    56(%rsp),%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    64(%rsp),%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13	
+adcx    %r14,%r13
+
+xorq    %r15,%r15
+movq    16(%rsi),%rdx
+mulx    %rax,%rcx,%rbp
+adcx    %rcx,%r10
+adox    %rbp,%r11
+mulx    %rbx,%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    56(%rsp),%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13
+mulx    64(%rsp),%rcx,%rbp
+adcx    %rcx,%r13
+adox    %rbp,%r14
+adcx    %r15,%r14
+
+xorq    %rdx,%rdx
+movq    24(%rsi),%rdx
+mulx    %rax,%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    %rbx,%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13
+mulx    56(%rsp),%rcx,%rbp
+adcx    %rcx,%r13
+adox    %rbp,%r14
+mulx    64(%rsp),%rcx,%rbp
+adcx    %rcx,%r14
+adox    %rbp,%r15			
+adcq    $0,%r15
+  
+xorq    %rbp,%rbp
+movq    $0x1000003D1,%rdx
+mulx    %r12,%rax,%r12 
+adcx    %rax,%r8
+adox    %r12,%r9
+mulx    %r13,%rcx,%r13
+adcx    %rcx,%r9
+adox    %r13,%r10
+mulx    %r14,%rcx,%r14
+adcx    %rcx,%r10
+adox    %r14,%r11
+mulx    %r15,%rcx,%r15
+adcx    %rcx,%r11
+adox    %rbp,%r15
+adcx    %rbp,%r15		
+
+movq    %r8,0(%rdi)
+movq    %r9,8(%rdi)
+movq    %r10,16(%rdi)
+movq    %r11,24(%rdi)
+movq    %r15,32(%rdi)
+
+movq 	 0(%rsp),%r11
+movq 	 8(%rsp),%r12
+movq 	16(%rsp),%r13
+movq 	24(%rsp),%r14
+movq 	32(%rsp),%r15
+movq 	40(%rsp),%rbp
+movq 	48(%rsp),%rbx
+
+movq 	%r11,%rsp
+
+ret
diff --git a/src/asm/field_5x64_x86_64_mxaa.s b/src/asm/field_5x64_x86_64_mxaa.s
index dec40d76ed..f9b99e3848 100644
--- a/src/asm/field_5x64_x86_64_mxaa.s
+++ b/src/asm/field_5x64_x86_64_mxaa.s
@@ -1,23 +1,28 @@
-/***********************************************************************
- * Copyright (c) 2021 Kaushik Nath                                     *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+/************************************************************************
+ * Field multiplication and squaring assemblies using representation of *
+ * field elements in base 2^{64}.				        *
+ * Major instructions used in the assemblies are mulx/add/adc.          *
+ *									*
+ * Copyright (c) 2021 Kaushik Nath                                      *
+ * Distributed under the MIT software license, see the accompanying     *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php. *
  ***********************************************************************/
 
-/* 4-limb field multiplication and squaring using the bottom 4-limbs of 
- * a 5-limb representation. First reduce the 5-limb inputs to fully
- * reduced 4-limb forms, then multiply and finally output a half reduced
- * output in 5-limb form. The leading limb is of atmost 33 bits. 
- *
- * Major instructions used in the assemblies: mulx/add/adc.
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is of at most 64 bits. The 5-limb inputs are fully reduced first  
+ * to 4-limb forms, then multiplied, after which a field element in 5-limb 
+ * form is reported as output. The fifth limb of the output has at most 
+ * 33 bits. 
  */
 
 .att_syntax
 .text
 
 .p2align 4
-.global secp256k1_fe_mul_inner
-secp256k1_fe_mul_inner:
+.global secp256k1_fe_mul_55to5
+secp256k1_fe_mul_55to5:
 movq 	%rsp,%r11
 subq 	$112,%rsp
 
@@ -160,8 +165,8 @@ movq 	%r11,%rsp
 ret
 
 .p2align 4
-.global secp256k1_fe_sqr_inner
-secp256k1_fe_sqr_inner:
+.global secp256k1_fe_sqr_5to5
+secp256k1_fe_sqr_5to5:
 movq    %rsp,%r11
 subq    $64,%rsp
 
@@ -219,7 +224,7 @@ shld    $1,%r12,%r13
 shld    $1,%r11,%r12
 shld    $1,%r10,%r11
 shld    $1,%r9,%r10
-shlq    $1,%r9
+addq    %r9,%r9
 
 movq    %rbp,%rdx
 mulx    %rdx,%r8,%rax
@@ -278,3 +283,345 @@ movq 	48(%rsp),%rbx
 movq 	%r11,%rsp
 
 ret
+
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is zero. A field element in 5-limb form is reported as output
+ * such that the fifth limb is of at most 33 bits. 
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_44to5
+secp256k1_fe_mul_44to5:
+movq 	%rsp,%r11
+subq 	$64,%rsp
+
+movq 	%r11,0(%rsp)
+movq 	%r12,8(%rsp)
+movq 	%r13,16(%rsp)
+movq 	%r14,24(%rsp)
+movq 	%r15,32(%rsp)
+movq 	%rbp,40(%rsp)
+movq 	%rbx,48(%rsp)
+movq 	%rdi,56(%rsp)
+
+movq    %rdx,%rdi
+
+movq    0(%rdi),%rdx    
+mulx    0(%rsi),%r8,%r9
+mulx    8(%rsi),%rcx,%r10
+addq    %rcx,%r9
+mulx    16(%rsi),%rcx,%r11
+adcq    %rcx,%r10
+mulx    24(%rsi),%rcx,%r12
+adcq    %rcx,%r11
+adcq    $0,%r12
+
+movq    8(%rdi),%rdx    
+mulx    0(%rsi),%rax,%rbx
+mulx    8(%rsi),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    16(%rsi),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    24(%rsi),%rcx,%r13
+adcq    %rcx,%r15
+adcq    $0,%r13
+addq    %rax,%r9
+adcq    %rbx,%r10
+adcq    %rbp,%r11
+adcq    %r15,%r12
+adcq    $0,%r13
+
+movq    16(%rdi),%rdx
+mulx    0(%rsi),%rax,%rbx
+mulx    8(%rsi),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    16(%rsi),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    24(%rsi),%rcx,%r14
+adcq    %rcx,%r15
+adcq    $0,%r14
+addq    %rax,%r10
+adcq    %rbx,%r11
+adcq    %rbp,%r12
+adcq    %r15,%r13
+adcq    $0,%r14
+
+movq    24(%rdi),%rdx
+mulx    0(%rsi),%rax,%rbx
+mulx    8(%rsi),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    16(%rsi),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    24(%rsi),%rcx,%rsi
+adcq    %rcx,%r15
+adcq    $0,%rsi
+addq    %rax,%r11
+adcq    %rbx,%r12
+adcq    %rbp,%r13
+adcq    %r15,%r14
+adcq    $0,%rsi
+
+movq    $0x1000003D1,%rdx
+mulx    %r12,%r12,%rbx
+mulx    %r13,%r13,%rcx
+addq    %rbx,%r13
+mulx    %r14,%r14,%rbx
+adcq    %rcx,%r14
+mulx    %rsi,%r15,%rcx
+adcq    %rbx,%r15
+adcq    $0,%rcx
+addq    %r12,%r8
+adcq    %r13,%r9
+adcq    %r14,%r10
+adcq    %r15,%r11
+adcq    $0,%rcx
+
+movq 	56(%rsp),%rdi
+movq   	%r8,0(%rdi)
+movq   	%r9,8(%rdi)
+movq   	%r10,16(%rdi)
+movq   	%r11,24(%rdi)
+movq   	%rcx,32(%rdi)
+
+movq 	 0(%rsp),%r11
+movq 	 8(%rsp),%r12
+movq 	16(%rsp),%r13
+movq 	24(%rsp),%r14
+movq 	32(%rsp),%r15
+movq 	40(%rsp),%rbp
+movq 	48(%rsp),%rbx
+
+movq 	%r11,%rsp
+
+ret
+
+.p2align 4
+.global secp256k1_fe_sqr_4to5
+secp256k1_fe_sqr_4to5:
+movq    %rsp,%r11
+subq    $56,%rsp
+
+movq 	%r11,0(%rsp)
+movq 	%r12,8(%rsp)
+movq 	%r13,16(%rsp)
+movq 	%r14,24(%rsp)
+movq 	%r15,32(%rsp)
+movq 	%rbp,40(%rsp)
+movq 	%rbx,48(%rsp)
+
+movq    0(%rsi),%rdx    
+mulx    8(%rsi),%r9,%r10
+mulx    16(%rsi),%rcx,%r11
+addq    %rcx,%r10
+mulx    24(%rsi),%rcx,%r12
+adcq    %rcx,%r11
+adcq    $0,%r12
+
+movq    8(%rsi),%rdx
+mulx    16(%rsi),%rax,%rbx
+mulx    24(%rsi),%rcx,%r13
+addq    %rcx,%rbx
+adcq    $0,%r13
+addq    %rax,%r11
+adcq    %rbx,%r12
+adcq    $0,%r13
+
+movq    16(%rsi),%rdx
+mulx    24(%rsi),%rax,%r14
+addq    %rax,%r13
+adcq    $0,%r14
+
+movq    $0,%r15
+shld    $1,%r14,%r15
+shld    $1,%r13,%r14
+shld    $1,%r12,%r13
+shld    $1,%r11,%r12
+shld    $1,%r10,%r11
+shld    $1,%r9,%r10
+addq    %r9,%r9
+
+movq    0(%rsi),%rdx
+mulx    %rdx,%r8,%rax
+addq    %rax,%r9
+
+movq    8(%rsi),%rdx
+mulx    %rdx,%rax,%rbx
+adcq    %rax,%r10
+adcq    %rbx,%r11
+
+movq    16(%rsi),%rdx
+mulx    %rdx,%rax,%rbx
+adcq    %rax,%r12
+adcq    %rbx,%r13
+
+movq    24(%rsi),%rdx
+mulx    %rdx,%rax,%rbx
+adcq    %rax,%r14
+adcq    %rbx,%r15
+
+movq    $0x1000003D1,%rdx
+mulx    %r12,%r12,%rbx
+mulx    %r13,%r13,%rcx
+addq    %rbx,%r13
+mulx    %r14,%r14,%rbx
+adcq    %rcx,%r14
+mulx    %r15,%r15,%rcx
+adcq    %rbx,%r15
+adcq    $0,%rcx
+addq    %r12,%r8
+adcq    %r13,%r9
+adcq    %r14,%r10
+adcq    %r15,%r11
+adcq    $0,%rcx
+
+movq   	%r8,0(%rsi)
+movq   	%r9,8(%rsi)
+movq   	%r10,16(%rsi)
+movq   	%r11,24(%rsi)
+movq   	%rcx,32(%rsi)
+
+movq 	 0(%rsp),%r11
+movq 	 8(%rsp),%r12
+movq 	16(%rsp),%r13
+movq 	24(%rsp),%r14
+movq 	32(%rsp),%r15
+movq 	40(%rsp),%rbp
+movq 	48(%rsp),%rbx
+
+movq 	%r11,%rsp
+
+ret
+
+/* 64-bit field multiplication in which the first argument has 4-limb 
+ * and the second argument has 5-limb representations such that the 
+ * fifth limb is of at most 64 bits. The second argument is fully 
+ * reduced to 4-limb form and then field multiplication is performed. 
+ * A field element in 5-limb form is reported as output such that the 
+ * fifth limb is of at most 33 bits.
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_45to5
+secp256k1_fe_mul_45to5:
+movq 	%rsp,%r11
+subq 	$88,%rsp
+
+movq 	%r11,0(%rsp)
+movq 	%r12,8(%rsp)
+movq 	%r13,16(%rsp)
+movq 	%r14,24(%rsp)
+movq 	%r15,32(%rsp)
+movq 	%rbp,40(%rsp)
+movq 	%rbx,48(%rsp)
+
+movq    0(%rdx),%r12
+movq    8(%rdx),%r13
+movq    16(%rdx),%r14
+movq    24(%rdx),%r15
+movq    32(%rdx),%rax
+
+movq    $0x1000003D1,%rdx
+xorq    %rcx,%rcx
+mulx    %rax,%rax,%rbx
+addq    %rax,%r12
+adcq    %rbx,%r13
+adcq    $0,%r14
+adcq    $0,%r15
+cmovc   %rdx,%rcx
+addq    %rcx,%r12
+adcq    $0,%r13
+
+movq    %r12,56(%rsp)
+movq    %r13,64(%rsp)
+movq    %r14,72(%rsp)
+movq    %r15,80(%rsp)
+
+movq    0(%rsi),%rdx
+mulx    56(%rsp),%r8,%r9
+mulx    64(%rsp),%rcx,%r10
+addq    %rcx,%r9
+mulx    72(%rsp),%rcx,%r11
+adcq    %rcx,%r10
+mulx    80(%rsp),%rcx,%r12
+adcq    %rcx,%r11
+adcq    $0,%r12
+
+movq    8(%rsi),%rdx    
+mulx    56(%rsp),%rax,%rbx
+mulx    64(%rsp),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    72(%rsp),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    80(%rsp),%rcx,%r13
+adcq    %rcx,%r15
+adcq    $0,%r13
+addq    %rax,%r9
+adcq    %rbx,%r10
+adcq    %rbp,%r11
+adcq    %r15,%r12
+adcq    $0,%r13
+
+movq    16(%rsi),%rdx
+mulx    56(%rsp),%rax,%rbx
+mulx    64(%rsp),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    72(%rsp),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    80(%rsp),%rcx,%r14
+adcq    %rcx,%r15
+adcq    $0,%r14
+addq    %rax,%r10
+adcq    %rbx,%r11
+adcq    %rbp,%r12
+adcq    %r15,%r13
+adcq    $0,%r14
+
+movq    24(%rsi),%rdx
+mulx    56(%rsp),%rax,%rbx
+mulx    64(%rsp),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    72(%rsp),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    80(%rsp),%rcx,%rsi
+adcq    %rcx,%r15
+adcq    $0,%rsi
+addq    %rax,%r11
+adcq    %rbx,%r12
+adcq    %rbp,%r13
+adcq    %r15,%r14
+adcq    $0,%rsi
+
+movq    $0x1000003D1,%rdx
+mulx    %r12,%r12,%rbx
+mulx    %r13,%r13,%rcx
+addq    %rbx,%r13
+mulx    %r14,%r14,%rbx
+adcq    %rcx,%r14
+mulx    %rsi,%r15,%rcx
+adcq    %rbx,%r15
+adcq    $0,%rcx
+addq    %r12,%r8
+adcq    %r13,%r9
+adcq    %r14,%r10
+adcq    %r15,%r11
+adcq    $0,%rcx
+
+movq   	%r8,0(%rdi)
+movq   	%r9,8(%rdi)
+movq   	%r10,16(%rdi)
+movq   	%r11,24(%rdi)
+movq   	%rcx,32(%rdi)
+
+movq 	 0(%rsp),%r11
+movq 	 8(%rsp),%r12
+movq 	16(%rsp),%r13
+movq 	24(%rsp),%r14
+movq 	32(%rsp),%r15
+movq 	40(%rsp),%rbp
+movq 	48(%rsp),%rbx
+
+movq 	%r11,%rsp
+
+ret
diff --git a/src/field_5x64_impl.h b/src/field_5x64_impl.h
index 1c587b0332..0325d65e60 100644
--- a/src/field_5x64_impl.h
+++ b/src/field_5x64_impl.h
@@ -17,8 +17,10 @@
 
 #if defined(USE_EXTERNAL_ASM)
 /* External assembler implementation */
-void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b);
-void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a);
+void secp256k1_fe_mul_55to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b);
+void secp256k1_fe_mul_45to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b);
+void secp256k1_fe_sqr_5to5(uint64_t *r, const uint64_t *a);
+void secp256k1_fe_sqr_4to5(uint64_t *r, const uint64_t *a);
 #endif
 
 #ifdef VERIFY
@@ -733,7 +735,7 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2
 #endif
 
 #if defined(USE_EXTERNAL_ASM)
-    secp256k1_fe_mul_inner(r->n, a->n, b->n);
+    secp256k1_fe_mul_55to5(r->n, a->n, b->n);
 #else
     mul2(c0,c1,a4,0x1000003D1ULL);
     a4 = 0;
@@ -803,10 +805,12 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2
 }
 
 static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const secp256k1_fe * SECP256K1_RESTRICT b_prec) {
+#ifndef USE_EXTERNAL_ASM
     uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4];
     uint64_t b0 = b_prec->n[0], b1 = b_prec->n[1], b2 = b_prec->n[2], b3 = b_prec->n[3];
     uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0;
     uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0;
+#endif
 
 #ifdef VERIFY
     VERIFY_CHECK(a->magnitude <= 2048);
@@ -818,6 +822,9 @@ static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const
     VERIFY_CHECK(a != b_prec);
 #endif
 
+#if defined(USE_EXTERNAL_ASM)
+    secp256k1_fe_mul_45to5(r->n, b_prec->n, a->n);
+#else
     mul2(c0,c1,a4,0x1000003D1ULL);
     a4 = 0;
     add2(c0,c1,a0);
@@ -862,6 +869,7 @@ static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const
     add2(d3,d4,c3);
     r->n[3] = d3;
     r->n[4] = d4;
+#endif
 
 #ifdef VERIFY
     r->magnitude = 1;
@@ -884,7 +892,7 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) {
 #endif
 
 #if defined(USE_EXTERNAL_ASM)
-    secp256k1_fe_sqr_inner(r->n, a->n);
+    secp256k1_fe_sqr_5to5(r->n, a->n);
 #else
     /* Bring a to [0,2**256). */
     mul2(c0,c1,a4,0x1000003D1ULL);
@@ -935,9 +943,11 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) {
 }
 
 static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) {
+#ifndef USE_EXTERNAL_ASM
     uint64_t a0 = a_prec->n[0], a1 = a_prec->n[1], a2 = a_prec->n[2], a3 = a_prec->n[3];
     uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0;
     uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0;
+#endif
 
 #ifdef VERIFY
     VERIFY_CHECK(a_prec->precomputed);
@@ -945,6 +955,9 @@ static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) {
     secp256k1_fe_verify(a_prec);
 #endif
 
+#if defined(USE_EXTERNAL_ASM)
+    secp256k1_fe_sqr_4to5(r->n, a_prec->n);
+#else
     /* Compute 512-bit product. */
     c0 = 0;
     c1 = 0;
@@ -973,6 +986,7 @@ static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) {
     add2(d3,d4,c3);
     r->n[3] = d3;
     r->n[4] = d4;
+#endif
 
 #ifdef VERIFY
     r->magnitude = 1;