diff --git a/src/asm/field_5x64_x86_64_gen.s b/src/asm/field_5x64_x86_64_gen.s
index 596e7e9596..ee29792512 100644
--- a/src/asm/field_5x64_x86_64_gen.s
+++ b/src/asm/field_5x64_x86_64_gen.s
@@ -1,23 +1,28 @@
-/***********************************************************************
- * Copyright (c) 2021 Kaushik Nath                                     *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+/************************************************************************
+ * Field multiplication and squaring assemblies using representation of *
+ * field elements in base 2^{64}.				        *
+ * Major instructions used in the assemblies are mul/add/adc.           *
+ *									*
+ * Copyright (c) 2021 Kaushik Nath                                      *
+ * Distributed under the MIT software license, see the accompanying     *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php. *
  ***********************************************************************/
 
-/* 4-limb field multiplication and squaring using the bottom 4-limbs of 
- * a 5-limb representation. First reduce the 5-limb inputs to fully
- * reduced 4-limb forms, then multiply and finally output a half reduced
- * output in 5-limb form. The leading limb is of atmost 33 bits. 
- *
- * Major instructions used in the assemblies: mul/add/adc.
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is of at most 64 bits. The 5-limb inputs are fully reduced first  
+ * to 4-limb forms, then multiplied, after which a field element in 5-limb 
+ * form is reported as output. The fifth limb of the output has at most 
+ * 33 bits. 
  */
 
 .att_syntax
 .text
 
 .p2align 4
-.global secp256k1_fe_mul_inner
-secp256k1_fe_mul_inner:
+.global secp256k1_fe_mul_55to5
+secp256k1_fe_mul_55to5:
 movq   %rsp,%r11
 subq   $96,%rsp
 
@@ -228,8 +233,8 @@ movq   %r11,%rsp
 ret
 
 .p2align 4
-.global secp256k1_fe_sqr_inner
-secp256k1_fe_sqr_inner:
+.global secp256k1_fe_sqr_5to5
+secp256k1_fe_sqr_5to5:
 movq   %rsp,%r11
 subq   $64,%rsp
 
@@ -401,3 +406,555 @@ movq   48(%rsp),%rbp
 movq   %r11,%rsp
 
 ret
+
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is zero. A field element in 5-limb form is reported as output
+ * such that the fifth limb is of at most 33 bits. 
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_44to5
+secp256k1_fe_mul_44to5:
+movq   %rsp,%r11
+subq   $48,%rsp
+
+movq   %r11,0(%rsp)
+movq   %r12,8(%rsp)
+movq   %r13,16(%rsp)
+movq   %r14,24(%rsp)
+movq   %r15,32(%rsp)
+movq   %rbx,40(%rsp)
+
+movq   %rdx,%rcx
+movq   $0x1000003D1,%rbx
+
+movq   8(%rsi),%rax
+mulq   24(%rcx)
+movq   %rax,%r8
+xorq   %r9,%r9
+movq   %rdx,%r10
+xorq   %r11,%r11
+
+movq   16(%rsi),%rax
+mulq   16(%rcx)
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   24(%rsi),%rax
+mulq   8(%rcx)
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   16(%rsi),%rax
+mulq   24(%rcx)
+addq   %rax,%r10
+adcq   $0,%r11
+movq   %rdx,%r12
+xorq   %r13,%r13
+
+movq   24(%rsi),%rax
+mulq   16(%rcx)
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   %rbx,%rax
+mulq   %r10
+imul   %rbx,%r11
+movq   %rax,%r10
+addq   %rdx,%r11
+
+movq   24(%rsi),%rax
+mulq   24(%rcx)
+addq   %rax,%r12
+adcq   $0,%r13
+
+movq   %rbx,%rax
+mulq   %rdx
+movq   %rax,%r14
+movq   %rdx,%r15
+
+movq   %rbx,%rax
+mulq   %r12
+imul   %rbx,%r13
+movq   %rax,%r12
+addq   %rdx,%r13
+
+movq   0(%rsi),%rax
+mulq   24(%rcx)
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   8(%rsi),%rax
+mulq   16(%rcx)
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   16(%rsi),%rax
+mulq   8(%rcx)
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   24(%rsi),%rax
+mulq   0(%rcx)
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   %rbx,%rax
+mulq   %r8
+imul   %rbx,%r9
+movq   %rax,%r8
+addq   %rdx,%r9
+
+movq   0(%rsi),%rax
+mulq   0(%rcx)
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   0(%rsi),%rax
+mulq   8(%rcx)
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   8(%rsi),%rax
+mulq   0(%rcx)
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   0(%rsi),%rax
+mulq   16(%rcx)
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   8(%rsi),%rax
+mulq   8(%rcx)
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   16(%rsi),%rax
+mulq   0(%rcx)
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+addq   %r9,%r10
+adcq   $0,%r11
+
+addq   %r11,%r12
+adcq   $0,%r13
+
+addq   %r13,%r14
+adcq   $0,%r15
+
+movq   %r8,0(%rdi)
+movq   %r10,8(%rdi)
+movq   %r12,16(%rdi)
+movq   %r14,24(%rdi)
+movq   %r15,32(%rdi)
+
+movq   0(%rsp),%r11
+movq   8(%rsp),%r12
+movq   16(%rsp),%r13
+movq   24(%rsp),%r14
+movq   32(%rsp),%r15
+movq   40(%rsp),%rbx
+
+movq   %r11,%rsp
+
+ret
+
+.p2align 4
+.global secp256k1_fe_sqr_4to5
+secp256k1_fe_sqr_4to5:
+movq   %rsp,%r11
+subq   $64,%rsp
+
+movq   %r11,0(%rsp)
+movq   %r12,8(%rsp)
+movq   %r13,16(%rsp)
+movq   %r14,24(%rsp)
+movq   %r15,32(%rsp)
+movq   %rbx,40(%rsp)
+movq   %rbp,48(%rsp)
+movq   %rdi,56(%rsp)
+
+movq   0(%rsi),%rbx
+movq   8(%rsi),%rbp
+movq   16(%rsi),%rcx
+movq   24(%rsi),%rdi
+
+movq   $0x1000003D1,%rsi
+
+movq   %rbp,%rax
+mulq   %rdi
+movq   %rax,%r8
+xorq   %r9,%r9
+movq   %rdx,%r10
+xorq   %r11,%r11
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   %rcx,%rax
+mulq   %rcx
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   %rcx,%rax
+mulq   %rdi
+addq   %rax,%r10
+adcq   $0,%r11
+movq   %rdx,%r12
+xorq   %r13,%r13
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   %rsi,%rax
+mulq   %r10
+imul   %rsi,%r11
+movq   %rax,%r10
+addq   %rdx,%r11
+
+movq   %rdi,%rax
+mulq   %rdi
+addq   %rax,%r12
+adcq   $0,%r13
+
+movq   %rsi,%rax
+mulq   %rdx
+movq   %rax,%r14
+movq   %rdx,%r15
+
+movq   %rsi,%rax
+mulq   %r12
+imul   %rsi,%r13
+movq   %rax,%r12
+addq   %rdx,%r13
+
+movq   %rbx,%rax
+mulq   %rdi
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   %rbp,%rax
+mulq   %rcx
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   %rsi,%rax
+mulq   %r8
+imul   %rsi,%r9
+movq   %rax,%r8
+addq   %rdx,%r9
+
+movq   %rbx,%rax
+mulq   %rbx
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   %rbx,%rax
+mulq   %rbp
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   %rbx,%rax
+mulq   %rcx
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   %rbp,%rax
+mulq   %rbp
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   %r10,%rbp
+addq   %r9,%rbp
+adcq   $0,%r11
+
+movq   %r12,%rcx
+addq   %r11,%rcx
+adcq   $0,%r13
+
+addq   %r13,%r14
+adcq   $0,%r15
+
+movq   56(%rsp),%rdi
+
+movq   %rbx,0(%rdi)
+movq   %rbp,8(%rdi)
+movq   %rcx,16(%rdi)
+movq   %r14,24(%rdi)
+movq   %r15,32(%rdi)
+
+movq   0(%rsp),%r11
+movq   8(%rsp),%r12
+movq   16(%rsp),%r13
+movq   24(%rsp),%r14
+movq   32(%rsp),%r15
+movq   40(%rsp),%rbx
+movq   48(%rsp),%rbp
+
+movq   %r11,%rsp
+
+ret
+
+/* 64-bit field multiplication in which the first argument has 4-limb 
+ * and the second argument has 5-limb representations such that the 
+ * fifth limb is of at most 64 bits. The second argument is fully 
+ * reduced to 4-limb form and then field multiplication is performed. 
+ * A field element in 5-limb form is reported as output such that the 
+ * fifth limb is of at most 33 bits.
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_45to5
+secp256k1_fe_mul_45to5:
+movq   %rsp,%r11
+subq   $72,%rsp
+
+movq   %r11,0(%rsp)
+movq   %r12,8(%rsp)
+movq   %r13,16(%rsp)
+movq   %r14,24(%rsp)
+movq   %r15,32(%rsp)
+movq   %rbx,40(%rsp)
+movq   %rbp,48(%rsp)
+movq   %rdi,56(%rsp)
+
+movq   $0x1000003d1,%rcx
+
+movq   0(%rdx),%r8
+movq   8(%rdx),%r9
+movq   16(%rdx),%rbx
+movq   24(%rdx),%rbp
+movq   32(%rdx),%rax
+
+mulq   %rcx 
+xorq   %rdi,%rdi
+addq   %r8,%rax
+adcq   %r9,%rdx
+adcq   $0,%rbx
+adcq   $0,%rbp
+cmovc  %rcx,%rdi
+addq   %rax,%rdi
+adcq   $0,%rdx
+movq   %rdx,64(%rsp)
+
+movq   8(%rsi),%rax
+mulq   %rbp
+movq   %rax,%r8
+xorq   %r9,%r9
+movq   %rdx,%r10
+xorq   %r11,%r11
+
+movq   16(%rsi),%rax
+mulq   %rbx
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   24(%rsi),%rax
+mulq   64(%rsp)
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   16(%rsi),%rax
+mulq   %rbp
+addq   %rax,%r10
+adcq   $0,%r11
+movq   %rdx,%r12
+xorq   %r13,%r13
+
+movq   24(%rsi),%rax
+mulq   %rbx
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   %rcx,%rax
+mulq   %r10
+imul   %rcx,%r11
+movq   %rax,%r10
+addq   %rdx,%r11
+
+movq   24(%rsi),%rax
+mulq   %rbp
+addq   %rax,%r12
+adcq   $0,%r13
+
+movq   %rcx,%rax
+mulq   %rdx
+movq   %rax,%r14
+movq   %rdx,%r15
+
+movq   %rcx,%rax
+mulq   %r12
+imul   %rcx,%r13
+movq   %rax,%r12
+addq   %rdx,%r13
+
+movq   0(%rsi),%rax
+mulq   %rbp
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   8(%rsi),%rax
+mulq   %rbx
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   16(%rsi),%rax
+mulq   64(%rsp)
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   24(%rsi),%rax
+mulq   %rdi
+addq   %rax,%r14
+adcq   $0,%r15
+addq   %rdx,%r8
+adcq   $0,%r9
+
+movq   %rcx,%rax
+mulq   %r8
+imul   %rcx,%r9
+movq   %rax,%r8
+addq   %rdx,%r9
+
+movq   0(%rsi),%rax
+mulq   %rdi
+addq   %rax,%r8
+adcq   $0,%r9
+addq   %rdx,%r10
+adcq   $0,%r11
+
+movq   0(%rsi),%rax
+mulq   64(%rsp)
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   8(%rsi),%rax
+mulq   %rdi
+addq   %rax,%r10
+adcq   $0,%r11
+addq   %rdx,%r12
+adcq   $0,%r13
+
+movq   0(%rsi),%rax
+mulq   %rbx
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   8(%rsi),%rax
+mulq   64(%rsp)
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+movq   16(%rsi),%rax
+mulq   %rdi
+addq   %rax,%r12
+adcq   $0,%r13
+addq   %rdx,%r14
+adcq   $0,%r15
+
+addq   %r9,%r10
+adcq   $0,%r11
+addq   %r11,%r12
+adcq   $0,%r13
+addq   %r13,%r14
+adcq   $0,%r15
+
+movq   56(%rsp),%rdi
+
+movq   %r8,0(%rdi)
+movq   %r10,8(%rdi)
+movq   %r12,16(%rdi)
+movq   %r14,24(%rdi)
+movq   %r15,32(%rdi)
+
+movq   0(%rsp),%r11
+movq   8(%rsp),%r12
+movq   16(%rsp),%r13
+movq   24(%rsp),%r14
+movq   32(%rsp),%r15
+movq   40(%rsp),%rbx
+movq   48(%rsp),%rbp
+
+movq   %r11,%rsp
+
+ret
diff --git a/src/asm/field_5x64_x86_64_maax.s b/src/asm/field_5x64_x86_64_maax.s
index 79b7afb816..b06ec13af3 100644
--- a/src/asm/field_5x64_x86_64_maax.s
+++ b/src/asm/field_5x64_x86_64_maax.s
@@ -1,23 +1,28 @@
-/***********************************************************************
- * Copyright (c) 2021 Kaushik Nath                                     *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+/************************************************************************
+ * Field multiplication and squaring assemblies using representation of *
+ * field elements in base 2^{64}.				        *
+ * Major instructions used in the assemblies are mulx/adcx/adox.        *
+ *									*
+ * Copyright (c) 2021 Kaushik Nath                                      *
+ * Distributed under the MIT software license, see the accompanying     *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php. *
  ***********************************************************************/
 
-/* 4-limb field multiplication and squaring using the bottom 4-limbs of 
- * a 5-limb representation. First reduce the 5-limb inputs to fully
- * reduced 4-limb forms, then multiply and finally output a half reduced
- * output in 5-limb form. The leading limb is of atmost 33 bits. 
- *
- * Major instructions used in the assemblies: mulx/adcx/adox.
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is of at most 64 bits. The 5-limb inputs are fully reduced first  
+ * to 4-limb forms, then multiplied, after which a field element in 5-limb 
+ * form is reported as output. The fifth limb of the output has at most 
+ * 33 bits. 
  */
 
 .att_syntax
 .text
 
 .p2align 4
-.global secp256k1_fe_mul_inner
-secp256k1_fe_mul_inner:
+.global secp256k1_fe_mul_55to5
+secp256k1_fe_mul_55to5:
 movq 	%rsp,%r11
 subq 	$96,%rsp
 
@@ -163,8 +168,8 @@ movq 	%r11,%rsp
 ret
 
 .p2align 4
-.global secp256k1_fe_sqr_inner
-secp256k1_fe_sqr_inner:
+.global secp256k1_fe_sqr_5to5
+secp256k1_fe_sqr_5to5:
 movq    %rsp,%r11
 subq    $56,%rsp
 
@@ -279,3 +284,348 @@ movq 	48(%rsp),%rbx
 movq 	%r11,%rsp
 
 ret
+
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is zero. A field element in 5-limb form is reported as output
+ * such that the fifth limb is of at most 33 bits. 
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_44to5
+secp256k1_fe_mul_44to5:
+push    %rbp
+push    %rbx
+push    %r12
+push    %r13
+push    %r14
+push    %r15
+    
+movq    %rdx,%rbx
+
+xorq    %r13,%r13    
+movq    0(%rbx),%rdx    
+mulx    0(%rsi),%r8,%r9
+mulx    8(%rsi),%rcx,%r10
+adcx    %rcx,%r9     
+mulx    16(%rsi),%rcx,%r11
+adcx    %rcx,%r10    
+mulx    24(%rsi),%rcx,%r12
+adcx    %rcx,%r11
+adcx    %r13,%r12
+
+xorq    %r14,%r14
+movq    8(%rbx),%rdx
+mulx    0(%rsi),%rcx,%rbp
+adcx    %rcx,%r9
+adox    %rbp,%r10
+mulx    8(%rsi),%rcx,%rbp
+adcx    %rcx,%r10
+adox    %rbp,%r11
+mulx    16(%rsi),%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    24(%rsi),%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13	
+adcx    %r14,%r13
+
+xorq    %r15,%r15
+movq    16(%rbx),%rdx
+mulx    0(%rsi),%rcx,%rbp
+adcx    %rcx,%r10
+adox    %rbp,%r11
+mulx    8(%rsi),%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    16(%rsi),%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13
+mulx    24(%rsi),%rcx,%rbp
+adcx    %rcx,%r13
+adox    %rbp,%r14
+adcx    %r15,%r14
+
+xorq    %rax,%rax
+movq    24(%rbx),%rdx
+mulx    0(%rsi),%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    8(%rsi),%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13
+mulx    16(%rsi),%rcx,%rbp
+adcx    %rcx,%r13
+adox    %rbp,%r14
+mulx    24(%rsi),%rcx,%rbp
+adcx    %rcx,%r14
+adox    %rbp,%r15			
+adcx    %rax,%r15
+  
+xorq    %rbp,%rbp
+movq    $0x1000003D1,%rdx
+mulx    %r12,%rax,%r12 
+adcx    %rax,%r8
+adox    %r12,%r9
+mulx    %r13,%rcx,%r13
+adcx    %rcx,%r9
+adox    %r13,%r10
+mulx    %r14,%rcx,%r14
+adcx    %rcx,%r10
+adox    %r14,%r11
+mulx    %r15,%rcx,%r15
+adcx    %rcx,%r11
+adox    %rbp,%r15
+adcx    %rbp,%r15		
+
+movq    %r8,0(%rdi)
+movq    %r9,8(%rdi)
+movq    %r10,16(%rdi)
+movq    %r11,24(%rdi)
+movq    %r15,32(%rdi)
+
+pop     %r15
+pop     %r14
+pop     %r13
+pop     %r12
+pop     %rbx
+pop     %rbp
+
+ret
+
+.p2align 4
+.global secp256k1_fe_sqr_4to5
+secp256k1_fe_sqr_4to5:
+push    %rbp
+push    %rbx
+push    %r12
+push    %r13
+push    %r14
+push    %r15
+push    %rdi
+
+movq    0(%rsi),%rbx  
+movq    8(%rsi),%rbp  
+movq    16(%rsi),%rax
+movq    24(%rsi),%rsi
+
+xorq    %r13,%r13
+movq    %rbx,%rdx
+mulx    %rbp,%r9,%r10
+mulx    %rax,%rcx,%r11
+adcx    %rcx,%r10
+mulx    %rsi,%rcx,%r12
+adcx    %rcx,%r11
+adcx    %r13,%r12
+
+xorq    %r14,%r14
+movq    %rbp,%rdx
+mulx    %rax,%rcx,%rdx
+adcx    %rcx,%r11
+adox    %rdx,%r12
+movq    %rbp,%rdx
+mulx    %rsi,%rcx,%rdx
+adcx    %rcx,%r12
+adox    %rdx,%r13
+adcx    %r14,%r13
+
+xorq    %r15,%r15
+movq    %rax,%rdx
+mulx    %rsi,%rcx,%r14
+adcx    %rcx,%r13
+adcx    %r15,%r14
+
+shld    $1,%r14,%r15
+shld    $1,%r13,%r14
+shld    $1,%r12,%r13
+shld    $1,%r11,%r12
+shld    $1,%r10,%r11
+shld    $1,%r9,%r10
+addq    %r9,%r9
+     
+xorq    %rdx,%rdx
+movq    %rbx,%rdx
+mulx    %rdx,%r8,%rdx
+adcx    %rdx,%r9
+
+movq    %rbp,%rdx
+mulx    %rdx,%rcx,%rdx
+adcx    %rcx,%r10
+adcx    %rdx,%r11
+
+movq    %rax,%rdx
+mulx    %rdx,%rcx,%rdx
+adcx    %rcx,%r12
+adcx    %rdx,%r13
+
+movq    %rsi,%rdx
+mulx    %rdx,%rcx,%rdx
+adcx    %rcx,%r14
+adcx    %rdx,%r15	
+
+xorq    %rbp,%rbp
+movq    $0x1000003D1,%rdx
+mulx    %r12,%rbx,%r12
+adcx    %r8,%rbx
+adox    %r9,%r12
+mulx    %r13,%rcx,%rax
+adcx    %rcx,%r12
+adox    %r10,%rax
+mulx    %r14,%rcx,%rsi
+adcx    %rcx,%rax
+adox    %r11,%rsi
+mulx    %r15,%rcx,%r15
+adcx    %rcx,%rsi
+adox    %rbp,%r15
+adcx    %rbp,%r15
+
+movq    %rbx,0(%rdi)
+movq    %r12,8(%rdi)
+movq    %rax,16(%rdi)
+movq    %rsi,24(%rdi)
+movq    %r15,32(%rdi)
+
+pop     %r15
+pop     %r14
+pop     %r13
+pop     %r12
+pop     %rbx
+pop     %rbp
+
+ret
+
+/* 64-bit field multiplication in which the first argument has 4-limb 
+ * and the second argument has 5-limb representations such that the 
+ * fifth limb is of at most 64 bits. The second argument is fully 
+ * reduced to 4-limb form and then field multiplication is performed. 
+ * A field element in 5-limb form is reported as output such that the 
+ * fifth limb is of at most 33 bits.
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_45to5
+secp256k1_fe_mul_45to5:
+movq 	%rsp,%r11
+subq 	$72,%rsp
+
+movq 	%r11,0(%rsp)
+movq 	%r12,8(%rsp)
+movq 	%r13,16(%rsp)
+movq 	%r14,24(%rsp)
+movq 	%r15,32(%rsp)
+movq 	%rbp,40(%rsp)
+movq 	%rbx,48(%rsp)
+movq 	%rdi,56(%rsp)
+
+movq    0(%rdx),%rax
+movq    8(%rdx),%rbx
+movq    16(%rdx),%r8
+movq    24(%rdx),%r9
+
+movq    $0x1000003D1,%r15
+xorq    %rcx,%rcx
+mulx    32(%rdx),%r13,%r14
+adcx    %r13,%rax
+adcx    %r14,%rbx
+adcx    %rcx,%r8
+adcx    %rcx,%r9
+cmovc   %r15,%rcx
+addq    %rcx,%rax
+adcq    $0,%rbx
+
+movq    %r8,56(%rsp)
+movq    %r9,64(%rsp)
+
+xorq    %r13,%r13
+movq    0(%rsi),%rdx    
+mulx    %rax,%r8,%r9
+mulx    %rbx,%rcx,%r10
+adcx    %rcx,%r9     
+mulx    56(%rsp),%rcx,%r11
+adcx    %rcx,%r10    
+mulx    64(%rsp),%rcx,%r12
+adcx    %rcx,%r11
+adcx    %r13,%r12
+
+xorq    %r14,%r14
+movq    8(%rsi),%rdx
+mulx    %rax,%rcx,%rbp
+adcx    %rcx,%r9
+adox    %rbp,%r10
+mulx    %rbx,%rcx,%rbp
+adcx    %rcx,%r10
+adox    %rbp,%r11
+mulx    56(%rsp),%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    64(%rsp),%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13	
+adcx    %r14,%r13
+
+xorq    %r15,%r15
+movq    16(%rsi),%rdx
+mulx    %rax,%rcx,%rbp
+adcx    %rcx,%r10
+adox    %rbp,%r11
+mulx    %rbx,%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    56(%rsp),%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13
+mulx    64(%rsp),%rcx,%rbp
+adcx    %rcx,%r13
+adox    %rbp,%r14
+adcx    %r15,%r14
+
+xorq    %rdx,%rdx
+movq    24(%rsi),%rdx
+mulx    %rax,%rcx,%rbp
+adcx    %rcx,%r11
+adox    %rbp,%r12
+mulx    %rbx,%rcx,%rbp
+adcx    %rcx,%r12
+adox    %rbp,%r13
+mulx    56(%rsp),%rcx,%rbp
+adcx    %rcx,%r13
+adox    %rbp,%r14
+mulx    64(%rsp),%rcx,%rbp
+adcx    %rcx,%r14
+adox    %rbp,%r15			
+adcq    $0,%r15
+  
+xorq    %rbp,%rbp
+movq    $0x1000003D1,%rdx
+mulx    %r12,%rax,%r12 
+adcx    %rax,%r8
+adox    %r12,%r9
+mulx    %r13,%rcx,%r13
+adcx    %rcx,%r9
+adox    %r13,%r10
+mulx    %r14,%rcx,%r14
+adcx    %rcx,%r10
+adox    %r14,%r11
+mulx    %r15,%rcx,%r15
+adcx    %rcx,%r11
+adox    %rbp,%r15
+adcx    %rbp,%r15		
+
+movq    %r8,0(%rdi)
+movq    %r9,8(%rdi)
+movq    %r10,16(%rdi)
+movq    %r11,24(%rdi)
+movq    %r15,32(%rdi)
+
+movq 	 0(%rsp),%r11
+movq 	 8(%rsp),%r12
+movq 	16(%rsp),%r13
+movq 	24(%rsp),%r14
+movq 	32(%rsp),%r15
+movq 	40(%rsp),%rbp
+movq 	48(%rsp),%rbx
+
+movq 	%r11,%rsp
+
+ret
diff --git a/src/asm/field_5x64_x86_64_mxaa.s b/src/asm/field_5x64_x86_64_mxaa.s
index dec40d76ed..f9b99e3848 100644
--- a/src/asm/field_5x64_x86_64_mxaa.s
+++ b/src/asm/field_5x64_x86_64_mxaa.s
@@ -1,23 +1,28 @@
-/***********************************************************************
- * Copyright (c) 2021 Kaushik Nath                                     *
- * Distributed under the MIT software license, see the accompanying    *
- * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
+/************************************************************************
+ * Field multiplication and squaring assemblies using representation of *
+ * field elements in base 2^{64}.				        *
+ * Major instructions used in the assemblies are mulx/add/adc.          *
+ *									*
+ * Copyright (c) 2021 Kaushik Nath                                      *
+ * Distributed under the MIT software license, see the accompanying     *
+ * file COPYING or https://www.opensource.org/licenses/mit-license.php. *
  ***********************************************************************/
 
-/* 4-limb field multiplication and squaring using the bottom 4-limbs of 
- * a 5-limb representation. First reduce the 5-limb inputs to fully
- * reduced 4-limb forms, then multiply and finally output a half reduced
- * output in 5-limb form. The leading limb is of atmost 33 bits. 
- *
- * Major instructions used in the assemblies: mulx/add/adc.
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is of at most 64 bits. The 5-limb inputs are fully reduced first  
+ * to 4-limb forms, then multiplied, after which a field element in 5-limb 
+ * form is reported as output. The fifth limb of the output has at most 
+ * 33 bits. 
  */
 
 .att_syntax
 .text
 
 .p2align 4
-.global secp256k1_fe_mul_inner
-secp256k1_fe_mul_inner:
+.global secp256k1_fe_mul_55to5
+secp256k1_fe_mul_55to5:
 movq 	%rsp,%r11
 subq 	$112,%rsp
 
@@ -160,8 +165,8 @@ movq 	%r11,%rsp
 ret
 
 .p2align 4
-.global secp256k1_fe_sqr_inner
-secp256k1_fe_sqr_inner:
+.global secp256k1_fe_sqr_5to5
+secp256k1_fe_sqr_5to5:
 movq    %rsp,%r11
 subq    $64,%rsp
 
@@ -219,7 +224,7 @@ shld    $1,%r12,%r13
 shld    $1,%r11,%r12
 shld    $1,%r10,%r11
 shld    $1,%r9,%r10
-shlq    $1,%r9
+addq    %r9,%r9
 
 movq    %rbp,%rdx
 mulx    %rdx,%r8,%rax
@@ -278,3 +283,345 @@ movq 	48(%rsp),%rbx
 movq 	%r11,%rsp
 
 ret
+
+/*
+ * 64-bit field multiplication and squaring using the bottom 4-limbs of 
+ * two field elements having 5-limb representation such that the fifth
+ * limb is zero. A field element in 5-limb form is reported as output
+ * such that the fifth limb is of at most 33 bits. 
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_44to5
+secp256k1_fe_mul_44to5:
+movq 	%rsp,%r11
+subq 	$64,%rsp
+
+movq 	%r11,0(%rsp)
+movq 	%r12,8(%rsp)
+movq 	%r13,16(%rsp)
+movq 	%r14,24(%rsp)
+movq 	%r15,32(%rsp)
+movq 	%rbp,40(%rsp)
+movq 	%rbx,48(%rsp)
+movq 	%rdi,56(%rsp)
+
+movq    %rdx,%rdi
+
+movq    0(%rdi),%rdx    
+mulx    0(%rsi),%r8,%r9
+mulx    8(%rsi),%rcx,%r10
+addq    %rcx,%r9
+mulx    16(%rsi),%rcx,%r11
+adcq    %rcx,%r10
+mulx    24(%rsi),%rcx,%r12
+adcq    %rcx,%r11
+adcq    $0,%r12
+
+movq    8(%rdi),%rdx    
+mulx    0(%rsi),%rax,%rbx
+mulx    8(%rsi),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    16(%rsi),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    24(%rsi),%rcx,%r13
+adcq    %rcx,%r15
+adcq    $0,%r13
+addq    %rax,%r9
+adcq    %rbx,%r10
+adcq    %rbp,%r11
+adcq    %r15,%r12
+adcq    $0,%r13
+
+movq    16(%rdi),%rdx
+mulx    0(%rsi),%rax,%rbx
+mulx    8(%rsi),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    16(%rsi),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    24(%rsi),%rcx,%r14
+adcq    %rcx,%r15
+adcq    $0,%r14
+addq    %rax,%r10
+adcq    %rbx,%r11
+adcq    %rbp,%r12
+adcq    %r15,%r13
+adcq    $0,%r14
+
+movq    24(%rdi),%rdx
+mulx    0(%rsi),%rax,%rbx
+mulx    8(%rsi),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    16(%rsi),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    24(%rsi),%rcx,%rsi
+adcq    %rcx,%r15
+adcq    $0,%rsi
+addq    %rax,%r11
+adcq    %rbx,%r12
+adcq    %rbp,%r13
+adcq    %r15,%r14
+adcq    $0,%rsi
+
+movq    $0x1000003D1,%rdx
+mulx    %r12,%r12,%rbx
+mulx    %r13,%r13,%rcx
+addq    %rbx,%r13
+mulx    %r14,%r14,%rbx
+adcq    %rcx,%r14
+mulx    %rsi,%r15,%rcx
+adcq    %rbx,%r15
+adcq    $0,%rcx
+addq    %r12,%r8
+adcq    %r13,%r9
+adcq    %r14,%r10
+adcq    %r15,%r11
+adcq    $0,%rcx
+
+movq 	56(%rsp),%rdi
+movq   	%r8,0(%rdi)
+movq   	%r9,8(%rdi)
+movq   	%r10,16(%rdi)
+movq   	%r11,24(%rdi)
+movq   	%rcx,32(%rdi)
+
+movq 	 0(%rsp),%r11
+movq 	 8(%rsp),%r12
+movq 	16(%rsp),%r13
+movq 	24(%rsp),%r14
+movq 	32(%rsp),%r15
+movq 	40(%rsp),%rbp
+movq 	48(%rsp),%rbx
+
+movq 	%r11,%rsp
+
+ret
+
+.p2align 4
+.global secp256k1_fe_sqr_4to5
+secp256k1_fe_sqr_4to5:
+movq    %rsp,%r11
+subq    $56,%rsp
+
+movq 	%r11,0(%rsp)
+movq 	%r12,8(%rsp)
+movq 	%r13,16(%rsp)
+movq 	%r14,24(%rsp)
+movq 	%r15,32(%rsp)
+movq 	%rbp,40(%rsp)
+movq 	%rbx,48(%rsp)
+
+movq    0(%rsi),%rdx    
+mulx    8(%rsi),%r9,%r10
+mulx    16(%rsi),%rcx,%r11
+addq    %rcx,%r10
+mulx    24(%rsi),%rcx,%r12
+adcq    %rcx,%r11
+adcq    $0,%r12
+
+movq    8(%rsi),%rdx
+mulx    16(%rsi),%rax,%rbx
+mulx    24(%rsi),%rcx,%r13
+addq    %rcx,%rbx
+adcq    $0,%r13
+addq    %rax,%r11
+adcq    %rbx,%r12
+adcq    $0,%r13
+
+movq    16(%rsi),%rdx
+mulx    24(%rsi),%rax,%r14
+addq    %rax,%r13
+adcq    $0,%r14
+
+movq    $0,%r15
+shld    $1,%r14,%r15
+shld    $1,%r13,%r14
+shld    $1,%r12,%r13
+shld    $1,%r11,%r12
+shld    $1,%r10,%r11
+shld    $1,%r9,%r10
+addq    %r9,%r9
+
+movq    0(%rsi),%rdx
+mulx    %rdx,%r8,%rax
+addq    %rax,%r9
+
+movq    8(%rsi),%rdx
+mulx    %rdx,%rax,%rbx
+adcq    %rax,%r10
+adcq    %rbx,%r11
+
+movq    16(%rsi),%rdx
+mulx    %rdx,%rax,%rbx
+adcq    %rax,%r12
+adcq    %rbx,%r13
+
+movq    24(%rsi),%rdx
+mulx    %rdx,%rax,%rbx
+adcq    %rax,%r14
+adcq    %rbx,%r15
+
+movq    $0x1000003D1,%rdx
+mulx    %r12,%r12,%rbx
+mulx    %r13,%r13,%rcx
+addq    %rbx,%r13
+mulx    %r14,%r14,%rbx
+adcq    %rcx,%r14
+mulx    %r15,%r15,%rcx
+adcq    %rbx,%r15
+adcq    $0,%rcx
+addq    %r12,%r8
+adcq    %r13,%r9
+adcq    %r14,%r10
+adcq    %r15,%r11
+adcq    $0,%rcx
+
+movq   	%r8,0(%rsi)
+movq   	%r9,8(%rsi)
+movq   	%r10,16(%rsi)
+movq   	%r11,24(%rsi)
+movq   	%rcx,32(%rsi)
+
+movq 	 0(%rsp),%r11
+movq 	 8(%rsp),%r12
+movq 	16(%rsp),%r13
+movq 	24(%rsp),%r14
+movq 	32(%rsp),%r15
+movq 	40(%rsp),%rbp
+movq 	48(%rsp),%rbx
+
+movq 	%r11,%rsp
+
+ret
+
+/* 64-bit field multiplication in which the first argument has 4-limb 
+ * and the second argument has 5-limb representations such that the 
+ * fifth limb is of at most 64 bits. The second argument is fully 
+ * reduced to 4-limb form and then field multiplication is performed. 
+ * A field element in 5-limb form is reported as output such that the 
+ * fifth limb is of at most 33 bits.
+ */
+
+.p2align 4
+.global secp256k1_fe_mul_45to5
+secp256k1_fe_mul_45to5:
+movq 	%rsp,%r11
+subq 	$88,%rsp
+
+movq 	%r11,0(%rsp)
+movq 	%r12,8(%rsp)
+movq 	%r13,16(%rsp)
+movq 	%r14,24(%rsp)
+movq 	%r15,32(%rsp)
+movq 	%rbp,40(%rsp)
+movq 	%rbx,48(%rsp)
+
+movq    0(%rdx),%r12
+movq    8(%rdx),%r13
+movq    16(%rdx),%r14
+movq    24(%rdx),%r15
+movq    32(%rdx),%rax
+
+movq    $0x1000003D1,%rdx
+xorq    %rcx,%rcx
+mulx    %rax,%rax,%rbx
+addq    %rax,%r12
+adcq    %rbx,%r13
+adcq    $0,%r14
+adcq    $0,%r15
+cmovc   %rdx,%rcx
+addq    %rcx,%r12
+adcq    $0,%r13
+
+movq    %r12,56(%rsp)
+movq    %r13,64(%rsp)
+movq    %r14,72(%rsp)
+movq    %r15,80(%rsp)
+
+movq    0(%rsi),%rdx
+mulx    56(%rsp),%r8,%r9
+mulx    64(%rsp),%rcx,%r10
+addq    %rcx,%r9
+mulx    72(%rsp),%rcx,%r11
+adcq    %rcx,%r10
+mulx    80(%rsp),%rcx,%r12
+adcq    %rcx,%r11
+adcq    $0,%r12
+
+movq    8(%rsi),%rdx    
+mulx    56(%rsp),%rax,%rbx
+mulx    64(%rsp),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    72(%rsp),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    80(%rsp),%rcx,%r13
+adcq    %rcx,%r15
+adcq    $0,%r13
+addq    %rax,%r9
+adcq    %rbx,%r10
+adcq    %rbp,%r11
+adcq    %r15,%r12
+adcq    $0,%r13
+
+movq    16(%rsi),%rdx
+mulx    56(%rsp),%rax,%rbx
+mulx    64(%rsp),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    72(%rsp),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    80(%rsp),%rcx,%r14
+adcq    %rcx,%r15
+adcq    $0,%r14
+addq    %rax,%r10
+adcq    %rbx,%r11
+adcq    %rbp,%r12
+adcq    %r15,%r13
+adcq    $0,%r14
+
+movq    24(%rsi),%rdx
+mulx    56(%rsp),%rax,%rbx
+mulx    64(%rsp),%rcx,%rbp
+addq    %rcx,%rbx
+mulx    72(%rsp),%rcx,%r15
+adcq    %rcx,%rbp
+mulx    80(%rsp),%rcx,%rsi
+adcq    %rcx,%r15
+adcq    $0,%rsi
+addq    %rax,%r11
+adcq    %rbx,%r12
+adcq    %rbp,%r13
+adcq    %r15,%r14
+adcq    $0,%rsi
+
+movq    $0x1000003D1,%rdx
+mulx    %r12,%r12,%rbx
+mulx    %r13,%r13,%rcx
+addq    %rbx,%r13
+mulx    %r14,%r14,%rbx
+adcq    %rcx,%r14
+mulx    %rsi,%r15,%rcx
+adcq    %rbx,%r15
+adcq    $0,%rcx
+addq    %r12,%r8
+adcq    %r13,%r9
+adcq    %r14,%r10
+adcq    %r15,%r11
+adcq    $0,%rcx
+
+movq   	%r8,0(%rdi)
+movq   	%r9,8(%rdi)
+movq   	%r10,16(%rdi)
+movq   	%r11,24(%rdi)
+movq   	%rcx,32(%rdi)
+
+movq 	 0(%rsp),%r11
+movq 	 8(%rsp),%r12
+movq 	16(%rsp),%r13
+movq 	24(%rsp),%r14
+movq 	32(%rsp),%r15
+movq 	40(%rsp),%rbp
+movq 	48(%rsp),%rbx
+
+movq 	%r11,%rsp
+
+ret
diff --git a/src/field_5x64_impl.h b/src/field_5x64_impl.h
index 1c587b0332..0325d65e60 100644
--- a/src/field_5x64_impl.h
+++ b/src/field_5x64_impl.h
@@ -17,8 +17,10 @@
 
 #if defined(USE_EXTERNAL_ASM)
 /* External assembler implementation */
-void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b);
-void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a);
+void secp256k1_fe_mul_55to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b);
+void secp256k1_fe_mul_45to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b);
+void secp256k1_fe_sqr_5to5(uint64_t *r, const uint64_t *a);
+void secp256k1_fe_sqr_4to5(uint64_t *r, const uint64_t *a);
 #endif
 
 #ifdef VERIFY
@@ -733,7 +735,7 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2
 #endif
 
 #if defined(USE_EXTERNAL_ASM)
-    secp256k1_fe_mul_inner(r->n, a->n, b->n);
+    secp256k1_fe_mul_55to5(r->n, a->n, b->n);
 #else
     mul2(c0,c1,a4,0x1000003D1ULL);
     a4 = 0;
@@ -803,10 +805,12 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2
 }
 
 static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const secp256k1_fe * SECP256K1_RESTRICT b_prec) {
+#ifndef USE_EXTERNAL_ASM
     uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4];
     uint64_t b0 = b_prec->n[0], b1 = b_prec->n[1], b2 = b_prec->n[2], b3 = b_prec->n[3];
     uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0;
     uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0;
+#endif
 
 #ifdef VERIFY
     VERIFY_CHECK(a->magnitude <= 2048);
@@ -818,6 +822,9 @@ static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const
     VERIFY_CHECK(a != b_prec);
 #endif
 
+#if defined(USE_EXTERNAL_ASM)
+    secp256k1_fe_mul_45to5(r->n, b_prec->n, a->n);
+#else
     mul2(c0,c1,a4,0x1000003D1ULL);
     a4 = 0;
     add2(c0,c1,a0);
@@ -862,6 +869,7 @@ static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const
     add2(d3,d4,c3);
     r->n[3] = d3;
     r->n[4] = d4;
+#endif
 
 #ifdef VERIFY
     r->magnitude = 1;
@@ -884,7 +892,7 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) {
 #endif
 
 #if defined(USE_EXTERNAL_ASM)
-    secp256k1_fe_sqr_inner(r->n, a->n);
+    secp256k1_fe_sqr_5to5(r->n, a->n);
 #else
     /* Bring a to [0,2**256). */
     mul2(c0,c1,a4,0x1000003D1ULL);
@@ -935,9 +943,11 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) {
 }
 
 static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) {
+#ifndef USE_EXTERNAL_ASM
     uint64_t a0 = a_prec->n[0], a1 = a_prec->n[1], a2 = a_prec->n[2], a3 = a_prec->n[3];
     uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0;
     uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0;
+#endif
 
 #ifdef VERIFY
     VERIFY_CHECK(a_prec->precomputed);
@@ -945,6 +955,9 @@ static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) {
     secp256k1_fe_verify(a_prec);
 #endif
 
+#if defined(USE_EXTERNAL_ASM)
+    secp256k1_fe_sqr_4to5(r->n, a_prec->n);
+#else
     /* Compute 512-bit product. */
     c0 = 0;
     c1 = 0;
@@ -973,6 +986,7 @@ static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) {
     add2(d3,d4,c3);
     r->n[3] = d3;
     r->n[4] = d4;
+#endif
 
 #ifdef VERIFY
     r->magnitude = 1;