diff --git a/src/asm/field_5x64_x86_64_gen.s b/src/asm/field_5x64_x86_64_gen.s index 596e7e9596..ee29792512 100644 --- a/src/asm/field_5x64_x86_64_gen.s +++ b/src/asm/field_5x64_x86_64_gen.s @@ -1,23 +1,28 @@ -/*********************************************************************** - * Copyright (c) 2021 Kaushik Nath * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* +/************************************************************************ + * Field multiplication and squaring assemblies using representation of * + * field elements in base 2^{64}. * + * Major instructions used in the assemblies are mul/add/adc. * + * * + * Copyright (c) 2021 Kaushik Nath * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or https://www.opensource.org/licenses/mit-license.php. * ***********************************************************************/ -/* 4-limb field multiplication and squaring using the bottom 4-limbs of - * a 5-limb representation. First reduce the 5-limb inputs to fully - * reduced 4-limb forms, then multiply and finally output a half reduced - * output in 5-limb form. The leading limb is of atmost 33 bits. - * - * Major instructions used in the assemblies: mul/add/adc. +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is of at most 64 bits. The 5-limb inputs are fully reduced first + * to 4-limb forms, then multiplied, after which a field element in 5-limb + * form is reported as output. The fifth limb of the output has at most + * 33 bits. */ .att_syntax .text .p2align 4 -.global secp256k1_fe_mul_inner -secp256k1_fe_mul_inner: +.global secp256k1_fe_mul_55to5 +secp256k1_fe_mul_55to5: movq %rsp,%r11 subq $96,%rsp @@ -228,8 +233,8 @@ movq %r11,%rsp ret .p2align 4 -.global secp256k1_fe_sqr_inner -secp256k1_fe_sqr_inner: +.global secp256k1_fe_sqr_5to5 +secp256k1_fe_sqr_5to5: movq %rsp,%r11 subq $64,%rsp @@ -401,3 +406,555 @@ movq 48(%rsp),%rbp movq %r11,%rsp ret + +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_44to5 +secp256k1_fe_mul_44to5: +movq %rsp,%r11 +subq $48,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbx,40(%rsp) + +movq %rdx,%rcx +movq $0x1000003D1,%rbx + +movq 8(%rsi),%rax +mulq 24(%rcx) +movq %rax,%r8 +xorq %r9,%r9 +movq %rdx,%r10 +xorq %r11,%r11 + +movq 16(%rsi),%rax +mulq 16(%rcx) +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 24(%rsi),%rax +mulq 8(%rcx) +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 16(%rsi),%rax +mulq 24(%rcx) +addq %rax,%r10 +adcq $0,%r11 +movq %rdx,%r12 +xorq %r13,%r13 + +movq 24(%rsi),%rax +mulq 16(%rcx) +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq %rbx,%rax +mulq %r10 +imul %rbx,%r11 +movq %rax,%r10 +addq %rdx,%r11 + +movq 24(%rsi),%rax +mulq 24(%rcx) +addq %rax,%r12 +adcq $0,%r13 + +movq %rbx,%rax +mulq %rdx +movq %rax,%r14 +movq %rdx,%r15 + +movq %rbx,%rax +mulq %r12 +imul %rbx,%r13 +movq %rax,%r12 +addq %rdx,%r13 + +movq 0(%rsi),%rax +mulq 24(%rcx) +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 8(%rsi),%rax +mulq 16(%rcx) +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 16(%rsi),%rax +mulq 8(%rcx) +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 24(%rsi),%rax +mulq 0(%rcx) +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq %rbx,%rax +mulq %r8 +imul %rbx,%r9 +movq %rax,%r8 +addq %rdx,%r9 + +movq 0(%rsi),%rax +mulq 0(%rcx) +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 0(%rsi),%rax +mulq 8(%rcx) +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq 8(%rsi),%rax +mulq 0(%rcx) +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq 0(%rsi),%rax +mulq 16(%rcx) +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq 8(%rsi),%rax +mulq 8(%rcx) +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq 16(%rsi),%rax +mulq 0(%rcx) +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +addq %r9,%r10 +adcq $0,%r11 + +addq %r11,%r12 +adcq $0,%r13 + +addq %r13,%r14 +adcq $0,%r15 + +movq %r8,0(%rdi) +movq %r10,8(%rdi) +movq %r12,16(%rdi) +movq %r14,24(%rdi) +movq %r15,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbx + +movq %r11,%rsp + +ret + +.p2align 4 +.global secp256k1_fe_sqr_4to5 +secp256k1_fe_sqr_4to5: +movq %rsp,%r11 +subq $64,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbx,40(%rsp) +movq %rbp,48(%rsp) +movq %rdi,56(%rsp) + +movq 0(%rsi),%rbx +movq 8(%rsi),%rbp +movq 16(%rsi),%rcx +movq 24(%rsi),%rdi + +movq $0x1000003D1,%rsi + +movq %rbp,%rax +mulq %rdi +movq %rax,%r8 +xorq %r9,%r9 +movq %rdx,%r10 +xorq %r11,%r11 +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq %rcx,%rax +mulq %rcx +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq %rcx,%rax +mulq %rdi +addq %rax,%r10 +adcq $0,%r11 +movq %rdx,%r12 +xorq %r13,%r13 +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq %rsi,%rax +mulq %r10 +imul %rsi,%r11 +movq %rax,%r10 +addq %rdx,%r11 + +movq %rdi,%rax +mulq %rdi +addq %rax,%r12 +adcq $0,%r13 + +movq %rsi,%rax +mulq %rdx +movq %rax,%r14 +movq %rdx,%r15 + +movq %rsi,%rax +mulq %r12 +imul %rsi,%r13 +movq %rax,%r12 +addq %rdx,%r13 + +movq %rbx,%rax +mulq %rdi +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq %rbp,%rax +mulq %rcx +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq %rsi,%rax +mulq %r8 +imul %rsi,%r9 +movq %rax,%r8 +addq %rdx,%r9 + +movq %rbx,%rax +mulq %rbx +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq %rbx,%rax +mulq %rbp +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq %rbx,%rax +mulq %rcx +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq %rbp,%rax +mulq %rbp +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq %r10,%rbp +addq %r9,%rbp +adcq $0,%r11 + +movq %r12,%rcx +addq %r11,%rcx +adcq $0,%r13 + +addq %r13,%r14 +adcq $0,%r15 + +movq 56(%rsp),%rdi + +movq %rbx,0(%rdi) +movq %rbp,8(%rdi) +movq %rcx,16(%rdi) +movq %r14,24(%rdi) +movq %r15,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbx +movq 48(%rsp),%rbp + +movq %r11,%rsp + +ret + +/* 64-bit field multiplication in which the first argument has 4-limb + * and the second argument has 5-limb representations such that the + * fifth limb is of at most 64 bits. The second argument is fully + * reduced to 4-limb form and then field multiplication is performed. + * A field element in 5-limb form is reported as output such that the + * fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_45to5 +secp256k1_fe_mul_45to5: +movq %rsp,%r11 +subq $72,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbx,40(%rsp) +movq %rbp,48(%rsp) +movq %rdi,56(%rsp) + +movq $0x1000003d1,%rcx + +movq 0(%rdx),%r8 +movq 8(%rdx),%r9 +movq 16(%rdx),%rbx +movq 24(%rdx),%rbp +movq 32(%rdx),%rax + +mulq %rcx +xorq %rdi,%rdi +addq %r8,%rax +adcq %r9,%rdx +adcq $0,%rbx +adcq $0,%rbp +cmovc %rcx,%rdi +addq %rax,%rdi +adcq $0,%rdx +movq %rdx,64(%rsp) + +movq 8(%rsi),%rax +mulq %rbp +movq %rax,%r8 +xorq %r9,%r9 +movq %rdx,%r10 +xorq %r11,%r11 + +movq 16(%rsi),%rax +mulq %rbx +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 24(%rsi),%rax +mulq 64(%rsp) +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 16(%rsi),%rax +mulq %rbp +addq %rax,%r10 +adcq $0,%r11 +movq %rdx,%r12 +xorq %r13,%r13 + +movq 24(%rsi),%rax +mulq %rbx +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq %rcx,%rax +mulq %r10 +imul %rcx,%r11 +movq %rax,%r10 +addq %rdx,%r11 + +movq 24(%rsi),%rax +mulq %rbp +addq %rax,%r12 +adcq $0,%r13 + +movq %rcx,%rax +mulq %rdx +movq %rax,%r14 +movq %rdx,%r15 + +movq %rcx,%rax +mulq %r12 +imul %rcx,%r13 +movq %rax,%r12 +addq %rdx,%r13 + +movq 0(%rsi),%rax +mulq %rbp +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 8(%rsi),%rax +mulq %rbx +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 16(%rsi),%rax +mulq 64(%rsp) +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 24(%rsi),%rax +mulq %rdi +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq %rcx,%rax +mulq %r8 +imul %rcx,%r9 +movq %rax,%r8 +addq %rdx,%r9 + +movq 0(%rsi),%rax +mulq %rdi +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 0(%rsi),%rax +mulq 64(%rsp) +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq 8(%rsi),%rax +mulq %rdi +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq 0(%rsi),%rax +mulq %rbx +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq 8(%rsi),%rax +mulq 64(%rsp) +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq 16(%rsi),%rax +mulq %rdi +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +addq %r9,%r10 +adcq $0,%r11 +addq %r11,%r12 +adcq $0,%r13 +addq %r13,%r14 +adcq $0,%r15 + +movq 56(%rsp),%rdi + +movq %r8,0(%rdi) +movq %r10,8(%rdi) +movq %r12,16(%rdi) +movq %r14,24(%rdi) +movq %r15,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbx +movq 48(%rsp),%rbp + +movq %r11,%rsp + +ret diff --git a/src/asm/field_5x64_x86_64_maax.s b/src/asm/field_5x64_x86_64_maax.s index 79b7afb816..b06ec13af3 100644 --- a/src/asm/field_5x64_x86_64_maax.s +++ b/src/asm/field_5x64_x86_64_maax.s @@ -1,23 +1,28 @@ -/*********************************************************************** - * Copyright (c) 2021 Kaushik Nath * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* +/************************************************************************ + * Field multiplication and squaring assemblies using representation of * + * field elements in base 2^{64}. * + * Major instructions used in the assemblies are mulx/adcx/adox. * + * * + * Copyright (c) 2021 Kaushik Nath * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or https://www.opensource.org/licenses/mit-license.php. * ***********************************************************************/ -/* 4-limb field multiplication and squaring using the bottom 4-limbs of - * a 5-limb representation. First reduce the 5-limb inputs to fully - * reduced 4-limb forms, then multiply and finally output a half reduced - * output in 5-limb form. The leading limb is of atmost 33 bits. - * - * Major instructions used in the assemblies: mulx/adcx/adox. +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is of at most 64 bits. The 5-limb inputs are fully reduced first + * to 4-limb forms, then multiplied, after which a field element in 5-limb + * form is reported as output. The fifth limb of the output has at most + * 33 bits. */ .att_syntax .text .p2align 4 -.global secp256k1_fe_mul_inner -secp256k1_fe_mul_inner: +.global secp256k1_fe_mul_55to5 +secp256k1_fe_mul_55to5: movq %rsp,%r11 subq $96,%rsp @@ -163,8 +168,8 @@ movq %r11,%rsp ret .p2align 4 -.global secp256k1_fe_sqr_inner -secp256k1_fe_sqr_inner: +.global secp256k1_fe_sqr_5to5 +secp256k1_fe_sqr_5to5: movq %rsp,%r11 subq $56,%rsp @@ -279,3 +284,348 @@ movq 48(%rsp),%rbx movq %r11,%rsp ret + +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_44to5 +secp256k1_fe_mul_44to5: +push %rbp +push %rbx +push %r12 +push %r13 +push %r14 +push %r15 + +movq %rdx,%rbx + +xorq %r13,%r13 +movq 0(%rbx),%rdx +mulx 0(%rsi),%r8,%r9 +mulx 8(%rsi),%rcx,%r10 +adcx %rcx,%r9 +mulx 16(%rsi),%rcx,%r11 +adcx %rcx,%r10 +mulx 24(%rsi),%rcx,%r12 +adcx %rcx,%r11 +adcx %r13,%r12 + +xorq %r14,%r14 +movq 8(%rbx),%rdx +mulx 0(%rsi),%rcx,%rbp +adcx %rcx,%r9 +adox %rbp,%r10 +mulx 8(%rsi),%rcx,%rbp +adcx %rcx,%r10 +adox %rbp,%r11 +mulx 16(%rsi),%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx 24(%rsi),%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +adcx %r14,%r13 + +xorq %r15,%r15 +movq 16(%rbx),%rdx +mulx 0(%rsi),%rcx,%rbp +adcx %rcx,%r10 +adox %rbp,%r11 +mulx 8(%rsi),%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx 16(%rsi),%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +mulx 24(%rsi),%rcx,%rbp +adcx %rcx,%r13 +adox %rbp,%r14 +adcx %r15,%r14 + +xorq %rax,%rax +movq 24(%rbx),%rdx +mulx 0(%rsi),%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx 8(%rsi),%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +mulx 16(%rsi),%rcx,%rbp +adcx %rcx,%r13 +adox %rbp,%r14 +mulx 24(%rsi),%rcx,%rbp +adcx %rcx,%r14 +adox %rbp,%r15 +adcx %rax,%r15 + +xorq %rbp,%rbp +movq $0x1000003D1,%rdx +mulx %r12,%rax,%r12 +adcx %rax,%r8 +adox %r12,%r9 +mulx %r13,%rcx,%r13 +adcx %rcx,%r9 +adox %r13,%r10 +mulx %r14,%rcx,%r14 +adcx %rcx,%r10 +adox %r14,%r11 +mulx %r15,%rcx,%r15 +adcx %rcx,%r11 +adox %rbp,%r15 +adcx %rbp,%r15 + +movq %r8,0(%rdi) +movq %r9,8(%rdi) +movq %r10,16(%rdi) +movq %r11,24(%rdi) +movq %r15,32(%rdi) + +pop %r15 +pop %r14 +pop %r13 +pop %r12 +pop %rbx +pop %rbp + +ret + +.p2align 4 +.global secp256k1_fe_sqr_4to5 +secp256k1_fe_sqr_4to5: +push %rbp +push %rbx +push %r12 +push %r13 +push %r14 +push %r15 +push %rdi + +movq 0(%rsi),%rbx +movq 8(%rsi),%rbp +movq 16(%rsi),%rax +movq 24(%rsi),%rsi + +xorq %r13,%r13 +movq %rbx,%rdx +mulx %rbp,%r9,%r10 +mulx %rax,%rcx,%r11 +adcx %rcx,%r10 +mulx %rsi,%rcx,%r12 +adcx %rcx,%r11 +adcx %r13,%r12 + +xorq %r14,%r14 +movq %rbp,%rdx +mulx %rax,%rcx,%rdx +adcx %rcx,%r11 +adox %rdx,%r12 +movq %rbp,%rdx +mulx %rsi,%rcx,%rdx +adcx %rcx,%r12 +adox %rdx,%r13 +adcx %r14,%r13 + +xorq %r15,%r15 +movq %rax,%rdx +mulx %rsi,%rcx,%r14 +adcx %rcx,%r13 +adcx %r15,%r14 + +shld $1,%r14,%r15 +shld $1,%r13,%r14 +shld $1,%r12,%r13 +shld $1,%r11,%r12 +shld $1,%r10,%r11 +shld $1,%r9,%r10 +addq %r9,%r9 + +xorq %rdx,%rdx +movq %rbx,%rdx +mulx %rdx,%r8,%rdx +adcx %rdx,%r9 + +movq %rbp,%rdx +mulx %rdx,%rcx,%rdx +adcx %rcx,%r10 +adcx %rdx,%r11 + +movq %rax,%rdx +mulx %rdx,%rcx,%rdx +adcx %rcx,%r12 +adcx %rdx,%r13 + +movq %rsi,%rdx +mulx %rdx,%rcx,%rdx +adcx %rcx,%r14 +adcx %rdx,%r15 + +xorq %rbp,%rbp +movq $0x1000003D1,%rdx +mulx %r12,%rbx,%r12 +adcx %r8,%rbx +adox %r9,%r12 +mulx %r13,%rcx,%rax +adcx %rcx,%r12 +adox %r10,%rax +mulx %r14,%rcx,%rsi +adcx %rcx,%rax +adox %r11,%rsi +mulx %r15,%rcx,%r15 +adcx %rcx,%rsi +adox %rbp,%r15 +adcx %rbp,%r15 + +movq %rbx,0(%rdi) +movq %r12,8(%rdi) +movq %rax,16(%rdi) +movq %rsi,24(%rdi) +movq %r15,32(%rdi) + +pop %r15 +pop %r14 +pop %r13 +pop %r12 +pop %rbx +pop %rbp + +ret + +/* 64-bit field multiplication in which the first argument has 4-limb + * and the second argument has 5-limb representations such that the + * fifth limb is of at most 64 bits. The second argument is fully + * reduced to 4-limb form and then field multiplication is performed. + * A field element in 5-limb form is reported as output such that the + * fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_45to5 +secp256k1_fe_mul_45to5: +movq %rsp,%r11 +subq $72,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbp,40(%rsp) +movq %rbx,48(%rsp) +movq %rdi,56(%rsp) + +movq 0(%rdx),%rax +movq 8(%rdx),%rbx +movq 16(%rdx),%r8 +movq 24(%rdx),%r9 + +movq $0x1000003D1,%r15 +xorq %rcx,%rcx +mulx 32(%rdx),%r13,%r14 +adcx %r13,%rax +adcx %r14,%rbx +adcx %rcx,%r8 +adcx %rcx,%r9 +cmovc %r15,%rcx +addq %rcx,%rax +adcq $0,%rbx + +movq %r8,56(%rsp) +movq %r9,64(%rsp) + +xorq %r13,%r13 +movq 0(%rsi),%rdx +mulx %rax,%r8,%r9 +mulx %rbx,%rcx,%r10 +adcx %rcx,%r9 +mulx 56(%rsp),%rcx,%r11 +adcx %rcx,%r10 +mulx 64(%rsp),%rcx,%r12 +adcx %rcx,%r11 +adcx %r13,%r12 + +xorq %r14,%r14 +movq 8(%rsi),%rdx +mulx %rax,%rcx,%rbp +adcx %rcx,%r9 +adox %rbp,%r10 +mulx %rbx,%rcx,%rbp +adcx %rcx,%r10 +adox %rbp,%r11 +mulx 56(%rsp),%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx 64(%rsp),%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +adcx %r14,%r13 + +xorq %r15,%r15 +movq 16(%rsi),%rdx +mulx %rax,%rcx,%rbp +adcx %rcx,%r10 +adox %rbp,%r11 +mulx %rbx,%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx 56(%rsp),%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +mulx 64(%rsp),%rcx,%rbp +adcx %rcx,%r13 +adox %rbp,%r14 +adcx %r15,%r14 + +xorq %rdx,%rdx +movq 24(%rsi),%rdx +mulx %rax,%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx %rbx,%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +mulx 56(%rsp),%rcx,%rbp +adcx %rcx,%r13 +adox %rbp,%r14 +mulx 64(%rsp),%rcx,%rbp +adcx %rcx,%r14 +adox %rbp,%r15 +adcq $0,%r15 + +xorq %rbp,%rbp +movq $0x1000003D1,%rdx +mulx %r12,%rax,%r12 +adcx %rax,%r8 +adox %r12,%r9 +mulx %r13,%rcx,%r13 +adcx %rcx,%r9 +adox %r13,%r10 +mulx %r14,%rcx,%r14 +adcx %rcx,%r10 +adox %r14,%r11 +mulx %r15,%rcx,%r15 +adcx %rcx,%r11 +adox %rbp,%r15 +adcx %rbp,%r15 + +movq %r8,0(%rdi) +movq %r9,8(%rdi) +movq %r10,16(%rdi) +movq %r11,24(%rdi) +movq %r15,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbp +movq 48(%rsp),%rbx + +movq %r11,%rsp + +ret diff --git a/src/asm/field_5x64_x86_64_mxaa.s b/src/asm/field_5x64_x86_64_mxaa.s index dec40d76ed..f9b99e3848 100644 --- a/src/asm/field_5x64_x86_64_mxaa.s +++ b/src/asm/field_5x64_x86_64_mxaa.s @@ -1,23 +1,28 @@ -/*********************************************************************** - * Copyright (c) 2021 Kaushik Nath * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* +/************************************************************************ + * Field multiplication and squaring assemblies using representation of * + * field elements in base 2^{64}. * + * Major instructions used in the assemblies are mulx/add/adc. * + * * + * Copyright (c) 2021 Kaushik Nath * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or https://www.opensource.org/licenses/mit-license.php. * ***********************************************************************/ -/* 4-limb field multiplication and squaring using the bottom 4-limbs of - * a 5-limb representation. First reduce the 5-limb inputs to fully - * reduced 4-limb forms, then multiply and finally output a half reduced - * output in 5-limb form. The leading limb is of atmost 33 bits. - * - * Major instructions used in the assemblies: mulx/add/adc. +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is of at most 64 bits. The 5-limb inputs are fully reduced first + * to 4-limb forms, then multiplied, after which a field element in 5-limb + * form is reported as output. The fifth limb of the output has at most + * 33 bits. */ .att_syntax .text .p2align 4 -.global secp256k1_fe_mul_inner -secp256k1_fe_mul_inner: +.global secp256k1_fe_mul_55to5 +secp256k1_fe_mul_55to5: movq %rsp,%r11 subq $112,%rsp @@ -160,8 +165,8 @@ movq %r11,%rsp ret .p2align 4 -.global secp256k1_fe_sqr_inner -secp256k1_fe_sqr_inner: +.global secp256k1_fe_sqr_5to5 +secp256k1_fe_sqr_5to5: movq %rsp,%r11 subq $64,%rsp @@ -219,7 +224,7 @@ shld $1,%r12,%r13 shld $1,%r11,%r12 shld $1,%r10,%r11 shld $1,%r9,%r10 -shlq $1,%r9 +addq %r9,%r9 movq %rbp,%rdx mulx %rdx,%r8,%rax @@ -278,3 +283,345 @@ movq 48(%rsp),%rbx movq %r11,%rsp ret + +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_44to5 +secp256k1_fe_mul_44to5: +movq %rsp,%r11 +subq $64,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbp,40(%rsp) +movq %rbx,48(%rsp) +movq %rdi,56(%rsp) + +movq %rdx,%rdi + +movq 0(%rdi),%rdx +mulx 0(%rsi),%r8,%r9 +mulx 8(%rsi),%rcx,%r10 +addq %rcx,%r9 +mulx 16(%rsi),%rcx,%r11 +adcq %rcx,%r10 +mulx 24(%rsi),%rcx,%r12 +adcq %rcx,%r11 +adcq $0,%r12 + +movq 8(%rdi),%rdx +mulx 0(%rsi),%rax,%rbx +mulx 8(%rsi),%rcx,%rbp +addq %rcx,%rbx +mulx 16(%rsi),%rcx,%r15 +adcq %rcx,%rbp +mulx 24(%rsi),%rcx,%r13 +adcq %rcx,%r15 +adcq $0,%r13 +addq %rax,%r9 +adcq %rbx,%r10 +adcq %rbp,%r11 +adcq %r15,%r12 +adcq $0,%r13 + +movq 16(%rdi),%rdx +mulx 0(%rsi),%rax,%rbx +mulx 8(%rsi),%rcx,%rbp +addq %rcx,%rbx +mulx 16(%rsi),%rcx,%r15 +adcq %rcx,%rbp +mulx 24(%rsi),%rcx,%r14 +adcq %rcx,%r15 +adcq $0,%r14 +addq %rax,%r10 +adcq %rbx,%r11 +adcq %rbp,%r12 +adcq %r15,%r13 +adcq $0,%r14 + +movq 24(%rdi),%rdx +mulx 0(%rsi),%rax,%rbx +mulx 8(%rsi),%rcx,%rbp +addq %rcx,%rbx +mulx 16(%rsi),%rcx,%r15 +adcq %rcx,%rbp +mulx 24(%rsi),%rcx,%rsi +adcq %rcx,%r15 +adcq $0,%rsi +addq %rax,%r11 +adcq %rbx,%r12 +adcq %rbp,%r13 +adcq %r15,%r14 +adcq $0,%rsi + +movq $0x1000003D1,%rdx +mulx %r12,%r12,%rbx +mulx %r13,%r13,%rcx +addq %rbx,%r13 +mulx %r14,%r14,%rbx +adcq %rcx,%r14 +mulx %rsi,%r15,%rcx +adcq %rbx,%r15 +adcq $0,%rcx +addq %r12,%r8 +adcq %r13,%r9 +adcq %r14,%r10 +adcq %r15,%r11 +adcq $0,%rcx + +movq 56(%rsp),%rdi +movq %r8,0(%rdi) +movq %r9,8(%rdi) +movq %r10,16(%rdi) +movq %r11,24(%rdi) +movq %rcx,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbp +movq 48(%rsp),%rbx + +movq %r11,%rsp + +ret + +.p2align 4 +.global secp256k1_fe_sqr_4to5 +secp256k1_fe_sqr_4to5: +movq %rsp,%r11 +subq $56,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbp,40(%rsp) +movq %rbx,48(%rsp) + +movq 0(%rsi),%rdx +mulx 8(%rsi),%r9,%r10 +mulx 16(%rsi),%rcx,%r11 +addq %rcx,%r10 +mulx 24(%rsi),%rcx,%r12 +adcq %rcx,%r11 +adcq $0,%r12 + +movq 8(%rsi),%rdx +mulx 16(%rsi),%rax,%rbx +mulx 24(%rsi),%rcx,%r13 +addq %rcx,%rbx +adcq $0,%r13 +addq %rax,%r11 +adcq %rbx,%r12 +adcq $0,%r13 + +movq 16(%rsi),%rdx +mulx 24(%rsi),%rax,%r14 +addq %rax,%r13 +adcq $0,%r14 + +movq $0,%r15 +shld $1,%r14,%r15 +shld $1,%r13,%r14 +shld $1,%r12,%r13 +shld $1,%r11,%r12 +shld $1,%r10,%r11 +shld $1,%r9,%r10 +addq %r9,%r9 + +movq 0(%rsi),%rdx +mulx %rdx,%r8,%rax +addq %rax,%r9 + +movq 8(%rsi),%rdx +mulx %rdx,%rax,%rbx +adcq %rax,%r10 +adcq %rbx,%r11 + +movq 16(%rsi),%rdx +mulx %rdx,%rax,%rbx +adcq %rax,%r12 +adcq %rbx,%r13 + +movq 24(%rsi),%rdx +mulx %rdx,%rax,%rbx +adcq %rax,%r14 +adcq %rbx,%r15 + +movq $0x1000003D1,%rdx +mulx %r12,%r12,%rbx +mulx %r13,%r13,%rcx +addq %rbx,%r13 +mulx %r14,%r14,%rbx +adcq %rcx,%r14 +mulx %r15,%r15,%rcx +adcq %rbx,%r15 +adcq $0,%rcx +addq %r12,%r8 +adcq %r13,%r9 +adcq %r14,%r10 +adcq %r15,%r11 +adcq $0,%rcx + +movq %r8,0(%rsi) +movq %r9,8(%rsi) +movq %r10,16(%rsi) +movq %r11,24(%rsi) +movq %rcx,32(%rsi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbp +movq 48(%rsp),%rbx + +movq %r11,%rsp + +ret + +/* 64-bit field multiplication in which the first argument has 4-limb + * and the second argument has 5-limb representations such that the + * fifth limb is of at most 64 bits. The second argument is fully + * reduced to 4-limb form and then field multiplication is performed. + * A field element in 5-limb form is reported as output such that the + * fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_45to5 +secp256k1_fe_mul_45to5: +movq %rsp,%r11 +subq $88,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbp,40(%rsp) +movq %rbx,48(%rsp) + +movq 0(%rdx),%r12 +movq 8(%rdx),%r13 +movq 16(%rdx),%r14 +movq 24(%rdx),%r15 +movq 32(%rdx),%rax + +movq $0x1000003D1,%rdx +xorq %rcx,%rcx +mulx %rax,%rax,%rbx +addq %rax,%r12 +adcq %rbx,%r13 +adcq $0,%r14 +adcq $0,%r15 +cmovc %rdx,%rcx +addq %rcx,%r12 +adcq $0,%r13 + +movq %r12,56(%rsp) +movq %r13,64(%rsp) +movq %r14,72(%rsp) +movq %r15,80(%rsp) + +movq 0(%rsi),%rdx +mulx 56(%rsp),%r8,%r9 +mulx 64(%rsp),%rcx,%r10 +addq %rcx,%r9 +mulx 72(%rsp),%rcx,%r11 +adcq %rcx,%r10 +mulx 80(%rsp),%rcx,%r12 +adcq %rcx,%r11 +adcq $0,%r12 + +movq 8(%rsi),%rdx +mulx 56(%rsp),%rax,%rbx +mulx 64(%rsp),%rcx,%rbp +addq %rcx,%rbx +mulx 72(%rsp),%rcx,%r15 +adcq %rcx,%rbp +mulx 80(%rsp),%rcx,%r13 +adcq %rcx,%r15 +adcq $0,%r13 +addq %rax,%r9 +adcq %rbx,%r10 +adcq %rbp,%r11 +adcq %r15,%r12 +adcq $0,%r13 + +movq 16(%rsi),%rdx +mulx 56(%rsp),%rax,%rbx +mulx 64(%rsp),%rcx,%rbp +addq %rcx,%rbx +mulx 72(%rsp),%rcx,%r15 +adcq %rcx,%rbp +mulx 80(%rsp),%rcx,%r14 +adcq %rcx,%r15 +adcq $0,%r14 +addq %rax,%r10 +adcq %rbx,%r11 +adcq %rbp,%r12 +adcq %r15,%r13 +adcq $0,%r14 + +movq 24(%rsi),%rdx +mulx 56(%rsp),%rax,%rbx +mulx 64(%rsp),%rcx,%rbp +addq %rcx,%rbx +mulx 72(%rsp),%rcx,%r15 +adcq %rcx,%rbp +mulx 80(%rsp),%rcx,%rsi +adcq %rcx,%r15 +adcq $0,%rsi +addq %rax,%r11 +adcq %rbx,%r12 +adcq %rbp,%r13 +adcq %r15,%r14 +adcq $0,%rsi + +movq $0x1000003D1,%rdx +mulx %r12,%r12,%rbx +mulx %r13,%r13,%rcx +addq %rbx,%r13 +mulx %r14,%r14,%rbx +adcq %rcx,%r14 +mulx %rsi,%r15,%rcx +adcq %rbx,%r15 +adcq $0,%rcx +addq %r12,%r8 +adcq %r13,%r9 +adcq %r14,%r10 +adcq %r15,%r11 +adcq $0,%rcx + +movq %r8,0(%rdi) +movq %r9,8(%rdi) +movq %r10,16(%rdi) +movq %r11,24(%rdi) +movq %rcx,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbp +movq 48(%rsp),%rbx + +movq %r11,%rsp + +ret diff --git a/src/field_5x64_impl.h b/src/field_5x64_impl.h index 1c587b0332..0325d65e60 100644 --- a/src/field_5x64_impl.h +++ b/src/field_5x64_impl.h @@ -17,8 +17,10 @@ #if defined(USE_EXTERNAL_ASM) /* External assembler implementation */ -void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b); -void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a); +void secp256k1_fe_mul_55to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b); +void secp256k1_fe_mul_45to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b); +void secp256k1_fe_sqr_5to5(uint64_t *r, const uint64_t *a); +void secp256k1_fe_sqr_4to5(uint64_t *r, const uint64_t *a); #endif #ifdef VERIFY @@ -733,7 +735,7 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2 #endif #if defined(USE_EXTERNAL_ASM) - secp256k1_fe_mul_inner(r->n, a->n, b->n); + secp256k1_fe_mul_55to5(r->n, a->n, b->n); #else mul2(c0,c1,a4,0x1000003D1ULL); a4 = 0; @@ -803,10 +805,12 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2 } static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const secp256k1_fe * SECP256K1_RESTRICT b_prec) { +#ifndef USE_EXTERNAL_ASM uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4]; uint64_t b0 = b_prec->n[0], b1 = b_prec->n[1], b2 = b_prec->n[2], b3 = b_prec->n[3]; uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; +#endif #ifdef VERIFY VERIFY_CHECK(a->magnitude <= 2048); @@ -818,6 +822,9 @@ static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const VERIFY_CHECK(a != b_prec); #endif +#if defined(USE_EXTERNAL_ASM) + secp256k1_fe_mul_45to5(r->n, b_prec->n, a->n); +#else mul2(c0,c1,a4,0x1000003D1ULL); a4 = 0; add2(c0,c1,a0); @@ -862,6 +869,7 @@ static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const add2(d3,d4,c3); r->n[3] = d3; r->n[4] = d4; +#endif #ifdef VERIFY r->magnitude = 1; @@ -884,7 +892,7 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) { #endif #if defined(USE_EXTERNAL_ASM) - secp256k1_fe_sqr_inner(r->n, a->n); + secp256k1_fe_sqr_5to5(r->n, a->n); #else /* Bring a to [0,2**256). */ mul2(c0,c1,a4,0x1000003D1ULL); @@ -935,9 +943,11 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) { } static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) { +#ifndef USE_EXTERNAL_ASM uint64_t a0 = a_prec->n[0], a1 = a_prec->n[1], a2 = a_prec->n[2], a3 = a_prec->n[3]; uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; +#endif #ifdef VERIFY VERIFY_CHECK(a_prec->precomputed); @@ -945,6 +955,9 @@ static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) { secp256k1_fe_verify(a_prec); #endif +#if defined(USE_EXTERNAL_ASM) + secp256k1_fe_sqr_4to5(r->n, a_prec->n); +#else /* Compute 512-bit product. */ c0 = 0; c1 = 0; @@ -973,6 +986,7 @@ static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) { add2(d3,d4,c3); r->n[3] = d3; r->n[4] = d4; +#endif #ifdef VERIFY r->magnitude = 1;