From 237baa9cbfe889bc48a61c5bc56004f14173e7cb Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Thu, 5 Aug 2021 11:41:43 -0700 Subject: [PATCH] WIP: integrate x86_64_gen.s 4-limb multiplication --- src/asm/field_5x64_aarch64.s | 1163 ++++++++++++++++++++++-------- src/asm/field_5x64_x86_64_gen.s | 585 ++++++++++++++- src/asm/field_5x64_x86_64_maax.s | 378 +++++++++- src/asm/field_5x64_x86_64_mxaa.s | 377 +++++++++- src/field_5x64_impl.h | 22 +- 5 files changed, 2163 insertions(+), 362 deletions(-) diff --git a/src/asm/field_5x64_aarch64.s b/src/asm/field_5x64_aarch64.s index 13fa1a33d0..d89f6cf545 100644 --- a/src/asm/field_5x64_aarch64.s +++ b/src/asm/field_5x64_aarch64.s @@ -3,358 +3,891 @@ .text +/* Aarch64 assembly modules, created by disassembling the + output of high level c function written by Kaushik */ + .p2align 4 - .global secp256k1_fe_mul_inner - .type secp256k1_fe_mul_inner, %function -secp256k1_fe_mul_inner: - stp x29, x30, [sp, #-48]! - mov x5, #0x3d1 // #977 - movk x5, #0x1, lsl #32 + .global secp256k1_fe_mul_45to5 + .type secp256k1_fe_mul_45to5, %function +secp256k1_fe_mul_45to5: + stp x29, x30, [sp, #-16]! + mov x8, #0x3d1 // #977 + movk x8, #0x1, lsl #32 mov x29, sp - ldp x12, x3, [x1] - stp x19, x20, [sp, #16] - ldr x4, [x1, #32] - ldp x10, x14, [x1, #16] - mul x6, x4, x5 - umulh x4, x4, x5 - adds x12, x12, x6 - cset x6, cs // cs = hs, nlast - adds x4, x4, x3 - cset x3, cs // cs = hs, nlast + ldp x7, x3, [x2, #24] + ldp x6, x5, [x2] + ldp x10, x9, [x1] + mul x4, x3, x8 + umulh x3, x3, x8 adds x4, x4, x6 cinc x3, x3, cs // cs = hs, nlast - adds x10, x10, x3 - cset x1, cs // cs = hs, nlast - adds x14, x14, x1 - csetm x1, cs // cs = hs, nlast - and x1, x1, x5 - adds x12, x1, x12 - cset x1, cs // cs = hs, nlast - adds x4, x4, x1 - ldp x15, x9, [x2, #24] - cset x1, cs // cs = hs, nlast - adds x10, x10, x1 - ldp x11, x1, [x2] + ldr x6, [x2, #16] + adds x5, x5, x3 + cset x3, cs // cs = hs, nlast + adds x6, x6, x3 + cset x2, cs // cs = hs, nlast + adds x7, x7, x2 + cset x2, cs // cs = hs, nlast + mul x18, x9, x6 + ldp x12, x15, [x1, #16] + mul x14, x9, x7 + mul x11, x2, x8 + umulh x2, x2, x8 + adds x11, x11, x4 + umulh x4, x10, x7 + adc x5, x5, x2 + umulh x2, x9, x6 + mul x3, x12, x6 + mul x17, x10, x7 + adds x2, x2, x3 + mul x13, x15, x5 + umulh x1, x12, x5 + cset x3, cs // cs = hs, nlast + adds x4, x4, x14 + cset x14, cs // cs = hs, nlast + adds x2, x2, x4 + adc x3, x3, x14 + umulh x4, x15, x11 + adds x1, x1, x13 + mul x14, x10, x11 cset x13, cs // cs = hs, nlast - mul x3, x9, x5 - umulh x9, x9, x5 - adds x11, x11, x3 + adds x2, x2, x1 + adc x3, x3, x13 + adds x2, x2, x4 + cinc x3, x3, cs // cs = hs, nlast + mul x13, x15, x6 + umulh x1, x12, x6 + mul x4, x2, x8 + umulh x2, x2, x8 + madd x2, x3, x8, x2 + adds x4, x4, x14 + umulh x3, x9, x7 + str x4, [x0] + mul x14, x12, x7 + cinc x2, x2, cs // cs = hs, nlast + adds x1, x1, x13 + umulh x13, x15, x5 + cset x4, cs // cs = hs, nlast + adds x3, x3, x14 + cset x14, cs // cs = hs, nlast + adds x1, x1, x3 + adc x4, x4, x14 + adds x1, x1, x13 + cinc x4, x4, cs // cs = hs, nlast + umulh x3, x15, x7 + mul x13, x5, x10 + umulh x14, x1, x8 + mul x1, x1, x8 + madd x14, x4, x8, x14 + adds x1, x1, x13 + umulh x4, x10, x11 + mul x13, x9, x11 + cinc x14, x14, cs // cs = hs, nlast + adds x16, x4, x13 + mul x4, x3, x8 + umulh x13, x3, x8 + cset x30, cs // cs = hs, nlast + adds x3, x1, x16 + umulh x16, x10, x6 + adc x14, x14, x30 + adds x4, x4, x17 + umulh x1, x12, x11 + cinc x13, x13, cs // cs = hs, nlast + mul x17, x15, x11 + adds x16, x16, x18 + cset x30, cs // cs = hs, nlast + adds x4, x4, x16 + mul x18, x12, x5 + adc x13, x13, x30 + umulh x16, x9, x5 + adds x17, x1, x17 + cset x30, cs // cs = hs, nlast + adds x4, x4, x17 + umulh x1, x12, x7 + adc x13, x13, x30 + mul x7, x15, x7 + adds x16, x16, x18 + umulh x15, x15, x6 + cset x17, cs // cs = hs, nlast + adds x4, x4, x16 + umulh x16, x9, x11 + adc x13, x13, x17 + adds x1, x1, x7 + cset x7, cs // cs = hs, nlast + adds x1, x1, x15 + cinc x15, x7, cs // cs = hs, nlast + mul x12, x12, x11 + mul x6, x10, x6 + umulh x7, x1, x8 + mul x1, x1, x8 + madd x7, x15, x8, x7 + adds x1, x1, x16 + mul x9, x9, x5 + cinc x7, x7, cs // cs = hs, nlast + umulh x5, x5, x10 + adds x6, x6, x12 + cset x8, cs // cs = hs, nlast + adds x1, x1, x6 + adc x7, x7, x8 + adds x5, x5, x9 cset x6, cs // cs = hs, nlast - adds x9, x9, x1 - ldr x3, [x2, #16] + adds x1, x1, x5 + adc x7, x7, x6 + adds x2, x2, x3 + cinc x14, x14, cs // cs = hs, nlast + adds x1, x1, x14 + stp x2, x1, [x0, #8] + cinc x7, x7, cs // cs = hs, nlast + adds x4, x4, x7 + cinc x13, x13, cs // cs = hs, nlast + stp x4, x13, [x0, #24] + ldp x29, x30, [sp], #16 + ret + .size secp256k1_fe_mul_45to5, .-secp256k1_fe_mul_45to5 + + .p2align 4 + .global secp256k1_fe_mul_55to5 + .type secp256k1_fe_mul_55to5, %function +secp256k1_fe_mul_55to5: + stp x29, x30, [sp, #-32]! + mov x9, #0x3d1 // #977 + movk x9, #0x1, lsl #32 + mov x29, sp + ldp x5, x10, [x1] + stp x19, x20, [sp, #16] + ldr x3, [x1, #32] + ldp x12, x8, [x1, #16] + mul x4, x3, x9 + umulh x3, x3, x9 + adds x4, x4, x5 + cinc x3, x3, cs // cs = hs, nlast + adds x10, x10, x3 + cset x3, cs // cs = hs, nlast + adds x12, x12, x3 cset x1, cs // cs = hs, nlast - adds x9, x9, x6 - str x21, [sp, #32] + adds x8, x8, x1 + cset x3, cs // cs = hs, nlast + ldp x7, x11, [x2] + ldr x1, [x2, #32] + mul x6, x3, x9 + umulh x5, x3, x9 + adds x6, x6, x4 + mul x3, x1, x9 + adc x10, x10, x5 + umulh x1, x1, x9 + adds x3, x3, x7 cinc x1, x1, cs // cs = hs, nlast - adds x3, x3, x1 - cset x8, cs // cs = hs, nlast - adds x8, x15, x8 - csetm x1, cs // cs = hs, nlast - and x1, x1, x5 - adds x11, x1, x11 + ldr x13, [x2, #16] + adds x11, x11, x1 + cset x1, cs // cs = hs, nlast + ldr x7, [x2, #24] + adds x13, x13, x1 cset x1, cs // cs = hs, nlast - adds x9, x9, x1 - cset x2, cs // cs = hs, nlast - umulh x1, x12, x11 - adds x3, x3, x2 - mul x6, x4, x11 - mul x2, x9, x12 - cset x15, cs // cs = hs, nlast - umulh x7, x9, x12 - adds x1, x1, x2 - umulh x2, x4, x11 - cinc x7, x7, cs // cs = hs, nlast - adds x6, x1, x6 - mul x16, x12, x3 - cinc x1, x2, cs // cs = hs, nlast adds x7, x7, x1 - umulh x1, x12, x3 - cset x2, cs // cs = hs, nlast - adds x7, x7, x16 - mul x17, x9, x4 - cinc x1, x1, cs // cs = hs, nlast - adds x2, x2, x1 - umulh x1, x9, x4 - cset x16, cs // cs = hs, nlast - adds x7, x7, x17 - cinc x1, x1, cs // cs = hs, nlast - mul x17, x10, x11 - adds x2, x2, x1 - umulh x1, x10, x11 - cinc x19, x16, cs // cs = hs, nlast - adds x7, x7, x17 - cinc x1, x1, cs // cs = hs, nlast - mul x21, x4, x3 - adds x2, x2, x1 - umulh x1, x4, x3 - cset x16, cs // cs = hs, nlast - adds x15, x15, x8 - mul x20, x9, x10 - umulh x17, x9, x10 - mul x8, x12, x15 - umulh x18, x12, x15 - adds x2, x2, x8 - mul x30, x4, x15 - cinc x18, x18, cs // cs = hs, nlast - adds x2, x2, x21 - add x16, x16, x18 - cinc x1, x1, cs // cs = hs, nlast - add x16, x16, x19 - umulh x19, x4, x15 - adds x1, x16, x1 - mul x12, x12, x11 - cset x4, cs // cs = hs, nlast - cmp x18, x16 - cinc x8, x4, hi // hi = pmore - adds x2, x2, x20 - cinc x4, x17, cs // cs = hs, nlast - mul x18, x10, x3 - adds x1, x1, x4 - umulh x17, x10, x3 - cset x4, cs // cs = hs, nlast - adds x13, x13, x14 - mul x16, x10, x15 - umulh x10, x10, x15 - mul x14, x11, x13 - umulh x11, x11, x13 + cset x1, cs // cs = hs, nlast + mul x19, x12, x13 + umulh x14, x10, x13 + mul x18, x10, x7 + mul x5, x1, x9 + umulh x1, x1, x9 + adds x5, x5, x3 + umulh x17, x7, x6 + adc x11, x11, x1 + mul x16, x7, x12 + umulh x1, x12, x13 + umulh x2, x8, x5 + umulh x4, x11, x12 + mul x15, x11, x8 + adds x2, x2, x19 + cset x3, cs // cs = hs, nlast + mul x19, x6, x5 + adds x15, x4, x15 + umulh x4, x11, x8 + cset x20, cs // cs = hs, nlast + adds x2, x2, x15 + adc x3, x3, x20 + adds x14, x14, x18 + cset x15, cs // cs = hs, nlast adds x2, x2, x14 - mul x20, x9, x13 - cinc x11, x11, cs // cs = hs, nlast - umulh x21, x9, x13 - adds x1, x1, x11 - mul x11, x3, x13 - adc x4, x8, x4 - adds x1, x1, x30 - cinc x19, x19, cs // cs = hs, nlast - umulh x3, x3, x13 - adds x4, x4, x19 - umulh x9, x15, x13 - cset x8, cs // cs = hs, nlast - adds x1, x1, x18 - cinc x17, x17, cs // cs = hs, nlast - mul x13, x15, x13 - adds x4, x4, x17 - cinc x14, x8, cs // cs = hs, nlast - adds x1, x1, x20 - cinc x21, x21, cs // cs = hs, nlast - adds x4, x4, x21 - cset x8, cs // cs = hs, nlast - adds x4, x4, x16 - cinc x10, x10, cs // cs = hs, nlast - adds x4, x4, x11 - add x8, x8, x10 + adc x3, x3, x15 + adds x2, x2, x17 cinc x3, x3, cs // cs = hs, nlast - add x8, x8, x14 - mul x11, x1, x5 - adds x3, x8, x3 - umulh x1, x1, x5 + mul x18, x13, x8 + umulh x17, x10, x7 + mul x14, x2, x9 + umulh x2, x2, x9 + madd x2, x3, x9, x2 + adds x14, x14, x19 + str x14, [x0] + mul x15, x10, x5 + cinc x2, x2, cs // cs = hs, nlast + adds x1, x1, x16 + cset x3, cs // cs = hs, nlast + adds x4, x4, x18 cset x14, cs // cs = hs, nlast - cmp x10, x8 - cinc x9, x9, hi // hi = pmore - adds x3, x3, x13 - adc x9, x9, x14 - mul x8, x4, x5 - adds x11, x11, x12 - umulh x4, x4, x5 - cinc x1, x1, cs // cs = hs, nlast - mul x10, x3, x5 + adds x1, x1, x4 + adc x4, x3, x14 + adds x1, x1, x17 + cinc x4, x4, cs // cs = hs, nlast + mul x17, x11, x6 + umulh x3, x6, x5 + umulh x14, x1, x9 + mul x1, x1, x9 + madd x14, x4, x9, x14 + umulh x16, x7, x8 + adds x1, x1, x15 + umulh x4, x12, x5 + cinc x14, x14, cs // cs = hs, nlast + mul x15, x8, x5 + adds x3, x3, x17 + cset x18, cs // cs = hs, nlast + mul x17, x7, x6 + adds x3, x1, x3 + umulh x1, x13, x6 + adc x14, x14, x18 + adds x4, x4, x15 + mul x19, x11, x12 + cset x15, cs // cs = hs, nlast + adds x18, x1, x17 + mul x1, x16, x9 + umulh x17, x16, x9 + cset x20, cs // cs = hs, nlast + adds x4, x4, x18 + mul x30, x10, x13 + umulh x16, x10, x11 + adc x15, x15, x20 + adds x18, x1, x19 + umulh x1, x13, x8 + cinc x17, x17, cs // cs = hs, nlast + adds x4, x4, x18 + mul x8, x7, x8 + adc x15, x15, x17 + adds x16, x16, x30 + umulh x7, x7, x12 + cset x17, cs // cs = hs, nlast + adds x4, x4, x16 + adc x15, x15, x17 adds x1, x1, x8 - umulh x3, x3, x5 + cset x8, cs // cs = hs, nlast + adds x1, x1, x7 + cinc x8, x8, cs // cs = hs, nlast + mul x13, x13, x6 + mul x12, x12, x5 + umulh x7, x1, x9 + mul x1, x1, x9 + madd x7, x8, x9, x7 + umulh x6, x11, x6 + adds x1, x1, x13 + umulh x5, x10, x5 + cinc x7, x7, cs // cs = hs, nlast + mul x10, x10, x11 + adds x6, x6, x12 cset x8, cs // cs = hs, nlast adds x1, x1, x6 - adc x4, x8, x4 - mul x8, x9, x5 - adds x4, x4, x10 - umulh x5, x9, x5 + adc x7, x7, x8 + adds x5, x5, x10 cset x6, cs // cs = hs, nlast + adds x1, x1, x5 + adc x7, x7, x6 + adds x2, x2, x3 + cinc x14, x14, cs // cs = hs, nlast + adds x1, x1, x14 + stp x2, x1, [x0, #8] + cinc x7, x7, cs // cs = hs, nlast adds x4, x4, x7 - adc x3, x6, x3 - stp x11, x1, [x0] - adds x3, x3, x8 - cset x9, cs // cs = hs, nlast - adds x2, x3, x2 - adc x5, x9, x5 - stp x4, x2, [x0, #16] - str x5, [x0, #32] + cinc x15, x15, cs // cs = hs, nlast + stp x4, x15, [x0, #24] ldp x19, x20, [sp, #16] - ldr x21, [sp, #32] - ldp x29, x30, [sp], #48 + ldp x29, x30, [sp], #32 ret - .size secp256k1_fe_mul_inner, .-secp256k1_fe_mul_inner + .size secp256k1_fe_mul_55to5, .-secp256k1_fe_mul_55to5 .p2align 4 - .global secp256k1_fe_sqr_inner - .type secp256k1_fe_sqr_inner, %function -secp256k1_fe_sqr_inner: - stp x29, x30, [sp, #-32]! - mov x5, #0x3d1 // #977 - movk x5, #0x1, lsl #32 + .global secp256k1_fe_sqr_5to5 + .type secp256k1_fe_sqr_5to5, %function +secp256k1_fe_sqr_5to5: + stp x29, x30, [sp, #-16]! + mov x6, #0x3d1 // #977 + movk x6, #0x1, lsl #32 mov x29, sp - ldp x3, x4, [x1] + ldp x5, x4, [x1] ldr x2, [x1, #32] - ldp x10, x9, [x1, #16] - str x19, [sp, #16] - mul x6, x2, x5 - umulh x2, x2, x5 - adds x3, x3, x6 - cset x6, cs // cs = hs, nlast - adds x2, x2, x4 - cset x4, cs // cs = hs, nlast - adds x2, x2, x6 - cinc x4, x4, cs // cs = hs, nlast - adds x10, x10, x4 - cset x11, cs // cs = hs, nlast - adds x9, x9, x11 - csetm x1, cs // cs = hs, nlast - and x1, x1, x5 - adds x3, x1, x3 + mul x3, x2, x6 + umulh x2, x2, x6 + adds x5, x3, x5 + cinc x2, x2, cs // cs = hs, nlast + adds x4, x4, x2 + ldp x3, x2, [x1, #16] + cset x7, cs // cs = hs, nlast + adds x3, x3, x7 cset x1, cs // cs = hs, nlast adds x2, x2, x1 - cset x4, cs // cs = hs, nlast - umulh x1, x3, x3 - adds x10, x10, x4 - mul x8, x3, x3 - mul x4, x3, x2 + cset x7, cs // cs = hs, nlast + mul x13, x3, x3 + umulh x14, x3, x3 + umulh x15, x2, x3 + mul x1, x7, x6 + umulh x7, x7, x6 + adds x1, x1, x5 + mul x17, x2, x2 + adc x4, x4, x7 + lsl x5, x15, #1 + lsr x15, x15, #63 + mul x16, x2, x3 + umulh x8, x2, x1 + mul x10, x4, x2 + umulh x7, x4, x3 + mul x9, x4, x1 + adds x7, x7, x10 + mul x12, x1, x1 cset x11, cs // cs = hs, nlast - umulh x16, x3, x2 - mul x12, x3, x10 - lsl x6, x4, #1 - umulh x14, x3, x10 - lsl x13, x16, #1 - cmp x4, x6 - cinc x7, x13, hi // hi = pmore - adds x6, x1, x6 + adds x8, x7, x8 + cinc x11, x11, cs // cs = hs, nlast + umulh x30, x1, x1 + lsl x7, x8, #1 + lsl x10, x9, #1 + adds x7, x7, x13 + extr x8, x11, x8, #63 + cinc x8, x8, cs // cs = hs, nlast + lsr x9, x9, #63 + mul x18, x4, x4 + mul x11, x7, x6 + umulh x7, x7, x6 + madd x7, x8, x6, x7 + adds x11, x11, x12 + str x11, [x0] + umulh x13, x4, x2 cinc x7, x7, cs // cs = hs, nlast - cset w4, cs // cs = hs, nlast - cmp x7, #0x0 - lsl x1, x12, #1 - ccmp w4, #0x0, #0x4, eq // eq = none - lsl x15, x14, #1 - cset x4, ne // ne = any - cmp x16, x13 - cinc x4, x4, hi // hi = pmore - cmp x12, x1 - cinc x12, x15, hi // hi = pmore - adds x7, x7, x1 + adds x10, x10, x30 + cinc x9, x9, cs // cs = hs, nlast + adds x5, x5, x17 + cinc x8, x15, cs // cs = hs, nlast + mul x17, x3, x1 + mul x15, x2, x1 + umulh x11, x5, x6 + mul x5, x5, x6 + madd x11, x8, x6, x11 + adds x5, x5, x18 + umulh x12, x4, x1 + cinc x11, x11, cs // cs = hs, nlast + adds x13, x13, x16 + cset x8, cs // cs = hs, nlast + mul x16, x4, x3 + umulh x1, x3, x1 + lsl x3, x13, #1 + adds x3, x3, x14 + extr x8, x8, x13, #63 + cinc x14, x8, cs // cs = hs, nlast + umulh x2, x2, x2 + umulh x4, x4, x4 + mul x13, x3, x6 + umulh x3, x3, x6 + madd x3, x14, x6, x3 + adds x13, x13, x7 + mul x8, x2, x6 + cinc x3, x3, cs // cs = hs, nlast + adds x10, x10, x13 + adc x9, x9, x3 + adds x12, x12, x17 + cset x3, cs // cs = hs, nlast + umulh x2, x2, x6 + lsl x6, x12, #1 + adds x6, x6, x9 + extr x3, x3, x12, #63 + cinc x3, x3, cs // cs = hs, nlast + adds x5, x5, x6 + adc x11, x11, x3 + adds x1, x1, x15 + cset x3, cs // cs = hs, nlast + adds x1, x1, x16 + cinc x3, x3, cs // cs = hs, nlast + stp x10, x5, [x0, #8] + lsl x5, x1, #1 + adds x5, x5, x11 + extr x1, x3, x1, #63 + cinc x1, x1, cs // cs = hs, nlast + adds x4, x8, x4 + cinc x2, x2, cs // cs = hs, nlast + adds x4, x4, x5 + adc x2, x1, x2 + stp x4, x2, [x0, #24] + ldp x29, x30, [sp], #16 + ret + nop + .size secp256k1_fe_sqr_5to5, .-secp256k1_fe_sqr_5to5 + + .p2align 4 + .global secp256k1_fe_mul_44to5 + .type secp256k1_fe_mul_44to5, %function +secp256k1_fe_mul_44to5: + stp x29, x30, [sp, #-32]! + mov x9, #0x3d1 // #977 + movk x9, #0x1, lsl #32 + mov x29, sp + ldp x10, x6, [x1] + ldp x11, x15, [x1, #16] + ldp x5, x12, [x2, #16] + ldp x7, x8, [x2] + str x19, [sp, #16] + mul x13, x11, x5 + mul x3, x6, x12 + umulh x4, x12, x10 + adds x1, x3, x13 + mul x13, x15, x8 + umulh x2, x6, x5 + cset x3, cs // cs = hs, nlast + adds x4, x4, x13 + umulh x13, x11, x8 + cset x14, cs // cs = hs, nlast + adds x1, x1, x4 + adc x3, x3, x14 + umulh x4, x15, x7 + adds x2, x2, x13 + mul x14, x10, x7 + cset x13, cs // cs = hs, nlast + adds x1, x1, x2 + adc x3, x3, x13 + adds x1, x1, x4 + cinc x3, x3, cs // cs = hs, nlast + umulh x13, x11, x5 + umulh x2, x6, x12 + mul x4, x1, x9 + umulh x1, x1, x9 + madd x1, x3, x9, x1 + adds x4, x4, x14 + umulh x3, x15, x8 + str x4, [x0] + mul x14, x12, x11 + cinc x1, x1, cs // cs = hs, nlast + adds x2, x2, x13 + mul x13, x5, x15 + cset x4, cs // cs = hs, nlast + adds x3, x3, x14 + cset x14, cs // cs = hs, nlast + adds x2, x2, x3 + adc x4, x4, x14 + adds x2, x2, x13 + cinc x4, x4, cs // cs = hs, nlast + mul x16, x6, x7 + mul x14, x8, x10 + umulh x13, x2, x9 + mul x2, x2, x9 + umulh x3, x10, x7 + madd x13, x4, x9, x13 + adds x2, x2, x16 + mul x4, x12, x10 + cinc x13, x13, cs // cs = hs, nlast + adds x3, x3, x14 + mul x14, x6, x5 + cset x16, cs // cs = hs, nlast + mul x30, x15, x7 + adds x3, x2, x3 + mul x17, x11, x8 + adc x13, x13, x16 + umulh x2, x12, x15 + adds x4, x4, x14 + umulh x16, x5, x10 + cset x14, cs // cs = hs, nlast + umulh x18, x6, x8 + adds x17, x17, x30 + cset x19, cs // cs = hs, nlast + adds x4, x4, x17 + umulh x30, x11, x7 + adc x14, x14, x19 + mul x17, x2, x9 + adds x18, x16, x18 + umulh x16, x2, x9 + cset x2, cs // cs = hs, nlast + adds x4, x4, x18 + umulh x18, x5, x15 + adc x14, x14, x2 + adds x17, x17, x30 + umulh x2, x12, x11 + cinc x16, x16, cs // cs = hs, nlast + mul x12, x12, x15 + adds x4, x4, x17 + adc x14, x14, x16 + adds x2, x2, x18 + cset x16, cs // cs = hs, nlast + adds x2, x2, x12 + cinc x16, x16, cs // cs = hs, nlast + mul x11, x11, x7 + umulh x15, x8, x10 + umulh x12, x2, x9 + mul x2, x2, x9 + umulh x7, x6, x7 + madd x12, x16, x9, x12 + adds x2, x2, x11 + mul x6, x6, x8 + mul x5, x5, x10 cinc x12, x12, cs // cs = hs, nlast - cset w13, cs // cs = hs, nlast - adds x4, x4, x12 - mul x16, x2, x2 - cset x1, cs // cs = hs, nlast - cmp x12, #0x0 - ccmp w13, #0x0, #0x4, eq // eq = none - umulh x12, x2, x2 - cinc x13, x1, ne // ne = any - adds x7, x7, x16 + adds x7, x15, x7 + cset x8, cs // cs = hs, nlast + adds x5, x5, x6 + cset x6, cs // cs = hs, nlast + adds x7, x7, x5 + adc x5, x8, x6 + adds x2, x2, x7 + adc x12, x12, x5 + adds x1, x1, x3 + cinc x13, x13, cs // cs = hs, nlast + adds x2, x2, x13 + stp x1, x2, [x0, #8] cinc x12, x12, cs // cs = hs, nlast - umulh x17, x2, x10 adds x4, x4, x12 - mul x18, x2, x10 + cinc x14, x14, cs // cs = hs, nlast + stp x4, x14, [x0, #24] + ldr x19, [sp, #16] + ldp x29, x30, [sp], #32 + ret + .size secp256k1_fe_mul_44to5, .-secp256k1_fe_mul_44to5 + + .p2align 4 + .global secp256k1_fe_sqr_4to5 + .type secp256k1_fe_sqr_4to5, %function +secp256k1_fe_sqr_4to5: + ldp x9, x3, [x1] + mov x7, #0x3d1 // #977 + ldr x2, [x1, #24] + movk x7, #0x1, lsl #32 + ldr x1, [x1, #16] + mul x6, x3, x2 + umulh x4, x2, x9 + umulh x5, x3, x1 + adds x4, x4, x6 + mul x12, x1, x1 + cset x6, cs // cs = hs, nlast + adds x5, x4, x5 + cinc x6, x6, cs // cs = hs, nlast + mul x10, x3, x9 + lsl x4, x5, #1 + mul x15, x9, x9 + adds x4, x4, x12 + extr x5, x6, x5, #63 + cinc x5, x5, cs // cs = hs, nlast + umulh x13, x9, x9 + mul x12, x1, x9 + lsl x11, x10, #1 + mul x6, x4, x7 + lsr x10, x10, #63 + umulh x4, x4, x7 + madd x4, x5, x7, x4 + adds x6, x6, x15 + umulh x8, x3, x9 + str x6, [x0] + cinc x6, x4, cs // cs = hs, nlast + adds x11, x11, x13 + cinc x10, x10, cs // cs = hs, nlast + adds x8, x8, x12 + mul x14, x3, x3 + cset x12, cs // cs = hs, nlast + umulh x5, x3, x2 + lsl x13, x8, #1 + mul x4, x2, x1 + adds x13, x13, x14 + umulh x15, x1, x1 + extr x12, x12, x8, #63 + cinc x12, x12, cs // cs = hs, nlast + adds x14, x5, x4 + cset x17, cs // cs = hs, nlast + umulh x8, x2, x1 + lsl x5, x14, #1 + mul x18, x2, x2 + adds x5, x5, x15 + extr x17, x17, x14, #63 + cinc x17, x17, cs // cs = hs, nlast + lsl x4, x8, #1 + lsr x14, x8, #63 + mul x16, x3, x1 + mul x15, x5, x7 + umulh x5, x5, x7 + madd x5, x17, x7, x5 + adds x15, x15, x6 + mul x8, x2, x9 + cinc x5, x5, cs // cs = hs, nlast + adds x11, x11, x15 + adc x10, x10, x5 + adds x4, x4, x18 + cinc x5, x14, cs // cs = hs, nlast + umulh x9, x1, x9 + umulh x2, x2, x2 + mul x6, x4, x7 + umulh x4, x4, x7 + madd x4, x5, x7, x4 + adds x6, x6, x10 + mul x5, x2, x7 + cinc x4, x4, cs // cs = hs, nlast + adds x6, x6, x13 + adc x12, x12, x4 + adds x4, x8, x16 cset x1, cs // cs = hs, nlast - cmp x14, x15 - cinc x1, x1, hi // hi = pmore - adds x11, x11, x9 - add x1, x1, x13 - lsl x12, x17, #1 - lsl x30, x18, #1 - mul x13, x10, x10 - mul x15, x3, x11 - umulh x3, x3, x11 - mul x16, x2, x11 - lsl x14, x15, #1 - umulh x9, x2, x11 - cmp x15, x14 - lsl x19, x3, #1 - cinc x2, x19, hi // hi = pmore - adds x4, x4, x14 - cinc x2, x2, cs // cs = hs, nlast - cset w15, cs // cs = hs, nlast - adds x1, x2, x1 - cset x14, cs // cs = hs, nlast - cmp x2, #0x0 - ccmp w15, #0x0, #0x4, eq // eq = none - lsl x15, x16, #1 - cinc x14, x14, ne // ne = any - cmp x3, x19 - cset x2, hi // hi = pmore - cmp x17, x12 - cinc x3, x2, hi // hi = pmore - cmp x18, x30 - cinc x2, x12, hi // hi = pmore - adds x4, x4, x30 + adds x4, x4, x9 + cinc x1, x1, cs // cs = hs, nlast + umulh x2, x2, x7 + umulh x3, x3, x3 + lsl x7, x4, #1 + adds x7, x7, x12 + extr x1, x1, x4, #63 + cinc x1, x1, cs // cs = hs, nlast + adds x3, x5, x3 cinc x2, x2, cs // cs = hs, nlast - cset w18, cs // cs = hs, nlast - adds x1, x1, x2 - lsl x17, x9, #1 + adds x3, x3, x7 + adc x1, x1, x2 + stp x11, x6, [x0, #8] + stp x3, x1, [x0, #24] + ret + nop + nop + .size secp256k1_fe_sqr_4to5, .-secp256k1_fe_sqr_4to5 + + .p2align 4 + .global secp256k1_fe_mul_44to4 + .type secp256k1_fe_mul_44to4, %function +secp256k1_fe_mul_44to4: + stp x29, x30, [sp, #-32]! + mov x6, #0x3d1 // #977 + movk x6, #0x1, lsl #32 + mov x29, sp + ldp x10, x8, [x1] + stp x19, x20, [sp, #16] + ldp x13, x15, [x1, #16] + ldp x5, x14, [x2, #16] + ldp x11, x9, [x2] + mul x7, x13, x5 + mul x3, x8, x14 + umulh x4, x14, x10 + adds x1, x3, x7 + mul x7, x15, x9 + umulh x2, x8, x5 + cset x3, cs // cs = hs, nlast + adds x4, x4, x7 + umulh x7, x13, x9 cset x12, cs // cs = hs, nlast - cmp x2, #0x0 - ccmp w18, #0x0, #0x4, eq // eq = none - add x2, x14, x3 - cinc x12, x12, ne // ne = any - cmp x16, x15 - cinc x3, x17, hi // hi = pmore - adds x1, x1, x15 + adds x1, x1, x4 + adc x3, x3, x12 + umulh x4, x15, x11 + adds x2, x2, x7 + mul x7, x10, x11 + cset x12, cs // cs = hs, nlast + adds x1, x1, x2 + adc x3, x3, x12 + adds x1, x1, x4 cinc x3, x3, cs // cs = hs, nlast - cset w14, cs // cs = hs, nlast - cmp x3, #0x0 - add x12, x12, x2 - ccmp w14, #0x0, #0x4, eq // eq = none - mul x16, x10, x11 - cset x2, ne // ne = any - cmp x9, x17 - umulh x14, x10, x10 - cinc x2, x2, hi // hi = pmore - adds x3, x3, x12 - umulh x15, x10, x11 - cset x10, cs // cs = hs, nlast - adds x1, x1, x13 - cinc x9, x14, cs // cs = hs, nlast - lsl x12, x16, #1 - adds x3, x3, x9 - lsl x14, x15, #1 - adc x2, x2, x10 - cmp x16, x12 - cinc x9, x14, hi // hi = pmore - adds x3, x3, x12 - cinc x9, x9, cs // cs = hs, nlast - cset w12, cs // cs = hs, nlast - adds x2, x9, x2 - umulh x13, x11, x11 + umulh x4, x13, x5 + umulh x2, x8, x14 + umulh x12, x1, x6 + mul x1, x1, x6 + madd x12, x3, x6, x12 + mul x16, x14, x13 + adds x1, x1, x7 + umulh x3, x15, x9 + cinc x12, x12, cs // cs = hs, nlast + adds x2, x2, x4 + mul x7, x5, x15 + cset x4, cs // cs = hs, nlast + adds x3, x3, x16 + cset x16, cs // cs = hs, nlast + adds x2, x2, x3 + adc x3, x4, x16 + adds x2, x2, x7 + cinc x3, x3, cs // cs = hs, nlast + mul x17, x8, x11 + mul x7, x9, x10 + umulh x16, x2, x6 + mul x2, x2, x6 + umulh x4, x10, x11 + madd x16, x3, x6, x16 + adds x2, x2, x17 + mul x3, x14, x10 + cinc x16, x16, cs // cs = hs, nlast + adds x4, x4, x7 + mul x7, x8, x5 + cset x17, cs // cs = hs, nlast + mul x19, x15, x11 + adds x4, x2, x4 + mul x18, x13, x9 + adc x16, x16, x17 + umulh x2, x14, x15 + adds x3, x3, x7 + umulh x17, x5, x10 + cset x7, cs // cs = hs, nlast + umulh x30, x8, x9 + adds x18, x18, x19 + cset x20, cs // cs = hs, nlast + adds x3, x3, x18 + umulh x19, x13, x11 + adc x7, x7, x20 + mul x18, x2, x6 + adds x30, x17, x30 + cset x20, cs // cs = hs, nlast + umulh x17, x2, x6 + adds x3, x3, x30 + umulh x2, x14, x13 + umulh x30, x5, x15 + adc x7, x7, x20 + adds x18, x18, x19 + mul x14, x14, x15 + cinc x17, x17, cs // cs = hs, nlast + adds x3, x3, x18 + adc x7, x7, x17 + adds x2, x2, x30 + cset x17, cs // cs = hs, nlast + adds x2, x2, x14 + cinc x17, x17, cs // cs = hs, nlast + mul x13, x13, x11 + umulh x15, x9, x10 + umulh x14, x2, x6 + mul x2, x2, x6 + umulh x11, x8, x11 + madd x14, x17, x6, x14 + adds x2, x2, x13 + mul x8, x8, x9 + mul x5, x5, x10 + cinc x14, x14, cs // cs = hs, nlast + adds x9, x15, x11 cset x10, cs // cs = hs, nlast - cmp x9, #0x0 - ccmp w12, #0x0, #0x4, eq // eq = none - mul x9, x11, x11 - mul x12, x1, x5 - cinc x11, x10, ne // ne = any - cmp x15, x14 - umulh x1, x1, x5 - cinc x10, x13, hi // hi = pmore + adds x5, x5, x8 + cset x8, cs // cs = hs, nlast + adds x9, x9, x5 + adc x5, x10, x8 adds x2, x2, x9 - adc x10, x11, x10 - mul x9, x3, x5 - adds x8, x12, x8 - umulh x3, x3, x5 + adc x14, x14, x5 + adds x12, x12, x4 + cinc x16, x16, cs // cs = hs, nlast + adds x2, x2, x16 + cinc x14, x14, cs // cs = hs, nlast + adds x3, x3, x14 + cinc x4, x7, cs // cs = hs, nlast + ldp x19, x20, [sp, #16] + mul x5, x4, x6 + umulh x4, x4, x6 + adds x1, x1, x5 + cinc x4, x4, cs // cs = hs, nlast + adds x4, x4, x12 + cset x5, cs // cs = hs, nlast + adds x5, x5, x2 + cset x2, cs // cs = hs, nlast + adds x2, x2, x3 + stp x5, x2, [x0, #16] + cset x3, cs // cs = hs, nlast + ldp x29, x30, [sp], #32 + mul x2, x3, x6 + umulh x3, x3, x6 + adds x2, x2, x1 + adc x4, x4, x3 + stp x2, x4, [x0] + ret + .size secp256k1_fe_mul_44to4, .-secp256k1_fe_mul_44to4 + + .p2align 4 + .global secp256k1_fe_sqr_4to4 + .type secp256k1_fe_sqr_4to4, %function +secp256k1_fe_sqr_4to4: + stp x29, x30, [sp, #-16]! + mov x4, #0x3d1 // #977 + movk x4, #0x1, lsl #32 + mov x29, sp + ldp x8, x5, [x1] + ldr x3, [x1, #24] + ldr x1, [x1, #16] + mul x7, x5, x3 + umulh x2, x3, x8 + umulh x6, x5, x1 + adds x2, x2, x7 + mul x14, x1, x1 + cset x7, cs // cs = hs, nlast + adds x6, x2, x6 + cinc x7, x7, cs // cs = hs, nlast + mul x11, x5, x8 + lsl x2, x6, #1 + mul x13, x8, x8 + adds x2, x2, x14 + extr x6, x7, x6, #63 + cinc x6, x6, cs // cs = hs, nlast + umulh x12, x8, x8 + mul x7, x1, x8 + lsl x9, x11, #1 + umulh x17, x2, x4 + lsr x11, x11, #63 + mul x2, x2, x4 + madd x17, x6, x4, x17 + adds x2, x2, x13 + umulh x10, x5, x8 + cinc x17, x17, cs // cs = hs, nlast + adds x9, x9, x12 + cinc x11, x11, cs // cs = hs, nlast + adds x10, x10, x7 + mul x14, x5, x5 + cset x12, cs // cs = hs, nlast + umulh x6, x5, x3 + lsl x13, x10, #1 + mul x7, x3, x1 + adds x13, x13, x14 + umulh x15, x1, x1 + extr x12, x12, x10, #63 + cinc x12, x12, cs // cs = hs, nlast + adds x14, x6, x7 + cset x18, cs // cs = hs, nlast + umulh x10, x3, x1 + lsl x7, x14, #1 + mul x30, x3, x3 + adds x7, x7, x15 + extr x18, x18, x14, #63 + cinc x18, x18, cs // cs = hs, nlast + lsl x6, x10, #1 + lsr x14, x10, #63 + mul x10, x3, x8 + mul x15, x7, x4 + umulh x7, x7, x4 + madd x7, x18, x4, x7 + adds x15, x15, x17 + umulh x17, x1, x8 + cinc x7, x7, cs // cs = hs, nlast + adds x9, x9, x15 + adc x11, x11, x7 + adds x6, x6, x30 + cinc x7, x14, cs // cs = hs, nlast + mul x16, x5, x1 + umulh x3, x3, x3 + mul x8, x6, x4 + umulh x6, x6, x4 + madd x6, x7, x4, x6 + adds x8, x8, x11 + mul x7, x3, x4 + cinc x6, x6, cs // cs = hs, nlast + adds x8, x8, x13 + adc x12, x12, x6 + adds x6, x10, x16 + cset x1, cs // cs = hs, nlast + adds x6, x6, x17 + cinc x1, x1, cs // cs = hs, nlast + umulh x5, x5, x5 + lsl x10, x6, #1 + umulh x3, x3, x4 + adds x10, x10, x12 + extr x1, x1, x6, #63 + cinc x1, x1, cs // cs = hs, nlast + adds x5, x7, x5 + cinc x3, x3, cs // cs = hs, nlast + adds x5, x5, x10 + adc x1, x1, x3 + ldp x29, x30, [sp], #16 + mul x3, x1, x4 + umulh x1, x1, x4 + adds x2, x2, x3 cinc x1, x1, cs // cs = hs, nlast - mul x11, x2, x5 adds x1, x1, x9 - umulh x2, x2, x5 - cset x9, cs // cs = hs, nlast - adds x1, x1, x6 - adc x3, x9, x3 - mul x9, x10, x5 - adds x3, x3, x11 - umulh x5, x10, x5 - cset x6, cs // cs = hs, nlast - adds x3, x3, x7 - adc x2, x6, x2 - stp x8, x1, [x0] - adds x2, x2, x9 - cset x10, cs // cs = hs, nlast - adds x4, x2, x4 - adc x5, x10, x5 - stp x3, x4, [x0, #16] - str x5, [x0, #32] - ldr x19, [sp, #16] - ldp x29, x30, [sp], #32 + cset x3, cs // cs = hs, nlast + adds x3, x3, x8 + str x3, [x0, #16] + cset x3, cs // cs = hs, nlast + adds x3, x3, x5 + str x3, [x0, #24] + cset x5, cs // cs = hs, nlast + mul x3, x5, x4 + umulh x5, x5, x4 + adds x3, x3, x2 + adc x1, x1, x5 + stp x3, x1, [x0] ret - .size secp256k1_fe_sqr_inner, .-secp256k1_fe_sqr_inner + .size secp256k1_fe_sqr_4to4, .-secp256k1_fe_sqr_4to4 diff --git a/src/asm/field_5x64_x86_64_gen.s b/src/asm/field_5x64_x86_64_gen.s index 596e7e9596..ee29792512 100644 --- a/src/asm/field_5x64_x86_64_gen.s +++ b/src/asm/field_5x64_x86_64_gen.s @@ -1,23 +1,28 @@ -/*********************************************************************** - * Copyright (c) 2021 Kaushik Nath * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* +/************************************************************************ + * Field multiplication and squaring assemblies using representation of * + * field elements in base 2^{64}. * + * Major instructions used in the assemblies are mul/add/adc. * + * * + * Copyright (c) 2021 Kaushik Nath * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or https://www.opensource.org/licenses/mit-license.php. * ***********************************************************************/ -/* 4-limb field multiplication and squaring using the bottom 4-limbs of - * a 5-limb representation. First reduce the 5-limb inputs to fully - * reduced 4-limb forms, then multiply and finally output a half reduced - * output in 5-limb form. The leading limb is of atmost 33 bits. - * - * Major instructions used in the assemblies: mul/add/adc. +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is of at most 64 bits. The 5-limb inputs are fully reduced first + * to 4-limb forms, then multiplied, after which a field element in 5-limb + * form is reported as output. The fifth limb of the output has at most + * 33 bits. */ .att_syntax .text .p2align 4 -.global secp256k1_fe_mul_inner -secp256k1_fe_mul_inner: +.global secp256k1_fe_mul_55to5 +secp256k1_fe_mul_55to5: movq %rsp,%r11 subq $96,%rsp @@ -228,8 +233,8 @@ movq %r11,%rsp ret .p2align 4 -.global secp256k1_fe_sqr_inner -secp256k1_fe_sqr_inner: +.global secp256k1_fe_sqr_5to5 +secp256k1_fe_sqr_5to5: movq %rsp,%r11 subq $64,%rsp @@ -401,3 +406,555 @@ movq 48(%rsp),%rbp movq %r11,%rsp ret + +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_44to5 +secp256k1_fe_mul_44to5: +movq %rsp,%r11 +subq $48,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbx,40(%rsp) + +movq %rdx,%rcx +movq $0x1000003D1,%rbx + +movq 8(%rsi),%rax +mulq 24(%rcx) +movq %rax,%r8 +xorq %r9,%r9 +movq %rdx,%r10 +xorq %r11,%r11 + +movq 16(%rsi),%rax +mulq 16(%rcx) +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 24(%rsi),%rax +mulq 8(%rcx) +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 16(%rsi),%rax +mulq 24(%rcx) +addq %rax,%r10 +adcq $0,%r11 +movq %rdx,%r12 +xorq %r13,%r13 + +movq 24(%rsi),%rax +mulq 16(%rcx) +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq %rbx,%rax +mulq %r10 +imul %rbx,%r11 +movq %rax,%r10 +addq %rdx,%r11 + +movq 24(%rsi),%rax +mulq 24(%rcx) +addq %rax,%r12 +adcq $0,%r13 + +movq %rbx,%rax +mulq %rdx +movq %rax,%r14 +movq %rdx,%r15 + +movq %rbx,%rax +mulq %r12 +imul %rbx,%r13 +movq %rax,%r12 +addq %rdx,%r13 + +movq 0(%rsi),%rax +mulq 24(%rcx) +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 8(%rsi),%rax +mulq 16(%rcx) +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 16(%rsi),%rax +mulq 8(%rcx) +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 24(%rsi),%rax +mulq 0(%rcx) +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq %rbx,%rax +mulq %r8 +imul %rbx,%r9 +movq %rax,%r8 +addq %rdx,%r9 + +movq 0(%rsi),%rax +mulq 0(%rcx) +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 0(%rsi),%rax +mulq 8(%rcx) +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq 8(%rsi),%rax +mulq 0(%rcx) +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq 0(%rsi),%rax +mulq 16(%rcx) +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq 8(%rsi),%rax +mulq 8(%rcx) +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq 16(%rsi),%rax +mulq 0(%rcx) +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +addq %r9,%r10 +adcq $0,%r11 + +addq %r11,%r12 +adcq $0,%r13 + +addq %r13,%r14 +adcq $0,%r15 + +movq %r8,0(%rdi) +movq %r10,8(%rdi) +movq %r12,16(%rdi) +movq %r14,24(%rdi) +movq %r15,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbx + +movq %r11,%rsp + +ret + +.p2align 4 +.global secp256k1_fe_sqr_4to5 +secp256k1_fe_sqr_4to5: +movq %rsp,%r11 +subq $64,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbx,40(%rsp) +movq %rbp,48(%rsp) +movq %rdi,56(%rsp) + +movq 0(%rsi),%rbx +movq 8(%rsi),%rbp +movq 16(%rsi),%rcx +movq 24(%rsi),%rdi + +movq $0x1000003D1,%rsi + +movq %rbp,%rax +mulq %rdi +movq %rax,%r8 +xorq %r9,%r9 +movq %rdx,%r10 +xorq %r11,%r11 +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq %rcx,%rax +mulq %rcx +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq %rcx,%rax +mulq %rdi +addq %rax,%r10 +adcq $0,%r11 +movq %rdx,%r12 +xorq %r13,%r13 +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq %rsi,%rax +mulq %r10 +imul %rsi,%r11 +movq %rax,%r10 +addq %rdx,%r11 + +movq %rdi,%rax +mulq %rdi +addq %rax,%r12 +adcq $0,%r13 + +movq %rsi,%rax +mulq %rdx +movq %rax,%r14 +movq %rdx,%r15 + +movq %rsi,%rax +mulq %r12 +imul %rsi,%r13 +movq %rax,%r12 +addq %rdx,%r13 + +movq %rbx,%rax +mulq %rdi +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq %rbp,%rax +mulq %rcx +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq %rsi,%rax +mulq %r8 +imul %rsi,%r9 +movq %rax,%r8 +addq %rdx,%r9 + +movq %rbx,%rax +mulq %rbx +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq %rbx,%rax +mulq %rbp +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq %rbx,%rax +mulq %rcx +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq %rbp,%rax +mulq %rbp +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq %r10,%rbp +addq %r9,%rbp +adcq $0,%r11 + +movq %r12,%rcx +addq %r11,%rcx +adcq $0,%r13 + +addq %r13,%r14 +adcq $0,%r15 + +movq 56(%rsp),%rdi + +movq %rbx,0(%rdi) +movq %rbp,8(%rdi) +movq %rcx,16(%rdi) +movq %r14,24(%rdi) +movq %r15,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbx +movq 48(%rsp),%rbp + +movq %r11,%rsp + +ret + +/* 64-bit field multiplication in which the first argument has 4-limb + * and the second argument has 5-limb representations such that the + * fifth limb is of at most 64 bits. The second argument is fully + * reduced to 4-limb form and then field multiplication is performed. + * A field element in 5-limb form is reported as output such that the + * fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_45to5 +secp256k1_fe_mul_45to5: +movq %rsp,%r11 +subq $72,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbx,40(%rsp) +movq %rbp,48(%rsp) +movq %rdi,56(%rsp) + +movq $0x1000003d1,%rcx + +movq 0(%rdx),%r8 +movq 8(%rdx),%r9 +movq 16(%rdx),%rbx +movq 24(%rdx),%rbp +movq 32(%rdx),%rax + +mulq %rcx +xorq %rdi,%rdi +addq %r8,%rax +adcq %r9,%rdx +adcq $0,%rbx +adcq $0,%rbp +cmovc %rcx,%rdi +addq %rax,%rdi +adcq $0,%rdx +movq %rdx,64(%rsp) + +movq 8(%rsi),%rax +mulq %rbp +movq %rax,%r8 +xorq %r9,%r9 +movq %rdx,%r10 +xorq %r11,%r11 + +movq 16(%rsi),%rax +mulq %rbx +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 24(%rsi),%rax +mulq 64(%rsp) +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 16(%rsi),%rax +mulq %rbp +addq %rax,%r10 +adcq $0,%r11 +movq %rdx,%r12 +xorq %r13,%r13 + +movq 24(%rsi),%rax +mulq %rbx +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq %rcx,%rax +mulq %r10 +imul %rcx,%r11 +movq %rax,%r10 +addq %rdx,%r11 + +movq 24(%rsi),%rax +mulq %rbp +addq %rax,%r12 +adcq $0,%r13 + +movq %rcx,%rax +mulq %rdx +movq %rax,%r14 +movq %rdx,%r15 + +movq %rcx,%rax +mulq %r12 +imul %rcx,%r13 +movq %rax,%r12 +addq %rdx,%r13 + +movq 0(%rsi),%rax +mulq %rbp +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 8(%rsi),%rax +mulq %rbx +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 16(%rsi),%rax +mulq 64(%rsp) +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq 24(%rsi),%rax +mulq %rdi +addq %rax,%r14 +adcq $0,%r15 +addq %rdx,%r8 +adcq $0,%r9 + +movq %rcx,%rax +mulq %r8 +imul %rcx,%r9 +movq %rax,%r8 +addq %rdx,%r9 + +movq 0(%rsi),%rax +mulq %rdi +addq %rax,%r8 +adcq $0,%r9 +addq %rdx,%r10 +adcq $0,%r11 + +movq 0(%rsi),%rax +mulq 64(%rsp) +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq 8(%rsi),%rax +mulq %rdi +addq %rax,%r10 +adcq $0,%r11 +addq %rdx,%r12 +adcq $0,%r13 + +movq 0(%rsi),%rax +mulq %rbx +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq 8(%rsi),%rax +mulq 64(%rsp) +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +movq 16(%rsi),%rax +mulq %rdi +addq %rax,%r12 +adcq $0,%r13 +addq %rdx,%r14 +adcq $0,%r15 + +addq %r9,%r10 +adcq $0,%r11 +addq %r11,%r12 +adcq $0,%r13 +addq %r13,%r14 +adcq $0,%r15 + +movq 56(%rsp),%rdi + +movq %r8,0(%rdi) +movq %r10,8(%rdi) +movq %r12,16(%rdi) +movq %r14,24(%rdi) +movq %r15,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbx +movq 48(%rsp),%rbp + +movq %r11,%rsp + +ret diff --git a/src/asm/field_5x64_x86_64_maax.s b/src/asm/field_5x64_x86_64_maax.s index 79b7afb816..b06ec13af3 100644 --- a/src/asm/field_5x64_x86_64_maax.s +++ b/src/asm/field_5x64_x86_64_maax.s @@ -1,23 +1,28 @@ -/*********************************************************************** - * Copyright (c) 2021 Kaushik Nath * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* +/************************************************************************ + * Field multiplication and squaring assemblies using representation of * + * field elements in base 2^{64}. * + * Major instructions used in the assemblies are mulx/adcx/adox. * + * * + * Copyright (c) 2021 Kaushik Nath * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or https://www.opensource.org/licenses/mit-license.php. * ***********************************************************************/ -/* 4-limb field multiplication and squaring using the bottom 4-limbs of - * a 5-limb representation. First reduce the 5-limb inputs to fully - * reduced 4-limb forms, then multiply and finally output a half reduced - * output in 5-limb form. The leading limb is of atmost 33 bits. - * - * Major instructions used in the assemblies: mulx/adcx/adox. +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is of at most 64 bits. The 5-limb inputs are fully reduced first + * to 4-limb forms, then multiplied, after which a field element in 5-limb + * form is reported as output. The fifth limb of the output has at most + * 33 bits. */ .att_syntax .text .p2align 4 -.global secp256k1_fe_mul_inner -secp256k1_fe_mul_inner: +.global secp256k1_fe_mul_55to5 +secp256k1_fe_mul_55to5: movq %rsp,%r11 subq $96,%rsp @@ -163,8 +168,8 @@ movq %r11,%rsp ret .p2align 4 -.global secp256k1_fe_sqr_inner -secp256k1_fe_sqr_inner: +.global secp256k1_fe_sqr_5to5 +secp256k1_fe_sqr_5to5: movq %rsp,%r11 subq $56,%rsp @@ -279,3 +284,348 @@ movq 48(%rsp),%rbx movq %r11,%rsp ret + +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_44to5 +secp256k1_fe_mul_44to5: +push %rbp +push %rbx +push %r12 +push %r13 +push %r14 +push %r15 + +movq %rdx,%rbx + +xorq %r13,%r13 +movq 0(%rbx),%rdx +mulx 0(%rsi),%r8,%r9 +mulx 8(%rsi),%rcx,%r10 +adcx %rcx,%r9 +mulx 16(%rsi),%rcx,%r11 +adcx %rcx,%r10 +mulx 24(%rsi),%rcx,%r12 +adcx %rcx,%r11 +adcx %r13,%r12 + +xorq %r14,%r14 +movq 8(%rbx),%rdx +mulx 0(%rsi),%rcx,%rbp +adcx %rcx,%r9 +adox %rbp,%r10 +mulx 8(%rsi),%rcx,%rbp +adcx %rcx,%r10 +adox %rbp,%r11 +mulx 16(%rsi),%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx 24(%rsi),%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +adcx %r14,%r13 + +xorq %r15,%r15 +movq 16(%rbx),%rdx +mulx 0(%rsi),%rcx,%rbp +adcx %rcx,%r10 +adox %rbp,%r11 +mulx 8(%rsi),%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx 16(%rsi),%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +mulx 24(%rsi),%rcx,%rbp +adcx %rcx,%r13 +adox %rbp,%r14 +adcx %r15,%r14 + +xorq %rax,%rax +movq 24(%rbx),%rdx +mulx 0(%rsi),%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx 8(%rsi),%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +mulx 16(%rsi),%rcx,%rbp +adcx %rcx,%r13 +adox %rbp,%r14 +mulx 24(%rsi),%rcx,%rbp +adcx %rcx,%r14 +adox %rbp,%r15 +adcx %rax,%r15 + +xorq %rbp,%rbp +movq $0x1000003D1,%rdx +mulx %r12,%rax,%r12 +adcx %rax,%r8 +adox %r12,%r9 +mulx %r13,%rcx,%r13 +adcx %rcx,%r9 +adox %r13,%r10 +mulx %r14,%rcx,%r14 +adcx %rcx,%r10 +adox %r14,%r11 +mulx %r15,%rcx,%r15 +adcx %rcx,%r11 +adox %rbp,%r15 +adcx %rbp,%r15 + +movq %r8,0(%rdi) +movq %r9,8(%rdi) +movq %r10,16(%rdi) +movq %r11,24(%rdi) +movq %r15,32(%rdi) + +pop %r15 +pop %r14 +pop %r13 +pop %r12 +pop %rbx +pop %rbp + +ret + +.p2align 4 +.global secp256k1_fe_sqr_4to5 +secp256k1_fe_sqr_4to5: +push %rbp +push %rbx +push %r12 +push %r13 +push %r14 +push %r15 +push %rdi + +movq 0(%rsi),%rbx +movq 8(%rsi),%rbp +movq 16(%rsi),%rax +movq 24(%rsi),%rsi + +xorq %r13,%r13 +movq %rbx,%rdx +mulx %rbp,%r9,%r10 +mulx %rax,%rcx,%r11 +adcx %rcx,%r10 +mulx %rsi,%rcx,%r12 +adcx %rcx,%r11 +adcx %r13,%r12 + +xorq %r14,%r14 +movq %rbp,%rdx +mulx %rax,%rcx,%rdx +adcx %rcx,%r11 +adox %rdx,%r12 +movq %rbp,%rdx +mulx %rsi,%rcx,%rdx +adcx %rcx,%r12 +adox %rdx,%r13 +adcx %r14,%r13 + +xorq %r15,%r15 +movq %rax,%rdx +mulx %rsi,%rcx,%r14 +adcx %rcx,%r13 +adcx %r15,%r14 + +shld $1,%r14,%r15 +shld $1,%r13,%r14 +shld $1,%r12,%r13 +shld $1,%r11,%r12 +shld $1,%r10,%r11 +shld $1,%r9,%r10 +addq %r9,%r9 + +xorq %rdx,%rdx +movq %rbx,%rdx +mulx %rdx,%r8,%rdx +adcx %rdx,%r9 + +movq %rbp,%rdx +mulx %rdx,%rcx,%rdx +adcx %rcx,%r10 +adcx %rdx,%r11 + +movq %rax,%rdx +mulx %rdx,%rcx,%rdx +adcx %rcx,%r12 +adcx %rdx,%r13 + +movq %rsi,%rdx +mulx %rdx,%rcx,%rdx +adcx %rcx,%r14 +adcx %rdx,%r15 + +xorq %rbp,%rbp +movq $0x1000003D1,%rdx +mulx %r12,%rbx,%r12 +adcx %r8,%rbx +adox %r9,%r12 +mulx %r13,%rcx,%rax +adcx %rcx,%r12 +adox %r10,%rax +mulx %r14,%rcx,%rsi +adcx %rcx,%rax +adox %r11,%rsi +mulx %r15,%rcx,%r15 +adcx %rcx,%rsi +adox %rbp,%r15 +adcx %rbp,%r15 + +movq %rbx,0(%rdi) +movq %r12,8(%rdi) +movq %rax,16(%rdi) +movq %rsi,24(%rdi) +movq %r15,32(%rdi) + +pop %r15 +pop %r14 +pop %r13 +pop %r12 +pop %rbx +pop %rbp + +ret + +/* 64-bit field multiplication in which the first argument has 4-limb + * and the second argument has 5-limb representations such that the + * fifth limb is of at most 64 bits. The second argument is fully + * reduced to 4-limb form and then field multiplication is performed. + * A field element in 5-limb form is reported as output such that the + * fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_45to5 +secp256k1_fe_mul_45to5: +movq %rsp,%r11 +subq $72,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbp,40(%rsp) +movq %rbx,48(%rsp) +movq %rdi,56(%rsp) + +movq 0(%rdx),%rax +movq 8(%rdx),%rbx +movq 16(%rdx),%r8 +movq 24(%rdx),%r9 + +movq $0x1000003D1,%r15 +xorq %rcx,%rcx +mulx 32(%rdx),%r13,%r14 +adcx %r13,%rax +adcx %r14,%rbx +adcx %rcx,%r8 +adcx %rcx,%r9 +cmovc %r15,%rcx +addq %rcx,%rax +adcq $0,%rbx + +movq %r8,56(%rsp) +movq %r9,64(%rsp) + +xorq %r13,%r13 +movq 0(%rsi),%rdx +mulx %rax,%r8,%r9 +mulx %rbx,%rcx,%r10 +adcx %rcx,%r9 +mulx 56(%rsp),%rcx,%r11 +adcx %rcx,%r10 +mulx 64(%rsp),%rcx,%r12 +adcx %rcx,%r11 +adcx %r13,%r12 + +xorq %r14,%r14 +movq 8(%rsi),%rdx +mulx %rax,%rcx,%rbp +adcx %rcx,%r9 +adox %rbp,%r10 +mulx %rbx,%rcx,%rbp +adcx %rcx,%r10 +adox %rbp,%r11 +mulx 56(%rsp),%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx 64(%rsp),%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +adcx %r14,%r13 + +xorq %r15,%r15 +movq 16(%rsi),%rdx +mulx %rax,%rcx,%rbp +adcx %rcx,%r10 +adox %rbp,%r11 +mulx %rbx,%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx 56(%rsp),%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +mulx 64(%rsp),%rcx,%rbp +adcx %rcx,%r13 +adox %rbp,%r14 +adcx %r15,%r14 + +xorq %rdx,%rdx +movq 24(%rsi),%rdx +mulx %rax,%rcx,%rbp +adcx %rcx,%r11 +adox %rbp,%r12 +mulx %rbx,%rcx,%rbp +adcx %rcx,%r12 +adox %rbp,%r13 +mulx 56(%rsp),%rcx,%rbp +adcx %rcx,%r13 +adox %rbp,%r14 +mulx 64(%rsp),%rcx,%rbp +adcx %rcx,%r14 +adox %rbp,%r15 +adcq $0,%r15 + +xorq %rbp,%rbp +movq $0x1000003D1,%rdx +mulx %r12,%rax,%r12 +adcx %rax,%r8 +adox %r12,%r9 +mulx %r13,%rcx,%r13 +adcx %rcx,%r9 +adox %r13,%r10 +mulx %r14,%rcx,%r14 +adcx %rcx,%r10 +adox %r14,%r11 +mulx %r15,%rcx,%r15 +adcx %rcx,%r11 +adox %rbp,%r15 +adcx %rbp,%r15 + +movq %r8,0(%rdi) +movq %r9,8(%rdi) +movq %r10,16(%rdi) +movq %r11,24(%rdi) +movq %r15,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbp +movq 48(%rsp),%rbx + +movq %r11,%rsp + +ret diff --git a/src/asm/field_5x64_x86_64_mxaa.s b/src/asm/field_5x64_x86_64_mxaa.s index dec40d76ed..f9b99e3848 100644 --- a/src/asm/field_5x64_x86_64_mxaa.s +++ b/src/asm/field_5x64_x86_64_mxaa.s @@ -1,23 +1,28 @@ -/*********************************************************************** - * Copyright (c) 2021 Kaushik Nath * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* +/************************************************************************ + * Field multiplication and squaring assemblies using representation of * + * field elements in base 2^{64}. * + * Major instructions used in the assemblies are mulx/add/adc. * + * * + * Copyright (c) 2021 Kaushik Nath * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or https://www.opensource.org/licenses/mit-license.php. * ***********************************************************************/ -/* 4-limb field multiplication and squaring using the bottom 4-limbs of - * a 5-limb representation. First reduce the 5-limb inputs to fully - * reduced 4-limb forms, then multiply and finally output a half reduced - * output in 5-limb form. The leading limb is of atmost 33 bits. - * - * Major instructions used in the assemblies: mulx/add/adc. +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is of at most 64 bits. The 5-limb inputs are fully reduced first + * to 4-limb forms, then multiplied, after which a field element in 5-limb + * form is reported as output. The fifth limb of the output has at most + * 33 bits. */ .att_syntax .text .p2align 4 -.global secp256k1_fe_mul_inner -secp256k1_fe_mul_inner: +.global secp256k1_fe_mul_55to5 +secp256k1_fe_mul_55to5: movq %rsp,%r11 subq $112,%rsp @@ -160,8 +165,8 @@ movq %r11,%rsp ret .p2align 4 -.global secp256k1_fe_sqr_inner -secp256k1_fe_sqr_inner: +.global secp256k1_fe_sqr_5to5 +secp256k1_fe_sqr_5to5: movq %rsp,%r11 subq $64,%rsp @@ -219,7 +224,7 @@ shld $1,%r12,%r13 shld $1,%r11,%r12 shld $1,%r10,%r11 shld $1,%r9,%r10 -shlq $1,%r9 +addq %r9,%r9 movq %rbp,%rdx mulx %rdx,%r8,%rax @@ -278,3 +283,345 @@ movq 48(%rsp),%rbx movq %r11,%rsp ret + +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_44to5 +secp256k1_fe_mul_44to5: +movq %rsp,%r11 +subq $64,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbp,40(%rsp) +movq %rbx,48(%rsp) +movq %rdi,56(%rsp) + +movq %rdx,%rdi + +movq 0(%rdi),%rdx +mulx 0(%rsi),%r8,%r9 +mulx 8(%rsi),%rcx,%r10 +addq %rcx,%r9 +mulx 16(%rsi),%rcx,%r11 +adcq %rcx,%r10 +mulx 24(%rsi),%rcx,%r12 +adcq %rcx,%r11 +adcq $0,%r12 + +movq 8(%rdi),%rdx +mulx 0(%rsi),%rax,%rbx +mulx 8(%rsi),%rcx,%rbp +addq %rcx,%rbx +mulx 16(%rsi),%rcx,%r15 +adcq %rcx,%rbp +mulx 24(%rsi),%rcx,%r13 +adcq %rcx,%r15 +adcq $0,%r13 +addq %rax,%r9 +adcq %rbx,%r10 +adcq %rbp,%r11 +adcq %r15,%r12 +adcq $0,%r13 + +movq 16(%rdi),%rdx +mulx 0(%rsi),%rax,%rbx +mulx 8(%rsi),%rcx,%rbp +addq %rcx,%rbx +mulx 16(%rsi),%rcx,%r15 +adcq %rcx,%rbp +mulx 24(%rsi),%rcx,%r14 +adcq %rcx,%r15 +adcq $0,%r14 +addq %rax,%r10 +adcq %rbx,%r11 +adcq %rbp,%r12 +adcq %r15,%r13 +adcq $0,%r14 + +movq 24(%rdi),%rdx +mulx 0(%rsi),%rax,%rbx +mulx 8(%rsi),%rcx,%rbp +addq %rcx,%rbx +mulx 16(%rsi),%rcx,%r15 +adcq %rcx,%rbp +mulx 24(%rsi),%rcx,%rsi +adcq %rcx,%r15 +adcq $0,%rsi +addq %rax,%r11 +adcq %rbx,%r12 +adcq %rbp,%r13 +adcq %r15,%r14 +adcq $0,%rsi + +movq $0x1000003D1,%rdx +mulx %r12,%r12,%rbx +mulx %r13,%r13,%rcx +addq %rbx,%r13 +mulx %r14,%r14,%rbx +adcq %rcx,%r14 +mulx %rsi,%r15,%rcx +adcq %rbx,%r15 +adcq $0,%rcx +addq %r12,%r8 +adcq %r13,%r9 +adcq %r14,%r10 +adcq %r15,%r11 +adcq $0,%rcx + +movq 56(%rsp),%rdi +movq %r8,0(%rdi) +movq %r9,8(%rdi) +movq %r10,16(%rdi) +movq %r11,24(%rdi) +movq %rcx,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbp +movq 48(%rsp),%rbx + +movq %r11,%rsp + +ret + +.p2align 4 +.global secp256k1_fe_sqr_4to5 +secp256k1_fe_sqr_4to5: +movq %rsp,%r11 +subq $56,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbp,40(%rsp) +movq %rbx,48(%rsp) + +movq 0(%rsi),%rdx +mulx 8(%rsi),%r9,%r10 +mulx 16(%rsi),%rcx,%r11 +addq %rcx,%r10 +mulx 24(%rsi),%rcx,%r12 +adcq %rcx,%r11 +adcq $0,%r12 + +movq 8(%rsi),%rdx +mulx 16(%rsi),%rax,%rbx +mulx 24(%rsi),%rcx,%r13 +addq %rcx,%rbx +adcq $0,%r13 +addq %rax,%r11 +adcq %rbx,%r12 +adcq $0,%r13 + +movq 16(%rsi),%rdx +mulx 24(%rsi),%rax,%r14 +addq %rax,%r13 +adcq $0,%r14 + +movq $0,%r15 +shld $1,%r14,%r15 +shld $1,%r13,%r14 +shld $1,%r12,%r13 +shld $1,%r11,%r12 +shld $1,%r10,%r11 +shld $1,%r9,%r10 +addq %r9,%r9 + +movq 0(%rsi),%rdx +mulx %rdx,%r8,%rax +addq %rax,%r9 + +movq 8(%rsi),%rdx +mulx %rdx,%rax,%rbx +adcq %rax,%r10 +adcq %rbx,%r11 + +movq 16(%rsi),%rdx +mulx %rdx,%rax,%rbx +adcq %rax,%r12 +adcq %rbx,%r13 + +movq 24(%rsi),%rdx +mulx %rdx,%rax,%rbx +adcq %rax,%r14 +adcq %rbx,%r15 + +movq $0x1000003D1,%rdx +mulx %r12,%r12,%rbx +mulx %r13,%r13,%rcx +addq %rbx,%r13 +mulx %r14,%r14,%rbx +adcq %rcx,%r14 +mulx %r15,%r15,%rcx +adcq %rbx,%r15 +adcq $0,%rcx +addq %r12,%r8 +adcq %r13,%r9 +adcq %r14,%r10 +adcq %r15,%r11 +adcq $0,%rcx + +movq %r8,0(%rsi) +movq %r9,8(%rsi) +movq %r10,16(%rsi) +movq %r11,24(%rsi) +movq %rcx,32(%rsi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbp +movq 48(%rsp),%rbx + +movq %r11,%rsp + +ret + +/* 64-bit field multiplication in which the first argument has 4-limb + * and the second argument has 5-limb representations such that the + * fifth limb is of at most 64 bits. The second argument is fully + * reduced to 4-limb form and then field multiplication is performed. + * A field element in 5-limb form is reported as output such that the + * fifth limb is of at most 33 bits. + */ + +.p2align 4 +.global secp256k1_fe_mul_45to5 +secp256k1_fe_mul_45to5: +movq %rsp,%r11 +subq $88,%rsp + +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbp,40(%rsp) +movq %rbx,48(%rsp) + +movq 0(%rdx),%r12 +movq 8(%rdx),%r13 +movq 16(%rdx),%r14 +movq 24(%rdx),%r15 +movq 32(%rdx),%rax + +movq $0x1000003D1,%rdx +xorq %rcx,%rcx +mulx %rax,%rax,%rbx +addq %rax,%r12 +adcq %rbx,%r13 +adcq $0,%r14 +adcq $0,%r15 +cmovc %rdx,%rcx +addq %rcx,%r12 +adcq $0,%r13 + +movq %r12,56(%rsp) +movq %r13,64(%rsp) +movq %r14,72(%rsp) +movq %r15,80(%rsp) + +movq 0(%rsi),%rdx +mulx 56(%rsp),%r8,%r9 +mulx 64(%rsp),%rcx,%r10 +addq %rcx,%r9 +mulx 72(%rsp),%rcx,%r11 +adcq %rcx,%r10 +mulx 80(%rsp),%rcx,%r12 +adcq %rcx,%r11 +adcq $0,%r12 + +movq 8(%rsi),%rdx +mulx 56(%rsp),%rax,%rbx +mulx 64(%rsp),%rcx,%rbp +addq %rcx,%rbx +mulx 72(%rsp),%rcx,%r15 +adcq %rcx,%rbp +mulx 80(%rsp),%rcx,%r13 +adcq %rcx,%r15 +adcq $0,%r13 +addq %rax,%r9 +adcq %rbx,%r10 +adcq %rbp,%r11 +adcq %r15,%r12 +adcq $0,%r13 + +movq 16(%rsi),%rdx +mulx 56(%rsp),%rax,%rbx +mulx 64(%rsp),%rcx,%rbp +addq %rcx,%rbx +mulx 72(%rsp),%rcx,%r15 +adcq %rcx,%rbp +mulx 80(%rsp),%rcx,%r14 +adcq %rcx,%r15 +adcq $0,%r14 +addq %rax,%r10 +adcq %rbx,%r11 +adcq %rbp,%r12 +adcq %r15,%r13 +adcq $0,%r14 + +movq 24(%rsi),%rdx +mulx 56(%rsp),%rax,%rbx +mulx 64(%rsp),%rcx,%rbp +addq %rcx,%rbx +mulx 72(%rsp),%rcx,%r15 +adcq %rcx,%rbp +mulx 80(%rsp),%rcx,%rsi +adcq %rcx,%r15 +adcq $0,%rsi +addq %rax,%r11 +adcq %rbx,%r12 +adcq %rbp,%r13 +adcq %r15,%r14 +adcq $0,%rsi + +movq $0x1000003D1,%rdx +mulx %r12,%r12,%rbx +mulx %r13,%r13,%rcx +addq %rbx,%r13 +mulx %r14,%r14,%rbx +adcq %rcx,%r14 +mulx %rsi,%r15,%rcx +adcq %rbx,%r15 +adcq $0,%rcx +addq %r12,%r8 +adcq %r13,%r9 +adcq %r14,%r10 +adcq %r15,%r11 +adcq $0,%rcx + +movq %r8,0(%rdi) +movq %r9,8(%rdi) +movq %r10,16(%rdi) +movq %r11,24(%rdi) +movq %rcx,32(%rdi) + +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbp +movq 48(%rsp),%rbx + +movq %r11,%rsp + +ret diff --git a/src/field_5x64_impl.h b/src/field_5x64_impl.h index 1c587b0332..0325d65e60 100644 --- a/src/field_5x64_impl.h +++ b/src/field_5x64_impl.h @@ -17,8 +17,10 @@ #if defined(USE_EXTERNAL_ASM) /* External assembler implementation */ -void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b); -void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a); +void secp256k1_fe_mul_55to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b); +void secp256k1_fe_mul_45to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b); +void secp256k1_fe_sqr_5to5(uint64_t *r, const uint64_t *a); +void secp256k1_fe_sqr_4to5(uint64_t *r, const uint64_t *a); #endif #ifdef VERIFY @@ -733,7 +735,7 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2 #endif #if defined(USE_EXTERNAL_ASM) - secp256k1_fe_mul_inner(r->n, a->n, b->n); + secp256k1_fe_mul_55to5(r->n, a->n, b->n); #else mul2(c0,c1,a4,0x1000003D1ULL); a4 = 0; @@ -803,10 +805,12 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2 } static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const secp256k1_fe * SECP256K1_RESTRICT b_prec) { +#ifndef USE_EXTERNAL_ASM uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4]; uint64_t b0 = b_prec->n[0], b1 = b_prec->n[1], b2 = b_prec->n[2], b3 = b_prec->n[3]; uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; +#endif #ifdef VERIFY VERIFY_CHECK(a->magnitude <= 2048); @@ -818,6 +822,9 @@ static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const VERIFY_CHECK(a != b_prec); #endif +#if defined(USE_EXTERNAL_ASM) + secp256k1_fe_mul_45to5(r->n, b_prec->n, a->n); +#else mul2(c0,c1,a4,0x1000003D1ULL); a4 = 0; add2(c0,c1,a0); @@ -862,6 +869,7 @@ static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const add2(d3,d4,c3); r->n[3] = d3; r->n[4] = d4; +#endif #ifdef VERIFY r->magnitude = 1; @@ -884,7 +892,7 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) { #endif #if defined(USE_EXTERNAL_ASM) - secp256k1_fe_sqr_inner(r->n, a->n); + secp256k1_fe_sqr_5to5(r->n, a->n); #else /* Bring a to [0,2**256). */ mul2(c0,c1,a4,0x1000003D1ULL); @@ -935,9 +943,11 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) { } static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) { +#ifndef USE_EXTERNAL_ASM uint64_t a0 = a_prec->n[0], a1 = a_prec->n[1], a2 = a_prec->n[2], a3 = a_prec->n[3]; uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; +#endif #ifdef VERIFY VERIFY_CHECK(a_prec->precomputed); @@ -945,6 +955,9 @@ static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) { secp256k1_fe_verify(a_prec); #endif +#if defined(USE_EXTERNAL_ASM) + secp256k1_fe_sqr_4to5(r->n, a_prec->n); +#else /* Compute 512-bit product. */ c0 = 0; c1 = 0; @@ -973,6 +986,7 @@ static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) { add2(d3,d4,c3); r->n[3] = d3; r->n[4] = d4; +#endif #ifdef VERIFY r->magnitude = 1;