diff --git a/.cirrus.yml b/.cirrus.yml index bf71a70839..dda2d9a3fa 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -65,6 +65,9 @@ task: - env: {WIDEMUL: int128, RECOVERY: yes, EXPERIMENTAL: yes, SCHNORRSIG: yes} - env: {WIDEMUL: int128, ECDH: yes, EXPERIMENTAL: yes, SCHNORRSIG: yes} - env: {WIDEMUL: int128, ASM: x86_64} + - env: {WIDEMUL: int128, ASM: x86_64_gen} + - env: {WIDEMUL: int128, ASM: x86_64_maax} + - env: {WIDEMUL: int128, ASM: x86_64_mxaa} - env: { RECOVERY: yes, EXPERIMENTAL: yes, SCHNORRSIG: yes} - env: { STATICPRECOMPUTATION: no} - env: {BUILD: distcheck, WITH_VALGRIND: no, CTIMETEST: no, BENCH: no} @@ -226,6 +229,11 @@ task: EXPERIMENTAL: yes SCHNORRSIG: yes CTIMETEST: no + matrix: + - env: + ASM: no + - env: + ASM: aarch64 << : *MERGE_BASE test_script: - ./ci/cirrus.sh diff --git a/Makefile.am b/Makefile.am index 1e03560884..50091c5529 100644 --- a/Makefile.am +++ b/Makefile.am @@ -30,10 +30,8 @@ noinst_HEADERS += src/ecmult_gen.h noinst_HEADERS += src/ecmult_gen_impl.h noinst_HEADERS += src/field_10x26.h noinst_HEADERS += src/field_10x26_impl.h -noinst_HEADERS += src/field_5x52.h -noinst_HEADERS += src/field_5x52_impl.h -noinst_HEADERS += src/field_5x52_int128_impl.h -noinst_HEADERS += src/field_5x52_asm_impl.h +noinst_HEADERS += src/field_5x64.h +noinst_HEADERS += src/field_5x64_impl.h noinst_HEADERS += src/modinv32.h noinst_HEADERS += src/modinv32_impl.h noinst_HEADERS += src/modinv64.h @@ -69,6 +67,18 @@ if USE_EXTERNAL_ASM if USE_ASM_ARM libsecp256k1_common_la_SOURCES = src/asm/field_10x26_arm.s endif +if USE_ASM_AARCH64 +libsecp256k1_common_la_SOURCES = src/asm/field_5x64_aarch64.s +endif +if USE_ASM_X86_64_GEN +libsecp256k1_common_la_SOURCES = src/asm/field_5x64_x86_64_gen.s +endif +if USE_ASM_X86_64_MXAA +libsecp256k1_common_la_SOURCES = src/asm/field_5x64_x86_64_mxaa.s +endif +if USE_ASM_X86_64_MAAX +libsecp256k1_common_la_SOURCES = src/asm/field_5x64_x86_64_maax.s +endif endif libsecp256k1_la_SOURCES = src/secp256k1.c diff --git a/configure.ac b/configure.ac index 9969cfa343..a53bd3c63f 100644 --- a/configure.ac +++ b/configure.ac @@ -170,7 +170,7 @@ AC_ARG_ENABLE(external_default_callbacks, # Legal values are int64 (for [u]int64_t), int128 (for [unsigned] __int128), and auto (the default). AC_ARG_WITH([test-override-wide-multiply], [] ,[set_widemul=$withval], [set_widemul=auto]) -AC_ARG_WITH([asm], [AS_HELP_STRING([--with-asm=x86_64|arm|no|auto], +AC_ARG_WITH([asm], [AS_HELP_STRING([--with-asm=x86_64|x86_64_gen|arm|aarch64|no|auto], [assembly optimizations to useĀ (experimental: arm) [default=auto]])],[req_asm=$withval], [req_asm=auto]) AC_ARG_WITH([ecmult-window], [AS_HELP_STRING([--with-ecmult-window=SIZE|auto], @@ -236,13 +236,13 @@ if test x"$req_asm" = x"auto"; then else set_asm=$req_asm case $set_asm in - x86_64) + x86_64|x86_64_gen|x86_64_mxaa|x86_64_maax) SECP_64BIT_ASM_CHECK if test x"$has_64bit_asm" != x"yes"; then AC_MSG_ERROR([x86_64 assembly optimization requested but not available]) fi ;; - arm) + arm|aarch64) ;; no) ;; @@ -254,12 +254,17 @@ fi # Select assembly optimization use_external_asm=no +use_asm_x86_64=no case $set_asm in x86_64) - AC_DEFINE(USE_ASM_X86_64, 1, [Define this symbol to enable x86_64 assembly optimizations]) + use_asm_x86_64=yes ;; -arm) +x86_64_gen|x86_64_mxaa|x86_64_maax) + use_asm_x86_64=yes + use_external_asm=yes + ;; +arm|aarch64) use_external_asm=yes ;; no) @@ -269,6 +274,10 @@ no) ;; esac +if test x"$use_asm_x86_64" = x"yes"; then + AC_DEFINE(USE_ASM_X86_64, 1, [Define this symbol in x86_64 inline assembly is used]) +fi + if test x"$use_external_asm" = x"yes"; then AC_DEFINE(USE_EXTERNAL_ASM, 1, [Define this symbol if an external (non-inline) assembly implementation is used]) fi @@ -503,6 +512,10 @@ AM_CONDITIONAL([ENABLE_MODULE_EXTRAKEYS], [test x"$enable_module_extrakeys" = x" AM_CONDITIONAL([ENABLE_MODULE_SCHNORRSIG], [test x"$enable_module_schnorrsig" = x"yes"]) AM_CONDITIONAL([USE_EXTERNAL_ASM], [test x"$use_external_asm" = x"yes"]) AM_CONDITIONAL([USE_ASM_ARM], [test x"$set_asm" = x"arm"]) +AM_CONDITIONAL([USE_ASM_AARCH64], [test x"$set_asm" = x"aarch64"]) +AM_CONDITIONAL([USE_ASM_X86_64_GEN], [test x"$set_asm" = x"x86_64_gen"]) +AM_CONDITIONAL([USE_ASM_X86_64_MXAA], [test x"$set_asm" = x"x86_64_mxaa"]) +AM_CONDITIONAL([USE_ASM_X86_64_MAAX], [test x"$set_asm" = x"x86_64_maax"]) # Make sure nothing new is exported so that we don't break the cache. PKGCONFIG_PATH_TEMP="$PKG_CONFIG_PATH" diff --git a/src/asm/field_5x64_aarch64.s b/src/asm/field_5x64_aarch64.s new file mode 100644 index 0000000000..d89f6cf545 --- /dev/null +++ b/src/asm/field_5x64_aarch64.s @@ -0,0 +1,893 @@ +/* Aarch64 assembly, created by disassembling the output of GCC 10.3.0 from the C __int128 + * based implementation in src/field_5x64_impl.h. */ + + .text + +/* Aarch64 assembly modules, created by disassembling the + output of high level c function written by Kaushik */ + + .p2align 4 + .global secp256k1_fe_mul_45to5 + .type secp256k1_fe_mul_45to5, %function +secp256k1_fe_mul_45to5: + stp x29, x30, [sp, #-16]! + mov x8, #0x3d1 // #977 + movk x8, #0x1, lsl #32 + mov x29, sp + ldp x7, x3, [x2, #24] + ldp x6, x5, [x2] + ldp x10, x9, [x1] + mul x4, x3, x8 + umulh x3, x3, x8 + adds x4, x4, x6 + cinc x3, x3, cs // cs = hs, nlast + ldr x6, [x2, #16] + adds x5, x5, x3 + cset x3, cs // cs = hs, nlast + adds x6, x6, x3 + cset x2, cs // cs = hs, nlast + adds x7, x7, x2 + cset x2, cs // cs = hs, nlast + mul x18, x9, x6 + ldp x12, x15, [x1, #16] + mul x14, x9, x7 + mul x11, x2, x8 + umulh x2, x2, x8 + adds x11, x11, x4 + umulh x4, x10, x7 + adc x5, x5, x2 + umulh x2, x9, x6 + mul x3, x12, x6 + mul x17, x10, x7 + adds x2, x2, x3 + mul x13, x15, x5 + umulh x1, x12, x5 + cset x3, cs // cs = hs, nlast + adds x4, x4, x14 + cset x14, cs // cs = hs, nlast + adds x2, x2, x4 + adc x3, x3, x14 + umulh x4, x15, x11 + adds x1, x1, x13 + mul x14, x10, x11 + cset x13, cs // cs = hs, nlast + adds x2, x2, x1 + adc x3, x3, x13 + adds x2, x2, x4 + cinc x3, x3, cs // cs = hs, nlast + mul x13, x15, x6 + umulh x1, x12, x6 + mul x4, x2, x8 + umulh x2, x2, x8 + madd x2, x3, x8, x2 + adds x4, x4, x14 + umulh x3, x9, x7 + str x4, [x0] + mul x14, x12, x7 + cinc x2, x2, cs // cs = hs, nlast + adds x1, x1, x13 + umulh x13, x15, x5 + cset x4, cs // cs = hs, nlast + adds x3, x3, x14 + cset x14, cs // cs = hs, nlast + adds x1, x1, x3 + adc x4, x4, x14 + adds x1, x1, x13 + cinc x4, x4, cs // cs = hs, nlast + umulh x3, x15, x7 + mul x13, x5, x10 + umulh x14, x1, x8 + mul x1, x1, x8 + madd x14, x4, x8, x14 + adds x1, x1, x13 + umulh x4, x10, x11 + mul x13, x9, x11 + cinc x14, x14, cs // cs = hs, nlast + adds x16, x4, x13 + mul x4, x3, x8 + umulh x13, x3, x8 + cset x30, cs // cs = hs, nlast + adds x3, x1, x16 + umulh x16, x10, x6 + adc x14, x14, x30 + adds x4, x4, x17 + umulh x1, x12, x11 + cinc x13, x13, cs // cs = hs, nlast + mul x17, x15, x11 + adds x16, x16, x18 + cset x30, cs // cs = hs, nlast + adds x4, x4, x16 + mul x18, x12, x5 + adc x13, x13, x30 + umulh x16, x9, x5 + adds x17, x1, x17 + cset x30, cs // cs = hs, nlast + adds x4, x4, x17 + umulh x1, x12, x7 + adc x13, x13, x30 + mul x7, x15, x7 + adds x16, x16, x18 + umulh x15, x15, x6 + cset x17, cs // cs = hs, nlast + adds x4, x4, x16 + umulh x16, x9, x11 + adc x13, x13, x17 + adds x1, x1, x7 + cset x7, cs // cs = hs, nlast + adds x1, x1, x15 + cinc x15, x7, cs // cs = hs, nlast + mul x12, x12, x11 + mul x6, x10, x6 + umulh x7, x1, x8 + mul x1, x1, x8 + madd x7, x15, x8, x7 + adds x1, x1, x16 + mul x9, x9, x5 + cinc x7, x7, cs // cs = hs, nlast + umulh x5, x5, x10 + adds x6, x6, x12 + cset x8, cs // cs = hs, nlast + adds x1, x1, x6 + adc x7, x7, x8 + adds x5, x5, x9 + cset x6, cs // cs = hs, nlast + adds x1, x1, x5 + adc x7, x7, x6 + adds x2, x2, x3 + cinc x14, x14, cs // cs = hs, nlast + adds x1, x1, x14 + stp x2, x1, [x0, #8] + cinc x7, x7, cs // cs = hs, nlast + adds x4, x4, x7 + cinc x13, x13, cs // cs = hs, nlast + stp x4, x13, [x0, #24] + ldp x29, x30, [sp], #16 + ret + .size secp256k1_fe_mul_45to5, .-secp256k1_fe_mul_45to5 + + .p2align 4 + .global secp256k1_fe_mul_55to5 + .type secp256k1_fe_mul_55to5, %function +secp256k1_fe_mul_55to5: + stp x29, x30, [sp, #-32]! + mov x9, #0x3d1 // #977 + movk x9, #0x1, lsl #32 + mov x29, sp + ldp x5, x10, [x1] + stp x19, x20, [sp, #16] + ldr x3, [x1, #32] + ldp x12, x8, [x1, #16] + mul x4, x3, x9 + umulh x3, x3, x9 + adds x4, x4, x5 + cinc x3, x3, cs // cs = hs, nlast + adds x10, x10, x3 + cset x3, cs // cs = hs, nlast + adds x12, x12, x3 + cset x1, cs // cs = hs, nlast + adds x8, x8, x1 + cset x3, cs // cs = hs, nlast + ldp x7, x11, [x2] + ldr x1, [x2, #32] + mul x6, x3, x9 + umulh x5, x3, x9 + adds x6, x6, x4 + mul x3, x1, x9 + adc x10, x10, x5 + umulh x1, x1, x9 + adds x3, x3, x7 + cinc x1, x1, cs // cs = hs, nlast + ldr x13, [x2, #16] + adds x11, x11, x1 + cset x1, cs // cs = hs, nlast + ldr x7, [x2, #24] + adds x13, x13, x1 + cset x1, cs // cs = hs, nlast + adds x7, x7, x1 + cset x1, cs // cs = hs, nlast + mul x19, x12, x13 + umulh x14, x10, x13 + mul x18, x10, x7 + mul x5, x1, x9 + umulh x1, x1, x9 + adds x5, x5, x3 + umulh x17, x7, x6 + adc x11, x11, x1 + mul x16, x7, x12 + umulh x1, x12, x13 + umulh x2, x8, x5 + umulh x4, x11, x12 + mul x15, x11, x8 + adds x2, x2, x19 + cset x3, cs // cs = hs, nlast + mul x19, x6, x5 + adds x15, x4, x15 + umulh x4, x11, x8 + cset x20, cs // cs = hs, nlast + adds x2, x2, x15 + adc x3, x3, x20 + adds x14, x14, x18 + cset x15, cs // cs = hs, nlast + adds x2, x2, x14 + adc x3, x3, x15 + adds x2, x2, x17 + cinc x3, x3, cs // cs = hs, nlast + mul x18, x13, x8 + umulh x17, x10, x7 + mul x14, x2, x9 + umulh x2, x2, x9 + madd x2, x3, x9, x2 + adds x14, x14, x19 + str x14, [x0] + mul x15, x10, x5 + cinc x2, x2, cs // cs = hs, nlast + adds x1, x1, x16 + cset x3, cs // cs = hs, nlast + adds x4, x4, x18 + cset x14, cs // cs = hs, nlast + adds x1, x1, x4 + adc x4, x3, x14 + adds x1, x1, x17 + cinc x4, x4, cs // cs = hs, nlast + mul x17, x11, x6 + umulh x3, x6, x5 + umulh x14, x1, x9 + mul x1, x1, x9 + madd x14, x4, x9, x14 + umulh x16, x7, x8 + adds x1, x1, x15 + umulh x4, x12, x5 + cinc x14, x14, cs // cs = hs, nlast + mul x15, x8, x5 + adds x3, x3, x17 + cset x18, cs // cs = hs, nlast + mul x17, x7, x6 + adds x3, x1, x3 + umulh x1, x13, x6 + adc x14, x14, x18 + adds x4, x4, x15 + mul x19, x11, x12 + cset x15, cs // cs = hs, nlast + adds x18, x1, x17 + mul x1, x16, x9 + umulh x17, x16, x9 + cset x20, cs // cs = hs, nlast + adds x4, x4, x18 + mul x30, x10, x13 + umulh x16, x10, x11 + adc x15, x15, x20 + adds x18, x1, x19 + umulh x1, x13, x8 + cinc x17, x17, cs // cs = hs, nlast + adds x4, x4, x18 + mul x8, x7, x8 + adc x15, x15, x17 + adds x16, x16, x30 + umulh x7, x7, x12 + cset x17, cs // cs = hs, nlast + adds x4, x4, x16 + adc x15, x15, x17 + adds x1, x1, x8 + cset x8, cs // cs = hs, nlast + adds x1, x1, x7 + cinc x8, x8, cs // cs = hs, nlast + mul x13, x13, x6 + mul x12, x12, x5 + umulh x7, x1, x9 + mul x1, x1, x9 + madd x7, x8, x9, x7 + umulh x6, x11, x6 + adds x1, x1, x13 + umulh x5, x10, x5 + cinc x7, x7, cs // cs = hs, nlast + mul x10, x10, x11 + adds x6, x6, x12 + cset x8, cs // cs = hs, nlast + adds x1, x1, x6 + adc x7, x7, x8 + adds x5, x5, x10 + cset x6, cs // cs = hs, nlast + adds x1, x1, x5 + adc x7, x7, x6 + adds x2, x2, x3 + cinc x14, x14, cs // cs = hs, nlast + adds x1, x1, x14 + stp x2, x1, [x0, #8] + cinc x7, x7, cs // cs = hs, nlast + adds x4, x4, x7 + cinc x15, x15, cs // cs = hs, nlast + stp x4, x15, [x0, #24] + ldp x19, x20, [sp, #16] + ldp x29, x30, [sp], #32 + ret + .size secp256k1_fe_mul_55to5, .-secp256k1_fe_mul_55to5 + + .p2align 4 + .global secp256k1_fe_sqr_5to5 + .type secp256k1_fe_sqr_5to5, %function +secp256k1_fe_sqr_5to5: + stp x29, x30, [sp, #-16]! + mov x6, #0x3d1 // #977 + movk x6, #0x1, lsl #32 + mov x29, sp + ldp x5, x4, [x1] + ldr x2, [x1, #32] + mul x3, x2, x6 + umulh x2, x2, x6 + adds x5, x3, x5 + cinc x2, x2, cs // cs = hs, nlast + adds x4, x4, x2 + ldp x3, x2, [x1, #16] + cset x7, cs // cs = hs, nlast + adds x3, x3, x7 + cset x1, cs // cs = hs, nlast + adds x2, x2, x1 + cset x7, cs // cs = hs, nlast + mul x13, x3, x3 + umulh x14, x3, x3 + umulh x15, x2, x3 + mul x1, x7, x6 + umulh x7, x7, x6 + adds x1, x1, x5 + mul x17, x2, x2 + adc x4, x4, x7 + lsl x5, x15, #1 + lsr x15, x15, #63 + mul x16, x2, x3 + umulh x8, x2, x1 + mul x10, x4, x2 + umulh x7, x4, x3 + mul x9, x4, x1 + adds x7, x7, x10 + mul x12, x1, x1 + cset x11, cs // cs = hs, nlast + adds x8, x7, x8 + cinc x11, x11, cs // cs = hs, nlast + umulh x30, x1, x1 + lsl x7, x8, #1 + lsl x10, x9, #1 + adds x7, x7, x13 + extr x8, x11, x8, #63 + cinc x8, x8, cs // cs = hs, nlast + lsr x9, x9, #63 + mul x18, x4, x4 + mul x11, x7, x6 + umulh x7, x7, x6 + madd x7, x8, x6, x7 + adds x11, x11, x12 + str x11, [x0] + umulh x13, x4, x2 + cinc x7, x7, cs // cs = hs, nlast + adds x10, x10, x30 + cinc x9, x9, cs // cs = hs, nlast + adds x5, x5, x17 + cinc x8, x15, cs // cs = hs, nlast + mul x17, x3, x1 + mul x15, x2, x1 + umulh x11, x5, x6 + mul x5, x5, x6 + madd x11, x8, x6, x11 + adds x5, x5, x18 + umulh x12, x4, x1 + cinc x11, x11, cs // cs = hs, nlast + adds x13, x13, x16 + cset x8, cs // cs = hs, nlast + mul x16, x4, x3 + umulh x1, x3, x1 + lsl x3, x13, #1 + adds x3, x3, x14 + extr x8, x8, x13, #63 + cinc x14, x8, cs // cs = hs, nlast + umulh x2, x2, x2 + umulh x4, x4, x4 + mul x13, x3, x6 + umulh x3, x3, x6 + madd x3, x14, x6, x3 + adds x13, x13, x7 + mul x8, x2, x6 + cinc x3, x3, cs // cs = hs, nlast + adds x10, x10, x13 + adc x9, x9, x3 + adds x12, x12, x17 + cset x3, cs // cs = hs, nlast + umulh x2, x2, x6 + lsl x6, x12, #1 + adds x6, x6, x9 + extr x3, x3, x12, #63 + cinc x3, x3, cs // cs = hs, nlast + adds x5, x5, x6 + adc x11, x11, x3 + adds x1, x1, x15 + cset x3, cs // cs = hs, nlast + adds x1, x1, x16 + cinc x3, x3, cs // cs = hs, nlast + stp x10, x5, [x0, #8] + lsl x5, x1, #1 + adds x5, x5, x11 + extr x1, x3, x1, #63 + cinc x1, x1, cs // cs = hs, nlast + adds x4, x8, x4 + cinc x2, x2, cs // cs = hs, nlast + adds x4, x4, x5 + adc x2, x1, x2 + stp x4, x2, [x0, #24] + ldp x29, x30, [sp], #16 + ret + nop + .size secp256k1_fe_sqr_5to5, .-secp256k1_fe_sqr_5to5 + + .p2align 4 + .global secp256k1_fe_mul_44to5 + .type secp256k1_fe_mul_44to5, %function +secp256k1_fe_mul_44to5: + stp x29, x30, [sp, #-32]! + mov x9, #0x3d1 // #977 + movk x9, #0x1, lsl #32 + mov x29, sp + ldp x10, x6, [x1] + ldp x11, x15, [x1, #16] + ldp x5, x12, [x2, #16] + ldp x7, x8, [x2] + str x19, [sp, #16] + mul x13, x11, x5 + mul x3, x6, x12 + umulh x4, x12, x10 + adds x1, x3, x13 + mul x13, x15, x8 + umulh x2, x6, x5 + cset x3, cs // cs = hs, nlast + adds x4, x4, x13 + umulh x13, x11, x8 + cset x14, cs // cs = hs, nlast + adds x1, x1, x4 + adc x3, x3, x14 + umulh x4, x15, x7 + adds x2, x2, x13 + mul x14, x10, x7 + cset x13, cs // cs = hs, nlast + adds x1, x1, x2 + adc x3, x3, x13 + adds x1, x1, x4 + cinc x3, x3, cs // cs = hs, nlast + umulh x13, x11, x5 + umulh x2, x6, x12 + mul x4, x1, x9 + umulh x1, x1, x9 + madd x1, x3, x9, x1 + adds x4, x4, x14 + umulh x3, x15, x8 + str x4, [x0] + mul x14, x12, x11 + cinc x1, x1, cs // cs = hs, nlast + adds x2, x2, x13 + mul x13, x5, x15 + cset x4, cs // cs = hs, nlast + adds x3, x3, x14 + cset x14, cs // cs = hs, nlast + adds x2, x2, x3 + adc x4, x4, x14 + adds x2, x2, x13 + cinc x4, x4, cs // cs = hs, nlast + mul x16, x6, x7 + mul x14, x8, x10 + umulh x13, x2, x9 + mul x2, x2, x9 + umulh x3, x10, x7 + madd x13, x4, x9, x13 + adds x2, x2, x16 + mul x4, x12, x10 + cinc x13, x13, cs // cs = hs, nlast + adds x3, x3, x14 + mul x14, x6, x5 + cset x16, cs // cs = hs, nlast + mul x30, x15, x7 + adds x3, x2, x3 + mul x17, x11, x8 + adc x13, x13, x16 + umulh x2, x12, x15 + adds x4, x4, x14 + umulh x16, x5, x10 + cset x14, cs // cs = hs, nlast + umulh x18, x6, x8 + adds x17, x17, x30 + cset x19, cs // cs = hs, nlast + adds x4, x4, x17 + umulh x30, x11, x7 + adc x14, x14, x19 + mul x17, x2, x9 + adds x18, x16, x18 + umulh x16, x2, x9 + cset x2, cs // cs = hs, nlast + adds x4, x4, x18 + umulh x18, x5, x15 + adc x14, x14, x2 + adds x17, x17, x30 + umulh x2, x12, x11 + cinc x16, x16, cs // cs = hs, nlast + mul x12, x12, x15 + adds x4, x4, x17 + adc x14, x14, x16 + adds x2, x2, x18 + cset x16, cs // cs = hs, nlast + adds x2, x2, x12 + cinc x16, x16, cs // cs = hs, nlast + mul x11, x11, x7 + umulh x15, x8, x10 + umulh x12, x2, x9 + mul x2, x2, x9 + umulh x7, x6, x7 + madd x12, x16, x9, x12 + adds x2, x2, x11 + mul x6, x6, x8 + mul x5, x5, x10 + cinc x12, x12, cs // cs = hs, nlast + adds x7, x15, x7 + cset x8, cs // cs = hs, nlast + adds x5, x5, x6 + cset x6, cs // cs = hs, nlast + adds x7, x7, x5 + adc x5, x8, x6 + adds x2, x2, x7 + adc x12, x12, x5 + adds x1, x1, x3 + cinc x13, x13, cs // cs = hs, nlast + adds x2, x2, x13 + stp x1, x2, [x0, #8] + cinc x12, x12, cs // cs = hs, nlast + adds x4, x4, x12 + cinc x14, x14, cs // cs = hs, nlast + stp x4, x14, [x0, #24] + ldr x19, [sp, #16] + ldp x29, x30, [sp], #32 + ret + .size secp256k1_fe_mul_44to5, .-secp256k1_fe_mul_44to5 + + .p2align 4 + .global secp256k1_fe_sqr_4to5 + .type secp256k1_fe_sqr_4to5, %function +secp256k1_fe_sqr_4to5: + ldp x9, x3, [x1] + mov x7, #0x3d1 // #977 + ldr x2, [x1, #24] + movk x7, #0x1, lsl #32 + ldr x1, [x1, #16] + mul x6, x3, x2 + umulh x4, x2, x9 + umulh x5, x3, x1 + adds x4, x4, x6 + mul x12, x1, x1 + cset x6, cs // cs = hs, nlast + adds x5, x4, x5 + cinc x6, x6, cs // cs = hs, nlast + mul x10, x3, x9 + lsl x4, x5, #1 + mul x15, x9, x9 + adds x4, x4, x12 + extr x5, x6, x5, #63 + cinc x5, x5, cs // cs = hs, nlast + umulh x13, x9, x9 + mul x12, x1, x9 + lsl x11, x10, #1 + mul x6, x4, x7 + lsr x10, x10, #63 + umulh x4, x4, x7 + madd x4, x5, x7, x4 + adds x6, x6, x15 + umulh x8, x3, x9 + str x6, [x0] + cinc x6, x4, cs // cs = hs, nlast + adds x11, x11, x13 + cinc x10, x10, cs // cs = hs, nlast + adds x8, x8, x12 + mul x14, x3, x3 + cset x12, cs // cs = hs, nlast + umulh x5, x3, x2 + lsl x13, x8, #1 + mul x4, x2, x1 + adds x13, x13, x14 + umulh x15, x1, x1 + extr x12, x12, x8, #63 + cinc x12, x12, cs // cs = hs, nlast + adds x14, x5, x4 + cset x17, cs // cs = hs, nlast + umulh x8, x2, x1 + lsl x5, x14, #1 + mul x18, x2, x2 + adds x5, x5, x15 + extr x17, x17, x14, #63 + cinc x17, x17, cs // cs = hs, nlast + lsl x4, x8, #1 + lsr x14, x8, #63 + mul x16, x3, x1 + mul x15, x5, x7 + umulh x5, x5, x7 + madd x5, x17, x7, x5 + adds x15, x15, x6 + mul x8, x2, x9 + cinc x5, x5, cs // cs = hs, nlast + adds x11, x11, x15 + adc x10, x10, x5 + adds x4, x4, x18 + cinc x5, x14, cs // cs = hs, nlast + umulh x9, x1, x9 + umulh x2, x2, x2 + mul x6, x4, x7 + umulh x4, x4, x7 + madd x4, x5, x7, x4 + adds x6, x6, x10 + mul x5, x2, x7 + cinc x4, x4, cs // cs = hs, nlast + adds x6, x6, x13 + adc x12, x12, x4 + adds x4, x8, x16 + cset x1, cs // cs = hs, nlast + adds x4, x4, x9 + cinc x1, x1, cs // cs = hs, nlast + umulh x2, x2, x7 + umulh x3, x3, x3 + lsl x7, x4, #1 + adds x7, x7, x12 + extr x1, x1, x4, #63 + cinc x1, x1, cs // cs = hs, nlast + adds x3, x5, x3 + cinc x2, x2, cs // cs = hs, nlast + adds x3, x3, x7 + adc x1, x1, x2 + stp x11, x6, [x0, #8] + stp x3, x1, [x0, #24] + ret + nop + nop + .size secp256k1_fe_sqr_4to5, .-secp256k1_fe_sqr_4to5 + + .p2align 4 + .global secp256k1_fe_mul_44to4 + .type secp256k1_fe_mul_44to4, %function +secp256k1_fe_mul_44to4: + stp x29, x30, [sp, #-32]! + mov x6, #0x3d1 // #977 + movk x6, #0x1, lsl #32 + mov x29, sp + ldp x10, x8, [x1] + stp x19, x20, [sp, #16] + ldp x13, x15, [x1, #16] + ldp x5, x14, [x2, #16] + ldp x11, x9, [x2] + mul x7, x13, x5 + mul x3, x8, x14 + umulh x4, x14, x10 + adds x1, x3, x7 + mul x7, x15, x9 + umulh x2, x8, x5 + cset x3, cs // cs = hs, nlast + adds x4, x4, x7 + umulh x7, x13, x9 + cset x12, cs // cs = hs, nlast + adds x1, x1, x4 + adc x3, x3, x12 + umulh x4, x15, x11 + adds x2, x2, x7 + mul x7, x10, x11 + cset x12, cs // cs = hs, nlast + adds x1, x1, x2 + adc x3, x3, x12 + adds x1, x1, x4 + cinc x3, x3, cs // cs = hs, nlast + umulh x4, x13, x5 + umulh x2, x8, x14 + umulh x12, x1, x6 + mul x1, x1, x6 + madd x12, x3, x6, x12 + mul x16, x14, x13 + adds x1, x1, x7 + umulh x3, x15, x9 + cinc x12, x12, cs // cs = hs, nlast + adds x2, x2, x4 + mul x7, x5, x15 + cset x4, cs // cs = hs, nlast + adds x3, x3, x16 + cset x16, cs // cs = hs, nlast + adds x2, x2, x3 + adc x3, x4, x16 + adds x2, x2, x7 + cinc x3, x3, cs // cs = hs, nlast + mul x17, x8, x11 + mul x7, x9, x10 + umulh x16, x2, x6 + mul x2, x2, x6 + umulh x4, x10, x11 + madd x16, x3, x6, x16 + adds x2, x2, x17 + mul x3, x14, x10 + cinc x16, x16, cs // cs = hs, nlast + adds x4, x4, x7 + mul x7, x8, x5 + cset x17, cs // cs = hs, nlast + mul x19, x15, x11 + adds x4, x2, x4 + mul x18, x13, x9 + adc x16, x16, x17 + umulh x2, x14, x15 + adds x3, x3, x7 + umulh x17, x5, x10 + cset x7, cs // cs = hs, nlast + umulh x30, x8, x9 + adds x18, x18, x19 + cset x20, cs // cs = hs, nlast + adds x3, x3, x18 + umulh x19, x13, x11 + adc x7, x7, x20 + mul x18, x2, x6 + adds x30, x17, x30 + cset x20, cs // cs = hs, nlast + umulh x17, x2, x6 + adds x3, x3, x30 + umulh x2, x14, x13 + umulh x30, x5, x15 + adc x7, x7, x20 + adds x18, x18, x19 + mul x14, x14, x15 + cinc x17, x17, cs // cs = hs, nlast + adds x3, x3, x18 + adc x7, x7, x17 + adds x2, x2, x30 + cset x17, cs // cs = hs, nlast + adds x2, x2, x14 + cinc x17, x17, cs // cs = hs, nlast + mul x13, x13, x11 + umulh x15, x9, x10 + umulh x14, x2, x6 + mul x2, x2, x6 + umulh x11, x8, x11 + madd x14, x17, x6, x14 + adds x2, x2, x13 + mul x8, x8, x9 + mul x5, x5, x10 + cinc x14, x14, cs // cs = hs, nlast + adds x9, x15, x11 + cset x10, cs // cs = hs, nlast + adds x5, x5, x8 + cset x8, cs // cs = hs, nlast + adds x9, x9, x5 + adc x5, x10, x8 + adds x2, x2, x9 + adc x14, x14, x5 + adds x12, x12, x4 + cinc x16, x16, cs // cs = hs, nlast + adds x2, x2, x16 + cinc x14, x14, cs // cs = hs, nlast + adds x3, x3, x14 + cinc x4, x7, cs // cs = hs, nlast + ldp x19, x20, [sp, #16] + mul x5, x4, x6 + umulh x4, x4, x6 + adds x1, x1, x5 + cinc x4, x4, cs // cs = hs, nlast + adds x4, x4, x12 + cset x5, cs // cs = hs, nlast + adds x5, x5, x2 + cset x2, cs // cs = hs, nlast + adds x2, x2, x3 + stp x5, x2, [x0, #16] + cset x3, cs // cs = hs, nlast + ldp x29, x30, [sp], #32 + mul x2, x3, x6 + umulh x3, x3, x6 + adds x2, x2, x1 + adc x4, x4, x3 + stp x2, x4, [x0] + ret + .size secp256k1_fe_mul_44to4, .-secp256k1_fe_mul_44to4 + + .p2align 4 + .global secp256k1_fe_sqr_4to4 + .type secp256k1_fe_sqr_4to4, %function +secp256k1_fe_sqr_4to4: + stp x29, x30, [sp, #-16]! + mov x4, #0x3d1 // #977 + movk x4, #0x1, lsl #32 + mov x29, sp + ldp x8, x5, [x1] + ldr x3, [x1, #24] + ldr x1, [x1, #16] + mul x7, x5, x3 + umulh x2, x3, x8 + umulh x6, x5, x1 + adds x2, x2, x7 + mul x14, x1, x1 + cset x7, cs // cs = hs, nlast + adds x6, x2, x6 + cinc x7, x7, cs // cs = hs, nlast + mul x11, x5, x8 + lsl x2, x6, #1 + mul x13, x8, x8 + adds x2, x2, x14 + extr x6, x7, x6, #63 + cinc x6, x6, cs // cs = hs, nlast + umulh x12, x8, x8 + mul x7, x1, x8 + lsl x9, x11, #1 + umulh x17, x2, x4 + lsr x11, x11, #63 + mul x2, x2, x4 + madd x17, x6, x4, x17 + adds x2, x2, x13 + umulh x10, x5, x8 + cinc x17, x17, cs // cs = hs, nlast + adds x9, x9, x12 + cinc x11, x11, cs // cs = hs, nlast + adds x10, x10, x7 + mul x14, x5, x5 + cset x12, cs // cs = hs, nlast + umulh x6, x5, x3 + lsl x13, x10, #1 + mul x7, x3, x1 + adds x13, x13, x14 + umulh x15, x1, x1 + extr x12, x12, x10, #63 + cinc x12, x12, cs // cs = hs, nlast + adds x14, x6, x7 + cset x18, cs // cs = hs, nlast + umulh x10, x3, x1 + lsl x7, x14, #1 + mul x30, x3, x3 + adds x7, x7, x15 + extr x18, x18, x14, #63 + cinc x18, x18, cs // cs = hs, nlast + lsl x6, x10, #1 + lsr x14, x10, #63 + mul x10, x3, x8 + mul x15, x7, x4 + umulh x7, x7, x4 + madd x7, x18, x4, x7 + adds x15, x15, x17 + umulh x17, x1, x8 + cinc x7, x7, cs // cs = hs, nlast + adds x9, x9, x15 + adc x11, x11, x7 + adds x6, x6, x30 + cinc x7, x14, cs // cs = hs, nlast + mul x16, x5, x1 + umulh x3, x3, x3 + mul x8, x6, x4 + umulh x6, x6, x4 + madd x6, x7, x4, x6 + adds x8, x8, x11 + mul x7, x3, x4 + cinc x6, x6, cs // cs = hs, nlast + adds x8, x8, x13 + adc x12, x12, x6 + adds x6, x10, x16 + cset x1, cs // cs = hs, nlast + adds x6, x6, x17 + cinc x1, x1, cs // cs = hs, nlast + umulh x5, x5, x5 + lsl x10, x6, #1 + umulh x3, x3, x4 + adds x10, x10, x12 + extr x1, x1, x6, #63 + cinc x1, x1, cs // cs = hs, nlast + adds x5, x7, x5 + cinc x3, x3, cs // cs = hs, nlast + adds x5, x5, x10 + adc x1, x1, x3 + ldp x29, x30, [sp], #16 + mul x3, x1, x4 + umulh x1, x1, x4 + adds x2, x2, x3 + cinc x1, x1, cs // cs = hs, nlast + adds x1, x1, x9 + cset x3, cs // cs = hs, nlast + adds x3, x3, x8 + str x3, [x0, #16] + cset x3, cs // cs = hs, nlast + adds x3, x3, x5 + str x3, [x0, #24] + cset x5, cs // cs = hs, nlast + mul x3, x5, x4 + umulh x5, x5, x4 + adds x3, x3, x2 + adc x1, x1, x5 + stp x3, x1, [x0] + ret + .size secp256k1_fe_sqr_4to4, .-secp256k1_fe_sqr_4to4 diff --git a/src/asm/field_5x64_x86_64_gen.s b/src/asm/field_5x64_x86_64_gen.s new file mode 100644 index 0000000000..6953956190 --- /dev/null +++ b/src/asm/field_5x64_x86_64_gen.s @@ -0,0 +1,1338 @@ +/************************************************************************ + * Field multiplication and squaring assemblies using representation of * + * field elements in base 2^{64}. * + * Major instructions used in the assemblies are mul/add/adc. * + * * + * Copyright (c) 2021 Kaushik Nath * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or https://www.opensource.org/licenses/mit-license.php. * + ***********************************************************************/ + + .att_syntax + .text +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is of at most 64 bits. The 5-limb inputs are fully reduced first + * to 4-limb forms, then multiplied, after which a field element in 5-limb + * form is reported as output. The fifth limb of the output has at most + * 33 bits. + */ + .p2align 4 + .global secp256k1_fe_mul_55to5 + .type secp256k1_fe_mul_55to5, %function + +secp256k1_fe_mul_55to5: + + movq %rsp,%r11 + subq $96,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbx,40(%rsp) + movq %rbp,48(%rsp) + movq %rdi,56(%rsp) + + movq $0x1000003d1,%rcx + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%r13 + + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rax + + mulq %rcx + xorq %rdi,%rdi + addq 0(%rsi),%rax + adcq 8(%rsi),%rdx + adcq $0,%r10 + movq %r10,80(%rsp) + adcq $0,%r11 + movq %r11,88(%rsp) + cmovc %rcx,%rdi + addq %rax,%rdi + movq %rdi,64(%rsp) + adcq $0,%rdx + movq %rdx,72(%rsp) + + movq %r13,%rax + mulq %rcx + xorq %rdi,%rdi + addq %r8,%rax + adcq %r9,%rdx + adcq $0,%rbx + adcq $0,%rbp + cmovc %rcx,%rdi + addq %rax,%rdi + adcq $0,%rdx + movq %rdx,%rsi + + movq 72(%rsp),%rax + mulq %rbp + movq %rax,%r8 + xorq %r9,%r9 + movq %rdx,%r10 + xorq %r11,%r11 + + movq 80(%rsp),%rax + mulq %rbx + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 88(%rsp),%rax + mulq %rsi + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 80(%rsp),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%r11 + movq %rdx,%r12 + xorq %r13,%r13 + + movq 88(%rsp),%rax + mulq %rbx + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq %rcx,%rax + mulq %r10 + imul %rcx,%r11 + movq %rax,%r10 + addq %rdx,%r11 + + movq 88(%rsp),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%r13 + + movq %rcx,%rax + mulq %rdx + movq %rax,%r14 + movq %rdx,%r15 + + movq %rcx,%rax + mulq %r12 + imul %rcx,%r13 + movq %rax,%r12 + addq %rdx,%r13 + + movq 64(%rsp),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 72(%rsp),%rax + mulq %rbx + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 80(%rsp),%rax + mulq %rsi + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 88(%rsp),%rax + mulq %rdi + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq %rcx,%rax + mulq %r8 + imul %rcx,%r9 + movq %rax,%r8 + addq %rdx,%r9 + + movq 64(%rsp),%rax + mulq %rdi + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 64(%rsp),%rax + mulq %rsi + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq 72(%rsp),%rax + mulq %rdi + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq 64(%rsp),%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq 72(%rsp),%rax + mulq %rsi + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq 80(%rsp),%rax + mulq %rdi + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + addq %r9,%r10 + adcq $0,%r11 + addq %r11,%r12 + adcq $0,%r13 + addq %r13,%r14 + adcq $0,%r15 + + movq 56(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r10,8(%rdi) + movq %r12,16(%rdi) + movq %r14,24(%rdi) + movq %r15,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbx + movq 48(%rsp),%rbp + + movq %r11,%rsp + + ret + + + .p2align 4 + .global secp256k1_fe_sqr_5to5 + .type secp256k1_fe_sqr_5to5, %function + +secp256k1_fe_sqr_5to5: + + movq %rsp,%r11 + subq $64,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbx,40(%rsp) + movq %rbp,48(%rsp) + movq %rdi,56(%rsp) + + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%rcx + movq 24(%rsi),%rdi + movq 32(%rsi),%rax + + movq $0x1000003d1,%rsi + + mulq %rsi + movq $0,%r8 + addq %rax,%rbx + adcq %rdx,%rbp + adcq $0,%rcx + adcq $0,%rdi + cmovc %rsi,%r8 + addq %r8,%rbx + adcq $0,%rbp + + movq %rbp,%rax + mulq %rdi + movq %rax,%r8 + xorq %r9,%r9 + movq %rdx,%r10 + xorq %r11,%r11 + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq %rcx,%rax + mulq %rcx + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq %rcx,%rax + mulq %rdi + addq %rax,%r10 + adcq $0,%r11 + movq %rdx,%r12 + xorq %r13,%r13 + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq %rsi,%rax + mulq %r10 + imul %rsi,%r11 + movq %rax,%r10 + addq %rdx,%r11 + + movq %rdi,%rax + mulq %rdi + addq %rax,%r12 + adcq $0,%r13 + + movq %rsi,%rax + mulq %rdx + movq %rax,%r14 + movq %rdx,%r15 + + movq %rsi,%rax + mulq %r12 + imul %rsi,%r13 + movq %rax,%r12 + addq %rdx,%r13 + + movq %rbx,%rax + mulq %rdi + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq %rbp,%rax + mulq %rcx + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq %rsi,%rax + mulq %r8 + imul %rsi,%r9 + movq %rax,%r8 + addq %rdx,%r9 + + movq %rbx,%rax + mulq %rbx + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq %rbx,%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq %rbx,%rax + mulq %rcx + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq %rbp,%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + addq %r9,%r10 + adcq $0,%r11 + addq %r11,%r12 + adcq $0,%r13 + addq %r13,%r14 + adcq $0,%r15 + + movq 56(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r10,8(%rdi) + movq %r12,16(%rdi) + movq %r14,24(%rdi) + movq %r15,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbx + movq 48(%rsp),%rbp + + movq %r11,%rsp + + ret +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is of at most 33 bits. + */ + .p2align 4 + .global secp256k1_fe_mul_44to5 + .type secp256k1_fe_mul_44to5, %function + +secp256k1_fe_mul_44to5: + + movq %rsp,%r11 + subq $48,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbx,40(%rsp) + + movq %rdx,%rcx + movq $0x1000003D1,%rbx + + movq 8(%rsi),%rax + mulq 24(%rcx) + movq %rax,%r8 + xorq %r9,%r9 + movq %rdx,%r10 + xorq %r11,%r11 + + movq 16(%rsi),%rax + mulq 16(%rcx) + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 24(%rsi),%rax + mulq 8(%rcx) + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 16(%rsi),%rax + mulq 24(%rcx) + addq %rax,%r10 + adcq $0,%r11 + movq %rdx,%r12 + xorq %r13,%r13 + + movq 24(%rsi),%rax + mulq 16(%rcx) + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq %rbx,%rax + mulq %r10 + imul %rbx,%r11 + movq %rax,%r10 + addq %rdx,%r11 + + movq 24(%rsi),%rax + mulq 24(%rcx) + addq %rax,%r12 + adcq $0,%r13 + + movq %rbx,%rax + mulq %rdx + movq %rax,%r14 + movq %rdx,%r15 + + movq %rbx,%rax + mulq %r12 + imul %rbx,%r13 + movq %rax,%r12 + addq %rdx,%r13 + + movq 0(%rsi),%rax + mulq 24(%rcx) + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 8(%rsi),%rax + mulq 16(%rcx) + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 16(%rsi),%rax + mulq 8(%rcx) + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 24(%rsi),%rax + mulq 0(%rcx) + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq %rbx,%rax + mulq %r8 + imul %rbx,%r9 + movq %rax,%r8 + addq %rdx,%r9 + + movq 0(%rsi),%rax + mulq 0(%rcx) + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 0(%rsi),%rax + mulq 8(%rcx) + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq 8(%rsi),%rax + mulq 0(%rcx) + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq 0(%rsi),%rax + mulq 16(%rcx) + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq 8(%rsi),%rax + mulq 8(%rcx) + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq 16(%rsi),%rax + mulq 0(%rcx) + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + addq %r9,%r10 + adcq $0,%r11 + addq %r11,%r12 + adcq $0,%r13 + addq %r13,%r14 + adcq $0,%r15 + + movq %r8,0(%rdi) + movq %r10,8(%rdi) + movq %r12,16(%rdi) + movq %r14,24(%rdi) + movq %r15,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbx + + movq %r11,%rsp + + ret + + + .p2align 4 + .global secp256k1_fe_sqr_4to5 + .type secp256k1_fe_sqr_4to5, %function + +secp256k1_fe_sqr_4to5: + + movq %rsp,%r11 + subq $64,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbx,40(%rsp) + movq %rbp,48(%rsp) + movq %rdi,56(%rsp) + + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%rcx + movq 24(%rsi),%rdi + + movq $0x1000003D1,%rsi + + movq %rbp,%rax + mulq %rdi + movq %rax,%r8 + xorq %r9,%r9 + movq %rdx,%r10 + xorq %r11,%r11 + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq %rcx,%rax + mulq %rcx + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq %rcx,%rax + mulq %rdi + addq %rax,%r10 + adcq $0,%r11 + movq %rdx,%r12 + xorq %r13,%r13 + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq %rsi,%rax + mulq %r10 + imul %rsi,%r11 + movq %rax,%r10 + addq %rdx,%r11 + + movq %rdi,%rax + mulq %rdi + addq %rax,%r12 + adcq $0,%r13 + + movq %rsi,%rax + mulq %rdx + movq %rax,%r14 + movq %rdx,%r15 + + movq %rsi,%rax + mulq %r12 + imul %rsi,%r13 + movq %rax,%r12 + addq %rdx,%r13 + + movq %rbx,%rax + mulq %rdi + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq %rbp,%rax + mulq %rcx + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq %rsi,%rax + mulq %r8 + imul %rsi,%r9 + movq %rax,%r8 + addq %rdx,%r9 + + movq %rbx,%rax + mulq %rbx + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq %rbx,%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq %rbx,%rax + mulq %rcx + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq %rbp,%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + addq %r9,%r10 + adcq $0,%r11 + addq %r11,%r12 + adcq $0,%r13 + addq %r13,%r14 + adcq $0,%r15 + + movq 56(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r10,8(%rdi) + movq %r12,16(%rdi) + movq %r14,24(%rdi) + movq %r15,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbx + movq 48(%rsp),%rbp + + movq %r11,%rsp + + ret +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is zero. + */ + .p2align 4 + .global secp256k1_fe_mul_44to4 + .type secp256k1_fe_mul_44to4, %function + +secp256k1_fe_mul_44to4: + + movq %rsp,%r11 + subq $48,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbx,40(%rsp) + + movq %rdx,%rcx + movq $0x1000003D1,%rbx + + movq 8(%rsi),%rax + mulq 24(%rcx) + movq %rax,%r8 + xorq %r9,%r9 + movq %rdx,%r10 + xorq %r11,%r11 + + movq 16(%rsi),%rax + mulq 16(%rcx) + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 24(%rsi),%rax + mulq 8(%rcx) + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 16(%rsi),%rax + mulq 24(%rcx) + addq %rax,%r10 + adcq $0,%r11 + movq %rdx,%r12 + xorq %r13,%r13 + + movq 24(%rsi),%rax + mulq 16(%rcx) + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq %rbx,%rax + mulq %r10 + imul %rbx,%r11 + movq %rax,%r10 + addq %rdx,%r11 + + movq 24(%rsi),%rax + mulq 24(%rcx) + addq %rax,%r12 + adcq $0,%r13 + + movq %rbx,%rax + mulq %rdx + movq %rax,%r14 + movq %rdx,%r15 + + movq %rbx,%rax + mulq %r12 + imul %rbx,%r13 + movq %rax,%r12 + addq %rdx,%r13 + + movq 0(%rsi),%rax + mulq 24(%rcx) + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 8(%rsi),%rax + mulq 16(%rcx) + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 16(%rsi),%rax + mulq 8(%rcx) + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 24(%rsi),%rax + mulq 0(%rcx) + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq %rbx,%rax + mulq %r8 + imul %rbx,%r9 + movq %rax,%r8 + addq %rdx,%r9 + + movq 0(%rsi),%rax + mulq 0(%rcx) + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 0(%rsi),%rax + mulq 8(%rcx) + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq 8(%rsi),%rax + mulq 0(%rcx) + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq 0(%rsi),%rax + mulq 16(%rcx) + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq 8(%rsi),%rax + mulq 8(%rcx) + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq 16(%rsi),%rax + mulq 0(%rcx) + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + addq %r9,%r10 + adcq $0,%r11 + addq %r11,%r12 + adcq $0,%r13 + addq %r13,%r14 + adcq $0,%r15 + + movq %rbx,%rax + mulq %r15 + xorq %r11,%r11 + addq %rax,%r8 + adcq %rdx,%r10 + adcq $0,%r12 + adcq $0,%r14 + cmovc %rbx,%r11 + addq %r11,%r8 + adcq $0,%r10 + + movq %r8,0(%rdi) + movq %r10,8(%rdi) + movq %r12,16(%rdi) + movq %r14,24(%rdi) + movq $0,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbx + + movq %r11,%rsp + + ret + + + .p2align 4 + .global secp256k1_fe_sqr_4to4 + .type secp256k1_fe_sqr_4to4, %function + +secp256k1_fe_sqr_4to4: + + movq %rsp,%r11 + subq $64,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbx,40(%rsp) + movq %rbp,48(%rsp) + movq %rdi,56(%rsp) + + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%rcx + movq 24(%rsi),%rdi + + movq $0x1000003D1,%rsi + + movq %rbp,%rax + mulq %rdi + movq %rax,%r8 + xorq %r9,%r9 + movq %rdx,%r10 + xorq %r11,%r11 + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq %rcx,%rax + mulq %rcx + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq %rcx,%rax + mulq %rdi + addq %rax,%r10 + adcq $0,%r11 + movq %rdx,%r12 + xorq %r13,%r13 + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq %rsi,%rax + mulq %r10 + imul %rsi,%r11 + movq %rax,%r10 + addq %rdx,%r11 + + movq %rdi,%rax + mulq %rdi + addq %rax,%r12 + adcq $0,%r13 + + movq %rsi,%rax + mulq %rdx + movq %rax,%r14 + movq %rdx,%r15 + + movq %rsi,%rax + mulq %r12 + imul %rsi,%r13 + movq %rax,%r12 + addq %rdx,%r13 + + movq %rbx,%rax + mulq %rdi + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq %rbp,%rax + mulq %rcx + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq %rsi,%rax + mulq %r8 + imul %rsi,%r9 + movq %rax,%r8 + addq %rdx,%r9 + + movq %rbx,%rax + mulq %rbx + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq %rbx,%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq %rbx,%rax + mulq %rcx + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq %rbp,%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + addq %r9,%r10 + adcq $0,%r11 + addq %r11,%r12 + adcq $0,%r13 + addq %r13,%r14 + adcq $0,%r15 + + movq %rsi,%rax + mulq %r15 + xorq %r11,%r11 + addq %rax,%r8 + adcq %rdx,%r10 + adcq $0,%r12 + adcq $0,%r14 + cmovc %rsi,%r11 + addq %r11,%r8 + adcq $0,%r10 + + movq 56(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r10,8(%rdi) + movq %r12,16(%rdi) + movq %r14,24(%rdi) + movq $0,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbx + movq 48(%rsp),%rbp + + movq %r11,%rsp + + ret +/* + * 64-bit field multiplication in which the first argument has 4-limb + * and the second argument has 5-limb representations such that the + * fifth limb is of at most 64 bits. The second argument is fully + * reduced to 4-limb form and then field multiplication is performed. + * A field element in 5-limb form is reported as output such that the + * fifth limb is of at most 33 bits. + */ + .p2align 4 + .global secp256k1_fe_mul_45to5 + .type secp256k1_fe_mul_45to5, %function + +secp256k1_fe_mul_45to5: + + movq %rsp,%r11 + subq $72,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbx,40(%rsp) + movq %rbp,48(%rsp) + movq %rdi,56(%rsp) + + movq $0x1000003d1,%rcx + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rax + + mulq %rcx + xorq %rdi,%rdi + addq %r8,%rax + adcq %r9,%rdx + adcq $0,%rbx + adcq $0,%rbp + cmovc %rcx,%rdi + addq %rax,%rdi + adcq $0,%rdx + movq %rdx,64(%rsp) + + movq 8(%rsi),%rax + mulq %rbp + movq %rax,%r8 + xorq %r9,%r9 + movq %rdx,%r10 + xorq %r11,%r11 + + movq 16(%rsi),%rax + mulq %rbx + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 24(%rsi),%rax + mulq 64(%rsp) + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 16(%rsi),%rax + mulq %rbp + addq %rax,%r10 + adcq $0,%r11 + movq %rdx,%r12 + xorq %r13,%r13 + + movq 24(%rsi),%rax + mulq %rbx + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq %rcx,%rax + mulq %r10 + imul %rcx,%r11 + movq %rax,%r10 + addq %rdx,%r11 + + movq 24(%rsi),%rax + mulq %rbp + addq %rax,%r12 + adcq $0,%r13 + + movq %rcx,%rax + mulq %rdx + movq %rax,%r14 + movq %rdx,%r15 + + movq %rcx,%rax + mulq %r12 + imul %rcx,%r13 + movq %rax,%r12 + addq %rdx,%r13 + + movq 0(%rsi),%rax + mulq %rbp + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 8(%rsi),%rax + mulq %rbx + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 16(%rsi),%rax + mulq 64(%rsp) + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq 24(%rsi),%rax + mulq %rdi + addq %rax,%r14 + adcq $0,%r15 + addq %rdx,%r8 + adcq $0,%r9 + + movq %rcx,%rax + mulq %r8 + imul %rcx,%r9 + movq %rax,%r8 + addq %rdx,%r9 + + movq 0(%rsi),%rax + mulq %rdi + addq %rax,%r8 + adcq $0,%r9 + addq %rdx,%r10 + adcq $0,%r11 + + movq 0(%rsi),%rax + mulq 64(%rsp) + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq 8(%rsi),%rax + mulq %rdi + addq %rax,%r10 + adcq $0,%r11 + addq %rdx,%r12 + adcq $0,%r13 + + movq 0(%rsi),%rax + mulq %rbx + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq 8(%rsi),%rax + mulq 64(%rsp) + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + movq 16(%rsi),%rax + mulq %rdi + addq %rax,%r12 + adcq $0,%r13 + addq %rdx,%r14 + adcq $0,%r15 + + addq %r9,%r10 + adcq $0,%r11 + addq %r11,%r12 + adcq $0,%r13 + addq %r13,%r14 + adcq $0,%r15 + + movq 56(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r10,8(%rdi) + movq %r12,16(%rdi) + movq %r14,24(%rdi) + movq %r15,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbx + movq 48(%rsp),%rbp + + movq %r11,%rsp + + ret diff --git a/src/asm/field_5x64_x86_64_maax.s b/src/asm/field_5x64_x86_64_maax.s new file mode 100644 index 0000000000..29a46abc35 --- /dev/null +++ b/src/asm/field_5x64_x86_64_maax.s @@ -0,0 +1,883 @@ +/************************************************************************ + * Field multiplication and squaring assemblies using representation of * + * field elements in base 2^{64}. * + * Major instructions used in the assemblies are mulx/adcx/adox. * + * * + * Copyright (c) 2021 Kaushik Nath * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or https://www.opensource.org/licenses/mit-license.php. * + ***********************************************************************/ + + .att_syntax + .text +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is of at most 64 bits. The 5-limb inputs are fully reduced first + * to 4-limb forms, then multiplied, after which a field element in 5-limb + * form is reported as output. The fifth limb of the output has at most + * 33 bits. + */ + .p2align 4 + .global secp256k1_fe_mul_55to5 + .type secp256k1_fe_mul_55to5, %function + +secp256k1_fe_mul_55to5: + + movq %rsp,%r11 + subq $96,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbp,40(%rsp) + movq %rbx,48(%rsp) + movq %rdi,56(%rsp) + + movq 0(%rsi),%rax + movq 8(%rsi),%rbx + movq 16(%rsi),%rdi + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + + movq $0x1000003D1,%rdx + xorq %rcx,%rcx + mulx 32(%rsi),%r13,%r14 + movq 24(%rsi),%rsi + adcx %r13,%rax + adcx %r14,%rbx + adcx %rcx,%rdi + adcx %rcx,%rsi + cmovc %rdx,%rcx + xorq %r13,%r13 + adcx %rcx,%rax + adcx %r13,%rbx + + xorq %rcx,%rcx + mulx %r12,%r13,%r14 + adcx %r13,%r8 + adcx %r14,%r9 + adcx %rcx,%r10 + adcx %rcx,%r11 + cmovc %rdx,%rcx + xorq %r13,%r13 + adcx %rcx,%r8 + adcx %r13,%r9 + + movq %r8,64(%rsp) + movq %r9,72(%rsp) + movq %r10,80(%rsp) + movq %r11,88(%rsp) + + xorq %r13,%r13 + movq 64(%rsp),%rdx + mulx %rax,%r8,%r9 + mulx %rbx,%rcx,%r10 + adcx %rcx,%r9 + mulx %rdi,%rcx,%r11 + adcx %rcx,%r10 + mulx %rsi,%rcx,%r12 + adcx %rcx,%r11 + adcx %r13,%r12 + + xorq %r14,%r14 + movq 72(%rsp),%rdx + mulx %rax,%rcx,%rbp + adcx %rcx,%r9 + adox %rbp,%r10 + mulx %rbx,%rcx,%rbp + adcx %rcx,%r10 + adox %rbp,%r11 + mulx %rdi,%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx %rsi,%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + adcx %r14,%r13 + + xorq %r15,%r15 + movq 80(%rsp),%rdx + mulx %rax,%rcx,%rbp + adcx %rcx,%r10 + adox %rbp,%r11 + mulx %rbx,%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx %rdi,%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + mulx %rsi,%rcx,%rbp + adcx %rcx,%r13 + adox %rbp,%r14 + adcx %r15,%r14 + + xorq %rdx,%rdx + movq 88(%rsp),%rdx + mulx %rax,%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx %rbx,%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + mulx %rdi,%rcx,%rbp + adcx %rcx,%r13 + adox %rbp,%r14 + mulx %rsi,%rcx,%rbp + adcx %rcx,%r14 + adox %rbp,%r15 + adcq $0,%r15 + + xorq %rbp,%rbp + movq $0x1000003D1,%rdx + mulx %r12,%rax,%r12 + adcx %rax,%r8 + adox %r12,%r9 + mulx %r13,%rcx,%r13 + adcx %rcx,%r9 + adox %r13,%r10 + mulx %r14,%rcx,%r14 + adcx %rcx,%r10 + adox %r14,%r11 + mulx %r15,%rcx,%r15 + adcx %rcx,%r11 + adox %rbp,%r15 + adcx %rbp,%r15 + + movq 56(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r15,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + + movq %r11,%rsp + + ret + + + .p2align 4 + .global secp256k1_fe_sqr_5to5 + .type secp256k1_fe_sqr_5to5, %function + +secp256k1_fe_sqr_5to5: + + movq %rsp,%r11 + subq $56,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbp,40(%rsp) + movq %rbx,48(%rsp) + + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%rax + + movq $0x1000003D1,%rdx + xorq %r15,%r15 + mulx 32(%rsi),%r13,%r14 + movq 24(%rsi),%rsi + adcx %r13,%rbx + adcx %r14,%rbp + adcx %r15,%rax + adcx %r15,%rsi + cmovc %rdx,%r15 + xorq %r13,%r13 + adcx %r15,%rbx + adcx %r13,%rbp + + xorq %r13,%r13 + movq %rbx,%rdx + mulx %rbp,%r9,%r10 + mulx %rax,%rcx,%r11 + adcx %rcx,%r10 + mulx %rsi,%rcx,%r12 + adcx %rcx,%r11 + adcx %r13,%r12 + + xorq %r14,%r14 + movq %rbp,%rdx + mulx %rax,%rcx,%rdx + adcx %rcx,%r11 + adox %rdx,%r12 + movq %rbp,%rdx + mulx %rsi,%rcx,%rdx + adcx %rcx,%r12 + adox %rdx,%r13 + adcx %r14,%r13 + + xorq %r15,%r15 + movq %rax,%rdx + mulx %rsi,%rcx,%r14 + adcx %rcx,%r13 + adcx %r15,%r14 + + shld $1,%r14,%r15 + shld $1,%r13,%r14 + shld $1,%r12,%r13 + shld $1,%r11,%r12 + shld $1,%r10,%r11 + shld $1,%r9,%r10 + addq %r9,%r9 + + xorq %rdx,%rdx + movq %rbx,%rdx + mulx %rdx,%r8,%rdx + adcx %rdx,%r9 + + movq %rbp,%rdx + mulx %rdx,%rcx,%rdx + adcx %rcx,%r10 + adcx %rdx,%r11 + + movq %rax,%rdx + mulx %rdx,%rcx,%rdx + adcx %rcx,%r12 + adcx %rdx,%r13 + + movq %rsi,%rdx + mulx %rdx,%rcx,%rdx + adcx %rcx,%r14 + adcx %rdx,%r15 + + xorq %rbp,%rbp + movq $0x1000003D1,%rdx + mulx %r12,%rax,%r12 + adcx %rax,%r8 + adox %r12,%r9 + mulx %r13,%rcx,%r13 + adcx %rcx,%r9 + adox %r13,%r10 + mulx %r14,%rcx,%r14 + adcx %rcx,%r10 + adox %r14,%r11 + mulx %r15,%rcx,%r15 + adcx %rcx,%r11 + adox %rbp,%r15 + adcx %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r15,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + + movq %r11,%rsp + + ret +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is of at most 33 bits. + */ + .p2align 4 + .global secp256k1_fe_mul_44to5 + .type secp256k1_fe_mul_44to5, %function + +secp256k1_fe_mul_44to5: + + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + movq %rdx,%rbx + + xorq %r13,%r13 + movq 0(%rbx),%rdx + mulx 0(%rsi),%r8,%r9 + mulx 8(%rsi),%rcx,%r10 + adcx %rcx,%r9 + mulx 16(%rsi),%rcx,%r11 + adcx %rcx,%r10 + mulx 24(%rsi),%rcx,%r12 + adcx %rcx,%r11 + adcx %r13,%r12 + + xorq %r14,%r14 + movq 8(%rbx),%rdx + mulx 0(%rsi),%rcx,%rbp + adcx %rcx,%r9 + adox %rbp,%r10 + mulx 8(%rsi),%rcx,%rbp + adcx %rcx,%r10 + adox %rbp,%r11 + mulx 16(%rsi),%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx 24(%rsi),%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + adcx %r14,%r13 + + xorq %r15,%r15 + movq 16(%rbx),%rdx + mulx 0(%rsi),%rcx,%rbp + adcx %rcx,%r10 + adox %rbp,%r11 + mulx 8(%rsi),%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx 16(%rsi),%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + mulx 24(%rsi),%rcx,%rbp + adcx %rcx,%r13 + adox %rbp,%r14 + adcx %r15,%r14 + + xorq %rax,%rax + movq 24(%rbx),%rdx + mulx 0(%rsi),%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx 8(%rsi),%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + mulx 16(%rsi),%rcx,%rbp + adcx %rcx,%r13 + adox %rbp,%r14 + mulx 24(%rsi),%rcx,%rbp + adcx %rcx,%r14 + adox %rbp,%r15 + adcx %rax,%r15 + + xorq %rbp,%rbp + movq $0x1000003D1,%rdx + mulx %r12,%rax,%r12 + adcx %rax,%r8 + adox %r12,%r9 + mulx %r13,%rcx,%r13 + adcx %rcx,%r9 + adox %r13,%r10 + mulx %r14,%rcx,%r14 + adcx %rcx,%r10 + adox %r14,%r11 + mulx %r15,%rcx,%r15 + adcx %rcx,%r11 + adox %rbp,%r15 + adcx %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r15,32(%rdi) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + + ret + + + .p2align 4 + .global secp256k1_fe_sqr_4to5 + .type secp256k1_fe_sqr_4to5, %function + +secp256k1_fe_sqr_4to5: + + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%rax + movq 24(%rsi),%rsi + + xorq %r13,%r13 + movq %rbx,%rdx + mulx %rbp,%r9,%r10 + mulx %rax,%rcx,%r11 + adcx %rcx,%r10 + mulx %rsi,%rcx,%r12 + adcx %rcx,%r11 + adcx %r13,%r12 + + xorq %r14,%r14 + movq %rbp,%rdx + mulx %rax,%rcx,%rdx + adcx %rcx,%r11 + adox %rdx,%r12 + movq %rbp,%rdx + mulx %rsi,%rcx,%rdx + adcx %rcx,%r12 + adox %rdx,%r13 + adcx %r14,%r13 + + xorq %r15,%r15 + movq %rax,%rdx + mulx %rsi,%rcx,%r14 + adcx %rcx,%r13 + adcx %r15,%r14 + + shld $1,%r14,%r15 + shld $1,%r13,%r14 + shld $1,%r12,%r13 + shld $1,%r11,%r12 + shld $1,%r10,%r11 + shld $1,%r9,%r10 + addq %r9,%r9 + + xorq %rdx,%rdx + movq %rbx,%rdx + mulx %rdx,%r8,%rdx + adcx %rdx,%r9 + + movq %rbp,%rdx + mulx %rdx,%rcx,%rdx + adcx %rcx,%r10 + adcx %rdx,%r11 + + movq %rax,%rdx + mulx %rdx,%rcx,%rdx + adcx %rcx,%r12 + adcx %rdx,%r13 + + movq %rsi,%rdx + mulx %rdx,%rcx,%rdx + adcx %rcx,%r14 + adcx %rdx,%r15 + + xorq %rbp,%rbp + movq $0x1000003D1,%rdx + mulx %r12,%rax,%rbx + adcx %rax,%r8 + adox %rbx,%r9 + mulx %r13,%rax,%rbx + adcx %rax,%r9 + adox %rbx,%r10 + mulx %r14,%rax,%rbx + adcx %rax,%r10 + adox %rbx,%r11 + mulx %r15,%rax,%r15 + adcx %rax,%r11 + adox %rbp,%r15 + adcx %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r15,32(%rdi) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + + ret +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is zero. + */ + .p2align 4 + .global secp256k1_fe_mul_44to4 + .type secp256k1_fe_mul_44to4, %function + +secp256k1_fe_mul_44to4: + + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + movq %rdx,%rbx + + xorq %r13,%r13 + movq 0(%rbx),%rdx + mulx 0(%rsi),%r8,%r9 + mulx 8(%rsi),%rcx,%r10 + adcx %rcx,%r9 + mulx 16(%rsi),%rcx,%r11 + adcx %rcx,%r10 + mulx 24(%rsi),%rcx,%r12 + adcx %rcx,%r11 + adcx %r13,%r12 + + xorq %r14,%r14 + movq 8(%rbx),%rdx + mulx 0(%rsi),%rcx,%rbp + adcx %rcx,%r9 + adox %rbp,%r10 + mulx 8(%rsi),%rcx,%rbp + adcx %rcx,%r10 + adox %rbp,%r11 + mulx 16(%rsi),%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx 24(%rsi),%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + adcx %r14,%r13 + + xorq %r15,%r15 + movq 16(%rbx),%rdx + mulx 0(%rsi),%rcx,%rbp + adcx %rcx,%r10 + adox %rbp,%r11 + mulx 8(%rsi),%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx 16(%rsi),%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + mulx 24(%rsi),%rcx,%rbp + adcx %rcx,%r13 + adox %rbp,%r14 + adcx %r15,%r14 + + xorq %rax,%rax + movq 24(%rbx),%rdx + mulx 0(%rsi),%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx 8(%rsi),%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + mulx 16(%rsi),%rcx,%rbp + adcx %rcx,%r13 + adox %rbp,%r14 + mulx 24(%rsi),%rcx,%rbp + adcx %rcx,%r14 + adox %rbp,%r15 + adcx %rax,%r15 + + xorq %rbp,%rbp + movq $0x1000003D1,%rdx + mulx %r12,%rax,%r12 + adcx %rax,%r8 + adox %r12,%r9 + mulx %r13,%rcx,%r13 + adcx %rcx,%r9 + adox %r13,%r10 + mulx %r14,%rcx,%r14 + adcx %rcx,%r10 + adox %r14,%r11 + mulx %r15,%rcx,%r15 + adcx %rcx,%r11 + adox %rbp,%r15 + adcx %rbp,%r15 + + xorq %rbp,%rbp + mulx %r15,%r14,%r15 + adcx %r14,%r8 + adcx %r15,%r9 + adcx %rbp,%r10 + adcx %rbp,%r11 + cmovc %rdx,%rbp + xorq %rbx,%rbx + adcx %rbp,%r8 + adcx %rbx,%r9 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq $0,32(%rdi) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + + ret + + + .p2align 4 + .global secp256k1_fe_sqr_4to4 + .type secp256k1_fe_sqr_4to4, %function + +secp256k1_fe_sqr_4to4: + + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%rax + movq 24(%rsi),%rsi + + xorq %r13,%r13 + movq %rbx,%rdx + mulx %rbp,%r9,%r10 + mulx %rax,%rcx,%r11 + adcx %rcx,%r10 + mulx %rsi,%rcx,%r12 + adcx %rcx,%r11 + adcx %r13,%r12 + + xorq %r14,%r14 + movq %rbp,%rdx + mulx %rax,%rcx,%rdx + adcx %rcx,%r11 + adox %rdx,%r12 + movq %rbp,%rdx + mulx %rsi,%rcx,%rdx + adcx %rcx,%r12 + adox %rdx,%r13 + adcx %r14,%r13 + + xorq %r15,%r15 + movq %rax,%rdx + mulx %rsi,%rcx,%r14 + adcx %rcx,%r13 + adcx %r15,%r14 + + shld $1,%r14,%r15 + shld $1,%r13,%r14 + shld $1,%r12,%r13 + shld $1,%r11,%r12 + shld $1,%r10,%r11 + shld $1,%r9,%r10 + addq %r9,%r9 + + xorq %rdx,%rdx + movq %rbx,%rdx + mulx %rdx,%r8,%rdx + adcx %rdx,%r9 + + movq %rbp,%rdx + mulx %rdx,%rcx,%rdx + adcx %rcx,%r10 + adcx %rdx,%r11 + + movq %rax,%rdx + mulx %rdx,%rcx,%rdx + adcx %rcx,%r12 + adcx %rdx,%r13 + + movq %rsi,%rdx + mulx %rdx,%rcx,%rdx + adcx %rcx,%r14 + adcx %rdx,%r15 + + xorq %rbp,%rbp + movq $0x1000003D1,%rdx + mulx %r12,%rax,%rbx + adcx %rax,%r8 + adox %rbx,%r9 + mulx %r13,%rax,%rbx + adcx %rax,%r9 + adox %rbx,%r10 + mulx %r14,%rax,%rbx + adcx %rax,%r10 + adox %rbx,%r11 + mulx %r15,%rax,%r15 + adcx %rax,%r11 + adox %rbp,%r15 + adcx %rbp,%r15 + + xorq %rbp,%rbp + mulx %r15,%r14,%r15 + adcx %r14,%r8 + adcx %r15,%r9 + adcx %rbp,%r10 + adcx %rbp,%r11 + cmovc %rdx,%rbp + xorq %rbx,%rbx + adcx %rbp,%r8 + adcx %rbx,%r9 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq $0,32(%rdi) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + + ret +/* + * 64-bit field multiplication in which the first argument has 4-limb + * and the second argument has 5-limb representations such that the + * fifth limb is of at most 64 bits. The second argument is fully + * reduced to 4-limb form and then field multiplication is performed. + * A field element in 5-limb form is reported as output such that the + * fifth limb is of at most 33 bits. + */ + .p2align 4 + .global secp256k1_fe_mul_45to5 + .type secp256k1_fe_mul_45to5, %function + +secp256k1_fe_mul_45to5: + + movq %rsp,%r11 + subq $72,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbp,40(%rsp) + movq %rbx,48(%rsp) + movq %rdi,56(%rsp) + + movq 0(%rdx),%rax + movq 8(%rdx),%rbx + movq 16(%rdx),%r8 + movq 24(%rdx),%rdi + + movq %rdx,%r15 + movq $0x1000003D1,%rdx + xorq %rcx,%rcx + mulx 32(%r15),%r13,%r14 + adcx %r13,%rax + adcx %r14,%rbx + adcx %rcx,%r8 + adcx %rcx,%rdi + cmovc %rdx,%rcx + xorq %r13,%r13 + adcx %rcx,%rax + adcx %r13,%rbx + movq %r8,64(%rsp) + + xorq %r13,%r13 + movq 0(%rsi),%rdx + mulx %rax,%r8,%r9 + mulx %rbx,%rcx,%r10 + adcx %rcx,%r9 + mulx 64(%rsp),%rcx,%r11 + adcx %rcx,%r10 + mulx %rdi,%rcx,%r12 + adcx %rcx,%r11 + adcx %r13,%r12 + + xorq %r14,%r14 + movq 8(%rsi),%rdx + mulx %rax,%rcx,%rbp + adcx %rcx,%r9 + adox %rbp,%r10 + mulx %rbx,%rcx,%rbp + adcx %rcx,%r10 + adox %rbp,%r11 + mulx 64(%rsp),%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx %rdi,%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + adcx %r14,%r13 + + xorq %r15,%r15 + movq 16(%rsi),%rdx + mulx %rax,%rcx,%rbp + adcx %rcx,%r10 + adox %rbp,%r11 + mulx %rbx,%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx 64(%rsp),%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + mulx %rdi,%rcx,%rbp + adcx %rcx,%r13 + adox %rbp,%r14 + adcx %r15,%r14 + + xorq %rdx,%rdx + movq 24(%rsi),%rdx + mulx %rax,%rcx,%rbp + adcx %rcx,%r11 + adox %rbp,%r12 + mulx %rbx,%rcx,%rbp + adcx %rcx,%r12 + adox %rbp,%r13 + mulx 64(%rsp),%rcx,%rbp + adcx %rcx,%r13 + adox %rbp,%r14 + mulx %rdi,%rcx,%rbp + adcx %rcx,%r14 + adox %rbp,%r15 + adcq $0,%r15 + + xorq %rbp,%rbp + movq $0x1000003D1,%rdx + mulx %r12,%rax,%r12 + adcx %rax,%r8 + adox %r12,%r9 + mulx %r13,%rcx,%r13 + adcx %rcx,%r9 + adox %r13,%r10 + mulx %r14,%rcx,%r14 + adcx %rcx,%r10 + adox %r14,%r11 + mulx %r15,%rcx,%r15 + adcx %rcx,%r11 + adox %rbp,%r15 + adcx %rbp,%r15 + + movq 56(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r15,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + + movq %r11,%rsp + + ret diff --git a/src/asm/field_5x64_x86_64_mxaa.s b/src/asm/field_5x64_x86_64_mxaa.s new file mode 100644 index 0000000000..49e04d05d9 --- /dev/null +++ b/src/asm/field_5x64_x86_64_mxaa.s @@ -0,0 +1,875 @@ +/************************************************************************ + * Field multiplication and squaring assemblies using representation of * + * field elements in base 2^{64}. * + * Major instructions used in the assemblies are mulx/add/adc. * + * * + * Copyright (c) 2021 Kaushik Nath * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or https://www.opensource.org/licenses/mit-license.php. * + ***********************************************************************/ + + .att_syntax + .text +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is of at most 64 bits. The 5-limb inputs are fully reduced first + * to 4-limb forms, then multiplied, after which a field element in 5-limb + * form is reported as output. The fifth limb of the output has at most + * 33 bits. + */ + .p2align 4 + .global secp256k1_fe_mul_55to5 + .type secp256k1_fe_mul_55to5, %function + +secp256k1_fe_mul_55to5: + + movq %rsp,%r11 + subq $112,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbp,40(%rsp) + movq %rbx,48(%rsp) + movq %rdi,56(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%rdi + movq 24(%rdx),%r15 + movq 32(%rdx),%rax + + movq $0x1000003D1,%rdx + xorq %rcx,%rcx + mulx 32(%rsi),%rbx,%rbp + addq %rbx,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + adcq $0,%r11 + cmovc %rdx,%rcx + addq %rcx,%r8 + adcq $0,%r9 + + xorq %rcx,%rcx + mulx %rax,%rax,%rbx + addq %rax,%r12 + adcq %rbx,%r13 + adcq $0,%rdi + adcq $0,%r15 + cmovc %rdx,%rcx + addq %rcx,%r12 + adcq $0,%r13 + movq %r15,%rsi + + movq %r8,64(%rsp) + movq %r9,72(%rsp) + movq %r10,80(%rsp) + movq %r11,88(%rsp) + movq %r12,96(%rsp) + movq %r13,104(%rsp) + + movq 64(%rsp),%rdx + mulx 96(%rsp),%r8,%r9 + mulx 104(%rsp),%rcx,%r10 + addq %rcx,%r9 + mulx %rdi,%rcx,%r11 + adcq %rcx,%r10 + mulx %rsi,%rcx,%r12 + adcq %rcx,%r11 + adcq $0,%r12 + + movq 72(%rsp),%rdx + mulx 96(%rsp),%rax,%rbx + mulx 104(%rsp),%rcx,%rbp + addq %rcx,%rbx + mulx %rdi,%rcx,%r15 + adcq %rcx,%rbp + mulx %rsi,%rcx,%r13 + adcq %rcx,%r15 + adcq $0,%r13 + addq %rax,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + adcq %r15,%r12 + adcq $0,%r13 + + movq 80(%rsp),%rdx + mulx 96(%rsp),%rax,%rbx + mulx 104(%rsp),%rcx,%rbp + addq %rcx,%rbx + mulx %rdi,%rcx,%r15 + adcq %rcx,%rbp + mulx %rsi,%rcx,%r14 + adcq %rcx,%r15 + adcq $0,%r14 + addq %rax,%r10 + adcq %rbx,%r11 + adcq %rbp,%r12 + adcq %r15,%r13 + adcq $0,%r14 + + movq 88(%rsp),%rdx + mulx 96(%rsp),%rax,%rbx + mulx 104(%rsp),%rcx,%rbp + addq %rcx,%rbx + mulx %rdi,%rcx,%r15 + adcq %rcx,%rbp + mulx %rsi,%rcx,%rsi + adcq %rcx,%r15 + adcq $0,%rsi + addq %rax,%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %r15,%r14 + adcq $0,%rsi + + movq $0x1000003D1,%rdx + mulx %r12,%r12,%rbx + mulx %r13,%r13,%rcx + addq %rbx,%r13 + mulx %r14,%r14,%rbx + adcq %rcx,%r14 + mulx %rsi,%r15,%rcx + adcq %rbx,%r15 + adcq $0,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq $0,%rcx + + movq 56(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rcx,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + + movq %r11,%rsp + + ret + + + .p2align 4 + .global secp256k1_fe_sqr_5to5 + .type secp256k1_fe_sqr_5to5, %function + +secp256k1_fe_sqr_5to5: + + movq %rsp,%r11 + subq $64,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbp,40(%rsp) + movq %rbx,48(%rsp) + movq %rdi,56(%rsp) + + movq 0(%rsi),%rbp + movq 8(%rsi),%rdi + movq 16(%rsi),%rcx + + movq $0x1000003D1,%rdx + xorq %r15,%r15 + mulx 32(%rsi),%r13,%r14 + movq 24(%rsi),%rsi + addq %r13,%rbp + adcq %r14,%rdi + adcq $0,%rcx + adcq $0,%rsi + cmovc %rdx,%r15 + addq %r15,%rbp + adcq $0,%rdi + + movq %rbp,%rdx + mulx %rdi,%r9,%r10 + mulx %rcx,%r8,%r11 + addq %r8,%r10 + mulx %rsi,%rdx,%r12 + adcq %rdx,%r11 + adcq $0,%r12 + + movq %rdi,%rdx + mulx %rcx,%rax,%rbx + mulx %rsi,%rdx,%r13 + addq %rdx,%rbx + adcq $0,%r13 + addq %rax,%r11 + adcq %rbx,%r12 + adcq $0,%r13 + + movq %rcx,%rdx + mulx %rsi,%rax,%r14 + addq %rax,%r13 + adcq $0,%r14 + + movq $0,%r15 + shld $1,%r14,%r15 + shld $1,%r13,%r14 + shld $1,%r12,%r13 + shld $1,%r11,%r12 + shld $1,%r10,%r11 + shld $1,%r9,%r10 + addq %r9,%r9 + + movq %rbp,%rdx + mulx %rdx,%r8,%rax + addq %rax,%r9 + + movq %rdi,%rdx + mulx %rdx,%rax,%rbx + adcq %rax,%r10 + adcq %rbx,%r11 + + movq %rcx,%rdx + mulx %rdx,%rax,%rbx + adcq %rax,%r12 + adcq %rbx,%r13 + + movq %rsi,%rdx + mulx %rdx,%rax,%rbx + adcq %rax,%r14 + adcq %rbx,%r15 + + movq $0x1000003D1,%rdx + + mulx %r12,%r12,%rbx + mulx %r13,%r13,%rcx + addq %rbx,%r13 + + mulx %r14,%r14,%rbx + adcq %rcx,%r14 + + mulx %r15,%r15,%rcx + adcq %rbx,%r15 + adcq $0,%rcx + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq $0,%rcx + + movq 56(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rcx,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + + movq %r11,%rsp + + ret +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is of at most 33 bits. + */ + .p2align 4 + .global secp256k1_fe_mul_44to5 + .type secp256k1_fe_mul_44to5, %function + +secp256k1_fe_mul_44to5: + + movq %rsp,%r11 + subq $64,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbp,40(%rsp) + movq %rbx,48(%rsp) + movq %rdi,56(%rsp) + + movq %rdx,%rdi + + movq 0(%rdi),%rdx + mulx 0(%rsi),%r8,%r9 + mulx 8(%rsi),%rcx,%r10 + addq %rcx,%r9 + mulx 16(%rsi),%rcx,%r11 + adcq %rcx,%r10 + mulx 24(%rsi),%rcx,%r12 + adcq %rcx,%r11 + adcq $0,%r12 + + movq 8(%rdi),%rdx + mulx 0(%rsi),%rax,%rbx + mulx 8(%rsi),%rcx,%rbp + addq %rcx,%rbx + mulx 16(%rsi),%rcx,%r15 + adcq %rcx,%rbp + mulx 24(%rsi),%rcx,%r13 + adcq %rcx,%r15 + adcq $0,%r13 + addq %rax,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + adcq %r15,%r12 + adcq $0,%r13 + + movq 16(%rdi),%rdx + mulx 0(%rsi),%rax,%rbx + mulx 8(%rsi),%rcx,%rbp + addq %rcx,%rbx + mulx 16(%rsi),%rcx,%r15 + adcq %rcx,%rbp + mulx 24(%rsi),%rcx,%r14 + adcq %rcx,%r15 + adcq $0,%r14 + addq %rax,%r10 + adcq %rbx,%r11 + adcq %rbp,%r12 + adcq %r15,%r13 + adcq $0,%r14 + + movq 24(%rdi),%rdx + mulx 0(%rsi),%rax,%rbx + mulx 8(%rsi),%rcx,%rbp + addq %rcx,%rbx + mulx 16(%rsi),%rcx,%r15 + adcq %rcx,%rbp + mulx 24(%rsi),%rcx,%rsi + adcq %rcx,%r15 + adcq $0,%rsi + addq %rax,%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %r15,%r14 + adcq $0,%rsi + + movq $0x1000003D1,%rdx + mulx %r12,%r12,%rbx + mulx %r13,%r13,%rcx + addq %rbx,%r13 + mulx %r14,%r14,%rbx + adcq %rcx,%r14 + mulx %rsi,%r15,%rcx + adcq %rbx,%r15 + adcq $0,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq $0,%rcx + + movq 56(%rsp),%rdi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rcx,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + + movq %r11,%rsp + + ret + + + .p2align 4 + .global secp256k1_fe_sqr_4to5 + .type secp256k1_fe_sqr_4to5, %function + +secp256k1_fe_sqr_4to5: + + movq %rsp,%r11 + subq $56,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbp,40(%rsp) + movq %rbx,48(%rsp) + + movq 0(%rsi),%rdx + mulx 8(%rsi),%r9,%r10 + mulx 16(%rsi),%rcx,%r11 + addq %rcx,%r10 + mulx 24(%rsi),%rcx,%r12 + adcq %rcx,%r11 + adcq $0,%r12 + + movq 8(%rsi),%rdx + mulx 16(%rsi),%rax,%rbx + mulx 24(%rsi),%rcx,%r13 + addq %rcx,%rbx + adcq $0,%r13 + addq %rax,%r11 + adcq %rbx,%r12 + adcq $0,%r13 + + movq 16(%rsi),%rdx + mulx 24(%rsi),%rax,%r14 + addq %rax,%r13 + adcq $0,%r14 + + movq $0,%r15 + shld $1,%r14,%r15 + shld $1,%r13,%r14 + shld $1,%r12,%r13 + shld $1,%r11,%r12 + shld $1,%r10,%r11 + shld $1,%r9,%r10 + addq %r9,%r9 + + movq 0(%rsi),%rdx + mulx %rdx,%r8,%rax + addq %rax,%r9 + + movq 8(%rsi),%rdx + mulx %rdx,%rax,%rbx + adcq %rax,%r10 + adcq %rbx,%r11 + + movq 16(%rsi),%rdx + mulx %rdx,%rax,%rbx + adcq %rax,%r12 + adcq %rbx,%r13 + + movq 24(%rsi),%rdx + mulx %rdx,%rax,%rbx + adcq %rax,%r14 + adcq %rbx,%r15 + + movq $0x1000003D1,%rdx + mulx %r12,%r12,%rbx + mulx %r13,%r13,%rcx + addq %rbx,%r13 + mulx %r14,%r14,%rbx + adcq %rcx,%r14 + mulx %r15,%r15,%rcx + adcq %rbx,%r15 + adcq $0,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq $0,%rcx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rcx,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + + movq %r11,%rsp + + ret +/* + * 64-bit field multiplication and squaring using the bottom 4-limbs of + * two field elements having 5-limb representation such that the fifth + * limb is zero. A field element in 5-limb form is reported as output + * such that the fifth limb is zero. + */ + .p2align 4 + .global secp256k1_fe_mul_44to4 + .type secp256k1_fe_mul_44to4, %function + +secp256k1_fe_mul_44to4: + + movq %rsp,%r11 + subq $64,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbp,40(%rsp) + movq %rbx,48(%rsp) + movq %rdi,56(%rsp) + + movq %rdx,%rdi + + movq 0(%rdi),%rdx + mulx 0(%rsi),%r8,%r9 + mulx 8(%rsi),%rcx,%r10 + addq %rcx,%r9 + mulx 16(%rsi),%rcx,%r11 + adcq %rcx,%r10 + mulx 24(%rsi),%rcx,%r12 + adcq %rcx,%r11 + adcq $0,%r12 + + movq 8(%rdi),%rdx + mulx 0(%rsi),%rax,%rbx + mulx 8(%rsi),%rcx,%rbp + addq %rcx,%rbx + mulx 16(%rsi),%rcx,%r15 + adcq %rcx,%rbp + mulx 24(%rsi),%rcx,%r13 + adcq %rcx,%r15 + adcq $0,%r13 + addq %rax,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + adcq %r15,%r12 + adcq $0,%r13 + + movq 16(%rdi),%rdx + mulx 0(%rsi),%rax,%rbx + mulx 8(%rsi),%rcx,%rbp + addq %rcx,%rbx + mulx 16(%rsi),%rcx,%r15 + adcq %rcx,%rbp + mulx 24(%rsi),%rcx,%r14 + adcq %rcx,%r15 + adcq $0,%r14 + addq %rax,%r10 + adcq %rbx,%r11 + adcq %rbp,%r12 + adcq %r15,%r13 + adcq $0,%r14 + + movq 24(%rdi),%rdx + mulx 0(%rsi),%rax,%rbx + mulx 8(%rsi),%rcx,%rbp + addq %rcx,%rbx + mulx 16(%rsi),%rcx,%r15 + adcq %rcx,%rbp + mulx 24(%rsi),%rcx,%rsi + adcq %rcx,%r15 + adcq $0,%rsi + addq %rax,%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %r15,%r14 + adcq $0,%rsi + + movq $0x1000003D1,%rdx + mulx %r12,%r12,%rbx + mulx %r13,%r13,%rcx + addq %rbx,%r13 + mulx %r14,%r14,%rbx + adcq %rcx,%r14 + mulx %rsi,%r15,%rcx + adcq %rbx,%r15 + adcq $0,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq $0,%rcx + + xorq %r15,%r15 + mulx %rcx,%r13,%r14 + addq %r13,%r8 + adcq %r14,%r9 + adcq $0,%r10 + adcq $0,%r11 + cmovc %rdx,%r15 + addq %r15,%r8 + adcq $0,%r9 + + movq 56(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq $0,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + + movq %r11,%rsp + + ret + + + .p2align 4 + .global secp256k1_fe_sqr_4to4 + .type secp256k1_fe_sqr_4to4, %function + +secp256k1_fe_sqr_4to4: + + movq %rsp,%r11 + subq $56,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbp,40(%rsp) + movq %rbx,48(%rsp) + + movq 0(%rsi),%rdx + mulx 8(%rsi),%r9,%r10 + mulx 16(%rsi),%rcx,%r11 + addq %rcx,%r10 + mulx 24(%rsi),%rcx,%r12 + adcq %rcx,%r11 + adcq $0,%r12 + + movq 8(%rsi),%rdx + mulx 16(%rsi),%rax,%rbx + mulx 24(%rsi),%rcx,%r13 + addq %rcx,%rbx + adcq $0,%r13 + addq %rax,%r11 + adcq %rbx,%r12 + adcq $0,%r13 + + movq 16(%rsi),%rdx + mulx 24(%rsi),%rax,%r14 + addq %rax,%r13 + adcq $0,%r14 + + movq $0,%r15 + shld $1,%r14,%r15 + shld $1,%r13,%r14 + shld $1,%r12,%r13 + shld $1,%r11,%r12 + shld $1,%r10,%r11 + shld $1,%r9,%r10 + addq %r9,%r9 + + movq 0(%rsi),%rdx + mulx %rdx,%r8,%rax + addq %rax,%r9 + + movq 8(%rsi),%rdx + mulx %rdx,%rax,%rbx + adcq %rax,%r10 + adcq %rbx,%r11 + + movq 16(%rsi),%rdx + mulx %rdx,%rax,%rbx + adcq %rax,%r12 + adcq %rbx,%r13 + + movq 24(%rsi),%rdx + mulx %rdx,%rax,%rbx + adcq %rax,%r14 + adcq %rbx,%r15 + + movq $0x1000003D1,%rdx + mulx %r12,%r12,%rbx + mulx %r13,%r13,%rcx + addq %rbx,%r13 + mulx %r14,%r14,%rbx + adcq %rcx,%r14 + mulx %r15,%r15,%rcx + adcq %rbx,%r15 + adcq $0,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq $0,%rcx + + xorq %r15,%r15 + mulx %rcx,%r13,%r14 + addq %r13,%r8 + adcq %r14,%r9 + adcq $0,%r10 + adcq $0,%r11 + cmovc %rdx,%r15 + addq %r15,%r8 + adcq $0,%r9 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq $0,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + + movq %r11,%rsp + + ret +/* + * 64-bit field multiplication in which the first argument has 4-limb + * and the second argument has 5-limb representations such that the + * fifth limb is of at most 64 bits. The second argument is fully + * reduced to 4-limb form and then field multiplication is performed. + * A field element in 5-limb form is reported as output such that the + * fifth limb is of at most 33 bits. + */ + .p2align 4 + .global secp256k1_fe_mul_45to5 + .type secp256k1_fe_mul_45to5, %function + +secp256k1_fe_mul_45to5: + + movq %rsp,%r11 + subq $88,%rsp + + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbp,40(%rsp) + movq %rbx,48(%rsp) + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + movq 32(%rdx),%rax + + movq $0x1000003D1,%rdx + xorq %rcx,%rcx + mulx %rax,%rax,%rbx + addq %rax,%r12 + adcq %rbx,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmovc %rdx,%rcx + addq %rcx,%r12 + adcq $0,%r13 + + movq %r12,56(%rsp) + movq %r13,64(%rsp) + movq %r14,72(%rsp) + movq %r15,80(%rsp) + + movq 0(%rsi),%rdx + mulx 56(%rsp),%r8,%r9 + mulx 64(%rsp),%rcx,%r10 + addq %rcx,%r9 + mulx 72(%rsp),%rcx,%r11 + adcq %rcx,%r10 + mulx 80(%rsp),%rcx,%r12 + adcq %rcx,%r11 + adcq $0,%r12 + + movq 8(%rsi),%rdx + mulx 56(%rsp),%rax,%rbx + mulx 64(%rsp),%rcx,%rbp + addq %rcx,%rbx + mulx 72(%rsp),%rcx,%r15 + adcq %rcx,%rbp + mulx 80(%rsp),%rcx,%r13 + adcq %rcx,%r15 + adcq $0,%r13 + addq %rax,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + adcq %r15,%r12 + adcq $0,%r13 + + movq 16(%rsi),%rdx + mulx 56(%rsp),%rax,%rbx + mulx 64(%rsp),%rcx,%rbp + addq %rcx,%rbx + mulx 72(%rsp),%rcx,%r15 + adcq %rcx,%rbp + mulx 80(%rsp),%rcx,%r14 + adcq %rcx,%r15 + adcq $0,%r14 + addq %rax,%r10 + adcq %rbx,%r11 + adcq %rbp,%r12 + adcq %r15,%r13 + adcq $0,%r14 + + movq 24(%rsi),%rdx + mulx 56(%rsp),%rax,%rbx + mulx 64(%rsp),%rcx,%rbp + addq %rcx,%rbx + mulx 72(%rsp),%rcx,%r15 + adcq %rcx,%rbp + mulx 80(%rsp),%rcx,%rsi + adcq %rcx,%r15 + adcq $0,%rsi + addq %rax,%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %r15,%r14 + adcq $0,%rsi + + movq $0x1000003D1,%rdx + mulx %r12,%r12,%rbx + mulx %r13,%r13,%rcx + addq %rbx,%r13 + mulx %r14,%r14,%rbx + adcq %rcx,%r14 + mulx %rsi,%r15,%rcx + adcq %rbx,%r15 + adcq $0,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq $0,%rcx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rcx,32(%rdi) + + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbp + movq 48(%rsp),%rbx + + movq %r11,%rsp + + ret diff --git a/src/field.h b/src/field.h index 854aaebabc..9fd94d4696 100644 --- a/src/field.h +++ b/src/field.h @@ -25,7 +25,7 @@ #include "util.h" #if defined(SECP256K1_WIDEMUL_INT128) -#include "field_5x52.h" +#include "field_5x64.h" #elif defined(SECP256K1_WIDEMUL_INT64) #include "field_10x26.h" #else @@ -43,6 +43,12 @@ static void secp256k1_fe_normalize_weak(secp256k1_fe *r); /** Normalize a field element, without constant-time guarantee. */ static void secp256k1_fe_normalize_var(secp256k1_fe *r); +/** Normalize a field element to be usable as input to _prec functions. */ +static void secp256k1_fe_normalize_prec(secp256k1_fe *r); + +/** Simultaneously normalize weakly and precomputedly. */ +static void secp256k1_fe_normalize_weak_prec(secp256k1_fe *r); + /** Verify whether a field element represents zero i.e. would normalize to a zero value. */ static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r); @@ -96,6 +102,18 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2 * The output magnitude is 1 (but not guaranteed to be normalized). */ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a); +/** Like secp256k1_fe_mul, but assumes b is prec-normalized. */ +static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe* a, const secp256k1_fe * SECP256K1_RESTRICT b_prec); + +/** Like secp256k1_fe_mul, but assumes both a and b are prec-normalized. */ +static void secp256k1_fe_mul_2prec(secp256k1_fe *r, const secp256k1_fe* a_prec, const secp256k1_fe * SECP256K1_RESTRICT b_prec); + +/** Like secp256k1_fe_sqr, but assumes a is prec-normalized. */ +static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe* a_prec); + +/** Like secp256k1_fe_sqr, but assumes a is prec-normalized, and produces prec-normalized output r. */ +static void secp256k1_fe_sqr_prec_oprec(secp256k1_fe *r_prec, const secp256k1_fe* a_prec); + /** If a has a square root, it is computed in r and 1 is returned. If a does not * have a square root, the root of its negation is computed and 0 is returned. * The input's magnitude can be at most 8. The output magnitude is 1 (but not diff --git a/src/field_10x26.h b/src/field_10x26.h index 9eb65607f1..852b0b7e23 100644 --- a/src/field_10x26.h +++ b/src/field_10x26.h @@ -17,6 +17,7 @@ typedef struct { #ifdef VERIFY int magnitude; int normalized; + uint64_t precomputed; /* 64 bits to avoid padding bytes */ #endif } secp256k1_fe; @@ -35,7 +36,7 @@ typedef struct { } #ifdef VERIFY -#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0)), 1, 1} +#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0)), 1, 1, 1} #else #define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0))} #endif diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 7a38c117f1..f288315bc8 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -91,6 +91,7 @@ static void secp256k1_fe_normalize(secp256k1_fe *r) { #ifdef VERIFY r->magnitude = 1; r->normalized = 1; + r->precomputed = 1; secp256k1_fe_verify(r); #endif } @@ -122,6 +123,23 @@ static void secp256k1_fe_normalize_weak(secp256k1_fe *r) { #ifdef VERIFY r->magnitude = 1; + if (!r->normalized) r->precomputed = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_normalize_prec(secp256k1_fe *r) { + (void)r; +#ifdef VERIFY + r->precomputed = 1; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_normalize_weak_prec(secp256k1_fe *r) { + secp256k1_fe_normalize_weak(r); +#ifdef VERIFY + r->precomputed = 1; secp256k1_fe_verify(r); #endif } @@ -178,6 +196,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) { #ifdef VERIFY r->magnitude = 1; r->normalized = 1; + r->precomputed = 1; secp256k1_fe_verify(r); #endif } @@ -269,6 +288,7 @@ SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe *r, int a) { #ifdef VERIFY r->magnitude = 1; r->normalized = 1; + r->precomputed = 1; secp256k1_fe_verify(r); #endif } @@ -295,6 +315,7 @@ SECP256K1_INLINE static void secp256k1_fe_clear(secp256k1_fe *a) { #ifdef VERIFY a->magnitude = 0; a->normalized = 1; + a->precomputed = 1; #endif for (i=0; i<10; i++) { a->n[i] = 0; @@ -336,12 +357,9 @@ static int secp256k1_fe_set_b32(secp256k1_fe *r, const unsigned char *a) { ret = !((r->n[9] == 0x3FFFFFUL) & ((r->n[8] & r->n[7] & r->n[6] & r->n[5] & r->n[4] & r->n[3] & r->n[2]) == 0x3FFFFFFUL) & ((r->n[1] + 0x40UL + ((r->n[0] + 0x3D1UL) >> 26)) > 0x3FFFFFFUL)); #ifdef VERIFY r->magnitude = 1; - if (ret) { - r->normalized = 1; - secp256k1_fe_verify(r); - } else { - r->normalized = 0; - } + r->normalized = ret; + r->precomputed = 1; + secp256k1_fe_verify(r); #endif return ret; } @@ -404,6 +422,7 @@ SECP256K1_INLINE static void secp256k1_fe_negate(secp256k1_fe *r, const secp256k #ifdef VERIFY r->magnitude = m + 1; r->normalized = 0; + r->precomputed = 0; secp256k1_fe_verify(r); #endif } @@ -422,6 +441,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_int(secp256k1_fe *r, int a) { #ifdef VERIFY r->magnitude *= a; r->normalized = 0; + r->precomputed = 0; secp256k1_fe_verify(r); #endif } @@ -443,6 +463,7 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe *r, const secp256k1_f #ifdef VERIFY r->magnitude += a->magnitude; r->normalized = 0; + r->precomputed = 0; secp256k1_fe_verify(r); #endif } @@ -1079,6 +1100,46 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2 #ifdef VERIFY r->magnitude = 1; r->normalized = 0; + r->precomputed = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const secp256k1_fe * SECP256K1_RESTRICT b_prec) { +#ifdef VERIFY + VERIFY_CHECK(b_prec->precomputed); + VERIFY_CHECK(a->magnitude <= 8); + VERIFY_CHECK(b_prec->magnitude <= 8); + secp256k1_fe_verify(a); + secp256k1_fe_verify(b_prec); + VERIFY_CHECK(r != b_prec); + VERIFY_CHECK(a != b_prec); +#endif + secp256k1_fe_mul_inner(r->n, a->n, b_prec->n); +#ifdef VERIFY + r->magnitude = 1; + r->normalized = 0; + r->precomputed = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_mul_2prec(secp256k1_fe *r, const secp256k1_fe *a_prec, const secp256k1_fe * SECP256K1_RESTRICT b_prec) { +#ifdef VERIFY + VERIFY_CHECK(a_prec->precomputed); + VERIFY_CHECK(b_prec->precomputed); + VERIFY_CHECK(a_prec->magnitude <= 8); + VERIFY_CHECK(b_prec->magnitude <= 8); + secp256k1_fe_verify(a_prec); + secp256k1_fe_verify(b_prec); + VERIFY_CHECK(r != b_prec); + VERIFY_CHECK(a_prec != b_prec); +#endif + secp256k1_fe_mul_inner(r->n, a_prec->n, b_prec->n); +#ifdef VERIFY + r->magnitude = 1; + r->normalized = 0; + r->precomputed = 0; secp256k1_fe_verify(r); #endif } @@ -1092,10 +1153,41 @@ static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) { #ifdef VERIFY r->magnitude = 1; r->normalized = 0; + r->precomputed = 0; secp256k1_fe_verify(r); #endif } +static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) { +#ifdef VERIFY + VERIFY_CHECK(a_prec->precomputed); + VERIFY_CHECK(a_prec->magnitude <= 8); + secp256k1_fe_verify(a_prec); +#endif + secp256k1_fe_sqr_inner(r->n, a_prec->n); +#ifdef VERIFY + r->magnitude = 1; + r->normalized = 0; + r->precomputed = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_sqr_prec_oprec(secp256k1_fe *r_prec, const secp256k1_fe *a_prec) { +#ifdef VERIFY + VERIFY_CHECK(a_prec->precomputed); + VERIFY_CHECK(a_prec->magnitude <= 8); + secp256k1_fe_verify(a_prec); +#endif + secp256k1_fe_sqr_inner(r_prec->n, a_prec->n); +#ifdef VERIFY + r_prec->magnitude = 1; + r_prec->normalized = 0; + r_prec->precomputed = 1; + secp256k1_fe_verify(r_prec); +#endif +} + static SECP256K1_INLINE void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag) { uint32_t mask0, mask1; VG_CHECK_VERIFY(r->n, sizeof(r->n)); @@ -1115,6 +1207,7 @@ static SECP256K1_INLINE void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_ if (flag) { r->magnitude = a->magnitude; r->normalized = a->normalized; + r->precomputed = a->precomputed; } #endif } @@ -1162,6 +1255,7 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se #ifdef VERIFY r->magnitude = 1; r->normalized = 1; + r->precomputed = 1; #endif } @@ -1197,6 +1291,7 @@ static void secp256k1_fe_from_signed30(secp256k1_fe *r, const secp256k1_modinv32 #ifdef VERIFY r->magnitude = 1; r->normalized = 1; + r->precomputed = 1; secp256k1_fe_verify(r); #endif } diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h deleted file mode 100644 index a2118044ab..0000000000 --- a/src/field_5x52_asm_impl.h +++ /dev/null @@ -1,502 +0,0 @@ -/*********************************************************************** - * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* - ***********************************************************************/ - -/** - * Changelog: - * - March 2013, Diederik Huys: original version - * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm - * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly - */ - -#ifndef SECP256K1_FIELD_INNER5X52_IMPL_H -#define SECP256K1_FIELD_INNER5X52_IMPL_H - -SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * r15:rcx = d - * r10-r14 = a0-a4 - * rbx = b - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" - - /* d += a3 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rcx\n" - "movq %%rdx,%%r15\n" - /* d += a2 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d = a0 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c = a4 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* d += a4 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a0 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* t4 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a4 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* u0 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a1 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a4 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a2 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a1 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b2 (last use of %%r10 = a0) */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* d += a4 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rcx only) */ - "shrdq $52,%%r15,%%rcx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rcx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) -: "b"(b), "D"(r) -: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); -} - -SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * rcx:rbx = d - * r10-r14 = a0-a4 - * r15 = M (0xfffffffffffff) - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" - "movq $0xfffffffffffff,%%r15\n" - - /* d = (a0*2) * a3 */ - "leaq (%%r10,%%r10,1),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rbx\n" - "movq %%rdx,%%rcx\n" - /* d += (a1*2) * a2 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c = a4 * a4 */ - "movq %%r14,%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* a4 *= 2 */ - "addq %%r14,%%r14\n" - /* d += a0 * a4 */ - "movq %%r10,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d+= (a1*2) * a3 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a2 * a2 */ - "movq %%r12,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* t4 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * a0 */ - "movq %%r10,%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a1 * a4 */ - "movq %%r11,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += (a2*2) * a3 */ - "leaq (%%r12,%%r12,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* u0 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* a0 *= 2 */ - "addq %%r10,%%r10\n" - /* c += a0 * a1 */ - "movq %%r10,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a2 * a4 */ - "movq %%r12,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a3 * a3 */ - "movq %%r13,%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a0 * a2 (last use of %%r10) */ - "movq %%r10,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* c += a1 * a1 */ - "movq %%r11,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a3 * a4 */ - "movq %%r13,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rbx only) */ - "shrdq $52,%%rcx,%%rbx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rbx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) -: "D"(r) -: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); -} - -#endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */ diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h deleted file mode 100644 index 60ded927f6..0000000000 --- a/src/field_5x52_impl.h +++ /dev/null @@ -1,578 +0,0 @@ -/*********************************************************************** - * Copyright (c) 2013, 2014 Pieter Wuille * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* - ***********************************************************************/ - -#ifndef SECP256K1_FIELD_REPR_IMPL_H -#define SECP256K1_FIELD_REPR_IMPL_H - -#if defined HAVE_CONFIG_H -#include "libsecp256k1-config.h" -#endif - -#include "util.h" -#include "field.h" -#include "modinv64_impl.h" - -#if defined(USE_ASM_X86_64) -#include "field_5x52_asm_impl.h" -#else -#include "field_5x52_int128_impl.h" -#endif - -/** Implements arithmetic modulo FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F, - * represented as 5 uint64_t's in base 2^52. The values are allowed to contain >52 each. In particular, - * each FieldElem has a 'magnitude' associated with it. Internally, a magnitude M means each element - * is at most M*(2^53-1), except the most significant one, which is limited to M*(2^49-1). All operations - * accept any input with magnitude at most M, and have different rules for propagating magnitude to their - * output. - */ - -#ifdef VERIFY -static void secp256k1_fe_verify(const secp256k1_fe *a) { - const uint64_t *d = a->n; - int m = a->normalized ? 1 : 2 * a->magnitude, r = 1; - /* secp256k1 'p' value defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */ - r &= (d[0] <= 0xFFFFFFFFFFFFFULL * m); - r &= (d[1] <= 0xFFFFFFFFFFFFFULL * m); - r &= (d[2] <= 0xFFFFFFFFFFFFFULL * m); - r &= (d[3] <= 0xFFFFFFFFFFFFFULL * m); - r &= (d[4] <= 0x0FFFFFFFFFFFFULL * m); - r &= (a->magnitude >= 0); - r &= (a->magnitude <= 2048); - if (a->normalized) { - r &= (a->magnitude <= 1); - if (r && (d[4] == 0x0FFFFFFFFFFFFULL) && ((d[3] & d[2] & d[1]) == 0xFFFFFFFFFFFFFULL)) { - r &= (d[0] < 0xFFFFEFFFFFC2FULL); - } - } - VERIFY_CHECK(r == 1); -} -#endif - -static void secp256k1_fe_normalize(secp256k1_fe *r) { - uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4]; - - /* Reduce t4 at the start so there will be at most a single carry from the first pass */ - uint64_t m; - uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL; - - /* The first pass ensures the magnitude is 1, ... */ - t0 += x * 0x1000003D1ULL; - t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL; - t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; m = t1; - t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; m &= t2; - t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; m &= t3; - - /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */ - VERIFY_CHECK(t4 >> 49 == 0); - - /* At most a single final reduction is needed; check if the value is >= the field characteristic */ - x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL) & (m == 0xFFFFFFFFFFFFFULL) - & (t0 >= 0xFFFFEFFFFFC2FULL)); - - /* Apply the final reduction (for constant-time behaviour, we do it always) */ - t0 += x * 0x1000003D1ULL; - t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL; - t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; - t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; - t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; - - /* If t4 didn't carry to bit 48 already, then it should have after any final reduction */ - VERIFY_CHECK(t4 >> 48 == x); - - /* Mask off the possible multiple of 2^256 from the final reduction */ - t4 &= 0x0FFFFFFFFFFFFULL; - - r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4; - -#ifdef VERIFY - r->magnitude = 1; - r->normalized = 1; - secp256k1_fe_verify(r); -#endif -} - -static void secp256k1_fe_normalize_weak(secp256k1_fe *r) { - uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4]; - - /* Reduce t4 at the start so there will be at most a single carry from the first pass */ - uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL; - - /* The first pass ensures the magnitude is 1, ... */ - t0 += x * 0x1000003D1ULL; - t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL; - t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; - t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; - t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; - - /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */ - VERIFY_CHECK(t4 >> 49 == 0); - - r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4; - -#ifdef VERIFY - r->magnitude = 1; - secp256k1_fe_verify(r); -#endif -} - -static void secp256k1_fe_normalize_var(secp256k1_fe *r) { - uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4]; - - /* Reduce t4 at the start so there will be at most a single carry from the first pass */ - uint64_t m; - uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL; - - /* The first pass ensures the magnitude is 1, ... */ - t0 += x * 0x1000003D1ULL; - t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL; - t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; m = t1; - t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; m &= t2; - t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; m &= t3; - - /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */ - VERIFY_CHECK(t4 >> 49 == 0); - - /* At most a single final reduction is needed; check if the value is >= the field characteristic */ - x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL) & (m == 0xFFFFFFFFFFFFFULL) - & (t0 >= 0xFFFFEFFFFFC2FULL)); - - if (x) { - t0 += 0x1000003D1ULL; - t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL; - t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; - t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; - t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; - - /* If t4 didn't carry to bit 48 already, then it should have after any final reduction */ - VERIFY_CHECK(t4 >> 48 == x); - - /* Mask off the possible multiple of 2^256 from the final reduction */ - t4 &= 0x0FFFFFFFFFFFFULL; - } - - r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4; - -#ifdef VERIFY - r->magnitude = 1; - r->normalized = 1; - secp256k1_fe_verify(r); -#endif -} - -static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) { - uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4]; - - /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */ - uint64_t z0, z1; - - /* Reduce t4 at the start so there will be at most a single carry from the first pass */ - uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL; - - /* The first pass ensures the magnitude is 1, ... */ - t0 += x * 0x1000003D1ULL; - t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL; z0 = t0; z1 = t0 ^ 0x1000003D0ULL; - t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; z0 |= t1; z1 &= t1; - t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; z0 |= t2; z1 &= t2; - t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; z0 |= t3; z1 &= t3; - z0 |= t4; z1 &= t4 ^ 0xF000000000000ULL; - - /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */ - VERIFY_CHECK(t4 >> 49 == 0); - - return (z0 == 0) | (z1 == 0xFFFFFFFFFFFFFULL); -} - -static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) { - uint64_t t0, t1, t2, t3, t4; - uint64_t z0, z1; - uint64_t x; - - t0 = r->n[0]; - t4 = r->n[4]; - - /* Reduce t4 at the start so there will be at most a single carry from the first pass */ - x = t4 >> 48; - - /* The first pass ensures the magnitude is 1, ... */ - t0 += x * 0x1000003D1ULL; - - /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */ - z0 = t0 & 0xFFFFFFFFFFFFFULL; - z1 = z0 ^ 0x1000003D0ULL; - - /* Fast return path should catch the majority of cases */ - if ((z0 != 0ULL) & (z1 != 0xFFFFFFFFFFFFFULL)) { - return 0; - } - - t1 = r->n[1]; - t2 = r->n[2]; - t3 = r->n[3]; - - t4 &= 0x0FFFFFFFFFFFFULL; - - t1 += (t0 >> 52); - t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; z0 |= t1; z1 &= t1; - t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; z0 |= t2; z1 &= t2; - t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; z0 |= t3; z1 &= t3; - z0 |= t4; z1 &= t4 ^ 0xF000000000000ULL; - - /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */ - VERIFY_CHECK(t4 >> 49 == 0); - - return (z0 == 0) | (z1 == 0xFFFFFFFFFFFFFULL); -} - -SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe *r, int a) { - r->n[0] = a; - r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0; -#ifdef VERIFY - r->magnitude = 1; - r->normalized = 1; - secp256k1_fe_verify(r); -#endif -} - -SECP256K1_INLINE static int secp256k1_fe_is_zero(const secp256k1_fe *a) { - const uint64_t *t = a->n; -#ifdef VERIFY - VERIFY_CHECK(a->normalized); - secp256k1_fe_verify(a); -#endif - return (t[0] | t[1] | t[2] | t[3] | t[4]) == 0; -} - -SECP256K1_INLINE static int secp256k1_fe_is_odd(const secp256k1_fe *a) { -#ifdef VERIFY - VERIFY_CHECK(a->normalized); - secp256k1_fe_verify(a); -#endif - return a->n[0] & 1; -} - -SECP256K1_INLINE static void secp256k1_fe_clear(secp256k1_fe *a) { - int i; -#ifdef VERIFY - a->magnitude = 0; - a->normalized = 1; -#endif - for (i=0; i<5; i++) { - a->n[i] = 0; - } -} - -static int secp256k1_fe_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) { - int i; -#ifdef VERIFY - VERIFY_CHECK(a->normalized); - VERIFY_CHECK(b->normalized); - secp256k1_fe_verify(a); - secp256k1_fe_verify(b); -#endif - for (i = 4; i >= 0; i--) { - if (a->n[i] > b->n[i]) { - return 1; - } - if (a->n[i] < b->n[i]) { - return -1; - } - } - return 0; -} - -static int secp256k1_fe_set_b32(secp256k1_fe *r, const unsigned char *a) { - int ret; - r->n[0] = (uint64_t)a[31] - | ((uint64_t)a[30] << 8) - | ((uint64_t)a[29] << 16) - | ((uint64_t)a[28] << 24) - | ((uint64_t)a[27] << 32) - | ((uint64_t)a[26] << 40) - | ((uint64_t)(a[25] & 0xF) << 48); - r->n[1] = (uint64_t)((a[25] >> 4) & 0xF) - | ((uint64_t)a[24] << 4) - | ((uint64_t)a[23] << 12) - | ((uint64_t)a[22] << 20) - | ((uint64_t)a[21] << 28) - | ((uint64_t)a[20] << 36) - | ((uint64_t)a[19] << 44); - r->n[2] = (uint64_t)a[18] - | ((uint64_t)a[17] << 8) - | ((uint64_t)a[16] << 16) - | ((uint64_t)a[15] << 24) - | ((uint64_t)a[14] << 32) - | ((uint64_t)a[13] << 40) - | ((uint64_t)(a[12] & 0xF) << 48); - r->n[3] = (uint64_t)((a[12] >> 4) & 0xF) - | ((uint64_t)a[11] << 4) - | ((uint64_t)a[10] << 12) - | ((uint64_t)a[9] << 20) - | ((uint64_t)a[8] << 28) - | ((uint64_t)a[7] << 36) - | ((uint64_t)a[6] << 44); - r->n[4] = (uint64_t)a[5] - | ((uint64_t)a[4] << 8) - | ((uint64_t)a[3] << 16) - | ((uint64_t)a[2] << 24) - | ((uint64_t)a[1] << 32) - | ((uint64_t)a[0] << 40); - ret = !((r->n[4] == 0x0FFFFFFFFFFFFULL) & ((r->n[3] & r->n[2] & r->n[1]) == 0xFFFFFFFFFFFFFULL) & (r->n[0] >= 0xFFFFEFFFFFC2FULL)); -#ifdef VERIFY - r->magnitude = 1; - if (ret) { - r->normalized = 1; - secp256k1_fe_verify(r); - } else { - r->normalized = 0; - } -#endif - return ret; -} - -/** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */ -static void secp256k1_fe_get_b32(unsigned char *r, const secp256k1_fe *a) { -#ifdef VERIFY - VERIFY_CHECK(a->normalized); - secp256k1_fe_verify(a); -#endif - r[0] = (a->n[4] >> 40) & 0xFF; - r[1] = (a->n[4] >> 32) & 0xFF; - r[2] = (a->n[4] >> 24) & 0xFF; - r[3] = (a->n[4] >> 16) & 0xFF; - r[4] = (a->n[4] >> 8) & 0xFF; - r[5] = a->n[4] & 0xFF; - r[6] = (a->n[3] >> 44) & 0xFF; - r[7] = (a->n[3] >> 36) & 0xFF; - r[8] = (a->n[3] >> 28) & 0xFF; - r[9] = (a->n[3] >> 20) & 0xFF; - r[10] = (a->n[3] >> 12) & 0xFF; - r[11] = (a->n[3] >> 4) & 0xFF; - r[12] = ((a->n[2] >> 48) & 0xF) | ((a->n[3] & 0xF) << 4); - r[13] = (a->n[2] >> 40) & 0xFF; - r[14] = (a->n[2] >> 32) & 0xFF; - r[15] = (a->n[2] >> 24) & 0xFF; - r[16] = (a->n[2] >> 16) & 0xFF; - r[17] = (a->n[2] >> 8) & 0xFF; - r[18] = a->n[2] & 0xFF; - r[19] = (a->n[1] >> 44) & 0xFF; - r[20] = (a->n[1] >> 36) & 0xFF; - r[21] = (a->n[1] >> 28) & 0xFF; - r[22] = (a->n[1] >> 20) & 0xFF; - r[23] = (a->n[1] >> 12) & 0xFF; - r[24] = (a->n[1] >> 4) & 0xFF; - r[25] = ((a->n[0] >> 48) & 0xF) | ((a->n[1] & 0xF) << 4); - r[26] = (a->n[0] >> 40) & 0xFF; - r[27] = (a->n[0] >> 32) & 0xFF; - r[28] = (a->n[0] >> 24) & 0xFF; - r[29] = (a->n[0] >> 16) & 0xFF; - r[30] = (a->n[0] >> 8) & 0xFF; - r[31] = a->n[0] & 0xFF; -} - -SECP256K1_INLINE static void secp256k1_fe_negate(secp256k1_fe *r, const secp256k1_fe *a, int m) { -#ifdef VERIFY - VERIFY_CHECK(a->magnitude <= m); - secp256k1_fe_verify(a); -#endif - r->n[0] = 0xFFFFEFFFFFC2FULL * 2 * (m + 1) - a->n[0]; - r->n[1] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[1]; - r->n[2] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[2]; - r->n[3] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[3]; - r->n[4] = 0x0FFFFFFFFFFFFULL * 2 * (m + 1) - a->n[4]; -#ifdef VERIFY - r->magnitude = m + 1; - r->normalized = 0; - secp256k1_fe_verify(r); -#endif -} - -SECP256K1_INLINE static void secp256k1_fe_mul_int(secp256k1_fe *r, int a) { - r->n[0] *= a; - r->n[1] *= a; - r->n[2] *= a; - r->n[3] *= a; - r->n[4] *= a; -#ifdef VERIFY - r->magnitude *= a; - r->normalized = 0; - secp256k1_fe_verify(r); -#endif -} - -SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe *r, const secp256k1_fe *a) { -#ifdef VERIFY - secp256k1_fe_verify(a); -#endif - r->n[0] += a->n[0]; - r->n[1] += a->n[1]; - r->n[2] += a->n[2]; - r->n[3] += a->n[3]; - r->n[4] += a->n[4]; -#ifdef VERIFY - r->magnitude += a->magnitude; - r->normalized = 0; - secp256k1_fe_verify(r); -#endif -} - -static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp256k1_fe * SECP256K1_RESTRICT b) { -#ifdef VERIFY - VERIFY_CHECK(a->magnitude <= 8); - VERIFY_CHECK(b->magnitude <= 8); - secp256k1_fe_verify(a); - secp256k1_fe_verify(b); - VERIFY_CHECK(r != b); - VERIFY_CHECK(a != b); -#endif - secp256k1_fe_mul_inner(r->n, a->n, b->n); -#ifdef VERIFY - r->magnitude = 1; - r->normalized = 0; - secp256k1_fe_verify(r); -#endif -} - -static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) { -#ifdef VERIFY - VERIFY_CHECK(a->magnitude <= 8); - secp256k1_fe_verify(a); -#endif - secp256k1_fe_sqr_inner(r->n, a->n); -#ifdef VERIFY - r->magnitude = 1; - r->normalized = 0; - secp256k1_fe_verify(r); -#endif -} - -static SECP256K1_INLINE void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag) { - uint64_t mask0, mask1; - VG_CHECK_VERIFY(r->n, sizeof(r->n)); - mask0 = flag + ~((uint64_t)0); - mask1 = ~mask0; - r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1); - r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1); - r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1); - r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1); - r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1); -#ifdef VERIFY - if (flag) { - r->magnitude = a->magnitude; - r->normalized = a->normalized; - } -#endif -} - -static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r, const secp256k1_fe_storage *a, int flag) { - uint64_t mask0, mask1; - VG_CHECK_VERIFY(r->n, sizeof(r->n)); - mask0 = flag + ~((uint64_t)0); - mask1 = ~mask0; - r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1); - r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1); - r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1); - r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1); -} - -static void secp256k1_fe_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) { -#ifdef VERIFY - VERIFY_CHECK(a->normalized); -#endif - r->n[0] = a->n[0] | a->n[1] << 52; - r->n[1] = a->n[1] >> 12 | a->n[2] << 40; - r->n[2] = a->n[2] >> 24 | a->n[3] << 28; - r->n[3] = a->n[3] >> 36 | a->n[4] << 16; -} - -static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) { - r->n[0] = a->n[0] & 0xFFFFFFFFFFFFFULL; - r->n[1] = a->n[0] >> 52 | ((a->n[1] << 12) & 0xFFFFFFFFFFFFFULL); - r->n[2] = a->n[1] >> 40 | ((a->n[2] << 24) & 0xFFFFFFFFFFFFFULL); - r->n[3] = a->n[2] >> 28 | ((a->n[3] << 36) & 0xFFFFFFFFFFFFFULL); - r->n[4] = a->n[3] >> 16; -#ifdef VERIFY - r->magnitude = 1; - r->normalized = 1; -#endif -} - -static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64_signed62 *a) { - const uint64_t M52 = UINT64_MAX >> 12; - const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4]; - - /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and - * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4). - */ - VERIFY_CHECK(a0 >> 62 == 0); - VERIFY_CHECK(a1 >> 62 == 0); - VERIFY_CHECK(a2 >> 62 == 0); - VERIFY_CHECK(a3 >> 62 == 0); - VERIFY_CHECK(a4 >> 8 == 0); - - r->n[0] = a0 & M52; - r->n[1] = (a0 >> 52 | a1 << 10) & M52; - r->n[2] = (a1 >> 42 | a2 << 20) & M52; - r->n[3] = (a2 >> 32 | a3 << 30) & M52; - r->n[4] = (a3 >> 22 | a4 << 40); - -#ifdef VERIFY - r->magnitude = 1; - r->normalized = 1; - secp256k1_fe_verify(r); -#endif -} - -static void secp256k1_fe_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_fe *a) { - const uint64_t M62 = UINT64_MAX >> 2; - const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4]; - -#ifdef VERIFY - VERIFY_CHECK(a->normalized); -#endif - - r->v[0] = (a0 | a1 << 52) & M62; - r->v[1] = (a1 >> 10 | a2 << 42) & M62; - r->v[2] = (a2 >> 20 | a3 << 32) & M62; - r->v[3] = (a3 >> 30 | a4 << 22) & M62; - r->v[4] = a4 >> 40; -} - -static const secp256k1_modinv64_modinfo secp256k1_const_modinfo_fe = { - {{-0x1000003D1LL, 0, 0, 0, 256}}, - 0x27C7F6E22DDACACFLL -}; - -static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) { - secp256k1_fe tmp; - secp256k1_modinv64_signed62 s; - - tmp = *x; - secp256k1_fe_normalize(&tmp); - secp256k1_fe_to_signed62(&s, &tmp); - secp256k1_modinv64(&s, &secp256k1_const_modinfo_fe); - secp256k1_fe_from_signed62(r, &s); - -#ifdef VERIFY - VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp)); -#endif -} - -static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) { - secp256k1_fe tmp; - secp256k1_modinv64_signed62 s; - - tmp = *x; - secp256k1_fe_normalize_var(&tmp); - secp256k1_fe_to_signed62(&s, &tmp); - secp256k1_modinv64_var(&s, &secp256k1_const_modinfo_fe); - secp256k1_fe_from_signed62(r, &s); - -#ifdef VERIFY - VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp)); -#endif -} - -#endif /* SECP256K1_FIELD_REPR_IMPL_H */ diff --git a/src/field_5x52_int128_impl.h b/src/field_5x52_int128_impl.h deleted file mode 100644 index 314002ee39..0000000000 --- a/src/field_5x52_int128_impl.h +++ /dev/null @@ -1,279 +0,0 @@ -/*********************************************************************** - * Copyright (c) 2013, 2014 Pieter Wuille * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* - ***********************************************************************/ - -#ifndef SECP256K1_FIELD_INNER5X52_IMPL_H -#define SECP256K1_FIELD_INNER5X52_IMPL_H - -#include - -#ifdef VERIFY -#define VERIFY_BITS(x, n) VERIFY_CHECK(((x) >> (n)) == 0) -#else -#define VERIFY_BITS(x, n) do { } while(0) -#endif - -SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { - uint128_t c, d; - uint64_t t3, t4, tx, u0; - uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; - const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; - - VERIFY_BITS(a[0], 56); - VERIFY_BITS(a[1], 56); - VERIFY_BITS(a[2], 56); - VERIFY_BITS(a[3], 56); - VERIFY_BITS(a[4], 52); - VERIFY_BITS(b[0], 56); - VERIFY_BITS(b[1], 56); - VERIFY_BITS(b[2], 56); - VERIFY_BITS(b[3], 56); - VERIFY_BITS(b[4], 52); - VERIFY_CHECK(r != b); - VERIFY_CHECK(a != b); - - /* [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n. - * for 0 <= x <= 4, px is a shorthand for sum(a[i]*b[x-i], i=0..x). - * for 4 <= x <= 8, px is a shorthand for sum(a[i]*b[x-i], i=(x-4)..4) - * Note that [x 0 0 0 0 0] = [x*R]. - */ - - d = (uint128_t)a0 * b[3] - + (uint128_t)a1 * b[2] - + (uint128_t)a2 * b[1] - + (uint128_t)a3 * b[0]; - VERIFY_BITS(d, 114); - /* [d 0 0 0] = [p3 0 0 0] */ - c = (uint128_t)a4 * b[4]; - VERIFY_BITS(c, 112); - /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - d += (c & M) * R; c >>= 52; - VERIFY_BITS(d, 115); - VERIFY_BITS(c, 60); - /* [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - t3 = d & M; d >>= 52; - VERIFY_BITS(t3, 52); - VERIFY_BITS(d, 63); - /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - - d += (uint128_t)a0 * b[4] - + (uint128_t)a1 * b[3] - + (uint128_t)a2 * b[2] - + (uint128_t)a3 * b[1] - + (uint128_t)a4 * b[0]; - VERIFY_BITS(d, 115); - /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - d += c * R; - VERIFY_BITS(d, 116); - /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - t4 = d & M; d >>= 52; - VERIFY_BITS(t4, 52); - VERIFY_BITS(d, 64); - /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - tx = (t4 >> 48); t4 &= (M >> 4); - VERIFY_BITS(tx, 4); - VERIFY_BITS(t4, 48); - /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - - c = (uint128_t)a0 * b[0]; - VERIFY_BITS(c, 112); - /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */ - d += (uint128_t)a1 * b[4] - + (uint128_t)a2 * b[3] - + (uint128_t)a3 * b[2] - + (uint128_t)a4 * b[1]; - VERIFY_BITS(d, 115); - /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - u0 = d & M; d >>= 52; - VERIFY_BITS(u0, 52); - VERIFY_BITS(d, 63); - /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - /* [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - u0 = (u0 << 4) | tx; - VERIFY_BITS(u0, 56); - /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - c += (uint128_t)u0 * (R >> 4); - VERIFY_BITS(c, 115); - /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - r[0] = c & M; c >>= 52; - VERIFY_BITS(r[0], 52); - VERIFY_BITS(c, 61); - /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */ - - c += (uint128_t)a0 * b[1] - + (uint128_t)a1 * b[0]; - VERIFY_BITS(c, 114); - /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */ - d += (uint128_t)a2 * b[4] - + (uint128_t)a3 * b[3] - + (uint128_t)a4 * b[2]; - VERIFY_BITS(d, 114); - /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - c += (d & M) * R; d >>= 52; - VERIFY_BITS(c, 115); - VERIFY_BITS(d, 62); - /* [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - r[1] = c & M; c >>= 52; - VERIFY_BITS(r[1], 52); - VERIFY_BITS(c, 63); - /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - - c += (uint128_t)a0 * b[2] - + (uint128_t)a1 * b[1] - + (uint128_t)a2 * b[0]; - VERIFY_BITS(c, 114); - /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */ - d += (uint128_t)a3 * b[4] - + (uint128_t)a4 * b[3]; - VERIFY_BITS(d, 114); - /* [d 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - c += (d & M) * R; d >>= 52; - VERIFY_BITS(c, 115); - VERIFY_BITS(d, 62); - /* [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - - /* [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[2] = c & M; c >>= 52; - VERIFY_BITS(r[2], 52); - VERIFY_BITS(c, 63); - /* [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - c += d * R + t3; - VERIFY_BITS(c, 100); - /* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[3] = c & M; c >>= 52; - VERIFY_BITS(r[3], 52); - VERIFY_BITS(c, 48); - /* [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - c += t4; - VERIFY_BITS(c, 49); - /* [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[4] = c; - VERIFY_BITS(r[4], 49); - /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ -} - -SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { - uint128_t c, d; - uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; - int64_t t3, t4, tx, u0; - const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; - - VERIFY_BITS(a[0], 56); - VERIFY_BITS(a[1], 56); - VERIFY_BITS(a[2], 56); - VERIFY_BITS(a[3], 56); - VERIFY_BITS(a[4], 52); - - /** [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n. - * px is a shorthand for sum(a[i]*a[x-i], i=0..x). - * Note that [x 0 0 0 0 0] = [x*R]. - */ - - d = (uint128_t)(a0*2) * a3 - + (uint128_t)(a1*2) * a2; - VERIFY_BITS(d, 114); - /* [d 0 0 0] = [p3 0 0 0] */ - c = (uint128_t)a4 * a4; - VERIFY_BITS(c, 112); - /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - d += (c & M) * R; c >>= 52; - VERIFY_BITS(d, 115); - VERIFY_BITS(c, 60); - /* [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - t3 = d & M; d >>= 52; - VERIFY_BITS(t3, 52); - VERIFY_BITS(d, 63); - /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - - a4 *= 2; - d += (uint128_t)a0 * a4 - + (uint128_t)(a1*2) * a3 - + (uint128_t)a2 * a2; - VERIFY_BITS(d, 115); - /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - d += c * R; - VERIFY_BITS(d, 116); - /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - t4 = d & M; d >>= 52; - VERIFY_BITS(t4, 52); - VERIFY_BITS(d, 64); - /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - tx = (t4 >> 48); t4 &= (M >> 4); - VERIFY_BITS(tx, 4); - VERIFY_BITS(t4, 48); - /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - - c = (uint128_t)a0 * a0; - VERIFY_BITS(c, 112); - /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */ - d += (uint128_t)a1 * a4 - + (uint128_t)(a2*2) * a3; - VERIFY_BITS(d, 114); - /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - u0 = d & M; d >>= 52; - VERIFY_BITS(u0, 52); - VERIFY_BITS(d, 62); - /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - /* [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - u0 = (u0 << 4) | tx; - VERIFY_BITS(u0, 56); - /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - c += (uint128_t)u0 * (R >> 4); - VERIFY_BITS(c, 113); - /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - r[0] = c & M; c >>= 52; - VERIFY_BITS(r[0], 52); - VERIFY_BITS(c, 61); - /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */ - - a0 *= 2; - c += (uint128_t)a0 * a1; - VERIFY_BITS(c, 114); - /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */ - d += (uint128_t)a2 * a4 - + (uint128_t)a3 * a3; - VERIFY_BITS(d, 114); - /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - c += (d & M) * R; d >>= 52; - VERIFY_BITS(c, 115); - VERIFY_BITS(d, 62); - /* [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - r[1] = c & M; c >>= 52; - VERIFY_BITS(r[1], 52); - VERIFY_BITS(c, 63); - /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - - c += (uint128_t)a0 * a2 - + (uint128_t)a1 * a1; - VERIFY_BITS(c, 114); - /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */ - d += (uint128_t)a3 * a4; - VERIFY_BITS(d, 114); - /* [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - c += (d & M) * R; d >>= 52; - VERIFY_BITS(c, 115); - VERIFY_BITS(d, 62); - /* [d 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[2] = c & M; c >>= 52; - VERIFY_BITS(r[2], 52); - VERIFY_BITS(c, 63); - /* [d 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - - c += d * R + t3; - VERIFY_BITS(c, 100); - /* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[3] = c & M; c >>= 52; - VERIFY_BITS(r[3], 52); - VERIFY_BITS(c, 48); - /* [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - c += t4; - VERIFY_BITS(c, 49); - /* [c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[4] = c; - VERIFY_BITS(r[4], 49); - /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ -} - -#endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */ diff --git a/src/field_5x52.h b/src/field_5x64.h similarity index 76% rename from src/field_5x52.h rename to src/field_5x64.h index 50ee3f9ec9..68897e81cb 100644 --- a/src/field_5x52.h +++ b/src/field_5x64.h @@ -1,5 +1,5 @@ /*********************************************************************** - * Copyright (c) 2013, 2014 Pieter Wuille * + * Copyright (c) 2021 Pieter Wuille * * Distributed under the MIT software license, see the accompanying * * file COPYING or https://www.opensource.org/licenses/mit-license.php.* ***********************************************************************/ @@ -10,27 +10,31 @@ #include typedef struct { - /* X = sum(i=0..4, n[i]*2^(i*52)) mod p + /* X = sum(i=0..4, n[i]*2^(i*64)) mod p * where p = 2^256 - 0x1000003D1 + * + * Magnitude m implies that n[4] < (magnitude << 34). + * Normalized implies n[4]==0 and X < p. */ uint64_t n[5]; #ifdef VERIFY int magnitude; int normalized; + uint64_t precomputed; /* 64-bit to avoid padding bytes */ #endif } secp256k1_fe; /* Unpacks a constant into a overlapping multi-limbed FE element. */ #define SECP256K1_FE_CONST_INNER(d7, d6, d5, d4, d3, d2, d1, d0) { \ - (d0) | (((uint64_t)(d1) & 0xFFFFFUL) << 32), \ - ((uint64_t)(d1) >> 20) | (((uint64_t)(d2)) << 12) | (((uint64_t)(d3) & 0xFFUL) << 44), \ - ((uint64_t)(d3) >> 8) | (((uint64_t)(d4) & 0xFFFFFFFUL) << 24), \ - ((uint64_t)(d4) >> 28) | (((uint64_t)(d5)) << 4) | (((uint64_t)(d6) & 0xFFFFUL) << 36), \ - ((uint64_t)(d6) >> 16) | (((uint64_t)(d7)) << 16) \ + (d0) | (((uint64_t)(d1)) << 32), \ + (d2) | (((uint64_t)(d3)) << 32), \ + (d4) | (((uint64_t)(d5)) << 32), \ + (d6) | (((uint64_t)(d7)) << 32), \ + 0 \ } #ifdef VERIFY -#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0)), 1, 1} +#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0)), 1, 1, 1} #else #define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0))} #endif diff --git a/src/field_5x64_impl.h b/src/field_5x64_impl.h new file mode 100644 index 0000000000..86b8c0481e --- /dev/null +++ b/src/field_5x64_impl.h @@ -0,0 +1,1260 @@ +/*********************************************************************** + * Copyright (c) 2021 Pieter Wuille * + * Distributed under the MIT software license, see the accompanying * + * file COPYING or https://www.opensource.org/licenses/mit-license.php.* + ***********************************************************************/ + +#ifndef SECP256K1_FIELD_REPR_IMPL_H +#define SECP256K1_FIELD_REPR_IMPL_H + +#if defined HAVE_CONFIG_H +#include "libsecp256k1-config.h" +#endif + +#include "util.h" +#include "field.h" +#include "modinv64_impl.h" + +#if defined(USE_EXTERNAL_ASM) +/* External assembler implementation */ +void secp256k1_fe_mul_55to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b); +void secp256k1_fe_mul_45to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b); +void secp256k1_fe_mul_44to5(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b); +void secp256k1_fe_sqr_5to5(uint64_t *r, const uint64_t *a); +void secp256k1_fe_sqr_4to5(uint64_t *r, const uint64_t *a); +void secp256k1_fe_sqr_4to4(uint64_t *r, const uint64_t *a); +#endif + +#ifdef VERIFY +#define ON_VERIFY(x) x +#else +#define ON_VERIFY(x) +#endif + +#ifdef USE_ASM_X86_64 + +/* Add a*b to [c0,c1]. c0,c1 must all be 0 on input. */ +#define mul2(c0,c1,a,b) do {\ + VERIFY_CHECK(c0 == 0); \ + VERIFY_CHECK(c1 == 0); \ + __asm__ ( \ + "mulq %[vb]\n" \ + : [vc0]"=a"(c0), [vc1]"=d"(c1) \ + : [va]"[vc0]"(a), [vb]"rm"(b) \ + : "cc"); \ +} while(0) + +/* Add a**2 to [c0,c1]. c0,c1 must all be 0 on input. */ +#define sqr2(c0,c1,a) do {\ + VERIFY_CHECK(c0 == 0); \ + VERIFY_CHECK(c1 == 0); \ + __asm__ ( \ + "mulq %[va]\n" \ + : [vc0]"=a"(c0), [vc1]"=d"(c1) \ + : [va]"[vc0]"(a) \ + : "cc"); \ +} while(0) + +/* Add a*b to [c0,c1,c2]. c2 must never overflow. */ +#define muladd3(c0,c1,c2,a,b) do {\ + ON_VERIFY(uint64_t old_c2 = c2;) \ + uint64_t ac = (a); \ + __asm__ ( \ + "mulq %[vb]\n" \ + "addq %%rax, %[vc0]\n" \ + "adcq %%rdx, %[vc1]\n" \ + "adcq $0, %[vc2]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1), [vc2]"+r"(c2), [va]"+a"(ac) \ + : [vb]"rm"(b) \ + : "cc", "rdx"); \ + ON_VERIFY(VERIFY_CHECK(c2 >= old_c2);) \ +} while(0) + +/* Add a**2 to [c0,c1,c2]. c2 must never overflow. */ +#define sqradd3(c0,c1,c2,a) do {\ + ON_VERIFY(uint64_t old_c2 = c2;) \ + uint64_t ac = (a); \ + __asm__ ( \ + "mulq %[va]\n" \ + "addq %%rax, %[vc0]\n" \ + "adcq %%rdx, %[vc1]\n" \ + "adcq $0, %[vc2]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1), [vc2]"+r"(c2), [va]"+a"(ac) \ + : \ + : "cc", "rdx"); \ + ON_VERIFY(VERIFY_CHECK(c2 >= old_c2);) \ +} while(0) + +/* Add 2*a*b to [c0,c1,c2]. c2 must never overflow. */ +#define mul2add3(c0,c1,c2,a,b) do {\ + ON_VERIFY(uint64_t old_c2 = c2;) \ + uint64_t ac = (a); \ + __asm__ ( \ + "mulq %[vb]\n" \ + "addq %%rax, %[vc0]\n" \ + "adcq %%rdx, %[vc1]\n" \ + "adcq $0, %[vc2]\n" \ + "addq %%rax, %[vc0]\n" \ + "adcq %%rdx, %[vc1]\n" \ + "adcq $0, %[vc2]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1), [vc2]"+r"(c2), [va]"+a"(ac) \ + : [vb]"rm"(b) \ + : "cc", "rdx"); \ + ON_VERIFY(VERIFY_CHECK(c2 >= old_c2);) \ +} while(0) + +/* Add a*b to [c0,c1]. c1 must never overflow. */ +#define muladd2(c0,c1,a,b) do {\ + ON_VERIFY(uint64_t old_c1 = c1;) \ + uint64_t ac = (a); \ + __asm__ ( \ + "mulq %[vb]\n" \ + "addq %%rax, %[vc0]\n" \ + "adcq %%rdx, %[vc1]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1), [va]"+a"(ac) \ + : [vb]"rm"(b) \ + : "cc", "rdx"); \ + ON_VERIFY(VERIFY_CHECK(c1 >= old_c1);) \ +} while(0) + +/* Add a**2 to [c0,c1. c1 must never overflow. */ +#define sqradd2(c0,c1,a) do {\ + ON_VERIFY(uint64_t old_c1 = c1;) \ + uint64_t ac = (a); \ + __asm__ ( \ + "mulq %[va]\n" \ + "addq %%rax, %[vc0]\n" \ + "adcq %%rdx, %[vc1]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1), [va]"+a"(ac) \ + : \ + : "cc", "rdx"); \ + ON_VERIFY(VERIFY_CHECK(c1 >= old_c1);) \ +} while(0) + +/* Add [a0,a1,a2,a3,a4] t0 [c0,c1,c2,c3,c4]. C4 cannot overflow. */ +#define add5x5(c0,c1,c2,c3,c4,a0,a1,a2,a3,a4) do {\ + ON_VERIFY(uint64_t old_c4 = c4;) \ + __asm__ ( \ + "addq %[va0], %[vc0]\n" \ + "adcq %[va1], %[vc1]\n" \ + "adcq %[va2], %[vc2]\n" \ + "adcq %[va3], %[vc3]\n" \ + "adcq %[va4], %[vc4]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1), [vc2]"+r"(c2), [vc3]"+r"(c3), [vc4]"+r"(c4) \ + : [va0]"rm"(a0), [va1]"rm"(a1), [va2]"rm"(a2), [va3]"rm"(a3), [va4]"rm"(a4) \ + : "cc" ); \ + ON_VERIFY(VERIFY_CHECK(c4 >= old_c4);) \ +} while(0) + +/* Add a to [c0,c1,c2,c3]. c3 must never overflow. */ +#define add4(c0,c1,c2,c3,a) do {\ + ON_VERIFY(uint64_t old_c3 = c3;) \ + __asm__ ( \ + "addq %[va], %[vc0]\n" \ + "adcq $0, %[vc1]\n" \ + "adcq $0, %[vc2]\n" \ + "adcq $0, %[vc3]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1), [vc2]"+r"(c2), [vc3]"+r"(c3) \ + : [va]"rm"(a) \ + : "cc" ); \ + ON_VERIFY(VERIFY_CHECK(c3 >= old_c3);) \ +} while(0) + +/* Add a to [c0,c1,c2,c3]. c3 may overflow. */ +#define add4o(c0,c1,c2,c3,a) do {\ + __asm__ ( \ + "addq %[va], %[vc0]\n" \ + "adcq $0, %[vc1]\n" \ + "adcq $0, %[vc2]\n" \ + "adcq $0, %[vc3]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1), [vc2]"+r"(c2), [vc3]"+r"(c3) \ + : [va]"rm"(a) \ + : "cc" ); \ +} while(0) + + +/* Add a to [c0,c1,c2]. c2 must never overflow. */ +#define add3(c0,c1,c2,a) do {\ + ON_VERIFY(uint64_t old_c2 = c2;) \ + __asm__ ( \ + "addq %[va], %[vc0]\n" \ + "adcq $0, %[vc1]\n" \ + "adcq $0, %[vc2]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1), [vc2]"+r"(c2) \ + : [va]"rm"(a) \ + : "cc" ); \ + ON_VERIFY(VERIFY_CHECK(c2 >= old_c2);) \ +} while(0) + +/* Add a to [c0,c1]. c1 must never overflow. */ +#define add2(c0,c1,a) do {\ + ON_VERIFY(uint64_t old_c1 = c1;) \ + __asm__ ( \ + "addq %[va], %[vc0]\n" \ + "adcq $0, %[vc1]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1) \ + : [va]"rm"(a) \ + : "cc" ); \ + ON_VERIFY(VERIFY_CHECK(c1 >= old_c1);) \ +} while(0) + +/* Subtract a from [c0,c1]. c1 must never underflow. */ +#define sub2(c0,c1,a) do {\ + ON_VERIFY(uint64_t old_c1 = c1;) \ + __asm__ ( \ + "subq %[va], %[vc0]\n" \ + "sbbq $0, %[vc1]\n" \ + : [vc0]"+r"(c0), [vc1]"+r"(c1) \ + : [va]"rm"(a) \ + : "cc" ); \ + ON_VERIFY(VERIFY_CHECK(c1 <= old_c1);) \ +} while(0) + +#else + +/* Fallback using uint128_t. */ + +/* Add a*b to [c0,c1]. c0,c1 must all be 0 on input. */ +#define mul2(c0,c1,a,b) do {\ + uint128_t t = (uint128_t)(a) * (b); \ + VERIFY_CHECK(c0 == 0); \ + VERIFY_CHECK(c1 == 0); \ + c0 = t; \ + c1 = t >> 64; \ +} while(0) + +/* Add a**2 to [c0,c1]. c0,c1 must all be 0 on input. */ +#define sqr2(c0,c1,a) do {\ + uint128_t t = (uint128_t)(a) * (a); \ + VERIFY_CHECK(c0 == 0); \ + VERIFY_CHECK(c1 == 0); \ + c0 = t; \ + c1 = t >> 64; \ +} while(0) + +/* Add a*b to [c0,c1,c2]. c2 must never overflow. */ +#define muladd3(c0,c1,c2,a,b) do {\ + uint64_t tl, th; \ + { \ + uint128_t t = (uint128_t)(a) * (b); \ + th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ + tl = t; \ + } \ + c0 += tl; /* overflow is handled on the next line */ \ + th += (c0 < tl); /* at most 0xFFFFFFFFFFFFFFFF */ \ + c1 += th; /* overflow is handled on the next line */ \ + c2 += (c1 < th); /* never overflows by contract (verified in the next line) */ \ + VERIFY_CHECK((c1 >= th) || (c2 != 0)); \ +} while(0) + +/* Add a**2 to [c0,c1,c2]. c2 must never overflow. */ +#define sqradd3(c0,c1,c2,a) do {\ + uint64_t tl, th; \ + { \ + uint128_t t = (uint128_t)(a) * (a); \ + th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ + tl = t; \ + } \ + c0 += tl; /* overflow is handled on the next line */ \ + th += (c0 < tl); /* at most 0xFFFFFFFFFFFFFFFF */ \ + c1 += th; /* overflow is handled on the next line */ \ + c2 += (c1 < th); /* never overflows by contract (verified in the next line) */ \ + VERIFY_CHECK((c1 >= th) || (c2 != 0)); \ +} while(0) + +/* Add 2*a*b to [c0,c1,c2]. c2 must never overflow. */ +#define mul2add3(c0,c1,c2,a,b) do {\ + uint64_t tl, th, th2, tl2; \ + { \ + uint128_t t = (uint128_t)(a) * (b); \ + th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ + tl = t; \ + } \ + th2 = th + th; /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \ + c2 += (th2 < th); /* never overflows by contract (verified the next line) */ \ + VERIFY_CHECK((th2 >= th) || (c2 != 0)); \ + tl2 = tl + tl; /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \ + th2 += (tl2 < tl); /* at most 0xFFFFFFFFFFFFFFFF */ \ + c0 += tl2; /* overflow is handled on the next line */ \ + th2 += (c0 < tl2); /* second overflow is handled on the next line */ \ + c2 += (c0 < tl2) & (th2 == 0); /* never overflows by contract (verified the next line) */ \ + VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \ + c1 += th2; /* overflow is handled on the next line */ \ + c2 += (c1 < th2); /* never overflows by contract (verified the next line) */ \ + VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \ +} while(0) + +/* Add a*b to [c0,c1]. c1 must never overflow. */ +#define muladd2(c0,c1,a,b) do {\ + uint64_t tl, th; \ + ON_VERIFY(uint64_t old_c1 = c1;) \ + { \ + uint128_t t = (uint128_t)(a) * (b); \ + th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ + tl = t; \ + } \ + c0 += tl; /* overflow is handled on the next line */ \ + th += (c0 < tl); /* at most 0xFFFFFFFFFFFFFFFF */ \ + c1 += th; /* overflow is handled on the next line */ \ + ON_VERIFY(VERIFY_CHECK(c1 >= old_c1);) \ +} while(0) + +/* Add a**2 to [c0,c1. c1 must never overflow. */ +#define sqradd2(c0,c1,a) do {\ + uint64_t tl, th; \ + ON_VERIFY(uint64_t old_c1 = c1;) \ + { \ + uint128_t t = (uint128_t)(a) * (a); \ + th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \ + tl = t; \ + } \ + c0 += tl; /* overflow is handled on the next line */ \ + th += (c0 < tl); /* at most 0xFFFFFFFFFFFFFFFF */ \ + c1 += th; /* overflow is handled on the next line */ \ + ON_VERIFY(VERIFY_CHECK(c1 >= old_c1);) \ +} while(0) + +/* Add [a0,a1,a2,a3,a4] t0 [c0,c1,c2,c3,c4]. C4 cannot overflow. */ +#define add5x5(c0,c1,c2,c3,c4,a0,a1,a2,a3,a4) do {\ + uint128_t tmp = (uint128_t)c0 + (a0); \ + c0 = tmp; tmp >>= 64; \ + tmp += c1; tmp += (a1); \ + c1 = tmp; tmp >>= 64; \ + tmp += c2; tmp += (a2); \ + c2 = tmp; tmp >>= 64; \ + tmp += c3; tmp += (a3); \ + c3 = tmp; tmp >>= 64; \ + tmp += c4; tmp += (a4); \ + c4 = tmp; \ + VERIFY_CHECK((tmp >> 64) == 0); \ +} while(0) + +/* Add a to [c0,c1,c2,c3]. c3 must never overflow. */ +#define add4(c0,c1,c2,c3,a) do {\ + uint128_t tmp = (uint128_t)c0 + (a); \ + c0 = tmp; tmp >>= 64; \ + tmp += c1; \ + c1 = tmp; tmp >>= 64; \ + tmp += c2; \ + c2 = tmp; tmp >>= 64; \ + tmp += c3; \ + c3 = tmp; \ + VERIFY_CHECK((tmp >> 64) == 0); \ +} while(0) + +/* Add a to [c0,c1,c2,c3]. c3 may overflow. */ +#define add4o(c0,c1,c2,c3,a) do {\ + uint128_t tmp = (uint128_t)c0 + (a); \ + c0 = tmp; tmp >>= 64; \ + tmp += c1; \ + c1 = tmp; tmp >>= 64; \ + tmp += c2; \ + c2 = tmp; tmp >>= 64; \ + tmp += c3; \ + c3 = tmp; \ +} while(0) + + +/* Add a to [c0,c1,c2]. c2 must never overflow. */ +#define add3(c0,c1,c2,a) do {\ + uint128_t tmp = (uint128_t)c0 + (a); \ + c0 = tmp; tmp >>= 64; \ + tmp += c1; \ + c1 = tmp; tmp >>= 64; \ + tmp += c2; \ + c2 = tmp; \ + VERIFY_CHECK((tmp >> 64) == 0); \ +} while(0) + +/* Add a to [c0,c1]. c1 must never overflow. */ +#define add2(c0,c1,a) do {\ + uint128_t tmp = (uint128_t)c0 + (a); \ + c0 = tmp; tmp >>= 64; \ + tmp += c1; \ + c1 = tmp; \ + VERIFY_CHECK((tmp >> 64) == 0); \ +} while(0) + +/* Subtract a from [c0,c1]. c1 must never underflow. */ +#define sub2(c0,c1,a) do {\ + int128_t tmp = (int128_t)c0 - (a); \ + c0 = tmp; tmp >>= 64; \ + tmp += c1; \ + c1 = tmp; \ + VERIFY_CHECK((tmp >> 64) == 0); \ +} while(0) + +#endif + +#ifdef VERIFY +static void secp256k1_fe_verify(const secp256k1_fe *a) { + VERIFY_CHECK(a->magnitude >= 0); + VERIFY_CHECK(a->magnitude <= 2048); + if (a->normalized) { + VERIFY_CHECK(a->n[4] == 0); + if (~(a->n[0] & a->n[1] & a->n[2] & a->n[3]) == 0) { + VERIFY_CHECK(a->n[0] <= 0xFFFFFFFEFFFFFC2FULL); + } + VERIFY_CHECK(a->magnitude <= 1); + } else { + VERIFY_CHECK(a->n[4] <= (((uint64_t)a->magnitude) << 34)); + } +} +#endif + +static void secp256k1_fe_normalize(secp256k1_fe *r) { + uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4]; + uint64_t c0 = 0, c1 = 0; + + mul2(c0,c1,t4,0x1000003D1ULL); + t4 = 0; + add2(c0,c1,t0); + t0 = c0; + add4(t1,t2,t3,t4,c1); + VERIFY_CHECK(t4 == 0 || t4 == 1); + + c0 = (-(t4 | (((~(t1 & t2 & t3)) == 0) & (t0 >= 0xFFFFFFFEFFFFFC2F)))) & 0x1000003D1ULL; + add4o(t0,t1,t2,t3,c0); + t4 = 0; + + r->n[0] = t0; + r->n[1] = t1; + r->n[2] = t2; + r->n[3] = t3; + r->n[4] = t4; + +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 1; + r->normalized = 1; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_normalize_weak(secp256k1_fe *r) { + secp256k1_fe_normalize(r); +} + +static void secp256k1_fe_normalize_prec(secp256k1_fe *r) { + uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4]; + uint64_t c0 = 0, c1 = 0; + + mul2(c0,c1,t4,0x1000003D1ULL); + t4 = 0; + add2(c0,c1,t0); + t0 = c0; + add4(t1,t2,t3,t4,c1); + VERIFY_CHECK(t4 == 0 || t4 == 1); + + c0 = (-t4) & 0x1000003D1ULL; + add4o(t0,t1,t2,t3,c0); + + r->n[0] = t0; + r->n[1] = t1; + r->n[2] = t2; + r->n[3] = t3; + r->n[4] = 0; + +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 1; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_normalize_weak_prec(secp256k1_fe *r) { + secp256k1_fe_normalize_prec(r); +} + +static void secp256k1_fe_normalize_var(secp256k1_fe *r) { + secp256k1_fe_normalize(r); +} + +static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) { + uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4]; + uint64_t c0 = 0, c1 = 0; + + mul2(c0,c1,t4,0x1000003D1ULL); + t4 = 0; + add2(c0,c1,t0); + t0 = c0; + add4(t1,t2,t3,t4,c1); + VERIFY_CHECK(t4 == 0 || t4 == 1); + + return (t4 == 0) & (((t0 | t1 | t2 | t3) == 0) | ((t0 == 0xFFFFFFFEFFFFFC2F) & ((~(t1 & t2 & t3)) == 0))); +} + +static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) { + uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4]; + uint64_t c0 = 0, c1 = 0; + + mul2(c0,c1,t4,0x1000003D1ULL); + t4 = 0; + add2(c0,c1,t0); + t0 = c0; + add4(t1,t2,t3,t4,c1); + VERIFY_CHECK(t4 == 0 || t4 == 1); + + return (t4 == 0) && (((t0 | t1 | t2 | t3) == 0) || ((t0 == 0xFFFFFFFEFFFFFC2F) && ((~(t1 & t2 & t3)) == 0))); +} + +SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe *r, int a) { + r->n[0] = a; + r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0; +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 1; + r->normalized = 1; + secp256k1_fe_verify(r); +#endif +} + +SECP256K1_INLINE static int secp256k1_fe_is_zero(const secp256k1_fe *a) { + const uint64_t *t = a->n; +#ifdef VERIFY + VERIFY_CHECK(a->normalized); + secp256k1_fe_verify(a); +#endif + return (t[0] | t[1] | t[2] | t[3] | t[4]) == 0; +} + +SECP256K1_INLINE static int secp256k1_fe_is_odd(const secp256k1_fe *a) { +#ifdef VERIFY + VERIFY_CHECK(a->normalized); + secp256k1_fe_verify(a); +#endif + return a->n[0] & 1; +} + +SECP256K1_INLINE static void secp256k1_fe_clear(secp256k1_fe *a) { + int i; +#ifdef VERIFY + a->magnitude = 0; + a->normalized = 1; +#endif + for (i=0; i<5; i++) { + a->n[i] = 0; + } +} + +static int secp256k1_fe_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) { + int i; +#ifdef VERIFY + VERIFY_CHECK(a->normalized); + VERIFY_CHECK(b->normalized); + secp256k1_fe_verify(a); + secp256k1_fe_verify(b); +#endif + for (i = 3; i >= 0; i--) { + if (a->n[i] > b->n[i]) { + return 1; + } + if (a->n[i] < b->n[i]) { + return -1; + } + } + return 0; +} + +static int secp256k1_fe_set_b32(secp256k1_fe *r, const unsigned char *a) { + int ret; + r->n[0] = (uint64_t)a[31] + | ((uint64_t)a[30] << 8) + | ((uint64_t)a[29] << 16) + | ((uint64_t)a[28] << 24) + | ((uint64_t)a[27] << 32) + | ((uint64_t)a[26] << 40) + | ((uint64_t)a[25] << 48) + | ((uint64_t)a[24] << 56); + r->n[1] = (uint64_t)a[23] + | ((uint64_t)a[22] << 8) + | ((uint64_t)a[21] << 16) + | ((uint64_t)a[20] << 24) + | ((uint64_t)a[19] << 32) + | ((uint64_t)a[18] << 40) + | ((uint64_t)a[17] << 48) + | ((uint64_t)a[16] << 56); + r->n[2] = (uint64_t)a[15] + | ((uint64_t)a[14] << 8) + | ((uint64_t)a[13] << 16) + | ((uint64_t)a[12] << 24) + | ((uint64_t)a[11] << 32) + | ((uint64_t)a[10] << 40) + | ((uint64_t)a[9] << 48) + | ((uint64_t)a[8] << 56); + r->n[3] = (uint64_t)a[7] + | ((uint64_t)a[6] << 8) + | ((uint64_t)a[5] << 16) + | ((uint64_t)a[4] << 24) + | ((uint64_t)a[3] << 32) + | ((uint64_t)a[2] << 40) + | ((uint64_t)a[1] << 48) + | ((uint64_t)a[0] << 56); + r->n[4] = 0; + + ret = !(((r->n[3] & r->n[2] & r->n[1]) == 0xFFFFFFFFFFFFFFFFULL) & (r->n[0] >= 0xFFFFFFFEFFFFFC2FULL)); +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 1; + r->normalized = ret; + secp256k1_fe_verify(r); +#endif + return ret; +} + +/** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */ +static void secp256k1_fe_get_b32(unsigned char *r, const secp256k1_fe *a) { +#ifdef VERIFY + VERIFY_CHECK(a->normalized); + secp256k1_fe_verify(a); +#endif + r[0] = (a->n[3] >> 56) & 0xFF; + r[1] = (a->n[3] >> 48) & 0xFF; + r[2] = (a->n[3] >> 40) & 0xFF; + r[3] = (a->n[3] >> 32) & 0xFF; + r[4] = (a->n[3] >> 24) & 0xFF; + r[5] = (a->n[3] >> 16) & 0xFF; + r[6] = (a->n[3] >> 8) & 0xFF; + r[7] = (a->n[3] >> 0) & 0xFF; + r[8] = (a->n[2] >> 56) & 0xFF; + r[9] = (a->n[2] >> 48) & 0xFF; + r[10] = (a->n[2] >> 40) & 0xFF; + r[11] = (a->n[2] >> 32) & 0xFF; + r[12] = (a->n[2] >> 24) & 0xFF; + r[13] = (a->n[2] >> 16) & 0xFF; + r[14] = (a->n[2] >> 8) & 0xFF; + r[15] = (a->n[2] >> 0) & 0xFF; + r[16] = (a->n[1] >> 56) & 0xFF; + r[17] = (a->n[1] >> 48) & 0xFF; + r[18] = (a->n[1] >> 40) & 0xFF; + r[19] = (a->n[1] >> 32) & 0xFF; + r[20] = (a->n[1] >> 24) & 0xFF; + r[21] = (a->n[1] >> 16) & 0xFF; + r[22] = (a->n[1] >> 8) & 0xFF; + r[23] = (a->n[1] >> 0) & 0xFF; + r[24] = (a->n[0] >> 56) & 0xFF; + r[25] = (a->n[0] >> 48) & 0xFF; + r[26] = (a->n[0] >> 40) & 0xFF; + r[27] = (a->n[0] >> 32) & 0xFF; + r[28] = (a->n[0] >> 24) & 0xFF; + r[29] = (a->n[0] >> 16) & 0xFF; + r[30] = (a->n[0] >> 8) & 0xFF; + r[31] = (a->n[0] >> 0) & 0xFF; +} + +SECP256K1_INLINE static void secp256k1_fe_negate(secp256k1_fe *r, const secp256k1_fe *a, int m) { + uint64_t f = ((uint64_t)(m + 1)) << 34; + uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0; + +#ifdef VERIFY + VERIFY_CHECK(a->magnitude <= m); + secp256k1_fe_verify(a); +#endif + + mul2(c0,c1,f,0xFFFFFFFEFFFFFC2F); + sub2(c0,c1,a->n[0]); + r->n[0] = c0; + c2 += f; + sub2(c1,c2,f); + sub2(c1,c2,a->n[1]); + r->n[1] = c1; + c3 += f; + sub2(c2,c3,f); + sub2(c2,c3,a->n[2]); + r->n[2] = c2; + c4 += f; + sub2(c3,c4,f); + sub2(c3,c4,a->n[3]); + r->n[3] = c3; + VERIFY_CHECK(c4 >= a->n[4]); + r->n[4] = c4 - a->n[4]; + +#ifdef VERIFY + r->magnitude = m + 1; + r->precomputed = 0; + r->normalized = 0; + secp256k1_fe_verify(r); +#endif +} + +SECP256K1_INLINE static void secp256k1_fe_mul_int(secp256k1_fe *r, int a) { + uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0; + uint64_t m = a; + mul2(c0,c1,r->n[0],m); + r->n[0] = c0; + muladd2(c1,c2,r->n[1],m); + r->n[1] = c1; + muladd2(c2,c3,r->n[2],m); + r->n[2] = c2; + muladd2(c3,c4,r->n[3],m); + r->n[3] = c3; + r->n[4] = c4 + (r->n[4] * m); +#ifdef VERIFY + r->magnitude *= a; + r->precomputed = 0; + r->normalized = 0; + secp256k1_fe_verify(r); +#endif +} + +SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe *r, const secp256k1_fe *a) { + uint64_t c0 = r->n[0], c1 = r->n[1], c2 = r->n[2], c3 = r->n[3], c4 = r->n[4]; +#ifdef VERIFY + secp256k1_fe_verify(r); + secp256k1_fe_verify(a); +#endif + add5x5(c0,c1,c2,c3,c4,a->n[0],a->n[1],a->n[2],a->n[3],a->n[4]); + r->n[0] = c0; + r->n[1] = c1; + r->n[2] = c2; + r->n[3] = c3; + r->n[4] = c4; +#ifdef VERIFY + r->magnitude += a->magnitude; + r->precomputed = 0; + r->normalized = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp256k1_fe * SECP256K1_RESTRICT b) { +#ifndef USE_EXTERNAL_ASM + uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4]; + uint64_t b0 = b->n[0], b1 = b->n[1], b2 = b->n[2], b3 = b->n[3], b4 = b->n[4]; + uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; + uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; +#endif + +#ifdef VERIFY + VERIFY_CHECK(a->magnitude <= 2047); + VERIFY_CHECK(b->magnitude <= 2047); + secp256k1_fe_verify(a); + secp256k1_fe_verify(b); + VERIFY_CHECK(r != b); + VERIFY_CHECK(a != b); +#endif + +#if defined(USE_EXTERNAL_ASM) + secp256k1_fe_mul_55to5(r->n, a->n, b->n); +#else + mul2(c0,c1,a4,0x1000003D1ULL); + a4 = 0; + add2(c0,c1,a0); + a0 = c0; + add4(a1,a2,a3,a4,c1); + VERIFY_CHECK(a4 == 0 || a4 == 1); + c0 = (-(a4)) & 0x1000003D1; + add4(a0,a1,a2,a3,c0); + a4 = 0; + + /* Bring b to [0,2**256). */ + c0 = 0; + c1 = 0; + mul2(c0,c1,b4,0x1000003D1ULL); + b4 = 0; + add2(c0,c1,b0); + b0 = c0; + add4(b1,b2,b3,b4,c1); + VERIFY_CHECK(b4 == 0 || b4 == 1); + c0 = (-(b4)) & 0x1000003D1; + add4(b0,b1,b2,b3,c0); + b4 = 0; + + /* Compute 512-bit product. */ + c0 = 0; + c1 = 0; + mul2(c0,c1,a0,b0); + muladd3(c1,c2,c3,a0,b1); + muladd3(c1,c2,c3,a1,b0); + muladd3(c2,c3,c4,a0,b2); + muladd3(c2,c3,c4,a1,b1); + muladd3(c2,c3,c4,a2,b0); + muladd3(c3,c4,c5,a0,b3); + muladd3(c3,c4,c5,a1,b2); + muladd3(c3,c4,c5,a2,b1); + muladd3(c3,c4,c5,a3,b0); + muladd3(c4,c5,c6,a1,b3); + muladd3(c4,c5,c6,a2,b2); + muladd3(c4,c5,c6,a3,b1); + muladd3(c5,c6,c7,a2,b3); + muladd3(c5,c6,c7,a3,b2); + muladd2(c6,c7,a3,b3); + + /* Reduce */ + mul2(d0,d1,c4,0x1000003D1); + add2(d0,d1,c0); + r->n[0] = d0; + muladd2(d1,d2,c5,0x1000003D1); + add3(d1,d2,d3,c1); + r->n[1] = d1; + muladd3(d2,d3,d4,c6,0x1000003D1); + add3(d2,d3,d4,c2); + r->n[2] = d2; + muladd2(d3,d4,c7,0x1000003D1); + add2(d3,d4,c3); + r->n[3] = d3; + r->n[4] = d4; +#endif + +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 0; + r->normalized = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_mul_prec(secp256k1_fe *r, const secp256k1_fe *a, const secp256k1_fe * SECP256K1_RESTRICT b_prec) { +#ifndef USE_EXTERNAL_ASM + uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4]; + uint64_t b0 = b_prec->n[0], b1 = b_prec->n[1], b2 = b_prec->n[2], b3 = b_prec->n[3]; + uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; + uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; +#endif + +#ifdef VERIFY + VERIFY_CHECK(a->magnitude <= 2048); + VERIFY_CHECK(b_prec->precomputed); + VERIFY_CHECK(b_prec->n[4] == 0); + secp256k1_fe_verify(a); + secp256k1_fe_verify(b_prec); + VERIFY_CHECK(r != b_prec); + VERIFY_CHECK(a != b_prec); +#endif + +#if defined(USE_EXTERNAL_ASM) + secp256k1_fe_mul_45to5(r->n, b_prec->n, a->n); +#else + mul2(c0,c1,a4,0x1000003D1ULL); + a4 = 0; + add2(c0,c1,a0); + a0 = c0; + add4(a1,a2,a3,a4,c1); + VERIFY_CHECK(a4 == 0 || a4 == 1); + c0 = (-(a4)) & 0x1000003D1; + add4(a0,a1,a2,a3,c0); + a4 = 0; + + /* Compute 512-bit product. */ + c0 = 0; + c1 = 0; + mul2(c0,c1,a0,b0); + muladd3(c1,c2,c3,a0,b1); + muladd3(c1,c2,c3,a1,b0); + muladd3(c2,c3,c4,a0,b2); + muladd3(c2,c3,c4,a1,b1); + muladd3(c2,c3,c4,a2,b0); + muladd3(c3,c4,c5,a0,b3); + muladd3(c3,c4,c5,a1,b2); + muladd3(c3,c4,c5,a2,b1); + muladd3(c3,c4,c5,a3,b0); + muladd3(c4,c5,c6,a1,b3); + muladd3(c4,c5,c6,a2,b2); + muladd3(c4,c5,c6,a3,b1); + muladd3(c5,c6,c7,a2,b3); + muladd3(c5,c6,c7,a3,b2); + muladd2(c6,c7,a3,b3); + + /* Reduce */ + mul2(d0,d1,c4,0x1000003D1); + add2(d0,d1,c0); + r->n[0] = d0; + muladd2(d1,d2,c5,0x1000003D1); + add3(d1,d2,d3,c1); + r->n[1] = d1; + muladd3(d2,d3,d4,c6,0x1000003D1); + add3(d2,d3,d4,c2); + r->n[2] = d2; + muladd2(d3,d4,c7,0x1000003D1); + add2(d3,d4,c3); + r->n[3] = d3; + r->n[4] = d4; +#endif + +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 0; + r->normalized = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_mul_2prec(secp256k1_fe *r, const secp256k1_fe *a_prec, const secp256k1_fe * SECP256K1_RESTRICT b_prec) { +#ifndef USE_EXTERNAL_ASM + uint64_t a0 = a_prec->n[0], a1 = a_prec->n[1], a2 = a_prec->n[2], a3 = a_prec->n[3]; + uint64_t b0 = b_prec->n[0], b1 = b_prec->n[1], b2 = b_prec->n[2], b3 = b_prec->n[3]; + uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; + uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; +#endif + +#ifdef VERIFY + VERIFY_CHECK(a_prec->precomputed); + VERIFY_CHECK(a_prec->n[4] == 0); + VERIFY_CHECK(b_prec->precomputed); + VERIFY_CHECK(b_prec->n[4] == 0); + secp256k1_fe_verify(a_prec); + secp256k1_fe_verify(b_prec); + VERIFY_CHECK(r != b_prec); + VERIFY_CHECK(a_prec != b_prec); +#endif + +#if defined(USE_EXTERNAL_ASM) + secp256k1_fe_mul_44to5(r->n, b_prec->n, a_prec->n); +#else + /* Compute 512-bit product. */ + c0 = 0; + c1 = 0; + mul2(c0,c1,a0,b0); + muladd3(c1,c2,c3,a0,b1); + muladd3(c1,c2,c3,a1,b0); + muladd3(c2,c3,c4,a0,b2); + muladd3(c2,c3,c4,a1,b1); + muladd3(c2,c3,c4,a2,b0); + muladd3(c3,c4,c5,a0,b3); + muladd3(c3,c4,c5,a1,b2); + muladd3(c3,c4,c5,a2,b1); + muladd3(c3,c4,c5,a3,b0); + muladd3(c4,c5,c6,a1,b3); + muladd3(c4,c5,c6,a2,b2); + muladd3(c4,c5,c6,a3,b1); + muladd3(c5,c6,c7,a2,b3); + muladd3(c5,c6,c7,a3,b2); + muladd2(c6,c7,a3,b3); + + /* Reduce */ + mul2(d0,d1,c4,0x1000003D1); + add2(d0,d1,c0); + r->n[0] = d0; + muladd2(d1,d2,c5,0x1000003D1); + add3(d1,d2,d3,c1); + r->n[1] = d1; + muladd3(d2,d3,d4,c6,0x1000003D1); + add3(d2,d3,d4,c2); + r->n[2] = d2; + muladd2(d3,d4,c7,0x1000003D1); + add2(d3,d4,c3); + r->n[3] = d3; + r->n[4] = d4; +#endif + +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 0; + r->normalized = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) { +#ifndef USE_EXTERNAL_ASM + uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4]; + uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; + uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; +#endif + +#ifdef VERIFY + VERIFY_CHECK(a->magnitude <= 2048); + secp256k1_fe_verify(a); +#endif + +#if defined(USE_EXTERNAL_ASM) + secp256k1_fe_sqr_5to5(r->n, a->n); +#else + /* Bring a to [0,2**256). */ + mul2(c0,c1,a4,0x1000003D1ULL); + a4 = 0; + add2(c0,c1,a0); + a0 = c0; + add4(a1,a2,a3,a4,c1); + VERIFY_CHECK(a4 == 0 || a4 == 1); + c0 = (-(a4)) & 0x1000003D1; + add4(a0,a1,a2,a3,c0); + + /* Compute 512-bit product. */ + c0 = 0; + c1 = 0; + sqr2(c0,c1,a0); + mul2add3(c1,c2,c3,a0,a1); + mul2add3(c2,c3,c4,a0,a2); + sqradd3(c2,c3,c4,a1); + mul2add3(c3,c4,c5,a0,a3); + mul2add3(c3,c4,c5,a1,a2); + mul2add3(c4,c5,c6,a1,a3); + sqradd3(c4,c5,c6,a2); + mul2add3(c5,c6,c7,a2,a3); + sqradd2(c6,c7,a3); + + /* Reduce */ + mul2(d0,d1,c4,0x1000003D1); + add2(d0,d1,c0); + r->n[0] = d0; + muladd2(d1,d2,c5,0x1000003D1); + add3(d1,d2,d3,c1); + r->n[1] = d1; + muladd3(d2,d3,d4,c6,0x1000003D1); + add3(d2,d3,d4,c2); + r->n[2] = d2; + muladd2(d3,d4,c7,0x1000003D1); + add2(d3,d4,c3); + r->n[3] = d3; + r->n[4] = d4; +#endif + +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 0; + r->normalized = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_sqr_prec(secp256k1_fe *r, const secp256k1_fe *a_prec) { +#ifndef USE_EXTERNAL_ASM + uint64_t a0 = a_prec->n[0], a1 = a_prec->n[1], a2 = a_prec->n[2], a3 = a_prec->n[3]; + uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; + uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; +#endif + +#ifdef VERIFY + VERIFY_CHECK(a_prec->precomputed); + VERIFY_CHECK(a_prec->n[4] == 0); + secp256k1_fe_verify(a_prec); +#endif + +#if defined(USE_EXTERNAL_ASM) + secp256k1_fe_sqr_4to5(r->n, a_prec->n); +#else + /* Compute 512-bit product. */ + c0 = 0; + c1 = 0; + sqr2(c0,c1,a0); + mul2add3(c1,c2,c3,a0,a1); + mul2add3(c2,c3,c4,a0,a2); + sqradd3(c2,c3,c4,a1); + mul2add3(c3,c4,c5,a0,a3); + mul2add3(c3,c4,c5,a1,a2); + mul2add3(c4,c5,c6,a1,a3); + sqradd3(c4,c5,c6,a2); + mul2add3(c5,c6,c7,a2,a3); + sqradd2(c6,c7,a3); + + /* Reduce */ + mul2(d0,d1,c4,0x1000003D1); + add2(d0,d1,c0); + r->n[0] = d0; + muladd2(d1,d2,c5,0x1000003D1); + add3(d1,d2,d3,c1); + r->n[1] = d1; + muladd3(d2,d3,d4,c6,0x1000003D1); + add3(d2,d3,d4,c2); + r->n[2] = d2; + muladd2(d3,d4,c7,0x1000003D1); + add2(d3,d4,c3); + r->n[3] = d3; + r->n[4] = d4; +#endif + +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 0; + r->normalized = 0; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_sqr_prec_oprec(secp256k1_fe *r_prec, const secp256k1_fe *a_prec) { +#ifndef USE_EXTERNAL_ASM + uint64_t a0 = a_prec->n[0], a1 = a_prec->n[1], a2 = a_prec->n[2], a3 = a_prec->n[3]; + uint64_t c0 = 0, c1 = 0, c2 = 0, c3 = 0, c4 = 0, c5 = 0, c6 = 0, c7 = 0; + uint64_t d0 = 0, d1 = 0, d2 = 0, d3 = 0, d4 = 0; +#endif + +#ifdef VERIFY + VERIFY_CHECK(a_prec->precomputed); + VERIFY_CHECK(a_prec->n[4] == 0); + secp256k1_fe_verify(a_prec); +#endif + +#if defined(USE_EXTERNAL_ASM) + secp256k1_fe_sqr_4to4(r_prec->n, a_prec->n); +#else + /* Compute 512-bit product. */ + c0 = 0; + c1 = 0; + sqr2(c0,c1,a0); + mul2add3(c1,c2,c3,a0,a1); + mul2add3(c2,c3,c4,a0,a2); + sqradd3(c2,c3,c4,a1); + mul2add3(c3,c4,c5,a0,a3); + mul2add3(c3,c4,c5,a1,a2); + mul2add3(c4,c5,c6,a1,a3); + sqradd3(c4,c5,c6,a2); + mul2add3(c5,c6,c7,a2,a3); + sqradd2(c6,c7,a3); + + /* Reduce */ + mul2(d0,d1,c4,0x1000003D1); + add2(d0,d1,c0); + muladd2(d1,d2,c5,0x1000003D1); + add3(d1,d2,d3,c1); + muladd3(d2,d3,d4,c6,0x1000003D1); + add3(d2,d3,d4,c2); + muladd2(d3,d4,c7,0x1000003D1); + add2(d3,d4,c3); + + /* Bring r to [0,2**256). */ + c0 = 0; + c1 = 0; + mul2(c0,c1,d4,0x1000003D1ULL); + d4 = 0; + add2(c0,c1,d0); + d0 = c0; + add4(d1,d2,d3,d4,c1); + VERIFY_CHECK(d4 == 0 || d4 == 1); + c0 = (-(d4)) & 0x1000003D1; + add4(d0,d1,d2,d3,c0); + r_prec->n[0] = d0; + r_prec->n[1] = d1; + r_prec->n[2] = d2; + r_prec->n[3] = d3; + r_prec->n[4] = 0; +#endif + +#ifdef VERIFY + r_prec->magnitude = 1; + r_prec->precomputed = 1; + r_prec->normalized = 0; + secp256k1_fe_verify(r_prec); +#endif +} + +static SECP256K1_INLINE void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag) { + uint64_t mask0, mask1; + VG_CHECK_VERIFY(r->n, sizeof(r->n)); + mask0 = flag + ~((uint64_t)0); + mask1 = ~mask0; + r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1); + r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1); + r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1); + r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1); + r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1); +#ifdef VERIFY + if (flag) { + r->magnitude = a->magnitude; + r->precomputed = a->precomputed; + r->normalized = a->normalized; + } +#endif +} + +static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r, const secp256k1_fe_storage *a, int flag) { + uint64_t mask0, mask1; + VG_CHECK_VERIFY(r->n, sizeof(r->n)); + mask0 = flag + ~((uint64_t)0); + mask1 = ~mask0; + r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1); + r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1); + r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1); + r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1); +} + +static void secp256k1_fe_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) { +#ifdef VERIFY + VERIFY_CHECK(a->normalized); +#endif + r->n[0] = a->n[0]; + r->n[1] = a->n[1]; + r->n[2] = a->n[2]; + r->n[3] = a->n[3]; +} + +static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) { + r->n[0] = a->n[0]; + r->n[1] = a->n[1]; + r->n[2] = a->n[2]; + r->n[3] = a->n[3]; + r->n[4] = 0; +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 1; + r->normalized = 1; +#endif +} + +static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64_signed62 *a) { + const uint64_t a0 = a->v[0], a1 = a->v[1], a2 = a->v[2], a3 = a->v[3], a4 = a->v[4]; + + /* The output from secp256k1_modinv64{_var} should be normalized to range [0,modulus), and + * have limbs in [0,2^62). The modulus is < 2^256, so the top limb must be below 2^(256-62*4). + */ + VERIFY_CHECK(a0 >> 62 == 0); + VERIFY_CHECK(a1 >> 62 == 0); + VERIFY_CHECK(a2 >> 62 == 0); + VERIFY_CHECK(a3 >> 62 == 0); + VERIFY_CHECK(a4 >> 8 == 0); + + r->n[0] = (a0) | (a1 << 62); + r->n[1] = (a1 >> 2) | (a2 << 60); + r->n[2] = (a2 >> 4) | (a3 << 58); + r->n[3] = (a3 >> 6) | (a4 << 56); + r->n[4] = 0; + +#ifdef VERIFY + r->magnitude = 1; + r->precomputed = 1; + r->normalized = 1; + secp256k1_fe_verify(r); +#endif +} + +static void secp256k1_fe_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_fe *a) { + const uint64_t M62 = UINT64_MAX >> 2; + const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3]; + +#ifdef VERIFY + VERIFY_CHECK(a->normalized); +#endif + + r->v[0] = a0 & M62; + r->v[1] = (a0 >> 62 | a1 << 2) & M62; + r->v[2] = (a1 >> 60 | a2 << 4) & M62; + r->v[3] = (a2 >> 58 | a3 << 6) & M62; + r->v[4] = (a3 >> 56) & M62; +} + +static const secp256k1_modinv64_modinfo secp256k1_const_modinfo_fe = { + {{-0x1000003D1LL, 0, 0, 0, 256}}, + 0x27C7F6E22DDACACFLL +}; + +static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) { + secp256k1_fe tmp; + secp256k1_modinv64_signed62 s; + + tmp = *x; + secp256k1_fe_normalize(&tmp); + secp256k1_fe_to_signed62(&s, &tmp); + secp256k1_modinv64(&s, &secp256k1_const_modinfo_fe); + secp256k1_fe_from_signed62(r, &s); + +#ifdef VERIFY + VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp)); +#endif +} + +static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) { + secp256k1_fe tmp; + secp256k1_modinv64_signed62 s; + + tmp = *x; + secp256k1_fe_normalize_var(&tmp); + secp256k1_fe_to_signed62(&s, &tmp); + secp256k1_modinv64_var(&s, &secp256k1_const_modinfo_fe); + secp256k1_fe_from_signed62(r, &s); + +#ifdef VERIFY + VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp)); +#endif +} + +#endif /* SECP256K1_FIELD_REPR_IMPL_H */ diff --git a/src/field_impl.h b/src/field_impl.h index 374284a1f4..30bf48bc57 100644 --- a/src/field_impl.h +++ b/src/field_impl.h @@ -14,7 +14,7 @@ #include "util.h" #if defined(SECP256K1_WIDEMUL_INT128) -#include "field_5x52_impl.h" +#include "field_5x64_impl.h" #elif defined(SECP256K1_WIDEMUL_INT64) #include "field_10x26_impl.h" #else diff --git a/src/group.h b/src/group.h index b9cd334dae..4c31b40dd0 100644 --- a/src/group.h +++ b/src/group.h @@ -44,13 +44,13 @@ static void secp256k1_ge_set_xy(secp256k1_ge *r, const secp256k1_fe *x, const se /** Set a group element (affine) equal to the point with the given X coordinate, and given oddness * for Y. Return value indicates whether the result is valid. */ -static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd); +static int secp256k1_ge_set_xo_var(secp256k1_ge *r, secp256k1_fe *x, int odd); /** Check whether a group element is the point at infinity. */ static int secp256k1_ge_is_infinity(const secp256k1_ge *a); /** Check whether a group element is valid (i.e., on the curve). */ -static int secp256k1_ge_is_valid_var(const secp256k1_ge *a); +static int secp256k1_ge_is_valid_var(secp256k1_ge *a); /** Set r equal to the inverse of a (i.e., mirrored around the X axis) */ static void secp256k1_ge_neg(secp256k1_ge *r, const secp256k1_ge *a); diff --git a/src/group_impl.h b/src/group_impl.h index 47aea32be1..b741671490 100644 --- a/src/group_impl.h +++ b/src/group_impl.h @@ -59,12 +59,13 @@ static const secp256k1_ge secp256k1_ge_const_g = SECP256K1_GE_CONST( static const secp256k1_fe secp256k1_fe_const_b = SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 7); #endif -static void secp256k1_ge_set_gej_zinv(secp256k1_ge *r, const secp256k1_gej *a, const secp256k1_fe *zi) { +static void secp256k1_ge_set_gej_zinv(secp256k1_ge *r, const secp256k1_gej *a, secp256k1_fe *zi) { secp256k1_fe zi2; secp256k1_fe zi3; - secp256k1_fe_sqr(&zi2, zi); - secp256k1_fe_mul(&zi3, &zi2, zi); - secp256k1_fe_mul(&r->x, &a->x, &zi2); + secp256k1_fe_normalize_prec(zi); + secp256k1_fe_sqr_prec_oprec(&zi2, zi); + secp256k1_fe_mul_2prec(&zi3, &zi2, zi); + secp256k1_fe_mul_prec(&r->x, &a->x, &zi2); secp256k1_fe_mul(&r->y, &a->y, &zi3); r->infinity = a->infinity; } @@ -89,9 +90,9 @@ static void secp256k1_ge_set_gej(secp256k1_ge *r, secp256k1_gej *a) { secp256k1_fe z2, z3; r->infinity = a->infinity; secp256k1_fe_inv(&a->z, &a->z); - secp256k1_fe_sqr(&z2, &a->z); - secp256k1_fe_mul(&z3, &a->z, &z2); - secp256k1_fe_mul(&a->x, &a->x, &z2); + secp256k1_fe_sqr_prec_oprec(&z2, &a->z); + secp256k1_fe_mul_prec(&z3, &a->z, &z2); + secp256k1_fe_mul_prec(&a->x, &a->x, &z2); secp256k1_fe_mul(&a->y, &a->y, &z3); secp256k1_fe_set_int(&a->z, 1); r->x = a->x; @@ -105,9 +106,9 @@ static void secp256k1_ge_set_gej_var(secp256k1_ge *r, secp256k1_gej *a) { return; } secp256k1_fe_inv_var(&a->z, &a->z); - secp256k1_fe_sqr(&z2, &a->z); - secp256k1_fe_mul(&z3, &a->z, &z2); - secp256k1_fe_mul(&a->x, &a->x, &z2); + secp256k1_fe_sqr_prec_oprec(&z2, &a->z); + secp256k1_fe_mul_prec(&z3, &a->z, &z2); + secp256k1_fe_mul_prec(&a->x, &a->x, &z2); secp256k1_fe_mul(&a->y, &a->y, &z3); secp256k1_fe_set_int(&a->z, 1); secp256k1_ge_set_xy(r, &a->x, &a->y); @@ -206,11 +207,12 @@ static void secp256k1_ge_clear(secp256k1_ge *r) { secp256k1_fe_clear(&r->y); } -static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd) { +static int secp256k1_ge_set_xo_var(secp256k1_ge *r, secp256k1_fe *x, int odd) { secp256k1_fe x2, x3; r->x = *x; - secp256k1_fe_sqr(&x2, x); - secp256k1_fe_mul(&x3, x, &x2); + secp256k1_fe_normalize_prec(x); + secp256k1_fe_sqr_prec(&x2, x); + secp256k1_fe_mul_prec(&x3, &x2, x); r->infinity = 0; secp256k1_fe_add(&x3, &secp256k1_fe_const_b); if (!secp256k1_fe_sqrt(&r->y, &x3)) { @@ -221,7 +223,6 @@ static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int o secp256k1_fe_negate(&r->y, &r->y, 1); } return 1; - } static void secp256k1_gej_set_ge(secp256k1_gej *r, const secp256k1_ge *a) { @@ -252,13 +253,14 @@ static int secp256k1_gej_is_infinity(const secp256k1_gej *a) { return a->infinity; } -static int secp256k1_ge_is_valid_var(const secp256k1_ge *a) { +static int secp256k1_ge_is_valid_var(secp256k1_ge *a) { secp256k1_fe y2, x3; if (a->infinity) { return 0; } /* y^2 = x^3 + 7 */ secp256k1_fe_sqr(&y2, &a->y); + secp256k1_fe_normalize_prec(&a->x); secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x); secp256k1_fe_add(&x3, &secp256k1_fe_const_b); secp256k1_fe_normalize_weak(&x3); @@ -273,20 +275,25 @@ static SECP256K1_INLINE void secp256k1_gej_double(secp256k1_gej *r, const secp25 * which trades a multiply for a square, but in practice this is actually slower, * mainly because it requires more normalizations. */ - secp256k1_fe t1,t2,t3,t4; + secp256k1_fe t1,t2,t3,t4,ax=a->x,ay=a->y; + + secp256k1_fe_normalize_prec(&ax); + secp256k1_fe_normalize_prec(&ay); r->infinity = a->infinity; - secp256k1_fe_mul(&r->z, &a->z, &a->y); + secp256k1_fe_mul_prec(&r->z, &a->z, &ay); secp256k1_fe_mul_int(&r->z, 2); /* Z' = 2*Y*Z (2) */ - secp256k1_fe_sqr(&t1, &a->x); + secp256k1_fe_sqr_prec(&t1, &ax); secp256k1_fe_mul_int(&t1, 3); /* T1 = 3*X^2 (3) */ - secp256k1_fe_sqr(&t2, &t1); /* T2 = 9*X^4 (1) */ - secp256k1_fe_sqr(&t3, &a->y); + secp256k1_fe_normalize_prec(&t1); + secp256k1_fe_sqr_prec(&t2, &t1); /* T2 = 9*X^4 (1) */ + secp256k1_fe_sqr_prec(&t3, &ay); secp256k1_fe_mul_int(&t3, 2); /* T3 = 2*Y^2 (2) */ - secp256k1_fe_sqr(&t4, &t3); + secp256k1_fe_normalize_prec(&t3); + secp256k1_fe_sqr_prec(&t4, &t3); secp256k1_fe_mul_int(&t4, 2); /* T4 = 8*Y^4 (2) */ - secp256k1_fe_mul(&t3, &t3, &a->x); /* T3 = 2*X*Y^2 (1) */ + secp256k1_fe_mul_2prec(&t3, &t3, &ax);/* T3 = 2*X*Y^2 (1) */ r->x = t3; secp256k1_fe_mul_int(&r->x, 4); /* X' = 8*X*Y^2 (4) */ secp256k1_fe_negate(&r->x, &r->x, 4); /* X' = -8*X*Y^2 (5) */ @@ -294,7 +301,7 @@ static SECP256K1_INLINE void secp256k1_gej_double(secp256k1_gej *r, const secp25 secp256k1_fe_negate(&t2, &t2, 1); /* T2 = -9*X^4 (2) */ secp256k1_fe_mul_int(&t3, 6); /* T3 = 12*X*Y^2 (6) */ secp256k1_fe_add(&t3, &t2); /* T3 = 12*X*Y^2 - 9*X^4 (8) */ - secp256k1_fe_mul(&r->y, &t1, &t3); /* Y' = 36*X^3*Y^2 - 27*X^6 (1) */ + secp256k1_fe_mul_prec(&r->y, &t3, &t1);/*Y' = 36*X^3*Y^2 - 27*X^6 (1) */ secp256k1_fe_negate(&t2, &t4, 2); /* T2 = -8*Y^4 (3) */ secp256k1_fe_add(&r->y, &t2); /* Y' = 36*X^3*Y^2 - 27*X^6 - 8*Y^4 (4) */ } @@ -329,7 +336,7 @@ static void secp256k1_gej_double_var(secp256k1_gej *r, const secp256k1_gej *a, s static void secp256k1_gej_add_var(secp256k1_gej *r, const secp256k1_gej *a, const secp256k1_gej *b, secp256k1_fe *rzr) { /* Operations: 12 mul, 4 sqr, 2 normalize, 12 mul_int/add/negate */ - secp256k1_fe z22, z12, u1, u2, s1, s2, h, i, i2, h2, h3, t; + secp256k1_fe z22, z12, u1, u2, s1, s2, h, i, i2, h2, h3, t, az, bz; if (a->infinity) { VERIFY_CHECK(rzr == NULL); @@ -346,15 +353,18 @@ static void secp256k1_gej_add_var(secp256k1_gej *r, const secp256k1_gej *a, cons } r->infinity = 0; - secp256k1_fe_sqr(&z22, &b->z); - secp256k1_fe_sqr(&z12, &a->z); - secp256k1_fe_mul(&u1, &a->x, &z22); - secp256k1_fe_mul(&u2, &b->x, &z12); - secp256k1_fe_mul(&s1, &a->y, &z22); secp256k1_fe_mul(&s1, &s1, &b->z); - secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z); + az = a->z; secp256k1_fe_normalize_prec(&az); + bz = b->z; secp256k1_fe_normalize_prec(&bz); + secp256k1_fe_sqr_prec_oprec(&z22, &bz); + secp256k1_fe_sqr_prec_oprec(&z12, &az); + secp256k1_fe_mul_prec(&u1, &a->x, &z22); + secp256k1_fe_mul_prec(&u2, &b->x, &z12); + secp256k1_fe_mul_prec(&s1, &a->y, &z22); secp256k1_fe_mul_prec(&s1, &s1, &bz); + secp256k1_fe_mul_prec(&s2, &b->y, &z12); secp256k1_fe_mul_prec(&s2, &s2, &az); secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2); secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2); - if (secp256k1_fe_normalizes_to_zero_var(&h)) { + secp256k1_fe_normalize_var(&h); + if (secp256k1_fe_is_zero(&h)) { if (secp256k1_fe_normalizes_to_zero_var(&i)) { secp256k1_gej_double_var(r, a, rzr); } else { @@ -365,24 +375,25 @@ static void secp256k1_gej_add_var(secp256k1_gej *r, const secp256k1_gej *a, cons } return; } - secp256k1_fe_sqr(&i2, &i); - secp256k1_fe_sqr(&h2, &h); - secp256k1_fe_mul(&h3, &h, &h2); - secp256k1_fe_mul(&h, &h, &b->z); + secp256k1_fe_normalize_prec(&i); + secp256k1_fe_sqr_prec(&i2, &i); + secp256k1_fe_sqr_prec_oprec(&h2, &h); + secp256k1_fe_mul_2prec(&h3, &h2, &h); + secp256k1_fe_mul_2prec(&h, &h, &bz); if (rzr != NULL) { *rzr = h; } - secp256k1_fe_mul(&r->z, &a->z, &h); - secp256k1_fe_mul(&t, &u1, &h2); + secp256k1_fe_mul_prec(&r->z, &h, &az); + secp256k1_fe_mul_prec(&t, &u1, &h2); r->x = t; secp256k1_fe_mul_int(&r->x, 2); secp256k1_fe_add(&r->x, &h3); secp256k1_fe_negate(&r->x, &r->x, 3); secp256k1_fe_add(&r->x, &i2); - secp256k1_fe_negate(&r->y, &r->x, 5); secp256k1_fe_add(&r->y, &t); secp256k1_fe_mul(&r->y, &r->y, &i); + secp256k1_fe_negate(&r->y, &r->x, 5); secp256k1_fe_add(&r->y, &t); secp256k1_fe_mul_prec(&r->y, &r->y, &i); secp256k1_fe_mul(&h3, &h3, &s1); secp256k1_fe_negate(&h3, &h3, 1); secp256k1_fe_add(&r->y, &h3); } static void secp256k1_gej_add_ge_var(secp256k1_gej *r, const secp256k1_gej *a, const secp256k1_ge *b, secp256k1_fe *rzr) { /* 8 mul, 3 sqr, 4 normalize, 12 mul_int/add/negate */ - secp256k1_fe z12, u1, u2, s1, s2, h, i, i2, h2, h3, t; + secp256k1_fe z12, u1, u2, s1, s2, h, i, i2, h2, h3, t, az; if (a->infinity) { VERIFY_CHECK(rzr == NULL); secp256k1_gej_set_ge(r, b); @@ -397,14 +408,16 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej *r, const secp256k1_gej *a, c } r->infinity = 0; - secp256k1_fe_sqr(&z12, &a->z); + az = a->z; secp256k1_fe_normalize_prec(&az); + secp256k1_fe_sqr_prec_oprec(&z12, &az); u1 = a->x; secp256k1_fe_normalize_weak(&u1); - secp256k1_fe_mul(&u2, &b->x, &z12); - s1 = a->y; secp256k1_fe_normalize_weak(&s1); - secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z); + secp256k1_fe_mul_prec(&u2, &b->x, &z12); + s1 = a->y; secp256k1_fe_normalize_weak_prec(&s1); + secp256k1_fe_mul_prec(&s2, &b->y, &z12); secp256k1_fe_mul_prec(&s2, &s2, &az); secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2); secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2); - if (secp256k1_fe_normalizes_to_zero_var(&h)) { + secp256k1_fe_normalize_var(&h); + if (secp256k1_fe_is_zero(&h)) { if (secp256k1_fe_normalizes_to_zero_var(&i)) { secp256k1_gej_double_var(r, a, rzr); } else { @@ -415,17 +428,18 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej *r, const secp256k1_gej *a, c } return; } - secp256k1_fe_sqr(&i2, &i); - secp256k1_fe_sqr(&h2, &h); - secp256k1_fe_mul(&h3, &h, &h2); + secp256k1_fe_normalize_prec(&i); + secp256k1_fe_sqr_prec(&i2, &i); + secp256k1_fe_sqr_prec_oprec(&h2, &h); + secp256k1_fe_mul_2prec(&h3, &h2, &h); if (rzr != NULL) { *rzr = h; } - secp256k1_fe_mul(&r->z, &a->z, &h); - secp256k1_fe_mul(&t, &u1, &h2); + secp256k1_fe_mul_2prec(&r->z, &h, &az); + secp256k1_fe_mul_prec(&t, &u1, &h2); r->x = t; secp256k1_fe_mul_int(&r->x, 2); secp256k1_fe_add(&r->x, &h3); secp256k1_fe_negate(&r->x, &r->x, 3); secp256k1_fe_add(&r->x, &i2); - secp256k1_fe_negate(&r->y, &r->x, 5); secp256k1_fe_add(&r->y, &t); secp256k1_fe_mul(&r->y, &r->y, &i); - secp256k1_fe_mul(&h3, &h3, &s1); secp256k1_fe_negate(&h3, &h3, 1); + secp256k1_fe_negate(&r->y, &r->x, 5); secp256k1_fe_add(&r->y, &t); secp256k1_fe_mul_prec(&r->y, &r->y, &i); + secp256k1_fe_mul_prec(&h3, &h3, &s1); secp256k1_fe_negate(&h3, &h3, 1); secp256k1_fe_add(&r->y, &h3); } @@ -440,9 +454,10 @@ static void secp256k1_gej_add_zinv_var(secp256k1_gej *r, const secp256k1_gej *a, if (a->infinity) { secp256k1_fe bzinv2, bzinv3; r->infinity = b->infinity; - secp256k1_fe_sqr(&bzinv2, bzinv); - secp256k1_fe_mul(&bzinv3, &bzinv2, bzinv); - secp256k1_fe_mul(&r->x, &b->x, &bzinv2); + bzinv3 = *bzinv; secp256k1_fe_normalize_prec(&bzinv3); + secp256k1_fe_sqr_prec_oprec(&bzinv2, &bzinv3); + secp256k1_fe_mul_prec(&bzinv3, &bzinv3, &bzinv2); + secp256k1_fe_mul_prec(&r->x, &b->x, &bzinv2); secp256k1_fe_mul(&r->y, &b->y, &bzinv3); secp256k1_fe_set_int(&r->z, 1); return; @@ -458,15 +473,17 @@ static void secp256k1_gej_add_zinv_var(secp256k1_gej *r, const secp256k1_gej *a, * for the computation of rx and ry, but not for rz. */ secp256k1_fe_mul(&az, &a->z, bzinv); + secp256k1_fe_normalize_prec(&az); - secp256k1_fe_sqr(&z12, &az); + secp256k1_fe_sqr_prec_oprec(&z12, &az); u1 = a->x; secp256k1_fe_normalize_weak(&u1); - secp256k1_fe_mul(&u2, &b->x, &z12); - s1 = a->y; secp256k1_fe_normalize_weak(&s1); - secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &az); + secp256k1_fe_mul_prec(&u2, &b->x, &z12); + s1 = a->y; secp256k1_fe_normalize_weak_prec(&s1); + secp256k1_fe_mul_prec(&s2, &b->y, &z12); secp256k1_fe_mul_prec(&s2, &s2, &az); secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2); secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2); - if (secp256k1_fe_normalizes_to_zero_var(&h)) { + secp256k1_fe_normalize_var(&h); + if (secp256k1_fe_is_zero(&h)) { if (secp256k1_fe_normalizes_to_zero_var(&i)) { secp256k1_gej_double_var(r, a, NULL); } else { @@ -474,14 +491,15 @@ static void secp256k1_gej_add_zinv_var(secp256k1_gej *r, const secp256k1_gej *a, } return; } - secp256k1_fe_sqr(&i2, &i); - secp256k1_fe_sqr(&h2, &h); - secp256k1_fe_mul(&h3, &h, &h2); - r->z = a->z; secp256k1_fe_mul(&r->z, &r->z, &h); - secp256k1_fe_mul(&t, &u1, &h2); + secp256k1_fe_normalize_prec(&i); + secp256k1_fe_sqr_prec(&i2, &i); + secp256k1_fe_sqr_prec_oprec(&h2, &h); + secp256k1_fe_mul_2prec(&h3, &h2, &h); + r->z = a->z; secp256k1_fe_mul_prec(&r->z, &r->z, &h); + secp256k1_fe_mul_prec(&t, &u1, &h2); r->x = t; secp256k1_fe_mul_int(&r->x, 2); secp256k1_fe_add(&r->x, &h3); secp256k1_fe_negate(&r->x, &r->x, 3); secp256k1_fe_add(&r->x, &i2); - secp256k1_fe_negate(&r->y, &r->x, 5); secp256k1_fe_add(&r->y, &t); secp256k1_fe_mul(&r->y, &r->y, &i); - secp256k1_fe_mul(&h3, &h3, &s1); secp256k1_fe_negate(&h3, &h3, 1); + secp256k1_fe_negate(&r->y, &r->x, 5); secp256k1_fe_add(&r->y, &t); secp256k1_fe_mul_prec(&r->y, &r->y, &i); + secp256k1_fe_mul_prec(&h3, &h3, &s1); secp256k1_fe_negate(&h3, &h3, 1); secp256k1_fe_add(&r->y, &h3); } @@ -489,7 +507,7 @@ static void secp256k1_gej_add_zinv_var(secp256k1_gej *r, const secp256k1_gej *a, static void secp256k1_gej_add_ge(secp256k1_gej *r, const secp256k1_gej *a, const secp256k1_ge *b) { /* Operations: 7 mul, 5 sqr, 4 normalize, 21 mul_int/add/negate/cmov */ static const secp256k1_fe fe_1 = SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1); - secp256k1_fe zz, u1, u2, s1, s2, t, tt, m, n, q, rr; + secp256k1_fe zz, u1, u2, s1, s2, t, tt, m, n, q, rr, az; secp256k1_fe m_alt, rr_alt; int infinity, degenerate; VERIFY_CHECK(!b->infinity); @@ -545,12 +563,13 @@ static void secp256k1_gej_add_ge(secp256k1_gej *r, const secp256k1_gej *a, const * so this covers everything. */ - secp256k1_fe_sqr(&zz, &a->z); /* z = Z1^2 */ + az = a->z; secp256k1_fe_normalize_prec(&az); + secp256k1_fe_sqr_prec_oprec(&zz, &az); /* z = Z1^2 */ u1 = a->x; secp256k1_fe_normalize_weak(&u1); /* u1 = U1 = X1*Z2^2 (1) */ - secp256k1_fe_mul(&u2, &b->x, &zz); /* u2 = U2 = X2*Z1^2 (1) */ + secp256k1_fe_mul_prec(&u2, &b->x, &zz); /* u2 = U2 = X2*Z1^2 (1) */ s1 = a->y; secp256k1_fe_normalize_weak(&s1); /* s1 = S1 = Y1*Z2^3 (1) */ - secp256k1_fe_mul(&s2, &b->y, &zz); /* s2 = Y2*Z1^2 (1) */ - secp256k1_fe_mul(&s2, &s2, &a->z); /* s2 = S2 = Y2*Z1^3 (1) */ + secp256k1_fe_mul_prec(&s2, &b->y, &zz); /* s2 = Y2*Z1^2 (1) */ + secp256k1_fe_mul_prec(&s2, &s2, &az); /* s2 = S2 = Y2*Z1^3 (1) */ t = u1; secp256k1_fe_add(&t, &u2); /* t = T = U1+U2 (2) */ m = s1; secp256k1_fe_add(&m, &s2); /* m = M = S1+S2 (2) */ secp256k1_fe_sqr(&rr, &t); /* rr = T^2 (1) */ @@ -584,8 +603,9 @@ static void secp256k1_gej_add_ge(secp256k1_gej *r, const secp256k1_gej *a, const * versus two multiplications. */ secp256k1_fe_sqr(&n, &n); secp256k1_fe_cmov(&n, &m, degenerate); /* n = M^3 * Malt (2) */ - secp256k1_fe_sqr(&t, &rr_alt); /* t = Ralt^2 (1) */ - secp256k1_fe_mul(&r->z, &a->z, &m_alt); /* r->z = Malt*Z (1) */ + secp256k1_fe_normalize_prec(&rr_alt); + secp256k1_fe_sqr_prec(&t, &rr_alt); /* t = Ralt^2 (1) */ + secp256k1_fe_mul_prec(&r->z, &m_alt, &az); /* r->z = Malt*Z (1) */ infinity = secp256k1_fe_normalizes_to_zero(&r->z) & ~a->infinity; secp256k1_fe_mul_int(&r->z, 2); /* r->z = Z3 = 2*Malt*Z (2) */ secp256k1_fe_negate(&q, &q, 1); /* q = -Q (2) */ @@ -594,7 +614,7 @@ static void secp256k1_gej_add_ge(secp256k1_gej *r, const secp256k1_gej *a, const r->x = t; /* r->x = Ralt^2-Q (1) */ secp256k1_fe_mul_int(&t, 2); /* t = 2*x3 (2) */ secp256k1_fe_add(&t, &q); /* t = 2*x3 - Q: (4) */ - secp256k1_fe_mul(&t, &t, &rr_alt); /* t = Ralt*(2*x3 - Q) (1) */ + secp256k1_fe_mul_prec(&t, &t, &rr_alt); /* t = Ralt*(2*x3 - Q) (1) */ secp256k1_fe_add(&t, &n); /* t = Ralt*(2*x3 - Q) + M^3*Malt (3) */ secp256k1_fe_negate(&r->y, &t, 3); /* r->y = Ralt*(Q - 2x3) - M^3*Malt (4) */ secp256k1_fe_normalize_weak(&r->y); @@ -610,13 +630,15 @@ static void secp256k1_gej_add_ge(secp256k1_gej *r, const secp256k1_gej *a, const static void secp256k1_gej_rescale(secp256k1_gej *r, const secp256k1_fe *s) { /* Operations: 4 mul, 1 sqr */ - secp256k1_fe zz; + secp256k1_fe zz, sc; VERIFY_CHECK(!secp256k1_fe_is_zero(s)); - secp256k1_fe_sqr(&zz, s); - secp256k1_fe_mul(&r->x, &r->x, &zz); /* r->x *= s^2 */ - secp256k1_fe_mul(&r->y, &r->y, &zz); - secp256k1_fe_mul(&r->y, &r->y, s); /* r->y *= s^3 */ - secp256k1_fe_mul(&r->z, &r->z, s); /* r->z *= s */ + sc = *s; + secp256k1_fe_normalize_prec(&sc); + secp256k1_fe_sqr_prec_oprec(&zz, &sc); + secp256k1_fe_mul_prec(&r->x, &r->x, &zz); /* r->x *= s^2 */ + secp256k1_fe_mul_prec(&r->y, &r->y, &zz); + secp256k1_fe_mul_prec(&r->y, &r->y, &sc); /* r->y *= s^3 */ + secp256k1_fe_mul_prec(&r->z, &r->z, &sc); /* r->z *= s */ } static void secp256k1_ge_to_storage(secp256k1_ge_storage *r, const secp256k1_ge *a) { @@ -647,7 +669,7 @@ static void secp256k1_ge_mul_lambda(secp256k1_ge *r, const secp256k1_ge *a) { 0x9cf04975ul, 0x12f58995ul, 0xc1396c28ul, 0x719501eeul ); *r = *a; - secp256k1_fe_mul(&r->x, &r->x, &beta); + secp256k1_fe_mul_prec(&r->x, &r->x, &beta); } static int secp256k1_ge_is_in_correct_subgroup(const secp256k1_ge* ge) {