From aa8499080e2a657113781921096b59a74d7bc0e7 Mon Sep 17 00:00:00 2001 From: CryptoGuru Date: Mon, 9 Jan 2017 23:18:57 +0000 Subject: [PATCH 1/3] Unroll secp256k1_fe_(get|set)_b32 for 5x52. field_get_b32: min 0.647us / avg 0.666us / max 0.751us field_set_b32: min 0.551us / avg 0.571us / max 0.624us becomes field_get_b32: min 0us / avg 0.0000000477us / max 0.000000238us field_set_b32: min 0us / avg 0.0000000238us / max 0.000000238us (Patch from https://bitcointalk.org/index.php?topic=1740973.0 _get was reversed from the patch because this order appeared somewhat faster in testing.) Signed-off-by: Gregory Maxwell --- src/field_5x52_impl.h | 87 ++++++++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index dd88f38c77..8e8b286baf 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -284,16 +284,40 @@ static int secp256k1_fe_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) { } static int secp256k1_fe_set_b32(secp256k1_fe *r, const unsigned char *a) { - int i; - r->n[0] = r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0; - for (i=0; i<32; i++) { - int j; - for (j=0; j<2; j++) { - int limb = (8*i+4*j)/52; - int shift = (8*i+4*j)%52; - r->n[limb] |= (uint64_t)((a[31-i] >> (4*j)) & 0xF) << shift; - } - } + r->n[0] = (uint64_t)a[31] + | ((uint64_t)a[30] << 8) + | ((uint64_t)a[29] << 16) + | ((uint64_t)a[28] << 24) + | ((uint64_t)a[27] << 32) + | ((uint64_t)a[26] << 40) + | ((uint64_t)(a[25] & 0xF) << 48); + r->n[1] = (uint64_t)((a[25] >> 4) & 0xF) + | ((uint64_t)a[24] << 4) + | ((uint64_t)a[23] << 12) + | ((uint64_t)a[22] << 20) + | ((uint64_t)a[21] << 28) + | ((uint64_t)a[20] << 36) + | ((uint64_t)a[19] << 44); + r->n[2] = (uint64_t)a[18] + | ((uint64_t)a[17] << 8) + | ((uint64_t)a[16] << 16) + | ((uint64_t)a[15] << 24) + | ((uint64_t)a[14] << 32) + | ((uint64_t)a[13] << 40) + | ((uint64_t)(a[12] & 0xF) << 48); + r->n[3] = (uint64_t)((a[12] >> 4) & 0xF) + | ((uint64_t)a[11] << 4) + | ((uint64_t)a[10] << 12) + | ((uint64_t)a[9] << 20) + | ((uint64_t)a[8] << 28) + | ((uint64_t)a[7] << 36) + | ((uint64_t)a[6] << 44); + r->n[4] = (uint64_t)a[5] + | ((uint64_t)a[4] << 8) + | ((uint64_t)a[3] << 16) + | ((uint64_t)a[2] << 24) + | ((uint64_t)a[1] << 32) + | ((uint64_t)a[0] << 40); if (r->n[4] == 0x0FFFFFFFFFFFFULL && (r->n[3] & r->n[2] & r->n[1]) == 0xFFFFFFFFFFFFFULL && r->n[0] >= 0xFFFFEFFFFFC2FULL) { return 0; } @@ -307,21 +331,42 @@ static int secp256k1_fe_set_b32(secp256k1_fe *r, const unsigned char *a) { /** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */ static void secp256k1_fe_get_b32(unsigned char *r, const secp256k1_fe *a) { - int i; #ifdef VERIFY VERIFY_CHECK(a->normalized); secp256k1_fe_verify(a); #endif - for (i=0; i<32; i++) { - int j; - int c = 0; - for (j=0; j<2; j++) { - int limb = (8*i+4*j)/52; - int shift = (8*i+4*j)%52; - c |= ((a->n[limb] >> shift) & 0xF) << (4 * j); - } - r[31-i] = c; - } + r[0] = (a->n[4] >> 40) & 0xFF; + r[1] = (a->n[4] >> 32) & 0xFF; + r[2] = (a->n[4] >> 24) & 0xFF; + r[3] = (a->n[4] >> 16) & 0xFF; + r[4] = (a->n[4] >> 8) & 0xFF; + r[5] = a->n[4] & 0xFF; + r[6] = (a->n[3] >> 44) & 0xFF; + r[7] = (a->n[3] >> 36) & 0xFF; + r[8] = (a->n[3] >> 28) & 0xFF; + r[9] = (a->n[3] >> 20) & 0xFF; + r[10] = (a->n[3] >> 12) & 0xFF; + r[11] = (a->n[3] >> 4) & 0xFF; + r[12] = ((a->n[2] >> 48) & 0xF) | ((a->n[3] & 0xF) << 4); + r[13] = (a->n[2] >> 40) & 0xFF; + r[14] = (a->n[2] >> 32) & 0xFF; + r[15] = (a->n[2] >> 24) & 0xFF; + r[16] = (a->n[2] >> 16) & 0xFF; + r[17] = (a->n[2] >> 8) & 0xFF; + r[18] = a->n[2] & 0xFF; + r[19] = (a->n[1] >> 44) & 0xFF; + r[20] = (a->n[1] >> 36) & 0xFF; + r[21] = (a->n[1] >> 28) & 0xFF; + r[22] = (a->n[1] >> 20) & 0xFF; + r[23] = (a->n[1] >> 12) & 0xFF; + r[24] = (a->n[1] >> 4) & 0xFF; + r[25] = ((a->n[0] >> 48) & 0xF) | ((a->n[1] & 0xF) << 4); + r[26] = (a->n[0] >> 40) & 0xFF; + r[27] = (a->n[0] >> 32) & 0xFF; + r[28] = (a->n[0] >> 24) & 0xFF; + r[29] = (a->n[0] >> 16) & 0xFF; + r[30] = (a->n[0] >> 8) & 0xFF; + r[31] = a->n[0] & 0xFF; } SECP256K1_INLINE static void secp256k1_fe_negate(secp256k1_fe *r, const secp256k1_fe *a, int m) { From 8b7680a826498a786eca5737e0e97ee4d2e63713 Mon Sep 17 00:00:00 2001 From: Gregory Maxwell Date: Tue, 10 Jan 2017 01:54:49 +0000 Subject: [PATCH 2/3] Unroll secp256k1_fe_(get|set)_b32 for 10x26. field_get_b32: min 0.890us / avg 0.905us / max 0.956us field_set_b32: min 1.12us / avg 1.15us / max 1.19us becomes field_get_b32: min 0us / avg 0.000000119us / max 0.000000238us field_set_b32: min 0.0532us / avg 0.0584us / max 0.0782us --- src/field_10x26_impl.h | 65 ++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h index 5fb092f1be..234c13a644 100644 --- a/src/field_10x26_impl.h +++ b/src/field_10x26_impl.h @@ -321,17 +321,17 @@ static int secp256k1_fe_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) { } static int secp256k1_fe_set_b32(secp256k1_fe *r, const unsigned char *a) { - int i; - r->n[0] = r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0; - r->n[5] = r->n[6] = r->n[7] = r->n[8] = r->n[9] = 0; - for (i=0; i<32; i++) { - int j; - for (j=0; j<4; j++) { - int limb = (8*i+2*j)/26; - int shift = (8*i+2*j)%26; - r->n[limb] |= (uint32_t)((a[31-i] >> (2*j)) & 0x3) << shift; - } - } + r->n[0] = (uint32_t)a[31] | ((uint32_t)a[30] << 8) | ((uint32_t)a[29] << 16) | ((uint32_t)(a[28] & 0x3) << 24); + r->n[1] = (uint32_t)((a[28] >> 2) & 0x3f) | ((uint32_t)a[27] << 6) | ((uint32_t)a[26] << 14) | ((uint32_t)(a[25] & 0xf) << 22); + r->n[2] = (uint32_t)((a[25] >> 4) & 0xf) | ((uint32_t)a[24] << 4) | ((uint32_t)a[23] << 12) | ((uint32_t)(a[22] & 0x3f) << 20); + r->n[3] = (uint32_t)((a[22] >> 6) & 0x3) | ((uint32_t)a[21] << 2) | ((uint32_t)a[20] << 10) | ((uint32_t)a[19] << 18); + r->n[4] = (uint32_t)a[18] | ((uint32_t)a[17] << 8) | ((uint32_t)a[16] << 16) | ((uint32_t)(a[15] & 0x3) << 24); + r->n[5] = (uint32_t)((a[15] >> 2) & 0x3f) | ((uint32_t)a[14] << 6) | ((uint32_t)a[13] << 14) | ((uint32_t)(a[12] & 0xf) << 22); + r->n[6] = (uint32_t)((a[12] >> 4) & 0xf) | ((uint32_t)a[11] << 4) | ((uint32_t)a[10] << 12) | ((uint32_t)(a[9] & 0x3f) << 20); + r->n[7] = (uint32_t)((a[9] >> 6) & 0x3) | ((uint32_t)a[8] << 2) | ((uint32_t)a[7] << 10) | ((uint32_t)a[6] << 18); + r->n[8] = (uint32_t)a[5] | ((uint32_t)a[4] << 8) | ((uint32_t)a[3] << 16) | ((uint32_t)(a[2] & 0x3) << 24); + r->n[9] = (uint32_t)((a[2] >> 2) & 0x3f) | ((uint32_t)a[1] << 6) | ((uint32_t)a[0] << 14); + if (r->n[9] == 0x3FFFFFUL && (r->n[8] & r->n[7] & r->n[6] & r->n[5] & r->n[4] & r->n[3] & r->n[2]) == 0x3FFFFFFUL && (r->n[1] + 0x40UL + ((r->n[0] + 0x3D1UL) >> 26)) > 0x3FFFFFFUL) { return 0; } @@ -345,21 +345,42 @@ static int secp256k1_fe_set_b32(secp256k1_fe *r, const unsigned char *a) { /** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */ static void secp256k1_fe_get_b32(unsigned char *r, const secp256k1_fe *a) { - int i; #ifdef VERIFY VERIFY_CHECK(a->normalized); secp256k1_fe_verify(a); #endif - for (i=0; i<32; i++) { - int j; - int c = 0; - for (j=0; j<4; j++) { - int limb = (8*i+2*j)/26; - int shift = (8*i+2*j)%26; - c |= ((a->n[limb] >> shift) & 0x3) << (2 * j); - } - r[31-i] = c; - } + r[0] = (a->n[9] >> 14) & 0xff; + r[1] = (a->n[9] >> 6) & 0xff; + r[2] = ((a->n[9] & 0x3F) << 2) | ((a->n[8] >> 24) & 0x3); + r[3] = (a->n[8] >> 16) & 0xff; + r[4] = (a->n[8] >> 8) & 0xff; + r[5] = a->n[8] & 0xff; + r[6] = (a->n[7] >> 18) & 0xff; + r[7] = (a->n[7] >> 10) & 0xff; + r[8] = (a->n[7] >> 2) & 0xff; + r[9] = ((a->n[7] & 0x3) << 6) | ((a->n[6] >> 20) & 0x3f); + r[10] = (a->n[6] >> 12) & 0xff; + r[11] = (a->n[6] >> 4) & 0xff; + r[12] = ((a->n[6] & 0xf) << 4) | ((a->n[5] >> 22) & 0xf); + r[13] = (a->n[5] >> 14) & 0xff; + r[14] = (a->n[5] >> 6) & 0xff; + r[15] = ((a->n[5] & 0x3f) << 2) | ((a->n[4] >> 24) & 0x3); + r[16] = (a->n[4] >> 16) & 0xff; + r[17] = (a->n[4] >> 8) & 0xff; + r[18] = a->n[4] & 0xff; + r[19] = (a->n[3] >> 18) & 0xff; + r[20] = (a->n[3] >> 10) & 0xff; + r[21] = (a->n[3] >> 2) & 0xff; + r[22] = ((a->n[3] & 0x3) << 6) | ((a->n[2] >> 20) & 0x3f); + r[23] = (a->n[2] >> 12) & 0xff; + r[24] = (a->n[2] >> 4) & 0xff; + r[25] = ((a->n[2] & 0xf) << 4) | ((a->n[1] >> 22) & 0xf); + r[26] = (a->n[1] >> 14) & 0xff; + r[27] = (a->n[1] >> 6) & 0xff; + r[28] = ((a->n[1] & 0x3f) << 2) | ((a->n[0] >> 24) & 0x3); + r[29] = (a->n[0] >> 16) & 0xff; + r[30] = (a->n[0] >> 8) & 0xff; + r[31] = a->n[0] & 0xff; } SECP256K1_INLINE static void secp256k1_fe_negate(secp256k1_fe *r, const secp256k1_fe *a, int m) { From a2b6b1914f5146e563697438a8cb54faefeefb92 Mon Sep 17 00:00:00 2001 From: Gregory Maxwell Date: Mon, 24 Apr 2017 06:02:36 +0000 Subject: [PATCH 3/3] Fix benchmark print_number infinite loop. --- src/bench.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bench.h b/src/bench.h index 3a71b4aafa..d67f08a426 100644 --- a/src/bench.h +++ b/src/bench.h @@ -23,7 +23,7 @@ void print_number(double x) { if (y < 0.0) { y = -y; } - while (y < 100.0) { + while (y > 0 && y < 100.0) { y *= 10.0; c++; }