From 5983351f130583ae343b76b755fcfdcff261b279 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Thu, 8 Feb 2024 13:19:57 -0800 Subject: [PATCH] Fix memory read for stack allocated buffers. --- source/arm/crc64_arm.c | 267 ++++++++++++++++++----------------------- 1 file changed, 118 insertions(+), 149 deletions(-) diff --git a/source/arm/crc64_arm.c b/source/arm/crc64_arm.c index 9cee3f3..c3378df 100644 --- a/source/arm/crc64_arm.c +++ b/source/arm/crc64_arm.c @@ -49,176 +49,145 @@ # endif // defined(__ARM_FEATURE_SHA3) /** Compute CRC64XZ using ARMv8 NEON +crypto/pmull64 instructions. */ -uint64_t aws_checksums_crc64xz_arm_pmull(const uint8_t *input, int length, const uint64_t previousCrc64) { +uint64_t aws_checksums_crc64xz_arm_pmull(const uint8_t *input, int length, const uint64_t previous_crc64) { if (!input || length <= 0) { - return previousCrc64; + return previous_crc64; + } + + // the amount of complexity required to handle vector instructions on + // memory regions smaller than an xmm register does not justify the very negligible performance gains + // we would get for using it on an input this small. + if (length < 16) { + return aws_checksums_crc64xz_sw(input, length, previous_crc64); } // Invert the previous crc bits and load into the lower half of a neon register - poly64x2_t a1 = vreinterpretq_p64_u64(vcombine_u64(vcreate_u64(~previousCrc64), vcreate_u64(0))); + poly64x2_t a1 = vreinterpretq_p64_u64(vcombine_u64(vcreate_u64(~previous_crc64), vcreate_u64(0))); // Load the x^128 and x^192 constants - they'll (very likely) be needed const poly64x2_t x128 = load_p64(aws_checksums_crc64xz_constants.x128); - if (length < 16) { - // Neon register loads are 16 bytes at once, so for lengths less than 16 we need to - // carefully load from memory to prevent reading beyond the end of the input buffer - int alignment = (intptr_t)input & 15; - if (alignment + length <= 16) { - // The input falls in a single 16 byte segment so we load from a 16 byte aligned address - // The input data will be loaded "into the middle" of the neon register - // Right shift the input data register to eliminate any leading bytes and move the data to the least - // significant bytes, then mask out the most significant bytes that may contain garbage - uint8x16_t load = mask_low_u8(right_shift_u8(load_u8(input - alignment), alignment), length); - // XOR the masked input data with the previous crc - a1 = xor_p64(a1, vreinterpretq_p64_u8(load)); - } else { - // The input spans two 16 byte segments so it's safe to load the input from its actual starting address - // The input data will be in the least significant bytes of the neon register - // Mask out the most significant bytes that may contain garbage - uint8x16_t load = mask_low_u8(load_u8(input), length); - // XOR the masked input data with the previous crc - a1 = xor_p64(a1, vreinterpretq_p64_u8(load)); - } - - if (length <= 8) { - // For 8 or less bytes of input just left shift to effectively multiply by x^64 - a1 = left_shift_p64(a1, 8 - length); - } else { - // For 8-15 bytes of input we can't left shift without losing the most significant bytes - // We need to fold the lower half of the crc register into the upper half - a1 = left_shift_p64(a1, 16 - length); - // Multiply the lower half of the crc register by x^128 (swapping upper and lower halves) - // XOR the result with the right shifted upper half of the crc - a1 = xor_p64(right_shift_imm_p64(a1, 8), pmull_lo(a1, vextq_p64(x128, x128, 1))); - } + // Load the next 16 bytes of input and XOR with the previous crc + a1 = xor_p64(a1, load_p64_u8(input)); + input += 16; + length -= 16; - // Fall through to Barrett modular reduction + if (length < 112) { - } else { // There are 16 or more bytes of input + const poly64x2_t x256 = load_p64(aws_checksums_crc64xz_constants.x256); - // Load the next 16 bytes of input and XOR with the previous crc - a1 = xor_p64(a1, load_p64_u8(input)); - input += 16; - length -= 16; - - if (length < 112) { - - const poly64x2_t x256 = load_p64(aws_checksums_crc64xz_constants.x256); - - if (length & 64) { - // Fold the current crc register with 64 bytes of input by multiplying 64-bit chunks by x^576 through - // x^128 - const poly64x2_t x512 = load_p64(aws_checksums_crc64xz_constants.x512); - const poly64x2_t x384 = load_p64(aws_checksums_crc64xz_constants.x384); - poly64x2_t b1 = load_p64_u8(input + 0); - poly64x2_t c1 = load_p64_u8(input + 16); - poly64x2_t d1 = load_p64_u8(input + 32); - poly64x2_t e1 = load_p64_u8(input + 48); - a1 = xor3_p64(pmull_lo(x512, a1), pmull_hi(x512, a1), pmull_lo(x384, b1)); - b1 = xor3_p64(pmull_hi(x384, b1), pmull_lo(x256, c1), pmull_hi(x256, c1)); - c1 = xor3_p64(pmull_lo(x128, d1), pmull_hi(x128, d1), e1); - a1 = xor3_p64(a1, b1, c1); - input += 64; - } - - if (length & 32) { - // Fold the current running value with 32 bytes of input by multiplying 64-bit chunks by x^320 through - // x^128 - poly64x2_t b1 = load_p64_u8(input + 0); - poly64x2_t c1 = load_p64_u8(input + 16); - a1 = xor3_p64(c1, pmull_lo(x256, a1), pmull_hi(x256, a1)); - a1 = xor3_p64(a1, pmull_lo(x128, b1), pmull_hi(x128, b1)); - input += 32; - } - } else { // There are 112 or more bytes of input - - const poly64x2_t x1024 = load_p64(aws_checksums_crc64xz_constants.x1024); - - // Load another 112 bytes of input + if (length & 64) { + // Fold the current crc register with 64 bytes of input by multiplying 64-bit chunks by x^576 through + // x^128 + const poly64x2_t x512 = load_p64(aws_checksums_crc64xz_constants.x512); + const poly64x2_t x384 = load_p64(aws_checksums_crc64xz_constants.x384); poly64x2_t b1 = load_p64_u8(input + 0); poly64x2_t c1 = load_p64_u8(input + 16); poly64x2_t d1 = load_p64_u8(input + 32); poly64x2_t e1 = load_p64_u8(input + 48); - poly64x2_t f1 = load_p64_u8(input + 64); - poly64x2_t g1 = load_p64_u8(input + 80); - poly64x2_t h1 = load_p64_u8(input + 96); - input += 112; - length -= 112; - - // Spin through additional chunks of 128 bytes, if any - int loops = length / 128; - while (loops--) { - // Fold input values in parallel by multiplying by x^1088 and x^1024 constants - a1 = xor3_p64(pmull_lo(x1024, a1), pmull_hi(x1024, a1), load_p64_u8(input + 0)); - b1 = xor3_p64(pmull_lo(x1024, b1), pmull_hi(x1024, b1), load_p64_u8(input + 16)); - c1 = xor3_p64(pmull_lo(x1024, c1), pmull_hi(x1024, c1), load_p64_u8(input + 32)); - d1 = xor3_p64(pmull_lo(x1024, d1), pmull_hi(x1024, d1), load_p64_u8(input + 48)); - e1 = xor3_p64(pmull_lo(x1024, e1), pmull_hi(x1024, e1), load_p64_u8(input + 64)); - f1 = xor3_p64(pmull_lo(x1024, f1), pmull_hi(x1024, f1), load_p64_u8(input + 80)); - g1 = xor3_p64(pmull_lo(x1024, g1), pmull_hi(x1024, g1), load_p64_u8(input + 96)); - h1 = xor3_p64(pmull_lo(x1024, h1), pmull_hi(x1024, h1), load_p64_u8(input + 112)); - input += 128; - } - - // Fold 128 bytes down to 64 bytes by multiplying by the x^576 and x^512 constants - const poly64x2_t x512 = load_p64(aws_checksums_crc64xz_constants.x512); - a1 = xor3_p64(e1, pmull_lo(x512, a1), pmull_hi(x512, a1)); - b1 = xor3_p64(f1, pmull_lo(x512, b1), pmull_hi(x512, b1)); - c1 = xor3_p64(g1, pmull_lo(x512, c1), pmull_hi(x512, c1)); - d1 = xor3_p64(h1, pmull_lo(x512, d1), pmull_hi(x512, d1)); - - if (length & 64) { - // Fold the current 64 bytes with 64 bytes of input by multiplying by x^576 and x^512 constants - a1 = xor3_p64(pmull_lo(x512, a1), pmull_hi(x512, a1), load_p64_u8(input + 0)); - b1 = xor3_p64(pmull_lo(x512, b1), pmull_hi(x512, b1), load_p64_u8(input + 16)); - c1 = xor3_p64(pmull_lo(x512, c1), pmull_hi(x512, c1), load_p64_u8(input + 32)); - d1 = xor3_p64(pmull_lo(x512, d1), pmull_hi(x512, d1), load_p64_u8(input + 48)); - input += 64; - } - - // Fold 64 bytes down to 32 bytes by multiplying by the x^320 and x^256 constants - const poly64x2_t x256 = load_p64(aws_checksums_crc64xz_constants.x256); - a1 = xor3_p64(c1, pmull_lo(x256, a1), pmull_hi(x256, a1)); - b1 = xor3_p64(d1, pmull_lo(x256, b1), pmull_hi(x256, b1)); - - if (length & 32) { - // Fold the current running value with 32 bytes of input by multiplying by x^320 and x^256 constants - a1 = xor3_p64(pmull_lo(x256, a1), pmull_hi(x256, a1), load_p64_u8(input + 0)); - b1 = xor3_p64(pmull_lo(x256, b1), pmull_hi(x256, b1), load_p64_u8(input + 16)); - input += 32; - } + a1 = xor3_p64(pmull_lo(x512, a1), pmull_hi(x512, a1), pmull_lo(x384, b1)); + b1 = xor3_p64(pmull_hi(x384, b1), pmull_lo(x256, c1), pmull_hi(x256, c1)); + c1 = xor3_p64(pmull_lo(x128, d1), pmull_hi(x128, d1), e1); + a1 = xor3_p64(a1, b1, c1); + input += 64; + } - // Fold 32 bytes down to 16 bytes by multiplying by x^192 and x^128 constants - a1 = xor3_p64(b1, pmull_lo(x128, a1), pmull_hi(x128, a1)); + if (length & 32) { + // Fold the current running value with 32 bytes of input by multiplying 64-bit chunks by x^320 through + // x^128 + poly64x2_t b1 = load_p64_u8(input + 0); + poly64x2_t c1 = load_p64_u8(input + 16); + a1 = xor3_p64(c1, pmull_lo(x256, a1), pmull_hi(x256, a1)); + a1 = xor3_p64(a1, pmull_lo(x128, b1), pmull_hi(x128, b1)); + input += 32; + } + } else { // There are 112 or more bytes of input + + const poly64x2_t x1024 = load_p64(aws_checksums_crc64xz_constants.x1024); + + // Load another 112 bytes of input + poly64x2_t b1 = load_p64_u8(input + 0); + poly64x2_t c1 = load_p64_u8(input + 16); + poly64x2_t d1 = load_p64_u8(input + 32); + poly64x2_t e1 = load_p64_u8(input + 48); + poly64x2_t f1 = load_p64_u8(input + 64); + poly64x2_t g1 = load_p64_u8(input + 80); + poly64x2_t h1 = load_p64_u8(input + 96); + input += 112; + length -= 112; + + // Spin through additional chunks of 128 bytes, if any + int loops = length / 128; + while (loops--) { + // Fold input values in parallel by multiplying by x^1088 and x^1024 constants + a1 = xor3_p64(pmull_lo(x1024, a1), pmull_hi(x1024, a1), load_p64_u8(input + 0)); + b1 = xor3_p64(pmull_lo(x1024, b1), pmull_hi(x1024, b1), load_p64_u8(input + 16)); + c1 = xor3_p64(pmull_lo(x1024, c1), pmull_hi(x1024, c1), load_p64_u8(input + 32)); + d1 = xor3_p64(pmull_lo(x1024, d1), pmull_hi(x1024, d1), load_p64_u8(input + 48)); + e1 = xor3_p64(pmull_lo(x1024, e1), pmull_hi(x1024, e1), load_p64_u8(input + 64)); + f1 = xor3_p64(pmull_lo(x1024, f1), pmull_hi(x1024, f1), load_p64_u8(input + 80)); + g1 = xor3_p64(pmull_lo(x1024, g1), pmull_hi(x1024, g1), load_p64_u8(input + 96)); + h1 = xor3_p64(pmull_lo(x1024, h1), pmull_hi(x1024, h1), load_p64_u8(input + 112)); + input += 128; } - if (length & 16) { - // Fold the current 16 bytes with 16 bytes of input by multiplying by x^192 and x^128 constants - a1 = xor3_p64(pmull_lo(x128, a1), pmull_hi(x128, a1), load_p64_u8(input + 0)); - input += 16; + // Fold 128 bytes down to 64 bytes by multiplying by the x^576 and x^512 constants + const poly64x2_t x512 = load_p64(aws_checksums_crc64xz_constants.x512); + a1 = xor3_p64(e1, pmull_lo(x512, a1), pmull_hi(x512, a1)); + b1 = xor3_p64(f1, pmull_lo(x512, b1), pmull_hi(x512, b1)); + c1 = xor3_p64(g1, pmull_lo(x512, c1), pmull_hi(x512, c1)); + d1 = xor3_p64(h1, pmull_lo(x512, d1), pmull_hi(x512, d1)); + + if (length & 64) { + // Fold the current 64 bytes with 64 bytes of input by multiplying by x^576 and x^512 constants + a1 = xor3_p64(pmull_lo(x512, a1), pmull_hi(x512, a1), load_p64_u8(input + 0)); + b1 = xor3_p64(pmull_lo(x512, b1), pmull_hi(x512, b1), load_p64_u8(input + 16)); + c1 = xor3_p64(pmull_lo(x512, c1), pmull_hi(x512, c1), load_p64_u8(input + 32)); + d1 = xor3_p64(pmull_lo(x512, d1), pmull_hi(x512, d1), load_p64_u8(input + 48)); + input += 64; } - // There must only be 0-15 bytes of input left - length &= 15; - - if (length == 0) { - // Multiply the lower half of the crc register by x^128 (swapping upper and lower halves) - poly64x2_t mul_by_x128 = pmull_lo(a1, vextq_p64(x128, x128, 1)); - // XOR the result with the right shifted upper half of the crc - a1 = xor_p64(right_shift_imm_p64(a1, 8), mul_by_x128); - } else { - // Handle any trailing input from 1-15 bytes - const poly64x2_t trailing_constants = load_p64(aws_checksums_crc64xz_constants.trailing[length - 1]); - // Multiply the crc by a pair of trailing length constants in order to fold it into the trailing input - a1 = xor_p64(pmull_lo(a1, trailing_constants), pmull_hi(a1, trailing_constants)); - // Safely load ending at the last byte of trailing input and mask out any leading garbage - poly64x2_t trailing_input = mask_high_p64(load_p64_u8(input + length - 16), length); - // Multiply the lower half of the trailing input register by x^128 (swapping x^192 and x^128 halves) - poly64x2_t mul_by_x128 = pmull_lo(trailing_input, vextq_p64(x128, x128, 1)); - // XOR the results with the right shifted upper half of the trailing input - a1 = xor3_p64(a1, right_shift_imm_p64(trailing_input, 8), mul_by_x128); + // Fold 64 bytes down to 32 bytes by multiplying by the x^320 and x^256 constants + const poly64x2_t x256 = load_p64(aws_checksums_crc64xz_constants.x256); + a1 = xor3_p64(c1, pmull_lo(x256, a1), pmull_hi(x256, a1)); + b1 = xor3_p64(d1, pmull_lo(x256, b1), pmull_hi(x256, b1)); + + if (length & 32) { + // Fold the current running value with 32 bytes of input by multiplying by x^320 and x^256 constants + a1 = xor3_p64(pmull_lo(x256, a1), pmull_hi(x256, a1), load_p64_u8(input + 0)); + b1 = xor3_p64(pmull_lo(x256, b1), pmull_hi(x256, b1), load_p64_u8(input + 16)); + input += 32; } + + // Fold 32 bytes down to 16 bytes by multiplying by x^192 and x^128 constants + a1 = xor3_p64(b1, pmull_lo(x128, a1), pmull_hi(x128, a1)); + } + + if (length & 16) { + // Fold the current 16 bytes with 16 bytes of input by multiplying by x^192 and x^128 constants + a1 = xor3_p64(pmull_lo(x128, a1), pmull_hi(x128, a1), load_p64_u8(input + 0)); + input += 16; + } + + // There must only be 0-15 bytes of input left + length &= 15; + + if (length == 0) { + // Multiply the lower half of the crc register by x^128 (swapping upper and lower halves) + poly64x2_t mul_by_x128 = pmull_lo(a1, vextq_p64(x128, x128, 1)); + // XOR the result with the right shifted upper half of the crc + a1 = xor_p64(right_shift_imm_p64(a1, 8), mul_by_x128); + } else { + // Handle any trailing input from 1-15 bytes + const poly64x2_t trailing_constants = load_p64(aws_checksums_crc64xz_constants.trailing[length - 1]); + // Multiply the crc by a pair of trailing length constants in order to fold it into the trailing input + a1 = xor_p64(pmull_lo(a1, trailing_constants), pmull_hi(a1, trailing_constants)); + // Safely load ending at the last byte of trailing input and mask out any leading garbage + poly64x2_t trailing_input = mask_high_p64(load_p64_u8(input + length - 16), length); + // Multiply the lower half of the trailing input register by x^128 (swapping x^192 and x^128 halves) + poly64x2_t mul_by_x128 = pmull_lo(trailing_input, vextq_p64(x128, x128, 1)); + // XOR the results with the right shifted upper half of the trailing input + a1 = xor3_p64(a1, right_shift_imm_p64(trailing_input, 8), mul_by_x128); } // Barrett modular reduction