From 7f8a7f2ba2c6a5cb31426b8c13618d57aa97c687 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Sat, 15 Nov 2025 15:03:59 -0500 Subject: [PATCH] add SSSE3 SIMD instructions --- include/ada/common_defs.h | 4 ++ src/helpers.cpp | 128 +++++++++++++++++++++++++++++++++++++- src/unicode.cpp | 38 ++++++++++- 3 files changed, 166 insertions(+), 4 deletions(-) diff --git a/include/ada/common_defs.h b/include/ada/common_defs.h index 4820a9070..ac6e660eb 100644 --- a/include/ada/common_defs.h +++ b/include/ada/common_defs.h @@ -233,6 +233,10 @@ namespace ada { } while (0) #endif +#if defined(__SSSE3__) +#define ADA_SSSE3 1 +#endif + #if defined(__SSE2__) || defined(__x86_64__) || defined(__x86_64) || \ (defined(_M_AMD64) || defined(_M_X64) || \ (defined(_M_IX86_FP) && _M_IX86_FP == 2)) diff --git a/src/helpers.cpp b/src/helpers.cpp index 502cbbbd5..65512a090 100644 --- a/src/helpers.cpp +++ b/src/helpers.cpp @@ -5,6 +5,10 @@ #include "ada/common_defs.h" #include "ada/scheme.h" +#if ADA_SSSE3 +#include +#endif + namespace ada::helpers { template @@ -178,7 +182,64 @@ ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept { // starting at index location, this finds the next location of a character // :, /, \\, ? or [. If none is found, view.size() is returned. // For use within get_host_delimiter_location. -#if ADA_NEON +#if ADA_SSSE3 +ada_really_inline size_t find_next_host_delimiter_special( + std::string_view view, size_t location) noexcept { + // first check for short strings in which case we do it naively. + if (view.size() - location < 16) { // slow path + for (size_t i = location; i < view.size(); i++) { + if (view[i] == ':' || view[i] == '/' || view[i] == '\\' || + view[i] == '?' || view[i] == '[') { + return i; + } + } + return size_t(view.size()); + } + // fast path for long strings (expected to be common) + // Using SSSE3's _mm_shuffle_epi8 for table lookup (same approach as NEON) + size_t i = location; + const __m128i low_mask = + _mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x04, 0x04, 0x00, 0x00, 0x03); + const __m128i high_mask = + _mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + const __m128i fmask = _mm_set1_epi8(0xf); + const __m128i zero = _mm_setzero_si128(); + for (; i + 15 < view.size(); i += 16) { + __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i)); + __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask)); + __m128i highpart = _mm_shuffle_epi8( + high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask)); + __m128i classify = _mm_and_si128(lowpart, highpart); + __m128i is_zero = _mm_cmpeq_epi8(classify, zero); + // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31 + // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to + // avoid false positives. + int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF; + if (mask != 0) { + return i + trailing_zeroes(static_cast(mask)); + } + } + if (i < view.size()) { + __m128i word = + _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16)); + __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask)); + __m128i highpart = _mm_shuffle_epi8( + high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask)); + __m128i classify = _mm_and_si128(lowpart, highpart); + __m128i is_zero = _mm_cmpeq_epi8(classify, zero); + // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31 + // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to + // avoid false positives. + int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF; + if (mask != 0) { + return view.length() - 16 + trailing_zeroes(static_cast(mask)); + } + } + return size_t(view.size()); +} +#elif ADA_NEON // The ada_make_uint8x16_t macro is necessary because Visual Studio does not // support direct initialization of uint8x16_t. See // https://developercommunity.visualstudio.com/t/error-C2078:-too-many-initializers-whe/402911?q=backend+neon @@ -417,7 +478,70 @@ ada_really_inline size_t find_next_host_delimiter_special( // starting at index location, this finds the next location of a character // :, /, ? or [. If none is found, view.size() is returned. // For use within get_host_delimiter_location. -#if ADA_NEON +#if ADA_SSSE3 +ada_really_inline size_t find_next_host_delimiter(std::string_view view, + size_t location) noexcept { + // first check for short strings in which case we do it naively. + if (view.size() - location < 16) { // slow path + for (size_t i = location; i < view.size(); i++) { + if (view[i] == ':' || view[i] == '/' || view[i] == '?' || + view[i] == '[') { + return i; + } + } + return size_t(view.size()); + } + // fast path for long strings (expected to be common) + size_t i = location; + // Lookup tables for bit classification: + // ':' (0x3A): low[0xA]=0x01, high[0x3]=0x01 -> match + // '/' (0x2F): low[0xF]=0x02, high[0x2]=0x02 -> match + // '?' (0x3F): low[0xF]=0x01, high[0x3]=0x01 -> match + // '[' (0x5B): low[0xB]=0x04, high[0x5]=0x04 -> match + const __m128i low_mask = + _mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x04, 0x00, 0x00, 0x00, 0x03); + const __m128i high_mask = + _mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + const __m128i fmask = _mm_set1_epi8(0xf); + const __m128i zero = _mm_setzero_si128(); + + for (; i + 15 < view.size(); i += 16) { + __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i)); + __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask)); + __m128i highpart = _mm_shuffle_epi8( + high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask)); + __m128i classify = _mm_and_si128(lowpart, highpart); + __m128i is_zero = _mm_cmpeq_epi8(classify, zero); + // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31 + // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to + // avoid false positives. + int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF; + if (mask != 0) { + return i + trailing_zeroes(static_cast(mask)); + } + } + + if (i < view.size()) { + __m128i word = + _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16)); + __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask)); + __m128i highpart = _mm_shuffle_epi8( + high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask)); + __m128i classify = _mm_and_si128(lowpart, highpart); + __m128i is_zero = _mm_cmpeq_epi8(classify, zero); + // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31 + // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to + // avoid false positives. + int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF; + if (mask != 0) { + return view.length() - 16 + trailing_zeroes(static_cast(mask)); + } + } + return size_t(view.size()); +} +#elif ADA_NEON ada_really_inline size_t find_next_host_delimiter(std::string_view view, size_t location) noexcept { // first check for short strings in which case we do it naively. diff --git a/src/unicode.cpp b/src/unicode.cpp index 69b3ddb4c..e1f887be5 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -10,7 +10,9 @@ ADA_PUSH_DISABLE_ALL_WARNINGS ADA_POP_DISABLE_WARNINGS #include -#if ADA_NEON +#if ADA_SSSE3 +#include +#elif ADA_NEON #include #elif ADA_SSE2 #include @@ -57,7 +59,39 @@ constexpr bool to_lower_ascii(char* input, size_t length) noexcept { } return non_ascii == 0; } -#if ADA_NEON +#if ADA_SSSE3 +ada_really_inline bool has_tabs_or_newline( + std::string_view user_input) noexcept { + // first check for short strings in which case we do it naively. + if (user_input.size() < 16) { // slow path + return std::ranges::any_of(user_input, is_tabs_or_newline); + } + // fast path for long strings (expected to be common) + // Using SSSE3's _mm_shuffle_epi8 for table lookup (same approach as NEON) + size_t i = 0; + // Lookup table where positions 9, 10, 13 contain their own values + // Everything else is set to 1 so it won't match + const __m128i rnt = + _mm_setr_epi8(1, 0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 0, 0, 13, 0, 0); + __m128i running = _mm_setzero_si128(); + for (; i + 15 < user_input.size(); i += 16) { + __m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i)); + // Shuffle the lookup table using input bytes as indices + __m128i shuffled = _mm_shuffle_epi8(rnt, word); + // Compare: if shuffled value matches input, we found \t, \n, or \r + __m128i matches = _mm_cmpeq_epi8(shuffled, word); + running = _mm_or_si128(running, matches); + } + if (i < user_input.size()) { + __m128i word = _mm_loadu_si128( + (const __m128i*)(user_input.data() + user_input.length() - 16)); + __m128i shuffled = _mm_shuffle_epi8(rnt, word); + __m128i matches = _mm_cmpeq_epi8(shuffled, word); + running = _mm_or_si128(running, matches); + } + return _mm_movemask_epi8(running) != 0; +} +#elif ADA_NEON ada_really_inline bool has_tabs_or_newline( std::string_view user_input) noexcept { // first check for short strings in which case we do it naively.