Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions include/ada/common_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,10 @@ namespace ada {
} while (0)
#endif

#if defined(__SSSE3__)
#define ADA_SSSE3 1
#endif

#if defined(__SSE2__) || defined(__x86_64__) || defined(__x86_64) || \
(defined(_M_AMD64) || defined(_M_X64) || \
(defined(_M_IX86_FP) && _M_IX86_FP == 2))
Expand Down
128 changes: 126 additions & 2 deletions src/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
#include "ada/common_defs.h"
#include "ada/scheme.h"

#if ADA_SSSE3
#include <tmmintrin.h>
#endif

namespace ada::helpers {

template <typename out_iter>
Expand Down Expand Up @@ -178,7 +182,64 @@ ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept {
// starting at index location, this finds the next location of a character
// :, /, \\, ? or [. If none is found, view.size() is returned.
// For use within get_host_delimiter_location.
#if ADA_NEON
#if ADA_SSSE3
ada_really_inline size_t find_next_host_delimiter_special(
std::string_view view, size_t location) noexcept {
// first check for short strings in which case we do it naively.
if (view.size() - location < 16) { // slow path
for (size_t i = location; i < view.size(); i++) {
if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
view[i] == '?' || view[i] == '[') {
return i;
}
}
return size_t(view.size());
}
// fast path for long strings (expected to be common)
// Using SSSE3's _mm_shuffle_epi8 for table lookup (same approach as NEON)
size_t i = location;
const __m128i low_mask =
_mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x04, 0x04, 0x00, 0x00, 0x03);
const __m128i high_mask =
_mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
const __m128i fmask = _mm_set1_epi8(0xf);
const __m128i zero = _mm_setzero_si128();
for (; i + 15 < view.size(); i += 16) {
__m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
__m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
__m128i highpart = _mm_shuffle_epi8(
high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
__m128i classify = _mm_and_si128(lowpart, highpart);
__m128i is_zero = _mm_cmpeq_epi8(classify, zero);
// _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
// zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
// avoid false positives.
int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
if (mask != 0) {
return i + trailing_zeroes(static_cast<uint32_t>(mask));
}
}
if (i < view.size()) {
__m128i word =
_mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
__m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
__m128i highpart = _mm_shuffle_epi8(
high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
__m128i classify = _mm_and_si128(lowpart, highpart);
__m128i is_zero = _mm_cmpeq_epi8(classify, zero);
// _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
// zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
// avoid false positives.
int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
if (mask != 0) {
return view.length() - 16 + trailing_zeroes(static_cast<uint32_t>(mask));
}
}
return size_t(view.size());
}
#elif ADA_NEON
// The ada_make_uint8x16_t macro is necessary because Visual Studio does not
// support direct initialization of uint8x16_t. See
// https://developercommunity.visualstudio.com/t/error-C2078:-too-many-initializers-whe/402911?q=backend+neon
Expand Down Expand Up @@ -417,7 +478,70 @@ ada_really_inline size_t find_next_host_delimiter_special(
// starting at index location, this finds the next location of a character
// :, /, ? or [. If none is found, view.size() is returned.
// For use within get_host_delimiter_location.
#if ADA_NEON
#if ADA_SSSE3
ada_really_inline size_t find_next_host_delimiter(std::string_view view,
size_t location) noexcept {
// first check for short strings in which case we do it naively.
if (view.size() - location < 16) { // slow path
for (size_t i = location; i < view.size(); i++) {
if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
view[i] == '[') {
return i;
}
}
return size_t(view.size());
}
// fast path for long strings (expected to be common)
size_t i = location;
// Lookup tables for bit classification:
// ':' (0x3A): low[0xA]=0x01, high[0x3]=0x01 -> match
// '/' (0x2F): low[0xF]=0x02, high[0x2]=0x02 -> match
// '?' (0x3F): low[0xF]=0x01, high[0x3]=0x01 -> match
// '[' (0x5B): low[0xB]=0x04, high[0x5]=0x04 -> match
const __m128i low_mask =
_mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x04, 0x00, 0x00, 0x00, 0x03);
const __m128i high_mask =
_mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
const __m128i fmask = _mm_set1_epi8(0xf);
const __m128i zero = _mm_setzero_si128();

for (; i + 15 < view.size(); i += 16) {
__m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
__m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
__m128i highpart = _mm_shuffle_epi8(
high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
__m128i classify = _mm_and_si128(lowpart, highpart);
__m128i is_zero = _mm_cmpeq_epi8(classify, zero);
// _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
// zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
// avoid false positives.
int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
if (mask != 0) {
return i + trailing_zeroes(static_cast<uint32_t>(mask));
}
}

if (i < view.size()) {
__m128i word =
_mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
__m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
__m128i highpart = _mm_shuffle_epi8(
high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
__m128i classify = _mm_and_si128(lowpart, highpart);
__m128i is_zero = _mm_cmpeq_epi8(classify, zero);
// _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
// zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
// avoid false positives.
int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
if (mask != 0) {
return view.length() - 16 + trailing_zeroes(static_cast<uint32_t>(mask));
}
}
return size_t(view.size());
}
#elif ADA_NEON
ada_really_inline size_t find_next_host_delimiter(std::string_view view,
size_t location) noexcept {
// first check for short strings in which case we do it naively.
Expand Down
38 changes: 36 additions & 2 deletions src/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ ADA_PUSH_DISABLE_ALL_WARNINGS
ADA_POP_DISABLE_WARNINGS

#include <algorithm>
#if ADA_NEON
#if ADA_SSSE3
#include <tmmintrin.h>
#elif ADA_NEON
#include <arm_neon.h>
#elif ADA_SSE2
#include <emmintrin.h>
Expand Down Expand Up @@ -57,7 +59,39 @@ constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
}
return non_ascii == 0;
}
#if ADA_NEON
#if ADA_SSSE3
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept {
// first check for short strings in which case we do it naively.
if (user_input.size() < 16) { // slow path
return std::ranges::any_of(user_input, is_tabs_or_newline);
}
// fast path for long strings (expected to be common)
// Using SSSE3's _mm_shuffle_epi8 for table lookup (same approach as NEON)
size_t i = 0;
// Lookup table where positions 9, 10, 13 contain their own values
// Everything else is set to 1 so it won't match
const __m128i rnt =
_mm_setr_epi8(1, 0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 0, 0, 13, 0, 0);
__m128i running = _mm_setzero_si128();
for (; i + 15 < user_input.size(); i += 16) {
__m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
// Shuffle the lookup table using input bytes as indices
__m128i shuffled = _mm_shuffle_epi8(rnt, word);
// Compare: if shuffled value matches input, we found \t, \n, or \r
__m128i matches = _mm_cmpeq_epi8(shuffled, word);
running = _mm_or_si128(running, matches);
}
if (i < user_input.size()) {
__m128i word = _mm_loadu_si128(
(const __m128i*)(user_input.data() + user_input.length() - 16));
__m128i shuffled = _mm_shuffle_epi8(rnt, word);
__m128i matches = _mm_cmpeq_epi8(shuffled, word);
running = _mm_or_si128(running, matches);
}
return _mm_movemask_epi8(running) != 0;
}
#elif ADA_NEON
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept {
// first check for short strings in which case we do it naively.
Expand Down
Loading