Skip to content

Commit

Permalink
One simple SIMD optimization (#402)
Browse files Browse the repository at this point in the history
* Adding some SIMD.

* SSE2 version.

* Reformat.

---------

Co-authored-by: Daniel Lemire <dlemire@lemire.me>
  • Loading branch information
lemire and Daniel Lemire committed May 14, 2023
1 parent 349d926 commit 90ed1dd
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 3 deletions.
11 changes: 11 additions & 0 deletions include/ada/common_defs.h
Expand Up @@ -285,4 +285,15 @@ namespace ada {
if (!(COND)) __builtin_unreachable(); \
} while (0)
#endif

#if defined(__SSE2__) || defined(__x86_64__) || defined(__x86_64) || \
(defined(_M_AMD64) || defined(_M_X64) || \
(defined(_M_IX86_FP) && _M_IX86_FP == 2))
#define ADA_SSE2 1
#endif

#if defined(__aarch64__) || defined(_M_ARM64)
#define ADA_NEON 1
#endif

#endif // ADA_COMMON_DEFS_H
2 changes: 1 addition & 1 deletion include/ada/unicode.h
Expand Up @@ -71,7 +71,7 @@ std::string to_unicode(std::string_view input);
* @attention The has_tabs_or_newline function is a bottleneck and it is simple
* enough that compilers like GCC can 'autovectorize it'.
*/
ada_really_inline constexpr bool has_tabs_or_newline(
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept;

/**
Expand Down
60 changes: 58 additions & 2 deletions src/unicode.cpp
Expand Up @@ -8,6 +8,11 @@ ADA_PUSH_DISABLE_ALL_WARNINGS
ADA_POP_DISABLE_WARNINGS

#include <algorithm>
#if ADA_NEON
#include <arm_neon.h>
#elif ADA_SSE2
#include <emmintrin.h>
#endif

namespace ada::unicode {

Expand Down Expand Up @@ -39,8 +44,58 @@ constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
}
return non_ascii == 0;
}

ada_really_inline constexpr bool has_tabs_or_newline(
#if ADA_NEON
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept {
size_t i = 0;
const uint8x16_t mask1 = vmovq_n_u8('\r');
const uint8x16_t mask2 = vmovq_n_u8('\n');
const uint8x16_t mask3 = vmovq_n_u8('\t');
uint8x16_t running{0};
for (; i + 15 < user_input.size(); i += 16) {
uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
vceqq_u8(word, mask2))),
vceqq_u8(word, mask3));
}
if (i < user_input.size()) {
uint8_t buffer[16];
memcpy(buffer, user_input.data() + i, user_input.size() - i);
uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
vceqq_u8(word, mask2))),
vceqq_u8(word, mask3));
}
return vmaxvq_u8(running) != 0;
}
#elif ADA_SSE2
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept {
size_t i = 0;
const __m128i mask1 = _mm_set1_epi8('\r');
const __m128i mask2 = _mm_set1_epi8('\n');
const __m128i mask3 = _mm_set1_epi8('\t');
__m128i running{0};
for (; i + 15 < user_input.size(); i += 16) {
__m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
running = _mm_or_si128(
_mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
_mm_cmpeq_epi8(word, mask2))),
_mm_cmpeq_epi8(word, mask3));
}
if (i < user_input.size()) {
uint8_t buffer[16];
memcpy(buffer, user_input.data() + i, user_input.size() - i);
__m128i word = _mm_loadu_si128((const __m128i*)buffer);
running = _mm_or_si128(
_mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
_mm_cmpeq_epi8(word, mask2))),
_mm_cmpeq_epi8(word, mask3));
}
return _mm_movemask_epi8(running) != 0;
}
#else
ada_really_inline bool has_tabs_or_newline(
std::string_view user_input) noexcept {
auto has_zero_byte = [](uint64_t v) {
return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080);
Expand Down Expand Up @@ -71,6 +126,7 @@ ada_really_inline constexpr bool has_tabs_or_newline(
}
return running;
}
#endif

// A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR,
// U+0020 SPACE, U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>),
Expand Down

0 comments on commit 90ed1dd

Please sign in to comment.