Permalink
Browse files

Merge

  • Loading branch information...
2 parents 49694d7 + 0a3f580 commit c8840d45857b0152e4dc43a4049d1d6b511d0318 @bos committed Oct 3, 2012
Showing with 109 additions and 0 deletions.
  1. +14 −0 benchmarks/Benchmarks.hs
  2. +93 −0 benchmarks/cbits/siphash-sse41.c
  3. +2 −0 benchmarks/hashable-benchmarks.cabal
View
@@ -55,6 +55,9 @@ main = do
cSipHash (PS fp off len) =
inlinePerformIO . withForeignPtr fp $ \ptr ->
return $! c_siphash 2 4 k0 k1 (ptr `plusPtr` off) (fromIntegral len)
+ sse41SipHash (PS fp off len) =
+ inlinePerformIO . withForeignPtr fp $ \ptr ->
+ return $! sse41_siphash k0 k1 (ptr `plusPtr` off) (fromIntegral len)
withForeignPtr fp5 $ \ p5 ->
withForeignPtr fp8 $ \ p8 ->
@@ -111,6 +114,15 @@ main = do
, bench "512" $ whnf cSipHash bs512
, bench "2^20" $ whnf cSipHash bs1Mb
]
+ , bgroup "sse41SipHash"
+ [ bench "5" $ whnf sse41SipHash bs5
+ , bench "8" $ whnf sse41SipHash bs8
+ , bench "11" $ whnf sse41SipHash bs11
+ , bench "40" $ whnf sse41SipHash bs40
+ , bench "128" $ whnf sse41SipHash bs128
+ , bench "512" $ whnf sse41SipHash bs512
+ , bench "2^20" $ whnf sse41SipHash bs1Mb
+ ]
, bgroup "pkgSipHash"
[ bench "5" $ whnf hsSipHash bs5
, bench "8" $ whnf hsSipHash bs8
@@ -132,3 +144,5 @@ new (I# n#) = unBA (runST $ ST $ \s1 ->
foreign import ccall unsafe "siphash" c_siphash
:: CInt -> CInt -> Word64 -> Word64 -> Ptr Word8 -> CSize -> Word64
+foreign import ccall unsafe "siphash_sse41" sse41_siphash
+ :: Word64 -> Word64 -> Ptr Word8 -> CSize -> Word64
@@ -0,0 +1,93 @@
+#include <smmintrin.h>
+
+#include <stdint.h>
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint8_t u8;
+
+
+#define SIPHASH_ROUNDS 2
+#define SIPHASH_FINALROUNDS 4
+
+/*
+typedef uint8_t u8;
+typedef uint32_t u32;
+typedef uint64_t u64;
+*/
+
+// Specialized for siphash, do not reuse
+#define rotate16(x) _mm_shufflehi_epi16((x), _MM_SHUFFLE(2,1,0,3))
+
+#define _mm_roti_epi64(x, c) (((c) == 16) ? rotate16((x)) : _mm_xor_si128(_mm_slli_epi64((x), (c)), _mm_srli_epi64((x), 64-(c))))
+//#define _mm_roti_epi64(x, c) _mm_xor_si128(_mm_slli_epi64((x), (c)), _mm_srli_epi64((x), 64-(c)))
+
+
+u64 siphash_sse41(u64 _k0, u64 _k1, const unsigned char *m, size_t n)
+{
+ __m128i v0, v1, v02, v13;
+ __m128i k0;
+ __m128i mi, mask, len, h;
+ const __m128i zero = _mm_setzero_si128();
+ size_t i, k;
+ union { u64 gpr; __m128i xmm; } hash;
+ unsigned char key[16];
+
+ ((u64 *)key)[0] = _k0;
+ ((u64 *)key)[1] = _k1;
+
+ k0 = _mm_loadu_si128((__m128i*)(key + 0));
+
+ v0 = _mm_xor_si128(k0, _mm_set_epi32(0x646f7261, 0x6e646f6d, 0x736f6d65, 0x70736575));
+ v1 = _mm_xor_si128(k0, _mm_set_epi32(0x74656462, 0x79746573, 0x6c796765, 0x6e657261));
+
+ v02 = _mm_unpacklo_epi64(v0, v1);
+ v13 = _mm_unpackhi_epi64(v0, v1);
+
+#define HALF_ROUND(a,b,s,t) \
+do \
+{ \
+ __m128i b1,b2; \
+ a = _mm_add_epi64(a, b); \
+ b1 = _mm_roti_epi64(b, s); b2 = _mm_roti_epi64(b, t); b = _mm_blend_epi16(b1, b2, 0xF0); \
+ b = _mm_xor_si128(b, a); \
+} while(0)
+
+#define COMPRESS(v02,v13) \
+ do \
+ { \
+ HALF_ROUND(v02,v13,13,16); \
+ v02 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
+ HALF_ROUND(v02,v13,17,21); \
+ v02 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
+ } while(0)
+
+ for(i = 0; i < (n-n%8); i += 8)
+ {
+ mi = _mm_loadl_epi64((__m128i*)(m + i));
+ v13 = _mm_xor_si128(v13, _mm_unpacklo_epi64(zero, mi));
+ for(k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v02,v13);
+ v02 = _mm_xor_si128(v02, mi);
+ }
+
+ mi = _mm_loadl_epi64((__m128i*)(m + i));
+ len = _mm_set_epi32(0, 0, (n&0xff) << 24, 0);
+ mask = _mm_srli_epi64(_mm_set_epi32(0, 0, 0xffffffff, 0xffffffff), 8*(8-n%8));
+ mi = _mm_xor_si128(_mm_and_si128(mi, mask), len);
+
+ v13 = _mm_xor_si128(v13, _mm_unpacklo_epi64(zero, mi));
+ for(k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v02,v13);
+ v02 = _mm_xor_si128(v02, mi);
+
+ v02 = _mm_xor_si128(v02, _mm_set_epi32(0, 0xff, 0, 0));
+ for(k = 0; k < SIPHASH_FINALROUNDS; ++k) COMPRESS(v02,v13);
+
+ v0 = _mm_xor_si128(v02, v13);
+ v0 = _mm_xor_si128(v0, _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(zero), _mm_castsi128_ps(v0))));
+ hash.xmm = v0;
+
+#undef COMPRESS
+#undef HALF_ROUND
+ //return _mm_extract_epi32(v0, 0) | (((u64)_mm_extract_epi32(v0, 1)) << 32);
+ return hash.gpr;
+}
@@ -6,9 +6,11 @@ cabal-version: >=1.2
executable hashable-benchmarks
ghc-options: -Wall -O2
+ cc-options: -msse4.1
c-sources:
../cbits/hashByteString.c
cbits/siphash.c
+ cbits/siphash-sse41.c
hs-source-dirs: .. .
main-is: Benchmarks.hs
other-modules:

0 comments on commit c8840d4

Please sign in to comment.