Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

speedup encode and decode #1

Merged
merged 3 commits into from

2 participants

@vincenthz

Hi, the following patchset replaces some computation by lookup tables and improve the performance a bit more. It's using ghc primitive directly but the portability line is GHC, so hopefully this is not a problem. if it's a problem, it might be possible to use bytestring overloadStrings and unsafeIndexing to almost the same performance increase (but consistently slower on some other benchmarks of mine.)

benchmarks below are means/lb/ub before and after optimization on my machine.

benchmarking encode/8
mean: 100.5066 ns, lb 100.4542 ns, ub 100.6346 ns, ci 0.950
mean: 89.37258 ns, lb 89.34455 ns, ub 89.42334 ns, ci 0.950

benchmarking encode/32
mean: 292.5857 ns, lb 292.5121 ns, ub 292.7101 ns, ci 0.950
mean: 254.6374 ns, lb 249.8842 ns, ub 261.5605 ns, ci 0.950

benchmarking encode/128
mean: 1.038159 us, lb 1.037985 us, ub 1.038537 us, ci 0.950
mean: 850.7781 ns, lb 850.6118 ns, ub 851.1164 ns, ci 0.950

benchmarking encode/1024
mean: 7.768683 us, lb 7.766970 us, ub 7.772629 us, ci 0.950
mean: 6.838571 us, lb 6.693303 us, ub 7.040833 us, ci 0.950

benchmarking encode/65536
mean: 511.2344 us, lb 510.7413 us, ub 511.9317 us, ci 0.950
mean: 426.9267 us, lb 426.2879 us, ub 427.5551 us, ci 0.950

benchmarking decode/8
mean: 491.4554 ns, lb 491.3104 ns, ub 491.7289 ns, ci 0.950
mean: 420.7300 ns, lb 419.1382 ns, ub 427.0422 ns, ci 0.950

benchmarking decode/32
mean: 1.146035 us, lb 1.145754 us, ub 1.146530 us, ci 0.950
mean: 815.8817 ns, lb 815.1795 ns, ub 817.5643 ns, ci 0.950

benchmarking decode/128
mean: 3.738534 us, lb 3.737629 us, ub 3.740522 us, ci 0.950
mean: 2.379186 us, lb 2.376243 us, ub 2.382636 us, ci 0.950

benchmarking decode/1024
mean: 29.88323 us, lb 29.87664 us, ub 29.89754 us, ci 0.950
mean: 16.65282 us, lb 16.65004 us, ub 16.65840 us, ci 0.950

benchmarking decode/65536
mean: 1.915516 ms, lb 1.914163 ms, ub 1.919997 ms, ci 0.950
mean: 1.056637 ms, lb 1.055106 ms, ub 1.057994 ms, ci 0.950

@bos bos merged commit 9af3bd4 into from
@bos
Owner

Nice work, thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
View
81 Data/ByteString/Base16.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE BangPatterns, MagicHash #-}
-- |
-- Module : Data.ByteString.Base16
@@ -17,16 +17,16 @@ module Data.ByteString.Base16
, decode
) where
-import Control.Monad (forM_)
-import Data.Bits ((.&.), shiftL, shiftR)
import Data.ByteString.Char8 (empty)
import Data.ByteString.Internal (ByteString(..), createAndTrim', unsafeCreate)
-import Data.Word (Word8)
+import Data.Bits (shiftL)
import Foreign.ForeignPtr (ForeignPtr, withForeignPtr)
-import Foreign.Marshal.Alloc (mallocBytes)
import Foreign.Ptr (Ptr, minusPtr, plusPtr)
import Foreign.Storable (peek, poke)
import System.IO.Unsafe (unsafePerformIO)
+import GHC.Prim
+import GHC.Types
+import GHC.Word
-- | Encode a string into base16 form. The result will always be a
-- multiple of 2 bytes in length.
@@ -47,16 +47,45 @@ encode (PS sfp soff slen)
go s d | s == e = return ()
| otherwise = do
x <- peek8 s
- poke d =<< (peek (digits `plusPtr` (x `shiftR` 4)) :: IO Word8)
- poke (d `plusPtr` 1) =<< (peek (digits `plusPtr` (x .&. 0xf)) :: IO Word8)
+ poke d (tlookup tableHi x)
+ poke (d `plusPtr` 1) (tlookup tableLo x)
go (s `plusPtr` 1) (d `plusPtr` 2)
- digits :: Ptr Word8
- !digits = unsafePerformIO $ do
- ptr <- mallocBytes 16
- forM_ (zip [0..] "0123456789abcdef") $ \(i,c) ->
- poke (ptr `plusPtr` i) ((fromIntegral (fromEnum c)) :: Word8)
- return ptr
- {-# NOINLINE digits #-}
+ tlookup :: Addr# -> Int -> Word8
+ tlookup table (I# index) = W8# (indexWord8OffAddr# table index)
+ !tableLo =
+ "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66\
+ \\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x61\x62\x63\x64\x65\x66"#
+ !tableHi =
+ "\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\
+ \\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\
+ \\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\x32\
+ \\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\x33\
+ \\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\x34\
+ \\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\x35\
+ \\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\x36\
+ \\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\x37\
+ \\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\x38\
+ \\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39\
+ \\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\x61\
+ \\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\x62\
+ \\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\x63\
+ \\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\x64\
+ \\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\x65\
+ \\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66\x66"#
-- | Decode a string from base16 form. The first element of the
-- returned tuple contains the decoded data. The second element starts
@@ -78,11 +107,6 @@ decode (PS sfp soff slen) =
go s d | s == e = let len = e `minusPtr` sptr
in return (0, len `div` 2, ps sfp (soff+len) (slen-len))
| otherwise = do
- let hex w
- | w >= 48 && w <= 57 = w - 48
- | w >= 97 && w <= 102 = w - 97 + 10
- | w >= 65 && w <= 70 = w - 65 + 10
- | otherwise = 0xff
hi <- hex `fmap` peek8 s
lo <- hex `fmap` peek8 (s `plusPtr` 1)
if lo == 0xff || hi == 0xff
@@ -92,6 +116,25 @@ decode (PS sfp soff slen) =
poke d . fromIntegral $ lo + (hi `shiftL` 4)
go (s `plusPtr` 2) (d `plusPtr` 1)
+ hex (I# index) = W8# $ indexWord8OffAddr# table index
+ !table =
+ "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\xff\xff\xff\xff\xff\xff\
+ \\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\x0a\x0b\x0c\x0d\x0e\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\
+ \\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"#
+
peek8 :: Ptr Word8 -> IO Int
peek8 p = fromIntegral `fmap` peek p
View
3  base16-bytestring.cabal
@@ -23,7 +23,8 @@ library
build-depends:
base == 4.*,
- bytestring == 0.9.*
+ bytestring == 0.9.*,
+ ghc-prim
ghc-options: -Wall -funbox-strict-fields
ghc-prof-options: -auto-all
View
7 benchmarks/Benchmarks.hs
@@ -13,4 +13,11 @@ main = defaultMain [
, bench "1024" $ whnf B16.encode (generate 1024)
, bench "65536" $ whnf B16.encode (generate 65536)
]
+ , bgroup "decode" [
+ bench "8" $ whnf (B16.decode . B16.encode) (generate 8)
+ , bench "32" $ whnf (B16.decode . B16.encode) (generate 32)
+ , bench "128" $ whnf (B16.decode . B16.encode) (generate 128)
+ , bench "1024" $ whnf (B16.decode . B16.encode) (generate 1024)
+ , bench "65536" $ whnf (B16.decode . B16.encode) (generate 65536)
+ ]
]
Something went wrong with that request. Please try again.