Permalink
Browse files

A valiant attempt at improving UTF-8 encoding performance.

This didn't actually work - it slowed down aeson encoding by almost 2x!
  • Loading branch information...
1 parent f0158d3 commit d90cb3aab27e81cf786d2318b053024bcb6d1ca8 @bos committed Dec 23, 2011
Showing with 86 additions and 46 deletions.
  1. +15 −45 Data/Text/Encoding.hs
  2. +70 −1 cbits/cbits.c
  3. +1 −0 text.cabal
View
@@ -53,24 +53,22 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
#else
import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
#endif
-import Data.Bits ((.&.))
import Data.ByteString as B
import Data.ByteString.Internal as B
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Internal (Text(..), textP)
-import Data.Text.UnsafeChar (ord, unsafeWrite)
-import Data.Text.UnsafeShift (shiftL, shiftR)
+import Data.Text.UnsafeChar (unsafeWrite)
import Data.Word (Word8)
import Foreign.C.Types (CSize)
-import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.ForeignPtr (newForeignPtr, withForeignPtr)
import Foreign.Marshal.Utils (with)
import Foreign.Ptr (Ptr, plusPtr)
import Foreign.Storable (peek, poke)
-import GHC.Base (MutableByteArray#)
+import GHC.Base (ByteArray#, MutableByteArray#)
import System.IO.Unsafe (unsafePerformIO)
import qualified Data.Text.Array as A
import qualified Data.Text.Encoding.Fusion as E
-import qualified Data.Text.Encoding.Utf16 as U16
+
import qualified Data.Text.Fusion as F
-- $strict
@@ -145,45 +143,14 @@ decodeUtf8' = unsafePerformIO . try . evaluate . decodeUtf8With strictDecode
-- | Encode text using UTF-8 encoding.
encodeUtf8 :: Text -> ByteString
-encodeUtf8 (Text arr off len) = unsafePerformIO $ do
- let size0 = max len 4
- mallocByteString size0 >>= start size0 off 0
- where
- start size n0 m0 fp = withForeignPtr fp $ loop n0 m0
- where
- loop n1 m1 ptr = go n1 m1
- where
- go !n !m
- | n == off+len = return (PS fp 0 m)
- | otherwise = do
- let poke8 k v = poke (ptr `plusPtr` k) (fromIntegral v :: Word8)
- ensure k act
- | size-m >= k = act
- | otherwise = {-# SCC "resizeUtf8/ensure" #-} do
- let newSize = size `shiftL` 1
- fp' <- mallocByteString newSize
- withForeignPtr fp' $ \ptr' ->
- memcpy ptr' ptr (fromIntegral m)
- start newSize n m fp'
- {-# INLINE ensure #-}
- case A.unsafeIndex arr n of
- w| w <= 0x7F -> poke8 m w >> go (n+1) (m+1)
- | w <= 0x7FF -> ensure 2 $ do
- poke8 m $ (w `shiftR` 6) + 0xC0
- poke8 (m+1) $ (w .&. 0x3f) + 0x80
- go (n+1) (m+2)
- | 0xD800 <= w && w <= 0xDBFF -> ensure 4 $ do
- let c = ord $ U16.chr2 w (A.unsafeIndex arr (n+1))
- poke8 m $ (c `shiftR` 18) + 0xF0
- poke8 (m+1) $ ((c `shiftR` 12) .&. 0x3F) + 0x80
- poke8 (m+2) $ ((c `shiftR` 6) .&. 0x3F) + 0x80
- poke8 (m+3) $ (c .&. 0x3F) + 0x80
- go (n+2) (m+4)
- | otherwise -> ensure 3 $ do
- poke8 m $ (w `shiftR` 12) + 0xE0
- poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80
- poke8 (m+2) $ (w .&. 0x3F) + 0x80
- go (n+1) (m+3)
+encodeUtf8 (Text arr off len)
+ | len == 0 = empty
+ | otherwise = unsafePerformIO $ do
+ with (fromIntegral len ::CSize) $ \lenPtr -> do
+ dptr <- c_encode_utf8 lenPtr (A.aBA arr) (fromIntegral off)
+ fp <- newForeignPtr c_free_finalizer dptr
+ dlen <- peek lenPtr
+ return (PS fp 0 (fromIntegral dlen))
-- | Decode text from little endian UTF-16 encoding.
decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
@@ -264,3 +231,6 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
:: MutableByteArray# s -> Ptr CSize
-> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
+
+foreign import ccall unsafe "_hs_text_encode_utf8" c_encode_utf8
+ :: Ptr CSize -> ByteArray# -> CSize -> IO (Ptr Word8)
View
@@ -6,9 +6,10 @@
* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
*/
-#include <string.h>
#include <stdint.h>
#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
size_t n)
@@ -131,3 +132,71 @@ _hs_text_decode_utf8(uint16_t *dest, size_t *destoff,
return s;
}
+
+uint8_t *_hs_text_encode_utf8(size_t *plen,
+ const uint16_t const *src, size_t srcoff)
+{
+ size_t srclen = *plen;
+ size_t destlen = srclen > 4 ? srclen : 4;
+ uint8_t *dest = malloc(destlen);
+ uint8_t *d = dest;
+ const uint8_t const *dend = dest + destlen;
+ const uint16_t const *s = src + srcoff;
+ const uint16_t const *srcend = s + srclen;
+
+ do {
+ const uint16_t w = *s++;
+ if (w < 0x80) {
+ if (dend - d < 1) {
+ destlen *= 2;
+ uint8_t *newdest = realloc(dest, destlen);
+ d = newdest + (d - dest);
+ dest = newdest;
+ dend = newdest + destlen;
+ }
+ *d++ = w;
+ while (s < srcend && d < dend && *s < 0x80) {
+ *d++ = *s++;
+ }
+ } else if (w < 0x800) {
+ if (dend - d < 2) {
+ destlen *= 2;
+ uint8_t *newdest = realloc(dest, destlen);
+ d = newdest + (d - dest);
+ dest = newdest;
+ dend = newdest + destlen;
+ }
+ *d++ = 0xc0 | (w >> 6);
+ *d++ = 0x80 | (w & 0x3f);
+ } else if (w >= 0xd800 && w < 0xdc00) {
+ if (dend - d < 4) {
+ destlen *= 2;
+ uint8_t *newdest = realloc(dest, destlen);
+ d = newdest + (d - dest);
+ dest = newdest;
+ dend = newdest + destlen;
+ }
+ const uint32_t c = (((((uint32_t) w) - 0xd800) << 10) |
+ (((uint32_t) *s++) - 0xdc00)) + 0x10000;
+ *d++ = 0xf0 | (c >> 18);
+ *d++ = 0x80 | ((c >> 12) & 0x3f);
+ *d++ = 0x80 | ((c >> 6) & 0x3f);
+ *d++ = 0x80 | (c & 0x3f);
+ } else {
+ if (dend - d < 3) {
+ destlen *= 2;
+ uint8_t *newdest = realloc(dest, destlen);
+ d = newdest + (d - dest);
+ dest = newdest;
+ dend = newdest + destlen;
+ }
+ *d++ = 0xe0 | (w >> 12);
+ *d++ = 0x80 | ((w >> 6) & 0x3f);
+ *d++ = 0x80 | (w & 0x3f);
+ }
+ } while (s < srcend);
+
+ *plen = d - dest;
+
+ return dest;
+}
View
@@ -65,6 +65,7 @@ flag developer
library
c-sources: cbits/cbits.c
+ cc-options: -Wall
exposed-modules:
Data.Text

0 comments on commit d90cb3a

Please sign in to comment.