Skip to content
Newer
Older
100644 265 lines (240 sloc) 9.67 KB
efa8732 @bos Portable native UTF-8 decoder gives 3.7x faster decoding
authored Jul 10, 2011
1 {-# LANGUAGE BangPatterns, ForeignFunctionInterface, MagicHash,
2 UnliftedFFITypes #-}
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
3 -- |
4 -- Module : Data.Text.Encoding
da1bc90 @bos Update copyright
authored Oct 14, 2010
5 -- Copyright : (c) 2008, 2009 Tom Harper,
6 -- (c) 2009, 2010 Bryan O'Sullivan,
7 -- (c) 2009 Duncan Coutts
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
8 --
9 -- License : BSD-style
01bb6aa @bos Change Tom's email address
authored Apr 29, 2010
10 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
11 -- duncan@haskell.org
12 -- Stability : experimental
13 -- Portability : portable
14 --
15 -- Functions for converting 'Text' values to and from 'ByteString',
960d36a @bos Fix Haddocks
authored Feb 27, 2009
16 -- using several standard encodings.
17 --
f97e59a @bos Many small documentation improvements.
authored Nov 29, 2010
18 -- To gain access to a much larger family of encodings, use the
19 -- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
20
21 module Data.Text.Encoding
22 (
23 -- * Decoding ByteStrings to Text
f97e59a @bos Many small documentation improvements.
authored Nov 30, 2010
24 -- $strict
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
25 decodeASCII
26 , decodeUtf8
27 , decodeUtf16LE
28 , decodeUtf16BE
29 , decodeUtf32LE
30 , decodeUtf32BE
f97e59a @bos Many small documentation improvements.
authored Nov 30, 2010
31
f8d5f17 @bos Add decodeUtf8'.
authored Mar 15, 2011
32 -- ** Catchable failure
33 , decodeUtf8'
34
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
35 -- ** Controllable error handling
36 , decodeUtf8With
37 , decodeUtf16LEWith
38 , decodeUtf16BEWith
39 , decodeUtf32LEWith
40 , decodeUtf32BEWith
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
41
42 -- * Encoding Text to ByteStrings
43 , encodeUtf8
44 , encodeUtf16LE
45 , encodeUtf16BE
46 , encodeUtf32LE
47 , encodeUtf32BE
48 ) where
be3f9d8 @bos Add a rewrite rule for fusion
authored Oct 14, 2010
49
f8d5f17 @bos Add decodeUtf8'.
authored Mar 16, 2011
50 import Control.Exception (evaluate, try)
efa8732 @bos Portable native UTF-8 decoder gives 3.7x faster decoding
authored Jul 10, 2011
51 import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
52 import Data.Bits ((.&.))
53 import Data.ByteString as B
54 import Data.ByteString.Internal as B
f8d5f17 @bos Add decodeUtf8'.
authored Mar 16, 2011
55 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
56 import Data.Text.Internal (Text(..), textP)
57 import Data.Text.UnsafeChar (ord, unsafeWrite)
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
58 import Data.Text.UnsafeShift (shiftL, shiftR)
59 import Data.Word (Word8)
efa8732 @bos Portable native UTF-8 decoder gives 3.7x faster decoding
authored Jul 10, 2011
60 import Foreign.C.Types (CSize)
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
61 import Foreign.ForeignPtr (withForeignPtr)
efa8732 @bos Portable native UTF-8 decoder gives 3.7x faster decoding
authored Jul 10, 2011
62 import Foreign.Marshal.Utils (with)
63 import Foreign.Ptr (Ptr, plusPtr)
64 import Foreign.Storable (peek, poke)
65 import GHC.Base (MutableByteArray#)
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
66 import System.IO.Unsafe (unsafePerformIO)
67 import qualified Data.Text.Array as A
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
68 import qualified Data.Text.Encoding.Fusion as E
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
69 import qualified Data.Text.Encoding.Utf16 as U16
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
70 import qualified Data.Text.Fusion as F
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
71
f97e59a @bos Many small documentation improvements.
authored Nov 30, 2010
72 -- $strict
73 --
74 -- All of the single-parameter functions for decoding bytestrings
75 -- encoded in one of the Unicode Transformation Formats (UTF) operate
76 -- in a /strict/ mode: each will throw an exception if given invalid
77 -- input.
78 --
79 -- Each function has a variant, whose name is suffixed with -'With',
80 -- that gives greater control over the handling of decoding errors.
81 -- For instance, 'decodeUtf8' will throw an exception, but
82 -- 'decodeUtf8With' allows the programmer to determine what to do on a
83 -- decoding error.
84
9e9d83e @bos Mark the ASCII decoding functions as deprecated.
authored Jul 11, 2011
85 -- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
86 -- encoded text.
87 --
88 -- This function is deprecated. Use 'decodeUtf8' instead.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
89 decodeASCII :: ByteString -> Text
9e9d83e @bos Mark the ASCII decoding functions as deprecated.
authored Jul 11, 2011
90 decodeASCII = decodeUtf8
91 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
92
f97e59a @bos Many small documentation improvements.
authored Nov 30, 2010
93 -- | Decode a 'ByteString' containing UTF-8 encoded text.
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
94 decodeUtf8With :: OnDecodeError -> ByteString -> Text
efa8732 @bos Portable native UTF-8 decoder gives 3.7x faster decoding
authored Jul 10, 2011
95 decodeUtf8With onErr (PS fp off len) = textP (fst a) 0 (snd a)
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
96 where
efa8732 @bos Portable native UTF-8 decoder gives 3.7x faster decoding
authored Jul 10, 2011
97 a = A.run2 (A.new len >>= unsafeIOToST . go)
0c1bc82 @bos Speed up UTF-8 decoding by a little over 2x
authored Jul 7, 2011
98 desc = "Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream"
efa8732 @bos Portable native UTF-8 decoder gives 3.7x faster decoding
authored Jul 10, 2011
99 go dest = withForeignPtr fp $ \ptr ->
100 with (0::CSize) $ \destOffPtr -> do
101 let end = ptr `plusPtr` (off + len)
102 loop curPtr = do
103 curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
104 if curPtr' == end
105 then do
106 n <- peek destOffPtr
107 return (dest,fromIntegral n)
108 else do
109 x <- peek curPtr'
110 case onErr desc (Just x) of
111 Nothing -> loop $ curPtr' `plusPtr` 1
112 Just c -> do
113 destOff <- peek destOffPtr
114 w <- unsafeSTToIO $
115 unsafeWrite dest (fromIntegral destOff) c
116 poke destOffPtr (destOff + fromIntegral w)
117 loop $ curPtr' `plusPtr` 1
118 loop (ptr `plusPtr` off)
119 {- INLINE[0] decodeUtf8With #-}
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
120
3b0634b @bos Improve error message.
authored Mar 15, 2011
121 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
122 -- to be valid.
f97e59a @bos Many small documentation improvements.
authored Nov 30, 2010
123 --
124 -- If the input contains any invalid UTF-8 data, an exception will be
f8d5f17 @bos Add decodeUtf8'.
authored Mar 16, 2011
125 -- thrown that cannot be caught in pure code. For more control over
126 -- the handling of invalid data, use 'decodeUtf8'' or
f97e59a @bos Many small documentation improvements.
authored Nov 30, 2010
127 -- 'decodeUtf8With'.
4667a18 @bos Get rid of the old decode function
authored Oct 14, 2010
128 decodeUtf8 :: ByteString -> Text
129 decodeUtf8 = decodeUtf8With strictDecode
130 {-# INLINE[0] decodeUtf8 #-}
131 {-# RULES "STREAM stream/decodeUtf8 fusion" [1]
132 forall bs. F.stream (decodeUtf8 bs) = E.streamUtf8 strictDecode bs #-}
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
133
f8d5f17 @bos Add decodeUtf8'.
authored Mar 16, 2011
134 -- | Decode a 'ByteString' containing UTF-8 encoded text..
135 --
136 -- If the input contains any invalid UTF-8 data, the relevant
137 -- exception will be returned, otherwise the decoded text.
138 decodeUtf8' :: ByteString -> Either UnicodeException Text
139 decodeUtf8' = unsafePerformIO . try . evaluate . decodeUtf8With strictDecode
140 {-# INLINE decodeUtf8' #-}
141
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
142 -- | Encode text using UTF-8 encoding.
143 encodeUtf8 :: Text -> ByteString
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
144 encodeUtf8 (Text arr off len) = unsafePerformIO $ do
9dec3d6 @bos Oh noes! I was miscalculating the initial buffer size!
authored Jun 27, 2011
145 let size0 = max len 4
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
146 mallocByteString size0 >>= start size0 off 0
147 where
148 start size n0 m0 fp = withForeignPtr fp $ loop n0 m0
149 where
150 loop n1 m1 ptr = go n1 m1
151 where
152 go !n !m
4050227 @bos Eliminate unnecessary resizes from encodeUtf8.
authored Jun 27, 2011
153 | n-off == len = return (PS fp 0 m)
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
154 | otherwise = do
155 let poke8 k v = poke (ptr `plusPtr` k) (fromIntegral v :: Word8)
4050227 @bos Eliminate unnecessary resizes from encodeUtf8.
authored Jun 28, 2011
156 ensure k act
157 | size-m >= k = act
158 | otherwise = {-# SCC "resizeUtf8/ensure" #-} do
159 let newSize = size `shiftL` 1
160 fp' <- mallocByteString newSize
161 withForeignPtr fp' $ \ptr' ->
162 memcpy ptr' ptr (fromIntegral m)
163 start newSize n m fp'
164 {-# INLINE ensure #-}
165 case A.unsafeIndex arr n of
166 w| w <= 0x7F -> ensure 1 $ do
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
167 poke8 m w
168 go (n+1) (m+1)
4050227 @bos Eliminate unnecessary resizes from encodeUtf8.
authored Jun 28, 2011
169 | w <= 0x7FF -> ensure 2 $ do
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
170 poke8 m $ (w `shiftR` 6) + 0xC0
171 poke8 (m+1) $ (w .&. 0x3f) + 0x80
172 go (n+1) (m+2)
4050227 @bos Eliminate unnecessary resizes from encodeUtf8.
authored Jun 28, 2011
173 | 0xD800 <= w && w <= 0xDBFF -> ensure 4 $ do
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
174 let c = ord $ U16.chr2 w (A.unsafeIndex arr (n+1))
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
175 poke8 m $ (c `shiftR` 18) + 0xF0
176 poke8 (m+1) $ ((c `shiftR` 12) .&. 0x3F) + 0x80
177 poke8 (m+2) $ ((c `shiftR` 6) .&. 0x3F) + 0x80
178 poke8 (m+3) $ (c .&. 0x3F) + 0x80
179 go (n+2) (m+4)
4050227 @bos Eliminate unnecessary resizes from encodeUtf8.
authored Jun 28, 2011
180 | otherwise -> ensure 3 $ do
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
181 poke8 m $ (w `shiftR` 12) + 0xE0
182 poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80
183 poke8 (m+2) $ (w .&. 0x3F) + 0x80
184 go (n+1) (m+3)
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
185
186 -- | Decode text from little endian UTF-16 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
187 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
188 decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
189 {-# INLINE decodeUtf16LEWith #-}
190
191 -- | Decode text from little endian UTF-16 encoding.
f97e59a @bos Many small documentation improvements.
authored Nov 30, 2010
192 --
193 -- If the input contains any invalid little endian UTF-16 data, an
194 -- exception will be thrown. For more control over the handling of
195 -- invalid data, use 'decodeUtf16LEWith'.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
196 decodeUtf16LE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
197 decodeUtf16LE = decodeUtf16LEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
198 {-# INLINE decodeUtf16LE #-}
199
200 -- | Decode text from big endian UTF-16 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
201 decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
202 decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
203 {-# INLINE decodeUtf16BEWith #-}
204
205 -- | Decode text from big endian UTF-16 encoding.
f97e59a @bos Many small documentation improvements.
authored Nov 30, 2010
206 --
207 -- If the input contains any invalid big endian UTF-16 data, an
208 -- exception will be thrown. For more control over the handling of
209 -- invalid data, use 'decodeUtf16BEWith'.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
210 decodeUtf16BE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
211 decodeUtf16BE = decodeUtf16BEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
212 {-# INLINE decodeUtf16BE #-}
213
214 -- | Encode text using little endian UTF-16 encoding.
215 encodeUtf16LE :: Text -> ByteString
216 encodeUtf16LE txt = E.unstream (E.restreamUtf16LE (F.stream txt))
217 {-# INLINE encodeUtf16LE #-}
218
219 -- | Encode text using big endian UTF-16 encoding.
220 encodeUtf16BE :: Text -> ByteString
221 encodeUtf16BE txt = E.unstream (E.restreamUtf16BE (F.stream txt))
222 {-# INLINE encodeUtf16BE #-}
223
224 -- | Decode text from little endian UTF-32 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
225 decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
226 decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
227 {-# INLINE decodeUtf32LEWith #-}
228
229 -- | Decode text from little endian UTF-32 encoding.
f97e59a @bos Many small documentation improvements.
authored Nov 30, 2010
230 --
231 -- If the input contains any invalid little endian UTF-32 data, an
232 -- exception will be thrown. For more control over the handling of
233 -- invalid data, use 'decodeUtf32LEWith'.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
234 decodeUtf32LE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
235 decodeUtf32LE = decodeUtf32LEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
236 {-# INLINE decodeUtf32LE #-}
237
238 -- | Decode text from big endian UTF-32 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
239 decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
240 decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
241 {-# INLINE decodeUtf32BEWith #-}
242
243 -- | Decode text from big endian UTF-32 encoding.
f97e59a @bos Many small documentation improvements.
authored Nov 30, 2010
244 --
245 -- If the input contains any invalid big endian UTF-32 data, an
246 -- exception will be thrown. For more control over the handling of
247 -- invalid data, use 'decodeUtf32BEWith'.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
248 decodeUtf32BE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
249 decodeUtf32BE = decodeUtf32BEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
250 {-# INLINE decodeUtf32BE #-}
251
252 -- | Encode text using little endian UTF-32 encoding.
253 encodeUtf32LE :: Text -> ByteString
254 encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
255 {-# INLINE encodeUtf32LE #-}
256
257 -- | Encode text using big endian UTF-32 encoding.
258 encodeUtf32BE :: Text -> ByteString
259 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
260 {-# INLINE encodeUtf32BE #-}
efa8732 @bos Portable native UTF-8 decoder gives 3.7x faster decoding
authored Jul 10, 2011
261
262 foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
263 :: MutableByteArray# s -> Ptr CSize
264 -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
Something went wrong with that request. Please try again.