Skip to content
Newer
Older
100644 224 lines (201 sloc) 7.67 KB
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
1 {-# LANGUAGE BangPatterns #-}
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
2 -- |
3 -- Module : Data.Text.Encoding
da1bc90 @bos Update copyright
authored Oct 14, 2010
4 -- Copyright : (c) 2008, 2009 Tom Harper,
5 -- (c) 2009, 2010 Bryan O'Sullivan,
6 -- (c) 2009 Duncan Coutts
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
7 --
8 -- License : BSD-style
01bb6aa @bos Change Tom's email address
authored Apr 29, 2010
9 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
10 -- duncan@haskell.org
11 -- Stability : experimental
12 -- Portability : portable
13 --
14 -- Functions for converting 'Text' values to and from 'ByteString',
960d36a @bos Fix Haddocks
authored Feb 27, 2009
15 -- using several standard encodings.
16 --
17 -- To make use of a much larger variety of encodings, use the @text-icu@
18 -- package.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
19
20 module Data.Text.Encoding
21 (
22 -- * Decoding ByteStrings to Text
23 decodeASCII
24 , decodeUtf8
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
25 , decodeUtf8'
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
26 , decodeUtf16LE
27 , decodeUtf16BE
28 , decodeUtf32LE
29 , decodeUtf32BE
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
30 -- ** Controllable error handling
31 , decodeUtf8With
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
32 , decodeUtf8With'
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
33 , decodeUtf16LEWith
34 , decodeUtf16BEWith
35 , decodeUtf32LEWith
36 , decodeUtf32BEWith
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
37
38 -- * Encoding Text to ByteStrings
39 , encodeUtf8
40 , encodeUtf16LE
41 , encodeUtf16BE
42 , encodeUtf32LE
43 , encodeUtf32BE
44 ) where
be3f9d8 @bos Add a rewrite rule for fusion
authored Oct 14, 2010
45
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
46 import Data.Bits ((.&.))
47 import Data.ByteString as B
48 import Data.ByteString.Internal as B
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
49 import Data.ByteString.Unsafe as B
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
50 import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
51 import Data.Text.Internal (Text(..), textP)
52 import Data.Text.UnsafeChar (ord, unsafeWrite)
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
53 import Data.Text.UnsafeShift (shiftL, shiftR)
54 import Data.Word (Word8)
55 import Foreign.ForeignPtr (withForeignPtr)
56 import Foreign.Ptr (plusPtr)
57 import Foreign.Storable (poke)
58 import System.IO.Unsafe (unsafePerformIO)
59 import qualified Data.Text.Array as A
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
60 import qualified Data.Text.Encoding.Fusion as E
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
61 import qualified Data.Text.Encoding.Utf16 as U16
62 import qualified Data.Text.Encoding.Utf8 as U8
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
63 import qualified Data.Text.Fusion as F
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
64
65 -- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
66 decodeASCII :: ByteString -> Text
67 decodeASCII bs = F.unstream (E.streamASCII bs)
68 {-# INLINE decodeASCII #-}
69
70 -- | Decode a 'ByteString' containing UTF-8 encoded text.
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
71 decodeUtf8With :: OnDecodeError -> ByteString -> Text
72 decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)
73 {-# INLINE decodeUtf8With #-}
74
75 -- | Decode a 'ByteString' containing UTF-8 encoded text.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
76 decodeUtf8 :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
77 decodeUtf8 = decodeUtf8With strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
78 {-# INLINE decodeUtf8 #-}
79
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
80 decodeUtf8With' :: OnDecodeError -> ByteString -> Text
81 decodeUtf8With' onErr bs = textP (fst a) 0 (snd a)
82 where
83 a = A.run2 (A.new len >>= outer 0 0)
84 len = B.length bs
85 outer n0 m0 arr = go n0 m0
86 where
87 go !n !m = do
88 let x1 = idx m
89 x2 = idx (m + 1)
90 x3 = idx (m + 2)
91 x4 = idx (m + 3)
92 idx = B.unsafeIndex bs
93 case undefined of
94 _| m >= len -> return (arr,n)
95 | U8.validate1 x1 -> do
96 A.unsafeWrite arr n (fromIntegral x1)
97 go (n+1) (m+1)
98 | m+1 < len && U8.validate2 x1 x2 -> do
99 w <- unsafeWrite arr n (U8.chr2 x1 x2)
100 go (n+w) (m+2)
101 | m+2 < len && U8.validate3 x1 x2 x3 -> do
102 w <- unsafeWrite arr n (U8.chr3 x1 x2 x3)
103 go (n+w) (m+3)
104 | m+3 < len && U8.validate4 x1 x2 x3 x4 -> do
105 w <- unsafeWrite arr n (U8.chr4 x1 x2 x3 x4)
106 go (n+w) (m+4)
107 | otherwise -> case onErr desc (Just x1) of
108 Nothing -> go n (m+1)
109 Just c -> do
110 w <- unsafeWrite arr n c
111 go (n+w) (m+1)
112 desc = "Data.Text.Encoding.encodeUtf8: Invalid UTF-8 stream"
be3f9d8 @bos Add a rewrite rule for fusion
authored Oct 14, 2010
113 {-# INLINE[0] decodeUtf8With' #-}
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
114
115 -- | Decode a 'ByteString' containing UTF-8 encoded text.
116 decodeUtf8' :: ByteString -> Text
117 decodeUtf8' = decodeUtf8With' strictDecode
be3f9d8 @bos Add a rewrite rule for fusion
authored Oct 14, 2010
118 {-# INLINE[0] decodeUtf8' #-}
119
120 {-# RULES "STREAM stream/decodeUtf8' fusion" [1]
121 forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
122
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
123 -- | Encode text using UTF-8 encoding.
124 encodeUtf8 :: Text -> ByteString
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
125 encodeUtf8 (Text arr off len) = unsafePerformIO $ do
126 let size0 = min len 4
127 mallocByteString size0 >>= start size0 off 0
128 where
129 start size n0 m0 fp = withForeignPtr fp $ loop n0 m0
130 where
131 loop n1 m1 ptr = go n1 m1
132 where
133 go !n !m
134 | n-off == len = return $! PS fp 0 m
135 | size-m < 4 = {-# SCC "encodeUtf8/resize" #-} do
136 let newSize = size `shiftL` 1
137 fp' <- mallocByteString newSize
138 withForeignPtr fp' $ \ptr' -> memcpy ptr' ptr (fromIntegral m)
139 start newSize n m fp'
140 | otherwise = do
141 let poke8 k v = poke (ptr `plusPtr` k) (fromIntegral v :: Word8)
142 w = A.unsafeIndex arr n
143 case undefined of
144 _| w <= 0x7F -> do
145 poke8 m w
146 go (n+1) (m+1)
147 | w <= 0x7FF -> do
148 poke8 m $ (w `shiftR` 6) + 0xC0
149 poke8 (m+1) $ (w .&. 0x3f) + 0x80
150 go (n+1) (m+2)
151 | 0xD800 <= w && w <= 0xDBFF -> do
2189300 @bos Write a faster UTF-8 decoder
authored Oct 14, 2010
152 let c = ord $ U16.chr2 w (A.unsafeIndex arr (n+1))
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored Oct 14, 2010
153 poke8 m $ (c `shiftR` 18) + 0xF0
154 poke8 (m+1) $ ((c `shiftR` 12) .&. 0x3F) + 0x80
155 poke8 (m+2) $ ((c `shiftR` 6) .&. 0x3F) + 0x80
156 poke8 (m+3) $ (c .&. 0x3F) + 0x80
157 go (n+2) (m+4)
158 | otherwise -> do
159 poke8 m $ (w `shiftR` 12) + 0xE0
160 poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80
161 poke8 (m+2) $ (w .&. 0x3F) + 0x80
162 go (n+1) (m+3)
163 {- INLINE encodeUtf8 #-}
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
164
165 -- | Decode text from little endian UTF-16 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
166 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
167 decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
168 {-# INLINE decodeUtf16LEWith #-}
169
170 -- | Decode text from little endian UTF-16 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
171 decodeUtf16LE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
172 decodeUtf16LE = decodeUtf16LEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
173 {-# INLINE decodeUtf16LE #-}
174
175 -- | Decode text from big endian UTF-16 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
176 decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
177 decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
178 {-# INLINE decodeUtf16BEWith #-}
179
180 -- | Decode text from big endian UTF-16 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
181 decodeUtf16BE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
182 decodeUtf16BE = decodeUtf16BEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
183 {-# INLINE decodeUtf16BE #-}
184
185 -- | Encode text using little endian UTF-16 encoding.
186 encodeUtf16LE :: Text -> ByteString
187 encodeUtf16LE txt = E.unstream (E.restreamUtf16LE (F.stream txt))
188 {-# INLINE encodeUtf16LE #-}
189
190 -- | Encode text using big endian UTF-16 encoding.
191 encodeUtf16BE :: Text -> ByteString
192 encodeUtf16BE txt = E.unstream (E.restreamUtf16BE (F.stream txt))
193 {-# INLINE encodeUtf16BE #-}
194
195 -- | Decode text from little endian UTF-32 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
196 decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
197 decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
198 {-# INLINE decodeUtf32LEWith #-}
199
200 -- | Decode text from little endian UTF-32 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
201 decodeUtf32LE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
202 decodeUtf32LE = decodeUtf32LEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
203 {-# INLINE decodeUtf32LE #-}
204
205 -- | Decode text from big endian UTF-32 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
206 decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
207 decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
208 {-# INLINE decodeUtf32BEWith #-}
209
210 -- | Decode text from big endian UTF-32 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
211 decodeUtf32BE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored Jun 6, 2009
212 decodeUtf32BE = decodeUtf32BEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored Jan 27, 2009
213 {-# INLINE decodeUtf32BE #-}
214
215 -- | Encode text using little endian UTF-32 encoding.
216 encodeUtf32LE :: Text -> ByteString
217 encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
218 {-# INLINE encodeUtf32LE #-}
219
220 -- | Encode text using big endian UTF-32 encoding.
221 encodeUtf32BE :: Text -> ByteString
222 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
223 {-# INLINE encodeUtf32BE #-}
Something went wrong with that request. Please try again.