Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 221 lines (199 sloc) 7.729 kb
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
1 {-# LANGUAGE BangPatterns #-}
a70e3a0 @bos Split encoding support out into new modules
authored
2 -- |
3 -- Module : Data.Text.Encoding
da1bc90 @bos Update copyright
authored
4 -- Copyright : (c) 2008, 2009 Tom Harper,
5 -- (c) 2009, 2010 Bryan O'Sullivan,
6 -- (c) 2009 Duncan Coutts
a70e3a0 @bos Split encoding support out into new modules
authored
7 --
8 -- License : BSD-style
01bb6aa @bos Change Tom's email address
authored
9 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
a70e3a0 @bos Split encoding support out into new modules
authored
10 -- duncan@haskell.org
11 -- Stability : experimental
12 -- Portability : portable
13 --
14 -- Functions for converting 'Text' values to and from 'ByteString',
960d36a @bos Fix Haddocks
authored
15 -- using several standard encodings.
16 --
17 -- To make use of a much larger variety of encodings, use the @text-icu@
18 -- package.
a70e3a0 @bos Split encoding support out into new modules
authored
19
20 module Data.Text.Encoding
21 (
22 -- * Decoding ByteStrings to Text
23 decodeASCII
24 , decodeUtf8
2189300 @bos Write a faster UTF-8 decoder
authored
25 , decodeUtf8'
a70e3a0 @bos Split encoding support out into new modules
authored
26 , decodeUtf16LE
27 , decodeUtf16BE
28 , decodeUtf32LE
29 , decodeUtf32BE
8766bac @bos Add controllable error handling and recovery code.
authored
30 -- ** Controllable error handling
31 , decodeUtf8With
2189300 @bos Write a faster UTF-8 decoder
authored
32 , decodeUtf8With'
8766bac @bos Add controllable error handling and recovery code.
authored
33 , decodeUtf16LEWith
34 , decodeUtf16BEWith
35 , decodeUtf32LEWith
36 , decodeUtf32BEWith
a70e3a0 @bos Split encoding support out into new modules
authored
37
38 -- * Encoding Text to ByteStrings
39 , encodeUtf8
40 , encodeUtf16LE
41 , encodeUtf16BE
42 , encodeUtf32LE
43 , encodeUtf32BE
44 ) where
45
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
46 import Data.Bits ((.&.))
47 import Data.ByteString as B
48 import Data.ByteString.Internal as B
2189300 @bos Write a faster UTF-8 decoder
authored
49 import Data.ByteString.Unsafe as B
8766bac @bos Add controllable error handling and recovery code.
authored
50 import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
2189300 @bos Write a faster UTF-8 decoder
authored
51 import Data.Text.Internal (Text(..), textP)
52 import Data.Text.UnsafeChar (ord, unsafeWrite)
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
53 import Data.Text.UnsafeShift (shiftL, shiftR)
54 import Data.Word (Word8)
55 import Foreign.ForeignPtr (withForeignPtr)
56 import Foreign.Ptr (plusPtr)
57 import Foreign.Storable (poke)
58 import System.IO.Unsafe (unsafePerformIO)
59 import qualified Data.Text.Array as A
a70e3a0 @bos Split encoding support out into new modules
authored
60 import qualified Data.Text.Encoding.Fusion as E
2189300 @bos Write a faster UTF-8 decoder
authored
61 import qualified Data.Text.Encoding.Utf16 as U16
62 import qualified Data.Text.Encoding.Utf8 as U8
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
63 import qualified Data.Text.Fusion as F
a70e3a0 @bos Split encoding support out into new modules
authored
64
65 -- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
66 decodeASCII :: ByteString -> Text
67 decodeASCII bs = F.unstream (E.streamASCII bs)
68 {-# INLINE decodeASCII #-}
69
70 -- | Decode a 'ByteString' containing UTF-8 encoded text.
8766bac @bos Add controllable error handling and recovery code.
authored
71 decodeUtf8With :: OnDecodeError -> ByteString -> Text
72 decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)
73 {-# INLINE decodeUtf8With #-}
74
75 -- | Decode a 'ByteString' containing UTF-8 encoded text.
a70e3a0 @bos Split encoding support out into new modules
authored
76 decodeUtf8 :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored
77 decodeUtf8 = decodeUtf8With strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored
78 {-# INLINE decodeUtf8 #-}
79
2189300 @bos Write a faster UTF-8 decoder
authored
80 decodeUtf8With' :: OnDecodeError -> ByteString -> Text
81 decodeUtf8With' onErr bs = textP (fst a) 0 (snd a)
82 where
83 a = A.run2 (A.new len >>= outer 0 0)
84 len = B.length bs
85 outer n0 m0 arr = go n0 m0
86 where
87 go !n !m = do
88 let x1 = idx m
89 x2 = idx (m + 1)
90 x3 = idx (m + 2)
91 x4 = idx (m + 3)
92 idx = B.unsafeIndex bs
93 case undefined of
94 _| m >= len -> return (arr,n)
95 | U8.validate1 x1 -> do
96 A.unsafeWrite arr n (fromIntegral x1)
97 go (n+1) (m+1)
98 | m+1 < len && U8.validate2 x1 x2 -> do
99 w <- unsafeWrite arr n (U8.chr2 x1 x2)
100 go (n+w) (m+2)
101 | m+2 < len && U8.validate3 x1 x2 x3 -> do
102 w <- unsafeWrite arr n (U8.chr3 x1 x2 x3)
103 go (n+w) (m+3)
104 | m+3 < len && U8.validate4 x1 x2 x3 x4 -> do
105 w <- unsafeWrite arr n (U8.chr4 x1 x2 x3 x4)
106 go (n+w) (m+4)
107 | otherwise -> case onErr desc (Just x1) of
108 Nothing -> go n (m+1)
109 Just c -> do
110 w <- unsafeWrite arr n c
111 go (n+w) (m+1)
112 desc = "Data.Text.Encoding.encodeUtf8: Invalid UTF-8 stream"
113 {-# INLINE decodeUtf8With' #-}
114
115 -- | Decode a 'ByteString' containing UTF-8 encoded text.
116 decodeUtf8' :: ByteString -> Text
117 decodeUtf8' = decodeUtf8With' strictDecode
118 {-# INLINE decodeUtf8' #-}
119
a70e3a0 @bos Split encoding support out into new modules
authored
120 -- | Encode text using UTF-8 encoding.
121 encodeUtf8 :: Text -> ByteString
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
122 encodeUtf8 (Text arr off len) = unsafePerformIO $ do
123 let size0 = min len 4
124 mallocByteString size0 >>= start size0 off 0
125 where
126 start size n0 m0 fp = withForeignPtr fp $ loop n0 m0
127 where
128 loop n1 m1 ptr = go n1 m1
129 where
130 go !n !m
131 | n-off == len = return $! PS fp 0 m
132 | size-m < 4 = {-# SCC "encodeUtf8/resize" #-} do
133 let newSize = size `shiftL` 1
134 fp' <- mallocByteString newSize
135 withForeignPtr fp' $ \ptr' -> memcpy ptr' ptr (fromIntegral m)
136 start newSize n m fp'
137 | otherwise = do
138 let poke8 k v = poke (ptr `plusPtr` k) (fromIntegral v :: Word8)
139 w = A.unsafeIndex arr n
140 case undefined of
141 _| w <= 0x7F -> do
142 poke8 m w
143 go (n+1) (m+1)
144 | w <= 0x7FF -> do
145 poke8 m $ (w `shiftR` 6) + 0xC0
146 poke8 (m+1) $ (w .&. 0x3f) + 0x80
147 go (n+1) (m+2)
148 | 0xD800 <= w && w <= 0xDBFF -> do
2189300 @bos Write a faster UTF-8 decoder
authored
149 let c = ord $ U16.chr2 w (A.unsafeIndex arr (n+1))
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
150 poke8 m $ (c `shiftR` 18) + 0xF0
151 poke8 (m+1) $ ((c `shiftR` 12) .&. 0x3F) + 0x80
152 poke8 (m+2) $ ((c `shiftR` 6) .&. 0x3F) + 0x80
153 poke8 (m+3) $ (c .&. 0x3F) + 0x80
154 go (n+2) (m+4)
155 | otherwise -> do
156 poke8 m $ (w `shiftR` 12) + 0xE0
157 poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80
158 poke8 (m+2) $ (w .&. 0x3F) + 0x80
159 go (n+1) (m+3)
160 {- INLINE encodeUtf8 #-}
a70e3a0 @bos Split encoding support out into new modules
authored
161
162 -- | Decode text from little endian UTF-16 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored
163 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
164 decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
165 {-# INLINE decodeUtf16LEWith #-}
166
167 -- | Decode text from little endian UTF-16 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored
168 decodeUtf16LE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored
169 decodeUtf16LE = decodeUtf16LEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored
170 {-# INLINE decodeUtf16LE #-}
171
172 -- | Decode text from big endian UTF-16 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored
173 decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
174 decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
175 {-# INLINE decodeUtf16BEWith #-}
176
177 -- | Decode text from big endian UTF-16 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored
178 decodeUtf16BE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored
179 decodeUtf16BE = decodeUtf16BEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored
180 {-# INLINE decodeUtf16BE #-}
181
182 -- | Encode text using little endian UTF-16 encoding.
183 encodeUtf16LE :: Text -> ByteString
184 encodeUtf16LE txt = E.unstream (E.restreamUtf16LE (F.stream txt))
185 {-# INLINE encodeUtf16LE #-}
186
187 -- | Encode text using big endian UTF-16 encoding.
188 encodeUtf16BE :: Text -> ByteString
189 encodeUtf16BE txt = E.unstream (E.restreamUtf16BE (F.stream txt))
190 {-# INLINE encodeUtf16BE #-}
191
192 -- | Decode text from little endian UTF-32 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored
193 decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
194 decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
195 {-# INLINE decodeUtf32LEWith #-}
196
197 -- | Decode text from little endian UTF-32 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored
198 decodeUtf32LE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored
199 decodeUtf32LE = decodeUtf32LEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored
200 {-# INLINE decodeUtf32LE #-}
201
202 -- | Decode text from big endian UTF-32 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored
203 decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
204 decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
205 {-# INLINE decodeUtf32BEWith #-}
206
207 -- | Decode text from big endian UTF-32 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored
208 decodeUtf32BE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored
209 decodeUtf32BE = decodeUtf32BEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored
210 {-# INLINE decodeUtf32BE #-}
211
212 -- | Encode text using little endian UTF-32 encoding.
213 encodeUtf32LE :: Text -> ByteString
214 encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
215 {-# INLINE encodeUtf32LE #-}
216
217 -- | Encode text using big endian UTF-32 encoding.
218 encodeUtf32BE :: Text -> ByteString
219 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
220 {-# INLINE encodeUtf32BE #-}
Something went wrong with that request. Please try again.