Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 212 lines (191 sloc) 7.445 kb
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
1 {-# LANGUAGE BangPatterns #-}
a70e3a0 @bos Split encoding support out into new modules
authored
2 -- |
3 -- Module : Data.Text.Encoding
da1bc90 @bos Update copyright
authored
4 -- Copyright : (c) 2008, 2009 Tom Harper,
5 -- (c) 2009, 2010 Bryan O'Sullivan,
6 -- (c) 2009 Duncan Coutts
a70e3a0 @bos Split encoding support out into new modules
authored
7 --
8 -- License : BSD-style
01bb6aa @bos Change Tom's email address
authored
9 -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,
a70e3a0 @bos Split encoding support out into new modules
authored
10 -- duncan@haskell.org
11 -- Stability : experimental
12 -- Portability : portable
13 --
14 -- Functions for converting 'Text' values to and from 'ByteString',
960d36a @bos Fix Haddocks
authored
15 -- using several standard encodings.
16 --
17 -- To make use of a much larger variety of encodings, use the @text-icu@
18 -- package.
a70e3a0 @bos Split encoding support out into new modules
authored
19
20 module Data.Text.Encoding
21 (
22 -- * Decoding ByteStrings to Text
23 decodeASCII
24 , decodeUtf8
25 , decodeUtf16LE
26 , decodeUtf16BE
27 , decodeUtf32LE
28 , decodeUtf32BE
8766bac @bos Add controllable error handling and recovery code.
authored
29 -- ** Controllable error handling
30 , decodeUtf8With
31 , decodeUtf16LEWith
32 , decodeUtf16BEWith
33 , decodeUtf32LEWith
34 , decodeUtf32BEWith
a70e3a0 @bos Split encoding support out into new modules
authored
35
36 -- * Encoding Text to ByteStrings
37 , encodeUtf8
38 , encodeUtf16LE
39 , encodeUtf16BE
40 , encodeUtf32LE
41 , encodeUtf32BE
42 ) where
be3f9d8 @bos Add a rewrite rule for fusion
authored
43
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
44 import Data.Bits ((.&.))
45 import Data.ByteString as B
46 import Data.ByteString.Internal as B
2189300 @bos Write a faster UTF-8 decoder
authored
47 import Data.ByteString.Unsafe as B
8766bac @bos Add controllable error handling and recovery code.
authored
48 import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
2189300 @bos Write a faster UTF-8 decoder
authored
49 import Data.Text.Internal (Text(..), textP)
50 import Data.Text.UnsafeChar (ord, unsafeWrite)
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
51 import Data.Text.UnsafeShift (shiftL, shiftR)
52 import Data.Word (Word8)
53 import Foreign.ForeignPtr (withForeignPtr)
54 import Foreign.Ptr (plusPtr)
55 import Foreign.Storable (poke)
56 import System.IO.Unsafe (unsafePerformIO)
57 import qualified Data.Text.Array as A
a70e3a0 @bos Split encoding support out into new modules
authored
58 import qualified Data.Text.Encoding.Fusion as E
2189300 @bos Write a faster UTF-8 decoder
authored
59 import qualified Data.Text.Encoding.Utf16 as U16
60 import qualified Data.Text.Encoding.Utf8 as U8
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
61 import qualified Data.Text.Fusion as F
a70e3a0 @bos Split encoding support out into new modules
authored
62
63 -- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
64 decodeASCII :: ByteString -> Text
65 decodeASCII bs = F.unstream (E.streamASCII bs)
66 {-# INLINE decodeASCII #-}
67
8766bac @bos Add controllable error handling and recovery code.
authored
68 decodeUtf8With :: OnDecodeError -> ByteString -> Text
4667a18 @bos Get rid of the old decode function
authored
69 decodeUtf8With onErr bs = textP (fst a) 0 (snd a)
2189300 @bos Write a faster UTF-8 decoder
authored
70 where
71 a = A.run2 (A.new len >>= outer 0 0)
72 len = B.length bs
73 outer n0 m0 arr = go n0 m0
74 where
75 go !n !m = do
76 let x1 = idx m
77 x2 = idx (m + 1)
78 x3 = idx (m + 2)
79 x4 = idx (m + 3)
80 idx = B.unsafeIndex bs
81 case undefined of
82 _| m >= len -> return (arr,n)
83 | U8.validate1 x1 -> do
84 A.unsafeWrite arr n (fromIntegral x1)
85 go (n+1) (m+1)
86 | m+1 < len && U8.validate2 x1 x2 -> do
87 w <- unsafeWrite arr n (U8.chr2 x1 x2)
88 go (n+w) (m+2)
89 | m+2 < len && U8.validate3 x1 x2 x3 -> do
90 w <- unsafeWrite arr n (U8.chr3 x1 x2 x3)
91 go (n+w) (m+3)
92 | m+3 < len && U8.validate4 x1 x2 x3 x4 -> do
93 w <- unsafeWrite arr n (U8.chr4 x1 x2 x3 x4)
94 go (n+w) (m+4)
95 | otherwise -> case onErr desc (Just x1) of
96 Nothing -> go n (m+1)
97 Just c -> do
98 w <- unsafeWrite arr n c
99 go (n+w) (m+1)
4667a18 @bos Get rid of the old decode function
authored
100 desc = "Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream"
101 {-# INLINE[0] decodeUtf8With #-}
2189300 @bos Write a faster UTF-8 decoder
authored
102
103 -- | Decode a 'ByteString' containing UTF-8 encoded text.
4667a18 @bos Get rid of the old decode function
authored
104 decodeUtf8 :: ByteString -> Text
105 decodeUtf8 = decodeUtf8With strictDecode
106 {-# INLINE[0] decodeUtf8 #-}
be3f9d8 @bos Add a rewrite rule for fusion
authored
107
4667a18 @bos Get rid of the old decode function
authored
108 {-# RULES "STREAM stream/decodeUtf8 fusion" [1]
109 forall bs. F.stream (decodeUtf8 bs) = E.streamUtf8 strictDecode bs #-}
2189300 @bos Write a faster UTF-8 decoder
authored
110
a70e3a0 @bos Split encoding support out into new modules
authored
111 -- | Encode text using UTF-8 encoding.
112 encodeUtf8 :: Text -> ByteString
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
113 encodeUtf8 (Text arr off len) = unsafePerformIO $ do
114 let size0 = min len 4
115 mallocByteString size0 >>= start size0 off 0
116 where
117 start size n0 m0 fp = withForeignPtr fp $ loop n0 m0
118 where
119 loop n1 m1 ptr = go n1 m1
120 where
121 go !n !m
122 | n-off == len = return $! PS fp 0 m
123 | size-m < 4 = {-# SCC "encodeUtf8/resize" #-} do
124 let newSize = size `shiftL` 1
125 fp' <- mallocByteString newSize
126 withForeignPtr fp' $ \ptr' -> memcpy ptr' ptr (fromIntegral m)
127 start newSize n m fp'
128 | otherwise = do
129 let poke8 k v = poke (ptr `plusPtr` k) (fromIntegral v :: Word8)
130 w = A.unsafeIndex arr n
131 case undefined of
132 _| w <= 0x7F -> do
133 poke8 m w
134 go (n+1) (m+1)
135 | w <= 0x7FF -> do
136 poke8 m $ (w `shiftR` 6) + 0xC0
137 poke8 (m+1) $ (w .&. 0x3f) + 0x80
138 go (n+1) (m+2)
139 | 0xD800 <= w && w <= 0xDBFF -> do
2189300 @bos Write a faster UTF-8 decoder
authored
140 let c = ord $ U16.chr2 w (A.unsafeIndex arr (n+1))
8dbe7ad @bos Rewrite encodeUtf8 for speed
authored
141 poke8 m $ (c `shiftR` 18) + 0xF0
142 poke8 (m+1) $ ((c `shiftR` 12) .&. 0x3F) + 0x80
143 poke8 (m+2) $ ((c `shiftR` 6) .&. 0x3F) + 0x80
144 poke8 (m+3) $ (c .&. 0x3F) + 0x80
145 go (n+2) (m+4)
146 | otherwise -> do
147 poke8 m $ (w `shiftR` 12) + 0xE0
148 poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80
149 poke8 (m+2) $ (w .&. 0x3F) + 0x80
150 go (n+1) (m+3)
151 {- INLINE encodeUtf8 #-}
a70e3a0 @bos Split encoding support out into new modules
authored
152
153 -- | Decode text from little endian UTF-16 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored
154 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
155 decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
156 {-# INLINE decodeUtf16LEWith #-}
157
158 -- | Decode text from little endian UTF-16 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored
159 decodeUtf16LE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored
160 decodeUtf16LE = decodeUtf16LEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored
161 {-# INLINE decodeUtf16LE #-}
162
163 -- | Decode text from big endian UTF-16 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored
164 decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
165 decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
166 {-# INLINE decodeUtf16BEWith #-}
167
168 -- | Decode text from big endian UTF-16 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored
169 decodeUtf16BE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored
170 decodeUtf16BE = decodeUtf16BEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored
171 {-# INLINE decodeUtf16BE #-}
172
173 -- | Encode text using little endian UTF-16 encoding.
174 encodeUtf16LE :: Text -> ByteString
175 encodeUtf16LE txt = E.unstream (E.restreamUtf16LE (F.stream txt))
176 {-# INLINE encodeUtf16LE #-}
177
178 -- | Encode text using big endian UTF-16 encoding.
179 encodeUtf16BE :: Text -> ByteString
180 encodeUtf16BE txt = E.unstream (E.restreamUtf16BE (F.stream txt))
181 {-# INLINE encodeUtf16BE #-}
182
183 -- | Decode text from little endian UTF-32 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored
184 decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
185 decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
186 {-# INLINE decodeUtf32LEWith #-}
187
188 -- | Decode text from little endian UTF-32 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored
189 decodeUtf32LE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored
190 decodeUtf32LE = decodeUtf32LEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored
191 {-# INLINE decodeUtf32LE #-}
192
193 -- | Decode text from big endian UTF-32 encoding.
8766bac @bos Add controllable error handling and recovery code.
authored
194 decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
195 decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
196 {-# INLINE decodeUtf32BEWith #-}
197
198 -- | Decode text from big endian UTF-32 encoding.
a70e3a0 @bos Split encoding support out into new modules
authored
199 decodeUtf32BE :: ByteString -> Text
8766bac @bos Add controllable error handling and recovery code.
authored
200 decodeUtf32BE = decodeUtf32BEWith strictDecode
a70e3a0 @bos Split encoding support out into new modules
authored
201 {-# INLINE decodeUtf32BE #-}
202
203 -- | Encode text using little endian UTF-32 encoding.
204 encodeUtf32LE :: Text -> ByteString
205 encodeUtf32LE txt = E.unstream (E.restreamUtf32LE (F.stream txt))
206 {-# INLINE encodeUtf32LE #-}
207
208 -- | Encode text using big endian UTF-32 encoding.
209 encodeUtf32BE :: Text -> ByteString
210 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
211 {-# INLINE encodeUtf32BE #-}
Something went wrong with that request. Please try again.