Permalink
Browse files

Add controllable error handling and recovery code.

--HG--
extra : convert_revision : 3795901067732c91b235f9281f8e3691756dc5d3
  • Loading branch information...
1 parent 62aa968 commit 8766bacfca7a17c97b43d942f87685fcdae6f976 @bos committed Jun 6, 2009
Showing with 203 additions and 43 deletions.
  1. +37 −5 Data/Text/Encoding.hs
  2. +108 −0 Data/Text/Encoding/Error.hs
  3. +26 −19 Data/Text/Encoding/Fusion.hs
  4. +8 −1 Data/Text/Lazy/Encoding.hs
  5. +23 −18 Data/Text/Lazy/Encoding/Fusion.hs
  6. +1 −0 text.cabal
View
@@ -25,6 +25,12 @@ module Data.Text.Encoding
, decodeUtf16BE
, decodeUtf32LE
, decodeUtf32BE
+ -- ** Controllable error handling
+ , decodeUtf8With
+ , decodeUtf16LEWith
+ , decodeUtf16BEWith
+ , decodeUtf32LEWith
+ , decodeUtf32BEWith
-- * Encoding Text to ByteStrings
, encodeUtf8
@@ -36,6 +42,7 @@ module Data.Text.Encoding
import Data.ByteString (ByteString)
import qualified Data.Text.Fusion as F
+import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
import qualified Data.Text.Encoding.Fusion as E
import Data.Text.Internal (Text)
@@ -45,8 +52,13 @@ decodeASCII bs = F.unstream (E.streamASCII bs)
{-# INLINE decodeASCII #-}
-- | Decode a 'ByteString' containing UTF-8 encoded text.
+decodeUtf8With :: OnDecodeError -> ByteString -> Text
+decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)
+{-# INLINE decodeUtf8With #-}
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
decodeUtf8 :: ByteString -> Text
-decodeUtf8 bs = F.unstream (E.streamUtf8 bs)
+decodeUtf8 = decodeUtf8With strictDecode
{-# INLINE decodeUtf8 #-}
-- | Encode text using UTF-8 encoding.
@@ -55,13 +67,23 @@ encodeUtf8 txt = E.unstream (E.restreamUtf8 (F.stream txt))
{-# INLINE encodeUtf8 #-}
-- | Decode text from little endian UTF-16 encoding.
+decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
+{-# INLINE decodeUtf16LEWith #-}
+
+-- | Decode text from little endian UTF-16 encoding.
decodeUtf16LE :: ByteString -> Text
-decodeUtf16LE bs = F.unstream (E.streamUtf16LE bs)
+decodeUtf16LE = decodeUtf16LEWith strictDecode
{-# INLINE decodeUtf16LE #-}
-- | Decode text from big endian UTF-16 encoding.
+decodeUtf16BEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
+{-# INLINE decodeUtf16BEWith #-}
+
+-- | Decode text from big endian UTF-16 encoding.
decodeUtf16BE :: ByteString -> Text
-decodeUtf16BE bs = F.unstream (E.streamUtf16BE bs)
+decodeUtf16BE = decodeUtf16BEWith strictDecode
{-# INLINE decodeUtf16BE #-}
-- | Encode text using little endian UTF-16 encoding.
@@ -75,13 +97,23 @@ encodeUtf16BE txt = E.unstream (E.restreamUtf16BE (F.stream txt))
{-# INLINE encodeUtf16BE #-}
-- | Decode text from little endian UTF-32 encoding.
+decodeUtf32LEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
+{-# INLINE decodeUtf32LEWith #-}
+
+-- | Decode text from little endian UTF-32 encoding.
decodeUtf32LE :: ByteString -> Text
-decodeUtf32LE bs = F.unstream (E.streamUtf32LE bs)
+decodeUtf32LE = decodeUtf32LEWith strictDecode
{-# INLINE decodeUtf32LE #-}
-- | Decode text from big endian UTF-32 encoding.
+decodeUtf32BEWith :: OnDecodeError -> ByteString -> Text
+decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
+{-# INLINE decodeUtf32BEWith #-}
+
+-- | Decode text from big endian UTF-32 encoding.
decodeUtf32BE :: ByteString -> Text
-decodeUtf32BE bs = F.unstream (E.streamUtf32BE bs)
+decodeUtf32BE = decodeUtf32BEWith strictDecode
{-# INLINE decodeUtf32BE #-}
-- | Encode text using little endian UTF-32 encoding.
View
@@ -0,0 +1,108 @@
+{-# LANGUAGE DeriveDataTypeable #-}
+-- |
+-- Module : Data.Text.Encoding.Error
+-- Copyright : (c) Bryan O'Sullivan 2009
+--
+-- License : BSD-style
+-- Maintainer : bos@serpentine.com, rtharper@aftereternity.co.uk,
+-- duncan@haskell.org
+-- Stability : experimental
+-- Portability : GHC
+--
+-- Types and functions for dealing with encoding and decoding errors
+-- in Unicode text.
+--
+-- The standard functions for encoding and decoding text are strict,
+-- which is to say that they throw exceptions on invalid input. This
+-- is often unhelpful on real world input, so alternative functions
+-- exist that accept custom handlers for dealing with invalid inputs.
+-- These 'OnError' handlers are normal Haskell functions. You can use
+-- one of the presupplied functions in this module, or you can write a
+-- custom handler of your own.
+
+module Data.Text.Encoding.Error
+ (
+ -- * Error handling types
+ UnicodeException(..)
+ , OnError
+ , OnDecodeError
+ , OnEncodeError
+ -- * Useful error handling functions
+ , lenientDecode
+ , strictDecode
+ , strictEncode
+ , ignore
+ , replace
+ ) where
+
+import Control.Exception (Exception, throw)
+import Data.Typeable (Typeable)
+import Data.Word (Word8)
+import Numeric (showHex)
+
+-- | Function type for handling a coding error. It is supplied with
+-- two inputs:
+--
+-- * A 'String' that describes the error.
+--
+-- * The input value that caused the error. If the error arose
+-- because the end of input was reached or could not be identified
+-- precisely, this value will be 'Nothing'.
+--
+-- If the handler returns a value wrapped with 'Just', that value will
+-- be used in the output as the replacement for the invalid input. If
+-- it returns 'Nothing', no value will be used in the output.
+--
+-- Should the handler need to abort processing, it should use 'error'
+-- or 'throw' an exception (preferably a 'UnicodeException'). It may
+-- use the description provided to construct a more helpful error
+-- report.
+type OnError a b = String -> Maybe a -> Maybe b
+type OnDecodeError = OnError Word8 Char
+type OnEncodeError = OnError Char Word8
+
+-- | An exception type for representing Unicode encoding errors.
+data UnicodeException =
+ DecodeError String (Maybe Word8)
+ -- ^ Could not decode a byte sequence because it was invalid under
+ -- the given encoding, or ran out of input in mid-decode.
+ | EncodeError String (Maybe Char)
+ -- ^ Tried to encode a character that could not be represented
+ -- under the given encoding, or ran out of input in mid-encode.
+ deriving (Typeable)
+
+showUnicodeException :: UnicodeException -> String
+showUnicodeException (DecodeError desc (Just w))
+ = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc)
+showUnicodeException (DecodeError desc Nothing)
+ = "Cannot decode input: " ++ desc
+showUnicodeException (EncodeError desc (Just c))
+ = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc)
+showUnicodeException (EncodeError desc Nothing)
+ = "Cannot encode input: " ++ desc
+
+instance Show UnicodeException where
+ show = showUnicodeException
+
+instance Exception UnicodeException
+
+-- | Throw a 'UnicodeException' if decoding fails.
+strictDecode :: OnError Word8 Char
+strictDecode desc c = throw (DecodeError desc c)
+
+-- | Replace an invalid input byte with the Unicode replacement
+-- character U+FFFD.
+lenientDecode :: OnError Word8 Char
+lenientDecode _ _ = Just '\xfffd'
+
+-- | Throw a 'UnicodeException' if encoding fails.
+strictEncode :: OnError Char Word8
+strictEncode desc c = throw (EncodeError desc c)
+
+-- | Ignore an invalid input, substituting nothing in the output.
+ignore :: OnError a b
+ignore _ _ = Nothing
+
+-- | Replace an invalid input with a valid output.
+replace :: b -> OnError a b
+replace c _ _ = Just c
@@ -1,4 +1,4 @@
-{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE BangPatterns, Rank2Types #-}
-- |
-- Module : Data.Text.Encoding.Fusion
@@ -35,6 +35,7 @@ import Control.Exception (assert)
import Data.ByteString as B
import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy)
import Data.Text.Fusion (Step(..), Stream(..))
+import Data.Text.Encoding.Error
import Data.Text.Encoding.Fusion.Common
import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32)
import Data.Text.UnsafeShift (shiftL)
@@ -62,8 +63,8 @@ streamASCII bs = Stream next 0 l
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using UTF-8
-- encoding.
-streamUtf8 :: ByteString -> Stream Char
-streamUtf8 bs = Stream next 0 l
+streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
+streamUtf8 onErr bs = Stream next 0 l
where
l = B.length bs
{-# INLINE next #-}
@@ -73,8 +74,9 @@ streamUtf8 bs = Stream next 0 l
| i+1 < l && U8.validate2 x1 x2 = Yield (U8.chr2 x1 x2) (i+2)
| i+2 < l && U8.validate3 x1 x2 x3 = Yield (U8.chr3 x1 x2 x3) (i+3)
| i+3 < l && U8.validate4 x1 x2 x3 x4 = Yield (U8.chr4 x1 x2 x3 x4) (i+4)
- | otherwise = encodingError "UTF-8"
+ | otherwise = decodeError "streamUtf8" "UTF-8" onErr mx (i+1)
where
+ mx = if i >= l then Nothing else Just x1
x1 = idx i
x2 = idx (i + 1)
x3 = idx (i + 2)
@@ -84,16 +86,16 @@ streamUtf8 bs = Stream next 0 l
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
-- endian UTF-16 encoding.
-streamUtf16LE :: ByteString -> Stream Char
-streamUtf16LE bs = Stream next 0 l
+streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf16LE onErr bs = Stream next 0 l
where
l = B.length bs
{-# INLINE next #-}
next i
| i >= l = Done
| i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2)
| i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
- | otherwise = encodingError "UTF-16LE"
+ | otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1)
where
x1 = idx i + (idx (i + 1) `shiftL` 8)
x2 = idx (i + 2) + (idx (i + 3) `shiftL` 8)
@@ -102,16 +104,16 @@ streamUtf16LE bs = Stream next 0 l
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
-- endian UTF-16 encoding.
-streamUtf16BE :: ByteString -> Stream Char
-streamUtf16BE bs = Stream next 0 l
+streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf16BE onErr bs = Stream next 0 l
where
l = B.length bs
{-# INLINE next #-}
next i
| i >= l = Done
| i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2)
| i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
- | otherwise = encodingError "UTF16-BE"
+ | otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1)
where
x1 = (idx i `shiftL` 8) + idx (i + 1)
x2 = (idx (i + 2) `shiftL` 8) + idx (i + 3)
@@ -120,15 +122,15 @@ streamUtf16BE bs = Stream next 0 l
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
-- endian UTF-32 encoding.
-streamUtf32BE :: ByteString -> Stream Char
-streamUtf32BE bs = Stream next 0 l
+streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf32BE onErr bs = Stream next 0 l
where
l = B.length bs
{-# INLINE next #-}
next i
| i >= l = Done
| i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
- | otherwise = encodingError "UTF-32BE"
+ | otherwise = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing (i+1)
where
x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
x1 = idx i
@@ -140,15 +142,15 @@ streamUtf32BE bs = Stream next 0 l
-- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
-- endian UTF-32 encoding.
-streamUtf32LE :: ByteString -> Stream Char
-streamUtf32LE bs = Stream next 0 l
+streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
+streamUtf32LE onErr bs = Stream next 0 l
where
l = B.length bs
{-# INLINE next #-}
next i
| i >= l = Done
| i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
- | otherwise = encodingError "UTF-32LE"
+ | otherwise = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing (i+1)
where
x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
x1 = idx i
@@ -187,6 +189,11 @@ unstream (Stream next s0 len) = unsafePerformIO $ do
memcpy dest' src' (fromIntegral srcLen)
return dest
-encodingError :: String -> a
-encodingError encoding =
- error $ "Data.Text.Encoding.Fusion: Bad " ++ encoding ++ " stream"
+decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
+ -> s -> Step s Char
+decodeError func kind onErr mb i =
+ case onErr desc mb of
+ Nothing -> Skip i
+ Just c -> Yield c i
+ where desc = "Data.Text.Encoding.Fusion." ++ func ++ ": Invalid " ++
+ kind ++ " stream"
@@ -19,6 +19,7 @@ module Data.Text.Lazy.Encoding
-- * Decoding ByteStrings to Text
-- decodeASCII
decodeUtf8
+ , decodeUtf8With
--, decodeUtf16LE
--, decodeUtf16BE
--, decodeUtf32LE
@@ -33,13 +34,19 @@ module Data.Text.Lazy.Encoding
) where
import Data.ByteString.Lazy (ByteString)
+import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
import Data.Text.Lazy (Text)
import qualified Data.Text.Lazy.Fusion as F
import qualified Data.Text.Lazy.Encoding.Fusion as E
-- | Decode a 'ByteString' containing UTF-8 encoded text.
+decodeUtf8With :: OnDecodeError -> ByteString -> Text
+decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)
+{-# INLINE decodeUtf8With #-}
+
+-- | Decode a 'ByteString' containing UTF-8 encoded text.
decodeUtf8 :: ByteString -> Text
-decodeUtf8 bs = F.unstream (E.streamUtf8 bs)
+decodeUtf8 = decodeUtf8With strictDecode
{-# INLINE decodeUtf8 #-}
-- | Encode text using UTF-8 encoding.
Oops, something went wrong.

0 comments on commit 8766bac

Please sign in to comment.