Oops! I accidentally released 0.10.0.1 with backwards-incompatible AP…

…I changes This change reapplies the reserved surrogate code space bug fix to 0.10.0.0, and bumps the version to 0.10.0.2.
haskell · Dec 3, 2010 · b00c8a5 · b00c8a5
1 parent d904602
commit b00c8a5
Show file tree

Hide file tree

Showing 4 changed files with 282 additions and 109 deletions.
diff --git a/Data/Text.hs b/Data/Text.hs
@@ -13,20 +13,31 @@
 -- Stability   : experimental
 -- Portability : GHC
 --
--- A time and space-efficient implementation of Unicode text using
--- packed Word16 arrays.  Suitable for performance critical use, both
--- in terms of large data quantities and high speed.
+-- A time and space-efficient implementation of Unicode text.
+-- Suitable for performance critical use, both in terms of large data
+-- quantities and high speed.
+--
+-- /Note/: Read below the synopsis for important notes on the use of
+-- this module.
 --
 -- This module is intended to be imported @qualified@, to avoid name
 -- clashes with "Prelude" functions, e.g.
 --
 -- > import qualified Data.Text as T
+--
+-- To use an extended and very rich family of functions for working
+-- with Unicode text (including normalization, regular expressions,
+-- non-standard encodings, text breaking, and locales), see the
+-- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>
 
 module Data.Text
     (
     -- * Strict vs lazy types
     -- $strict
 
+    -- * Acceptable data
+    -- $replacement
+
     -- * Fusion
     -- $fusion
 
@@ -196,7 +207,7 @@ import Data.String (IsString(..))
 import qualified Data.Text.Fusion as S
 import qualified Data.Text.Fusion.Common as S
 import Data.Text.Fusion (stream, reverseStream, unstream)
-import Data.Text.Internal (Text(..), empty, text, textP)
+import Data.Text.Internal (Text(..), empty, firstf, safe, text, textP)
 import qualified Prelude as P
 import Data.Text.Unsafe (Iter(..), iter, iter_, lengthWord16, reverseIter,
                          unsafeHead, unsafeTail)
@@ -230,6 +241,33 @@ import Data.Int (Int64)
 -- difference being that the strict module uses 'Int' values for
 -- lengths and counts, while the lazy module uses 'Int64' lengths.
 
+-- $replacement
+--
+-- A 'Text' value is a sequence of Unicode scalar values, as defined
+-- in &#xa7;3.9, definition D76 of the Unicode 5.2 standard:
+-- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35>. As
+-- such, a 'Text' cannot contain values in the range U+D800 to U+DFFF
+-- inclusive. Haskell implementations admit all Unicode code points
+-- (&#xa7;3.4, definition D10) as 'Char' values, including code points
+-- from this invalid range.  This means that there are some 'Char'
+-- values that are not valid Unicode scalar values, and the functions
+-- in this module must handle those cases.
+--
+-- Within this module, many functions construct a 'Text' from one or
+-- more 'Char' values. Those functions will substitute 'Char' values
+-- that are not valid Unicode scalar values with the replacement
+-- character \"&#xfffd;\" (U+FFFD).  Functions that perform this
+-- inspection and replacement are documented with the phrase
+-- \"Performs replacement on invalid scalar values\".
+--
+-- (One reason for this policy of replacement is that internally, a
+-- 'Text' value is represented as packed UTF-16 data. Values in the
+-- range U+D800 through U+DFFF are used by UTF-16 to denote surrogate
+-- code points, and so cannot be represented. The functions replace
+-- invalid scalar values, instead of dropping them, as a security
+-- measure. For details, see Unicode Technical Report 36, &#xa7;3.5:
+-- <http://unicode.org/reports/tr36/#Deletion_of_Noncharacters>)
+
 -- $fusion
 --
 -- Most of the functions in this module are subject to /fusion/,
@@ -240,6 +278,7 @@ import Data.Int (Int64)
 --
 -- > import Data.Text as T
 -- > import Data.Text.Encoding as E
+-- > import Data.ByteString (ByteString)
 -- >
 -- > countChars :: ByteString -> Int
 -- > countChars = T.length . T.toUpper . E.decodeUtf8
@@ -251,7 +290,7 @@ import Data.Int (Int64)
 -- function will be compiled down to a single loop over the source
 -- 'ByteString'.
 --
--- Functions that can be fused by the compiler are marked with the
+-- Functions that can be fused by the compiler are documented with the
 -- phrase \"Subject to fusion\".
 
 instance Eq Text where
@@ -320,36 +359,39 @@ compareText ta@(Text _arrA _offA lenA) tb@(Text _arrB _offB lenB)
 -- -----------------------------------------------------------------------------
 -- * Conversion to/from 'Text'
 
--- | /O(n)/ Convert a 'String' into a 'Text'.  Subject to fusion.
+-- | /O(n)/ Convert a 'String' into a 'Text'.  Subject to
+-- fusion.  Performs replacement on invalid scalar values.
 pack :: String -> Text
-pack = unstream . S.streamList
+pack = unstream . S.streamList . L.map safe
 {-# INLINE [1] pack #-}
 
 -- | /O(n)/ Convert a Text into a String.  Subject to fusion.
 unpack :: Text -> String
 unpack = S.unstreamList . stream
 {-# INLINE [1] unpack #-}
 
--- | /O(1)/ Convert a character into a Text.
--- Subject to fusion.
+-- | /O(1)/ Convert a character into a Text.  Subject to fusion.
+-- Performs replacement on invalid scalar values.
 singleton :: Char -> Text
-singleton = unstream . S.singleton
+singleton = unstream . S.singleton . safe
 {-# INLINE [1] singleton #-}
 
 -- -----------------------------------------------------------------------------
 -- * Basic functions
 
 -- | /O(n)/ Adds a character to the front of a 'Text'.  This function
 -- is more costly than its 'List' counterpart because it requires
--- copying a new array.  Subject to fusion.
+-- copying a new array.  Subject to fusion.  Performs replacement on
+-- invalid scalar values.
 cons :: Char -> Text -> Text
-cons c t = unstream (S.cons c (stream t))
+cons c t = unstream (S.cons (safe c) (stream t))
 {-# INLINE cons #-}
 
 -- | /O(n)/ Adds a character to the end of a 'Text'.  This copies the
 -- entire array in the process, unless fused.  Subject to fusion.
+-- Performs replacement on invalid scalar values.
 snoc :: Text -> Char -> Text
-snoc t c = unstream (S.snoc (stream t) c)
+snoc t c = unstream (S.snoc (stream t) (safe c))
 {-# INLINE snoc #-}
 
 -- | /O(n)/ Appends one 'Text' to the other by copying both of them
@@ -523,9 +565,10 @@ compareLength t n = S.compareLengthI (stream t) n
 -- -----------------------------------------------------------------------------
 -- * Transformations
 -- | /O(n)/ 'map' @f@ @t@ is the 'Text' obtained by applying @f@ to
--- each element of @t@.  Subject to fusion.
+-- each element of @t@.  Subject to fusion.  Performs replacement on
+-- invalid scalar values.
 map :: (Char -> Char) -> Text -> Text
-map f t = unstream (S.map f (stream t))
+map f t = unstream (S.map (safe . f) (stream t))
 {-# INLINE [1] map #-}
 
 -- | /O(n)/ The 'intercalate' function takes a 'Text' and a list of
@@ -536,9 +579,10 @@ intercalate t = concat . (U.intersperse t)
 {-# INLINE intercalate #-}
 
 -- | /O(n)/ The 'intersperse' function takes a character and places it
--- between the characters of a 'Text'.  Subject to fusion.
+-- between the characters of a 'Text'.  Subject to fusion.  Performs
+-- replacement on invalid scalar values.
 intersperse     :: Char -> Text -> Text
-intersperse c t = unstream (S.intersperse c (stream t))
+intersperse c t = unstream (S.intersperse (safe c) (stream t))
 {-# INLINE intersperse #-}
 
 -- | /O(n)/ Reverse the characters of a string. Subject to fusion.
@@ -614,7 +658,10 @@ toUpper t = unstream (S.toUpper (stream t))
 {-# INLINE toUpper #-}
 
 -- | /O(n)/ Left-justify a string to the given length, using the
--- specified fill character on the right. Subject to fusion. Examples:
+-- specified fill character on the right. Subject to fusion.
+-- Performs replacement on invalid scalar values.
+--
+-- Examples:
 --
 -- > justifyLeft 7 'x' "foo"    == "fooxxxx"
 -- > justifyLeft 3 'x' "foobar" == "foobar"
@@ -633,7 +680,10 @@ justifyLeft k c t
   #-}
 
 -- | /O(n)/ Right-justify a string to the given length, using the
--- specified fill character on the left. Examples:
+-- specified fill character on the left.  Performs replacement on
+-- invalid scalar values.
+--
+-- Examples:
 --
 -- > justifyRight 7 'x' "bar"    == "xxxxbar"
 -- > justifyRight 3 'x' "foobar" == "foobar"
@@ -644,8 +694,11 @@ justifyRight k c t
   where len = length t
 {-# INLINE justifyRight #-}
 
--- | /O(n)/ Center a string to the given length, using the
--- specified fill character on either side. Examples:
+-- | /O(n)/ Center a string to the given length, using the specified
+-- fill character on either side.  Performs replacement on invalid
+-- scalar values.
+--
+-- Examples:
 --
 -- > center 8 'x' "HS" = "xxxHSxxx"
 center :: Int -> Char -> Text -> Text
@@ -760,53 +813,63 @@ minimum t = S.minimum (stream t)
 
 -- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
 -- successive reduced values from the left. Subject to fusion.
+-- Performs replacement on invalid scalar values.
 --
 -- > scanl f z [x1, x2, ...] == [z, z `f` x1, (z `f` x1) `f` x2, ...]
 --
 -- Note that
 --
 -- > last (scanl f z xs) == foldl f z xs.
 scanl :: (Char -> Char -> Char) -> Char -> Text -> Text
-scanl f z t = unstream (S.scanl f z (stream t))
+scanl f z t = unstream (S.scanl g z (stream t))
+    where g a b = safe (f a b)
 {-# INLINE scanl #-}
 
 -- | /O(n)/ 'scanl1' is a variant of 'scanl' that has no starting
--- value argument.  Subject to fusion.
+-- value argument.  Subject to fusion.  Performs replacement on
+-- invalid scalar values.
 --
 -- > scanl1 f [x1, x2, ...] == [x1, x1 `f` x2, ...]
 scanl1 :: (Char -> Char -> Char) -> Text -> Text
 scanl1 f t | null t    = empty
            | otherwise = scanl f (unsafeHead t) (unsafeTail t)
 {-# INLINE scanl1 #-}
 
--- | /O(n)/ 'scanr' is the right-to-left dual of 'scanl'.
+-- | /O(n)/ 'scanr' is the right-to-left dual of 'scanl'.  Performs
+-- replacement on invalid scalar values.
 --
 -- > scanr f v == reverse . scanl (flip f) v . reverse
 scanr :: (Char -> Char -> Char) -> Char -> Text -> Text
-scanr f z = S.reverse . S.reverseScanr f z . reverseStream
+scanr f z = S.reverse . S.reverseScanr g z . reverseStream
+    where g a b = safe (f a b)
 {-# INLINE scanr #-}
 
 -- | /O(n)/ 'scanr1' is a variant of 'scanr' that has no starting
--- value argument.  Subject to fusion.
+-- value argument.  Subject to fusion.  Performs replacement on
+-- invalid scalar values.
 scanr1 :: (Char -> Char -> Char) -> Text -> Text
 scanr1 f t | null t    = empty
            | otherwise = scanr f (last t) (init t)
 {-# INLINE scanr1 #-}
 
 -- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
 -- function to each element of a 'Text', passing an accumulating
--- parameter from left to right, and returns a final 'Text'.
+-- parameter from left to right, and returns a final 'Text'.  Performs
+-- replacement on invalid scalar values.
 mapAccumL :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
-mapAccumL f z0 = S.mapAccumL f z0 . stream
+mapAccumL f z0 = S.mapAccumL g z0 . stream
+    where g a b = second safe (f a b)
 {-# INLINE mapAccumL #-}
 
 -- | The 'mapAccumR' function behaves like a combination of 'map' and
 -- a strict 'foldr'; it applies a function to each element of a
 -- 'Text', passing an accumulating parameter from right to left, and
 -- returning a final value of this accumulator together with the new
 -- 'Text'.
+-- Performs replacement on invalid scalar values.
 mapAccumR :: (a -> Char -> (a,Char)) -> a -> Text -> (a, Text)
-mapAccumR f z0 = second reverse . S.mapAccumL f z0 . reverseStream
+mapAccumR f z0 = second reverse . S.mapAccumL g z0 . reverseStream
+    where g a b = second safe (f a b)
 {-# INLINE mapAccumR #-}
 
 -- -----------------------------------------------------------------------------
@@ -839,7 +902,7 @@ replicate n t@(Text a o l)
 -- | /O(n)/ 'replicateChar' @n@ @c@ is a 'Text' of length @n@ with @c@ the
 -- value of every element. Subject to fusion.
 replicateChar :: Int -> Char -> Text
-replicateChar n c = unstream (S.replicateCharI n c)
+replicateChar n c = unstream (S.replicateCharI n (safe c))
 {-# INLINE replicateChar #-}
 
 -- | /O(n)/, where @n@ is the length of the result. The 'unfoldr'
@@ -848,19 +911,19 @@ replicateChar n c = unstream (S.replicateCharI n c)
 -- returns 'Nothing' if it is done producing the 'Text', otherwise
 -- 'Just' @(a,b)@.  In this case, @a@ is the next 'Char' in the
 -- string, and @b@ is the seed value for further production. Subject
--- to fusion.
+-- to fusion.  Performs replacement on invalid scalar values.
 unfoldr     :: (a -> Maybe (Char,a)) -> a -> Text
-unfoldr f s = unstream (S.unfoldr f s)
+unfoldr f s = unstream (S.unfoldr (firstf safe . f) s)
 {-# INLINE unfoldr #-}
 
 -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a 'Text' from a seed
 -- value. However, the length of the result should be limited by the
 -- first argument to 'unfoldrN'. This function is more efficient than
 -- 'unfoldr' when the maximum length of the result is known and
 -- correct, otherwise its performance is similar to 'unfoldr'. Subject
--- to fusion.
+-- to fusion.  Performs replacement on invalid scalar values.
 unfoldrN     :: Int -> (a -> Maybe (Char,a)) -> a -> Text
-unfoldrN n f s = unstream (S.unfoldrN n f s)
+unfoldrN n f s = unstream (S.unfoldrN n (firstf safe . f) s)
 {-# INLINE unfoldrN #-}
 
 -- -----------------------------------------------------------------------------
@@ -1297,8 +1360,10 @@ zip a b = S.unstreamList $ S.zipWith (,) (stream a) (stream b)
 
 -- | /O(n)/ 'zipWith' generalises 'zip' by zipping with the function
 -- given as the first argument, instead of a tupling function.
+-- Performs replacement on invalid scalar values.
 zipWith :: (Char -> Char -> Char) -> Text -> Text -> Text
-zipWith f t1 t2 = unstream (S.zipWith f (stream t1) (stream t2))
+zipWith f t1 t2 = unstream (S.zipWith g (stream t1) (stream t2))
+    where g a b = safe (f a b)
 {-# INLINE [0] zipWith #-}
 
 -- | /O(n)/ Breaks a 'Text' up into a list of words, delimited by 'Char's
@@ -1362,8 +1427,7 @@ unwords = intercalate (singleton ' ')
 {-# INLINE unwords #-}
 
 -- | /O(n)/ The 'isPrefixOf' function takes two 'Text's and returns
--- 'True' iff the first is a prefix of the second.  This function is
--- subject to fusion.
+-- 'True' iff the first is a prefix of the second.  Subject to fusion.
 isPrefixOf :: Text -> Text -> Bool
 isPrefixOf a@(Text _ _ alen) b@(Text _ _ blen) =
     alen <= blen && S.isPrefixOf (stream a) (stream b)

diff --git a/Data/Text/Internal.hs b/Data/Text/Internal.hs
@@ -22,16 +22,22 @@ module Data.Text.Internal
     -- * Construction
     , text
     , textP
+    -- * Safety
+    , safe
     -- * Code that must be here for accessibility
     , empty
+    -- * Utilities
+    , firstf
     -- * Debugging
     , showText
     ) where
 
 #if defined(ASSERTS)
 import Control.Exception (assert)
 #endif
+import Data.Bits ((.&.))
 import qualified Data.Text.Array as A
+import Data.Text.UnsafeChar (ord)
 import Data.Typeable (Typeable)
 
 -- | A space efficient, packed, unboxed Unicode text type.
@@ -72,3 +78,21 @@ showText :: Text -> String
 showText (Text arr off len) =
     "Text " ++ show (A.toList arr off len) ++ ' ' :
             show off ++ ' ' : show len
+
+-- | Map a 'Char' to a 'Text'-safe value.
+--
+-- UTF-16 surrogate code points are not included in the set of Unicode
+-- scalar values, but are unfortunately admitted as valid 'Char'
+-- values by Haskell.  They cannot be represented in a 'Text'.  This
+-- function remaps those code points to the Unicode replacement
+-- character \"&#xfffd;\", and leaves other code points unchanged.
+safe :: Char -> Char
+safe c
+    | ord c .&. 0x1ff800 /= 0xd800 = c
+    | otherwise                    = '\xfffd'
+{-# INLINE safe #-}
+
+-- | Apply a function to the first element of an optional pair.
+firstf :: (a -> c) -> Maybe (a,b) -> Maybe (c,b)
+firstf f (Just (a, b)) = Just (f a, b)
+firstf _  Nothing      = Nothing