Skip to content

Commit

Permalink
Fix size hints
Browse files Browse the repository at this point in the history
This fixes a variety of size hint bugs in text's fusion framework. These
issues fell broadly into two classes,

 * Code point/code unit confusion
 * Inappropriate bounds

It seems the most of the latter were introduced when the Size type was
extended to track both upper and lower bounds in f4fc30c. These could
manifest in a variety of issues similar to haskell#197.
  • Loading branch information
bgamari committed Oct 20, 2017
1 parent c91deda commit 424451e
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 28 deletions.
50 changes: 28 additions & 22 deletions Data/Text/Internal/Fusion/Common.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{-# LANGUAGE BangPatterns, MagicHash, Rank2Types #-}
{-# LANGUAGE PatternGuards, BangPatterns, MagicHash, Rank2Types #-}
-- |
-- Module : Data.Text.Internal.Fusion.Common
-- Copyright : (c) Bryan O'Sullivan 2009, 2012
Expand Down Expand Up @@ -117,7 +117,7 @@ import GHC.Prim (Addr#, chr#, indexCharOffAddr#, ord#)
import GHC.Types (Char(..), Int(..))

singleton :: Char -> Stream Char
singleton c = Stream next False 1
singleton c = Stream next False (codePointsSize 1)
where next False = Yield c True
next True = Done
{-# INLINE [0] singleton #-}
Expand Down Expand Up @@ -175,7 +175,7 @@ data C s = C0 !s

-- | /O(n)/ Adds a character to the front of a Stream Char.
cons :: Char -> Stream Char -> Stream Char
cons !w (Stream next0 s0 len) = Stream next (C1 s0) (len+1)
cons !w (Stream next0 s0 len) = Stream next (C1 s0) (len + codePointsSize 1)
where
next (C1 s) = Yield w (C0 s)
next (C0 s) = case next0 s of
Expand All @@ -189,7 +189,7 @@ data Snoc a = N

-- | /O(n)/ Adds a character to the end of a stream.
snoc :: Stream Char -> Char -> Stream Char
snoc (Stream next0 xs0 len) w = Stream next (J xs0) (len+1)
snoc (Stream next0 xs0 len) w = Stream next (J xs0) (len + codePointsSize 1)
where
next (J xs) = case next0 xs of
Done -> Yield w N
Expand Down Expand Up @@ -237,7 +237,7 @@ uncons :: Stream Char -> Maybe (Char, Stream Char)
uncons (Stream next s0 len) = loop_uncons s0
where
loop_uncons !s = case next s of
Yield x s1 -> Just (x, Stream next s1 (len-1))
Yield x s1 -> Just (x, Stream next s1 (len - codePointsSize 1))
Skip s' -> loop_uncons s'
Done -> Nothing
{-# INLINE [0] uncons #-}
Expand All @@ -260,7 +260,7 @@ last (Stream next s0 _len) = loop0_last s0
-- | /O(1)/ Returns all characters after the head of a Stream Char, which must
-- be non-empty.
tail :: Stream Char -> Stream Char
tail (Stream next0 s0 len) = Stream next (C0 s0) (len-1)
tail (Stream next0 s0 len) = Stream next (C0 s0) (len - codePointsSize 1)
where
next (C0 s) = case next0 s of
Done -> emptyError "tail"
Expand All @@ -278,7 +278,7 @@ data Init s = Init0 !s
-- | /O(1)/ Returns all but the last character of a Stream Char, which
-- must be non-empty.
init :: Stream Char -> Stream Char
init (Stream next0 s0 len) = Stream next (Init0 s0) (len-1)
init (Stream next0 s0 len) = Stream next (Init0 s0) (len - codePointsSize 1)
where
next (Init0 s) = case next0 s of
Done -> emptyError "init"
Expand Down Expand Up @@ -318,11 +318,14 @@ lengthI (Stream next s0 _len) = loop_length 0 s0
-- greater than the number or if the stream can't possibly be as long
-- as the number supplied, and hence be more efficient.
compareLengthI :: Integral a => Stream Char -> a -> Ordering
compareLengthI (Stream next s0 len) n =
case compareSize len (fromIntegral n) of
Just o -> o
Nothing -> loop_cmp 0 s0
compareLengthI (Stream next s0 len) n
-- Note that @len@ tracks code units whereas we want to compare the length
-- in code points. Specifically, a stream with hint @len@ may consist of
-- anywhere from @len/2@ to @len@ code points.
| Just r <- compareSize len n' = r
| otherwise = loop_cmp 0 s0
where
n' = codePointsSize $ fromIntegral n
loop_cmp !z s = case next s of
Done -> compare z n
Skip s' -> loop_cmp z s'
Expand Down Expand Up @@ -368,7 +371,7 @@ data I s = I1 !s
-- | /O(n)/ Take a character and place it between each of the
-- characters of a 'Stream Char'.
intersperse :: Char -> Stream Char -> Stream Char
intersperse c (Stream next0 s0 len) = Stream next (I1 s0) len
intersperse c (Stream next0 s0 len) = Stream next (I1 s0) (len + unknownSize)
where
next (I1 s) = case next0 s of
Done -> Done
Expand All @@ -393,9 +396,11 @@ intersperse c (Stream next0 s0 len) = Stream next (I1 s0) len
-- functions may map one input character to two or three output
-- characters.

-- | Map a 'Stream' through the given case-mapping function.
caseConvert :: (forall s. Char -> s -> Step (CC s) Char)
-> Stream Char -> Stream Char
caseConvert remap (Stream next0 s0 len) = Stream next (CC s0 '\0' '\0') len
caseConvert remap (Stream next0 s0 len) =
Stream next (CC s0 '\0' '\0') (len `unionSize` fromIntegral 3*len)
where
next (CC s '\0' _) =
case next0 s of
Expand Down Expand Up @@ -458,7 +463,7 @@ toLower = caseConvert lowerMapping
-- Fox\" is correctly title cased&#x2014;but this function will
-- capitalize /every/ word.
toTitle :: Stream Char -> Stream Char
toTitle (Stream next0 s0 len) = Stream next (CC (False :*: s0) '\0' '\0') len
toTitle (Stream next0 s0 len) = Stream next (CC (False :*: s0) '\0' '\0') (len + unknownSize)
where
next (CC (letter :*: s) '\0' _) =
case next0 s of
Expand All @@ -479,7 +484,7 @@ data Justify i s = Just1 !i !s

justifyLeftI :: Integral a => a -> Char -> Stream Char -> Stream Char
justifyLeftI k c (Stream next0 s0 len) =
Stream next (Just1 0 s0) (larger (fromIntegral k) len)
Stream next (Just1 0 s0) (larger (fromIntegral k * charSize c + len) len)
where
next (Just1 n s) =
case next0 s of
Expand Down Expand Up @@ -699,7 +704,7 @@ replicateI n (Stream next0 s0 len) =
-- (a,b), in which case, a is the next Char in the string, and b is
-- the seed value for further production.
unfoldr :: (a -> Maybe (Char,a)) -> a -> Stream Char
unfoldr f s0 = Stream next s0 1 -- HINT maybe too low
unfoldr f s0@(Stream _ _ size) = Stream next s0 (2*size)
where
{-# INLINE next #-}
next !s = case f s of
Expand All @@ -713,7 +718,7 @@ unfoldr f s0 = Stream next s0 1 -- HINT maybe too low
-- 'unfoldr' when the length of the result is known.
unfoldrNI :: Integral a => a -> (b -> Maybe (Char,b)) -> b -> Stream Char
unfoldrNI n f s0 | n < 0 = empty
| otherwise = Stream next (0 :*: s0) (fromIntegral (n*2)) -- HINT maybe too high
| otherwise = Stream next (0 :*: s0) (maxSize $ fromIntegral (n*2))
where
{-# INLINE next #-}
next (z :*: s) = case f s of
Expand All @@ -730,7 +735,7 @@ unfoldrNI n f s0 | n < 0 = empty
-- length of the stream.
take :: Integral a => a -> Stream Char -> Stream Char
take n0 (Stream next0 s0 len) =
Stream next (n0 :*: s0) (smaller len (fromIntegral (max 0 n0)))
Stream next (n0 :*: s0) (smaller len (codePointsSize $ fromIntegral n0))
where
{-# INLINE next #-}
next (n :*: s) | n <= 0 = Done
Expand All @@ -748,7 +753,7 @@ data Drop a s = NS !s
-- is greater than the length of the stream.
drop :: Integral a => a -> Stream Char -> Stream Char
drop n0 (Stream next0 s0 len) =
Stream next (JS n0 s0) (len - fromIntegral (max 0 n0))
Stream next (JS n0 s0) (len - codePointsSize (fromIntegral n0))
where
{-# INLINE next #-}
next (JS n s)
Expand All @@ -766,7 +771,7 @@ drop n0 (Stream next0 s0 len) =
-- | takeWhile, applied to a predicate @p@ and a stream, returns the
-- longest prefix (possibly empty) of elements that satisfy p.
takeWhile :: (Char -> Bool) -> Stream Char -> Stream Char
takeWhile p (Stream next0 s0 len) = Stream next s0 len -- HINT maybe too high
takeWhile p (Stream next0 s0 len) = Stream next s0 (len - unknownSize)
where
{-# INLINE next #-}
next !s = case next0 s of
Expand All @@ -778,7 +783,7 @@ takeWhile p (Stream next0 s0 len) = Stream next s0 len -- HINT maybe too high

-- | dropWhile @p @xs returns the suffix remaining after takeWhile @p @xs.
dropWhile :: (Char -> Bool) -> Stream Char -> Stream Char
dropWhile p (Stream next0 s0 len) = Stream next (L s0) len -- HINT maybe too high
dropWhile p (Stream next0 s0 len) = Stream next (L s0) (len - unknownSize)
where
{-# INLINE next #-}
next (L s) = case next0 s of
Expand Down Expand Up @@ -857,7 +862,8 @@ indexI (Stream next s0 _len) n0
-- returns a stream containing those characters that satisfy the
-- predicate.
filter :: (Char -> Bool) -> Stream Char -> Stream Char
filter p (Stream next0 s0 len) = Stream next s0 len -- HINT maybe too high
filter p (Stream next0 s0 len) =
Stream next s0 (len - unknownSize) -- HINT maybe too high
where
next !s = case next0 s of
Done -> Done
Expand Down
38 changes: 32 additions & 6 deletions Data/Text/Internal/Fusion/Size.hs
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,16 @@
module Data.Text.Internal.Fusion.Size
(
Size
, exactly
-- * Sizes
, exactSize
, maxSize
, betweenSize
, unknownSize
, unionSize
, charSize
, codePointsSize
-- * Querying sizes
, exactly
, smaller
, larger
, upperBound
Expand All @@ -32,11 +37,13 @@ module Data.Text.Internal.Fusion.Size
, isEmpty
) where

import Data.Char (ord)
import Data.Text.Internal (mul)
#if defined(ASSERTS)
import Control.Exception (assert)
#endif

-- | A size in UTF-16 code units.
data Size = Between {-# UNPACK #-} !Int {-# UNPACK #-} !Int -- ^ Lower and upper bounds on size.
| Unknown -- ^ Unknown size.
deriving (Eq, Show)
Expand All @@ -46,6 +53,17 @@ exactly (Between na nb) | na == nb = Just na
exactly _ = Nothing
{-# INLINE exactly #-}

-- | The 'Size' of the given code point.
charSize :: Char -> Size
charSize c
| ord c < 0x10000 = exactSize 1
| otherwise = exactSize 2

-- | The 'Size' of @n@ code points.
codePointsSize :: Int -> Size
codePointsSize n = Between n (2*n)
{-# INLINE codePointsSize #-}

exactSize :: Int -> Size
exactSize n =
#if defined(ASSERTS)
Expand All @@ -71,6 +89,10 @@ betweenSize m n =
Between m n
{-# INLINE betweenSize #-}

unionSize :: Size -> Size -> Size
unionSize (Between a b) (Between c d) = Between (min a c) (max b d)
unionSize _ _ = Unknown

unknownSize :: Size
unknownSize = Unknown
{-# INLINE unknownSize #-}
Expand Down Expand Up @@ -140,11 +162,15 @@ lowerBound _ (Between n _) = n
lowerBound k _ = k
{-# INLINE lowerBound #-}

compareSize :: Size -> Int -> Maybe Ordering
compareSize (Between ma mb) n
| mb < n = Just LT
| ma > n = Just GT
| ma == n && mb == n = Just EQ
-- | Determine the ordering relationship between two 'Size's, or 'Nothing' in
-- the indeterminate case.
compareSize :: Size -> Size -> Maybe Ordering
compareSize (Between ma mb) (Between na nb)
| mb < na = Just LT
| ma > nb = Just GT
| ma == mb
, ma == na
, ma == nb = Just EQ
compareSize _ _ = Nothing


Expand Down

0 comments on commit 424451e

Please sign in to comment.