Permalink
Browse files

Seemingly sane (!?) changes to string unescaping

  • Loading branch information...
bos committed May 11, 2015
1 parent deb5982 commit 05c9e0cbbebc861303fc7dd6b3dfd03844490621
Showing with 71 additions and 21 deletions.
  1. +69 −20 Data/Aeson/Parser/Internal.hs
  2. +2 −1 aeson.cabal
@@ -28,26 +28,31 @@ module Data.Aeson.Parser.Internal
, eitherDecodeStrictWith
) where
import Data.ByteString.Builder
(Builder, byteString, toLazyByteString, charUtf8, word8)
import Control.Applicative ((*>), (<$>), (<*), liftA2, pure)
import Control.Monad.IO.Class (liftIO)
import Data.Aeson.Types (Result(..), Value(..))
import Data.Attoparsec.ByteString.Char8 (Parser, char, endOfInput, scientific,
skipSpace, string)
import Data.Bits ((.|.), shiftL)
import Data.ByteString (ByteString)
import Data.ByteString.Internal (ByteString(..))
import Data.Char (chr)
import Data.Monoid (mappend, mempty)
import Data.Text (Text)
import Data.Text.Encoding (decodeUtf8')
import Data.Text.Internal.Encoding.Utf8 (ord2, ord3, ord4)
import Data.Text.Internal.Unsafe.Char (ord)
import Data.Vector as Vector (Vector, fromList)
import Data.Word (Word8)
import Foreign.ForeignPtr (withForeignPtr)
import Foreign.Ptr (minusPtr)
import Foreign.Ptr (Ptr, plusPtr)
import Foreign.Storable (poke)
import System.IO.Unsafe (unsafePerformIO)
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.Lazy as L
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Internal as B
import qualified Data.ByteString.Unsafe as B
import qualified Data.HashMap.Strict as H
@@ -198,7 +203,7 @@ jstring_ = {-# SCC "jstring_" #-} do
else Just (c == BACKSLASH)
_ <- A.word8 DOUBLE_QUOTE
s1 <- if BACKSLASH `B.elem` s
then case Z.parse unescape s of
then case unescape s of
Right r -> return r
Left err -> fail err
else return s
@@ -209,9 +214,24 @@ jstring_ = {-# SCC "jstring_" #-} do
{-# INLINE jstring_ #-}
unescape :: Z.Parser ByteString
unescape = toByteString <$> go mempty where
go acc = do
unescape :: ByteString -> Either String ByteString
unescape s = unsafePerformIO $ do
let len = B.length s
fp <- B.mallocByteString len
-- We perform no bounds checking when writing to the destination
-- string, as unescaping always makes it shorter than the source.
withForeignPtr fp $ \ptr -> do
ret <- Z.parseT (go ptr) s
case ret of
Left err -> return (Left err)
Right p -> do
let newlen = p `minusPtr` ptr
slop = len - newlen
Right <$> if slop >= 128 && slop >= len `quot` 4
then B.create newlen $ \np -> B.memcpy np ptr newlen
else return (PS fp 0 newlen)
where
go ptr = do
h <- Z.takeWhile (/=BACKSLASH)
let rest = do
start <- Z.take 2
@@ -222,29 +242,27 @@ unescape = toByteString <$> go mempty where
_ -> 255
if slash /= BACKSLASH || escape == 255
then fail "invalid JSON escape sequence"
else do
let cont m = go (acc `mappend` byteString h `mappend` m)
{-# INLINE cont #-}
else
if t /= 117 -- 'u'
then cont (word8 (B.unsafeIndex mapping escape))
then copy h ptr >>= word8 (B.unsafeIndex mapping escape) >>= go
else do
a <- hexQuad
if a < 0xd800 || a > 0xdfff
then cont (charUtf8 (chr a))
then copy h ptr >>= charUtf8 (chr a) >>= go
else do
b <- Z.string "\\u" *> hexQuad
if a <= 0xdbff && b >= 0xdc00 && b <= 0xdfff
then let !c = ((a - 0xd800) `shiftL` 10) +
(b - 0xdc00) + 0x10000
in cont (charUtf8 (chr c))
in copy h ptr >>= charUtf8 (chr c) >>= go
else fail "invalid UTF-16 surrogates"
done <- Z.atEnd
if done
then return (acc `mappend` byteString h)
then copy h ptr
else rest
mapping = "\"\\/\n\t\b\r\f"
hexQuad :: Z.Parser Int
hexQuad :: Z.ZeptoT IO Int
hexQuad = do
s <- Z.take 4
let hex n | w >= C_0 && w <= C_9 = w - C_0
@@ -321,6 +339,37 @@ jsonEOF = json <* skipSpace <* endOfInput
jsonEOF' :: Parser Value
jsonEOF' = json' <* skipSpace <* endOfInput
toByteString :: Builder -> ByteString
toByteString = L.toStrict . toLazyByteString
{-# INLINE toByteString #-}
word8 :: Word8 -> Ptr Word8 -> Z.ZeptoT IO (Ptr Word8)
word8 w ptr = do
liftIO $ poke ptr w
return $! ptr `plusPtr` 1
copy :: ByteString -> Ptr Word8 -> Z.ZeptoT IO (Ptr Word8)
copy (PS fp off len) ptr =
liftIO . withForeignPtr fp $ \src -> do
B.memcpy ptr (src `plusPtr` off) len
return $! ptr `plusPtr` len
charUtf8 :: Char -> Ptr Word8 -> Z.ZeptoT IO (Ptr Word8)
charUtf8 ch ptr
| ch < '\x80' = liftIO $ do
poke ptr (fromIntegral (ord ch))
return $! ptr `plusPtr` 1
| ch < '\x800' = liftIO $ do
let (a,b) = ord2 ch
poke ptr a
poke (ptr `plusPtr` 1) b
return $! ptr `plusPtr` 2
| ch < '\xffff' = liftIO $ do
let (a,b,c) = ord3 ch
poke ptr a
poke (ptr `plusPtr` 1) b
poke (ptr `plusPtr` 2) c
return $! ptr `plusPtr` 3
| otherwise = liftIO $ do
let (a,b,c,d) = ord4 ch
poke ptr a
poke (ptr `plusPtr` 1) b
poke (ptr `plusPtr` 2) c
poke (ptr `plusPtr` 3) d
return $! ptr `plusPtr` 4
View
@@ -99,7 +99,7 @@ library
Data.Aeson.Types.Internal
build-depends:
attoparsec >= 0.11.3.4,
attoparsec >= 0.13.0.0,
base == 4.*,
bytestring >= 0.10.4.0,
containers,
@@ -112,6 +112,7 @@ library
syb,
template-haskell >= 2.4,
text >= 1.1.1.0,
transformers,
unordered-containers >= 0.2.3.0,
vector >= 0.7.1

2 comments on commit 05c9e0c

@HaskellZhangSong

This comment has been minimized.

Show comment
Hide comment
@HaskellZhangSong

HaskellZhangSong May 14, 2015

It seems that you are manually operating ByteString and Text with very low level operations. Dose that just affect stream with backslash? Maybe run an other performance test that can be compared with original versions of aeson

It seems that you are manually operating ByteString and Text with very low level operations. Dose that just affect stream with backslash? Maybe run an other performance test that can be compared with original versions of aeson

@patmoore

This comment has been minimized.

Show comment
Hide comment
@patmoore

patmoore Jun 26, 2015

Probably should reference your blog post so devs understand the reason for change: http://www.serpentine.com/blog/2015/05/13/sometimes-the-old-ways-are-the-best/

I don't understand why the code itself doesn't have comments about why a low-level api is used. Right now anyone ready the code is confused as to why this code is low-level. A maintainer that comes after you will be tempted to reverse the change.

Probably should reference your blog post so devs understand the reason for change: http://www.serpentine.com/blog/2015/05/13/sometimes-the-old-ways-are-the-best/

I don't understand why the code itself doesn't have comments about why a low-level api is used. Right now anyone ready the code is confused as to why this code is low-level. A maintainer that comes after you will be tempted to reverse the change.

Please sign in to comment.