Permalink
Browse files

Clean up and document the stricter parsing functions.

--HG--
rename : Data/Aeson/Parser.hs => Data/Aeson/Parser/Internal.hs
  • Loading branch information...
1 parent 323c18f commit 33cf2475f96c64b5f81439596a351e090f77d8fb @bos committed Dec 1, 2011
Showing with 341 additions and 197 deletions.
  1. +17 −7 Data/Aeson.hs
  2. +17 −7 Data/Aeson/Generic.hs
  3. +33 −182 Data/Aeson/Parser.hs
  4. +249 −0 Data/Aeson/Parser/Internal.hs
  5. +1 −0 aeson.cabal
  6. +6 −1 benchmarks/bench-parse.py
  7. +18 −0 release-notes.markdown
View
@@ -14,6 +14,7 @@ module Data.Aeson
(
-- * Encoding and decoding
decode
+ , decode'
, encode
-- * Core JSON types
, Value(..)
@@ -34,21 +35,30 @@ module Data.Aeson
, object
-- * Parsing
, json
+ , json'
) where
import Data.Aeson.Encode (encode)
-import Data.Aeson.Parser (json)
+import Data.Aeson.Parser.Internal (decodeWith, json, json')
import Data.Aeson.Types
import qualified Data.ByteString.Lazy as L
-import qualified Data.Attoparsec.Lazy as L
-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
-- If this fails due to incomplete or invalid input, 'Nothing' is
-- returned.
+--
+-- This function parses immediately, but defers conversion. See
+-- 'json' for details.
decode :: (FromJSON a) => L.ByteString -> Maybe a
-decode s = case L.parse json s of
- L.Done _ v -> case fromJSON v of
- Success a -> Just a
- _ -> Nothing
- _ -> Nothing
+decode = decodeWith json fromJSON
{-# INLINE decode #-}
+
+-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
+-- If this fails due to incomplete or invalid input, 'Nothing' is
+-- returned.
+--
+-- This function parses and performs conversion immediately. See
+-- 'json'' for details.
+decode' :: (FromJSON a) => L.ByteString -> Maybe a
+decode' = decodeWith json' fromJSON
+{-# INLINE decode' #-}
View
@@ -18,6 +18,7 @@ module Data.Aeson.Generic
(
-- * Decoding and encoding
decode
+ , decode'
, encode
-- * Lower-level conversion functions
, fromJSON
@@ -39,11 +40,10 @@ import Data.Text (Text, pack, unpack)
import Data.Text.Encoding (encodeUtf8)
import Data.Time.Clock (UTCTime)
import Data.Word (Word, Word8, Word16, Word32, Word64)
-import Data.Aeson.Parser (json)
+import Data.Aeson.Parser.Internal (decodeWith, json, json')
import qualified Data.Aeson.Encode as E
import qualified Data.Aeson.Functions as F
import qualified Data.Aeson.Types as T
-import qualified Data.Attoparsec.Lazy as L
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.HashMap.Strict as H
@@ -62,12 +62,22 @@ encode = E.encode . toJSON
-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
-- If this fails due to incomplete or invalid input, 'Nothing' is
-- returned.
+--
+-- This function parses immediately, but defers conversion. See
+-- 'json' for details.
decode :: (Data a) => L.ByteString -> Maybe a
-decode s = case L.parse json s of
- L.Done _ v -> case fromJSON v of
- Success a -> Just a
- _ -> Nothing
- _ -> Nothing
+decode = decodeWith json fromJSON
+{-# INLINE decode #-}
+
+-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
+-- If this fails due to incomplete or invalid input, 'Nothing' is
+-- returned.
+--
+-- This function parses and performs conversion immediately. See
+-- 'json'' for details.
+decode' :: (Data a) => L.ByteString -> Maybe a
+decode' = decodeWith json' fromJSON
+{-# INLINE decode' #-}
type T a = a -> Value
View
@@ -10,196 +10,47 @@
--
-- Efficiently and correctly parse a JSON string. The string must be
-- encoded as UTF-8.
+--
+-- It can be useful to think of parsing as occurring in two phases:
+--
+-- * Identification of the textual boundaries of a JSON value. This
+-- is always strict, so that an invalid JSON document can be
+-- rejected as soon as possible.
+--
+-- * Conversion of a JSON value to a Haskell value. This may be
+-- either immediate (strict) or deferred (lazy); see below for
+-- details.
module Data.Aeson.Parser
(
+ -- * Lazy parsers
+ -- $lazy
json
- , json'
, value
- , value'
, jstring
+ -- * Strict parsers
+ -- $strict
+ , json'
+ , value'
) where
-import Blaze.ByteString.Builder (fromByteString, toByteString)
-import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
-import Blaze.ByteString.Builder.Word (fromWord8)
-import Control.Applicative as A
-import Data.Aeson.Types (Value(..))
-import Data.Attoparsec.Char8
-import Data.Bits ((.|.), shiftL)
-import Data.ByteString as B
-import Data.Char (chr)
-import Data.Monoid (mappend, mempty)
-import Data.Text as T
-import Data.Text.Encoding (decodeUtf8)
-import Data.Vector as Vector hiding ((++))
-import Data.Word (Word8)
-import qualified Data.Attoparsec as A
-import qualified Data.Attoparsec.Zepto as Z
-import qualified Data.ByteString.Char8 as B8
-import qualified Data.ByteString.Unsafe as B
-import qualified Data.HashMap.Strict as H
-
--- | Parse a top-level JSON value. This must be either an object or
--- an array.
-json :: Parser Value
-json = json_ object_ array_
+import Data.Aeson.Parser.Internal (json, json', jstring, value, value')
--- | Parse a top-level JSON value. This must be either an object or
--- an array.
+-- $lazy
--
--- This is a strict parser version of 'json' which avoids
--- building up thunks during parsing. Prefer this version if most of
--- the JSON data needs to be accessed.
-json' :: Parser Value
-json' = json_ object_' array_'
-
-json_ :: Parser Value -> Parser Value -> Parser Value
-json_ obj ary = do
- w <- skipSpace *> A.satisfy (\w -> w == 123 || w == 91)
- if w == 123
- then obj
- else ary
-{-# INLINE json_ #-}
-
-object_ :: Parser Value
-object_ = {-# SCC "object_" #-} Object <$> objectValues value
-
-object_' :: Parser Value
-object_' = {-# SCC "object_'" #-} do
- !vals <- objectValues value'
- return (Object vals)
-
-objectValues :: Parser Value -> Parser (H.HashMap Text Value)
-objectValues val = do
- skipSpace
- let pair = do
- a <- jstring <* skipSpace
- b <- char ':' *> skipSpace *> val
- return (a,b)
- vals <- ((pair <* skipSpace) `sepBy` (char ',' *> skipSpace)) <* char '}'
- return (H.fromList vals)
-{-# INLINE objectValues #-}
-
-array_ :: Parser Value
-array_ = {-# SCC "array_" #-} Array <$> arrayValues value
-
-array_' :: Parser Value
-array_' = {-# SCC "array_'" #-} do
- !vals <- arrayValues value'
- return (Array vals)
-
-arrayValues :: Parser Value -> Parser (Vector Value)
-arrayValues val = do
- skipSpace
- vals <- ((val <* skipSpace) `sepBy` (char ',' *> skipSpace)) <* char ']'
- return (Vector.fromList vals)
-{-# INLINE arrayValues #-}
-
--- | Parse any JSON value. Use 'json' in preference to this function
--- if you are parsing data from an untrusted source.
-value :: Parser Value
-value = most <|> (Number <$> number)
- where
- most = do
- c <- satisfy (`B8.elem` "{[\"ftn")
- case c of
- '{' -> object_
- '[' -> array_
- '"' -> String <$> jstring_
- 'f' -> string "alse" *> pure (Bool False)
- 't' -> string "rue" *> pure (Bool True)
- 'n' -> string "ull" *> pure Null
- _ -> error "attoparsec panic! the impossible happened!"
-
--- | Strict version of 'value'. See also 'json''.
-value' :: Parser Value
-value' = most <|> num
- where
- most = do
- c <- satisfy (`B8.elem` "{[\"ftn")
- case c of
- '{' -> object_'
- '[' -> array_'
- '"' -> do
- !s <- jstring_
- return (String s)
- 'f' -> string "alse" *> pure (Bool False)
- 't' -> string "rue" *> pure (Bool True)
- 'n' -> string "ull" *> pure Null
- _ -> error "attoparsec panic! the impossible happened!"
- num = do
- !n <- number
- return (Number n)
-
-doubleQuote, backslash :: Word8
-doubleQuote = 34
-backslash = 92
-{-# INLINE backslash #-}
-{-# INLINE doubleQuote #-}
-
-jstring :: Parser Text
-jstring = A.word8 doubleQuote *> jstring_
-
--- | Parse a string without a leading quote.
-jstring_ :: Parser Text
-jstring_ = {-# SCC "jstring_" #-} do
- s <- A.scan False $ \s c -> if s then Just False
- else if c == doubleQuote
- then Nothing
- else Just (c == backslash)
- _ <- A.word8 doubleQuote
- if backslash `B.elem` s
- then case Z.parse unescape s of
- Right r -> return (decodeUtf8 r)
- Left err -> fail err
- else return (decodeUtf8 s)
-{-# INLINE jstring_ #-}
-
-unescape :: Z.Parser ByteString
-unescape = toByteString <$> go mempty where
- go acc = do
- h <- Z.takeWhile (/=backslash)
- let rest = do
- start <- Z.take 2
- let !slash = B.unsafeHead start
- !t = B.unsafeIndex start 1
- escape = case B.findIndex (==t) "\"\\/ntbrfu" of
- Just i -> i
- _ -> 255
- if slash /= backslash || escape == 255
- then fail "invalid JSON escape sequence"
- else do
- let cont m = go (acc `mappend` fromByteString h `mappend` m)
- {-# INLINE cont #-}
- if t /= 117 -- 'u'
- then cont (fromWord8 (B.unsafeIndex mapping escape))
- else do
- a <- hexQuad
- if a < 0xd800 || a > 0xdfff
- then cont (fromChar (chr a))
- else do
- b <- Z.string "\\u" *> hexQuad
- if a <= 0xdbff && b >= 0xdc00 && b <= 0xdfff
- then let !c = ((a - 0xd800) `shiftL` 10) +
- (b - 0xdc00) + 0x10000
- in cont (fromChar (chr c))
- else fail "invalid UTF-16 surrogates"
- done <- Z.atEnd
- if done
- then return (acc `mappend` fromByteString h)
- else rest
- mapping = "\"\\/\n\t\b\r\f"
+-- The 'json' and 'value' parsers decouple identification from
+-- conversion. Identification occurs immediately (so that an invalid
+-- JSON document can be rejected as early as possible), but conversion
+-- to a Haskell value is deferred until that value is needed.
+--
+-- This decoupling can be time-efficient if only a smallish subset of
+-- elements in a JSON value need to be inspected, since the cost of
+-- conversion is zero for uninspected elements. The trade off is an
+-- increase in memory usage, due to allocation of thunks for values
+-- that have not yet been converted.
-hexQuad :: Z.Parser Int
-hexQuad = do
- s <- Z.take 4
- let hex n | w >= 48 && w <= 57 = w - 48
- | w >= 97 && w <= 122 = w - 87
- | w >= 65 && w <= 90 = w - 55
- | otherwise = 255
- where w = fromIntegral $ B.unsafeIndex s n
- a = hex 0; b = hex 1; c = hex 2; d = hex 3
- if (a .|. b .|. c .|. d) /= 255
- then return $! d .|. (c `shiftL` 4) .|. (b `shiftL` 8) .|. (a `shiftL` 12)
- else fail "invalid hex escape"
+-- $strict
+--
+-- The 'json'' and 'value'' parsers combine identification with
+-- conversion. They consume more CPU cycles up front, but have a
+-- smaller memory footprint.
Oops, something went wrong.

0 comments on commit 33cf247

Please sign in to comment.