Permalink
Browse files

Rewrote XmlParser with global and local states to allow for adherence…

… to spec.

Ignore-this: e05acf779f1446caaceb5f6713c3a99a

darcs-hash:20110130203651-1786f-448a2b484ce133f58e5c02b84d044423715c1a4a.gz
  • Loading branch information...
1 parent a8b734f commit dfeabd84ecfc34617b6c1f9d3d4790ff62718547 @amccausl committed Jan 30, 2011
Showing with 72 additions and 36 deletions.
  1. +72 −36 src/Text/RDF/RDF4H/XmlParser.hs
@@ -16,13 +16,23 @@ import Text.XML.HXT.Core
import Data.ByteString.Lazy.Char8(ByteString)
import Data.String.Utils
+-- |Global state for the parser
+data GParseState = GParseState { stateGenId :: Int
+ }
+
+-- |Local state for the parser (dependant on the parent xml elements)
+data LParseState = LParseState { stateBaseUrl :: Maybe BaseUrl
+ , stateLang :: Maybe String
+ , stateSubject :: Subject
+ }
+
-- |Parse a xml ByteString to an RDF representation
parseXmlRDF :: forall rdf. (RDF rdf)
=> Maybe BaseUrl -- ^ The base URL for the RDF if required
-> Maybe ByteString -- ^ DocUrl: The request URL for the RDF if available
-> ByteString -- ^ The contents to parse
-> Either ParseFailure rdf -- ^ The RDF representation of the triples or ParseFailure
-parseXmlRDF bUrl dUrl xmlStr = case runSLA (xread >>> addMetaData bUrl dUrl >>> getRDF) 0 (b2s xmlStr) of
+parseXmlRDF bUrl dUrl xmlStr = case runSLA (xread >>> addMetaData bUrl dUrl >>> getRDF) (GParseState { stateGenId = 0 }) (b2s xmlStr) of
(_,r:rest) -> Right r
_ -> Left (ParseFailure "XML parsing failed")
@@ -40,51 +50,77 @@ addMetaData bUrlM dUrlM = mkelem "/"
mkBase Nothing = []
-- |Arrow that translates HXT XmlTree to an RDF representation
-getRDF :: forall rdf a. (RDF rdf, ArrowXml a, ArrowState Int a) => a XmlTree rdf
+getRDF :: forall rdf a. (RDF rdf, ArrowXml a, ArrowState GParseState a) => a XmlTree rdf
getRDF = proc xml -> do
rdf <- hasName "rdf:RDF" <<< isElem <<< getChildren -< xml
bUrl <- arr (Just . BaseUrl . s2b) <<< getAttrValue "transfer-URI" -< xml
prefixMap <- arr toPrefixMap <<< toAttrMap -< rdf
- triples <- (parseDescription <<< isElem <<< getChildren) >. id -< rdf
+ desc <- isElem <<< getChildren -< rdf
+ state <- arr (\(bUrl, o) -> LParseState bUrl Nothing o) <<< second(mkNode (LParseState Nothing Nothing undefined)) -< (bUrl, desc)
+ triples <- parseDescription >. id -< (state, desc)
returnA -< mkRdf triples bUrl prefixMap
where toAttrMap = (getAttrl >>> (getName &&& (getChildren >>> getText))) >. id
toPrefixMap = PrefixMappings . Map.fromList . map (\(n, m) -> (s2b (drop 6 n), s2b m)) . filter (startswith "xmlns:" . fst)
- parseDescription = proc desc -> do
- s0 <- mkNode -< desc
- (s, p, o) <- (arr2A getPredicatesFromAttr <+> arr2A getPredicatesFromChildren) -< (s0, desc)
- returnA -< Triple s p o
-
-getPredicatesFromChildren :: forall a. (ArrowXml a, ArrowState Int a) => Subject -> a XmlTree (Subject, Predicate, Object)
-getPredicatesFromChildren s0 = proc rdf -> do
- cp <- isElem <<< getChildren -< rdf
- p0 <- arr (unode . s2b) <<< getName -< cp
- co <- isElem <<< getChildren -< cp
- o0 <- mkNode -< co
- (s, p, o) <- arr2A constA <+> (wrapKnownDescription <<< arr snd) -< ((s0, p0, o0), (o0, co))
- returnA -< (s, p, o)
-
-recursiveParse :: forall a. (ArrowXml a, ArrowState Int a) => a ((Subject, Predicate, Object), (Object, XmlTree)) (Subject, Predicate, Object)
-recursiveParse = arr2A constA <+> (arr (snd) >>> wrapKnownDescription)
-wrapKnownDescription :: forall a. (ArrowXml a, ArrowState Int a) => a (Object, XmlTree) (Subject, Predicate, Object)
-wrapKnownDescription = arr2A parseKnownDescription
-
-parseKnownDescription :: forall a. (ArrowXml a, ArrowState Int a) => Subject -> a XmlTree (Subject, Predicate, Object)
-parseKnownDescription s0 = proc desc -> do
- (s, p, o) <- arr2A getPredicatesFromAttr <+> arr2A getPredicatesFromChildren -< (s0, desc)
- returnA -< (s, p, o)
-
-getPredicatesFromAttr :: forall a. (ArrowXml a, ArrowState Int a) => Subject -> a XmlTree (Subject, Predicate, Object)
-getPredicatesFromAttr s = getAttrl >>> ((getName >>> isA (/= "rdf:about") >>> (arr (unode . s2b))) &&& (getChildren >>> getText >>> arr (lnode . plainL . s2b))) >>> arr (attachSubject s)
-
-attachSubject :: Subject -> (Predicate, Object) -> (Subject, Predicate, Object)
-attachSubject s (p, o) = (s, p, o)
+-- |Read an rdf:Description tag to its corresponding Triples
+parseDescription :: forall a. (ArrowXml a, ArrowState GParseState a) => a (LParseState, XmlTree) Triple
+parseDescription = updateState
+ >>> (arr2A parsePredicatesFromAttr <+> parsePredicatesFromChildren)
+
+-- |Read the attributes of an rdf:Description element. These correspond to the Predicate Object pairs of the Triple
+parsePredicatesFromAttr :: forall a. (ArrowXml a, ArrowState GParseState a) => LParseState -> a XmlTree Triple
+parsePredicatesFromAttr s = getAttrl >>> ((getName >>> isA (/= "rdf:about") >>> (arr (unode . s2b))) &&& (getChildren >>> getText >>> arr (lnode . plainL . s2b))) >>> arr (attachSubject (stateSubject s))
+
+-- |Read a children of an rdf:Description element. These correspond to the Predicate portion of the Triple
+parsePredicatesFromChildren :: forall a. (ArrowXml a, ArrowState GParseState a) => a (LParseState, XmlTree) Triple
+parsePredicatesFromChildren = updateState
+ >>> choiceA [ second (hasAttrValue "rdf:parseType" (== "Literal")) :-> arr2A getLiteralTriple
+ , second (hasAttrValue "rdf:parseType" (== "Collection")) :-> arr2A getCollectionTriples
+ , second (hasAttr "rdf:resource") :-> arr2A getResourceTriple
+ , this :-> proc (state, predXml) -> do
+ p <- arr(unode . s2b) <<< getName -< predXml
+ desc <- isElem <<< getChildren -< predXml
+ o <- arr2A mkNode -< (state, desc)
+ t0 <- arr (\(s, (p, o)) -> Triple s p o) -< (stateSubject state, (p, o))
+ t <- arr fst <+> (parseDescription <<< arr snd) -< (t0, (state { stateSubject = o }, desc))
+ returnA -< t
+ ]
+
+attachSubject :: Subject -> (Predicate, Object) -> Triple
+attachSubject s (p, o) = Triple s p o
+
+-- |Updates the local state at a given node
+updateState :: forall a. (ArrowXml a, ArrowState GParseState a) => a (LParseState, XmlTree) (LParseState, XmlTree)
+updateState = (ifA (second (hasAttr "rdf:lang")) (arr2A readLang) (arr id))
+ >>> (ifA (second (hasAttr "xml:base")) (arr2A readBase) (arr id))
+ where readLang state = (getAttrValue0 "rdf:lang" >>> arr (\lang -> state { stateLang = Just lang } ) ) &&& arr id
+ readBase state = (getAttrValue0 "xml:base" >>> arr (\base -> state { stateBaseUrl = (Just . BaseUrl . s2b) base } ) ) &&& arr id
+
+-- |Read a Triple with an rdf:parseType of Literal
+getLiteralTriple :: forall a. (ArrowXml a, ArrowState GParseState a) => LParseState -> a XmlTree Triple
+getLiteralTriple state = ((getName >>> arr (unode . s2b)) &&& (xshow ( getChildren ) >>> arr (mkLiteralNode state))) >>> arr (attachSubject (stateSubject state))
+
+-- TODO
+getCollectionTriples :: forall a. (ArrowXml a, ArrowState GParseState a) => LParseState -> a XmlTree Triple
+getCollectionTriples state = none
+
+-- TODO: include use of BaseURL
+getResourceTriple :: forall a. (ArrowXml a, ArrowState GParseState a) => LParseState -> a XmlTree Triple
+getResourceTriple state = ((getName >>> arr (unode . s2b)) &&& (getAttrValue "rdf:resource" >>> arr (mkLiteralNode state))) >>> arr (attachSubject (stateSubject state))
-- |Read a Node from the "rdf:about" property or generate a blank node
-mkNode :: forall a b. (ArrowXml a, ArrowState Int a) => a XmlTree Node
-mkNode = ((getAttrValue0 "rdf:about" `orElse` getAttrValue0 "rdf:resource") >>> arr (unode . s2b)) `orElse` mkBlankNode
+mkNode :: forall a. (ArrowXml a, ArrowState GParseState a) => LParseState -> a XmlTree Node
+mkNode s = choiceA [ hasAttr "rdf:about" :-> (getAttrValue "rdf:about" >>> arr (unode . s2b))
+ , hasAttr "rdf:resource" :-> (getAttrValue "rdf:resource" >>> arr (unode . s2b))
+ , this :-> mkBlankNode
+ ]
+
+-- |Use the given state to create a literal node
+mkLiteralNode :: LParseState -> String -> Node
+mkLiteralNode (LParseState _ (Just lang) _) content = (lnode (plainLL (s2b content) (s2b lang)))
+mkLiteralNode (LParseState _ Nothing _) content = (lnode . plainL . s2b) content
-- |Generate an RDF blank node with incrementing IDs from the arrow state
-mkBlankNode :: forall a b. (ArrowState Int a) => a b Node
-mkBlankNode = nextState (+1) >>> arr (BNodeGen)
+mkBlankNode :: forall a b. (ArrowState GParseState a) => a b Node
+mkBlankNode = nextState (\gState -> gState { stateGenId = stateGenId gState + 1 } ) >>> arr (BNodeGen . stateGenId)

0 comments on commit dfeabd8

Please sign in to comment.