Navigation Menu

Skip to content

Commit

Permalink
Rewrote XmlParser with global and local states to allow for adherence…
Browse files Browse the repository at this point in the history
… to spec.

Ignore-this: e05acf779f1446caaceb5f6713c3a99a

darcs-hash:20110130203651-1786f-448a2b484ce133f58e5c02b84d044423715c1a4a.gz
  • Loading branch information
amccausl committed Jan 30, 2011
1 parent a8b734f commit dfeabd8
Showing 1 changed file with 72 additions and 36 deletions.
108 changes: 72 additions & 36 deletions src/Text/RDF/RDF4H/XmlParser.hs
Expand Up @@ -16,13 +16,23 @@ import Text.XML.HXT.Core
import Data.ByteString.Lazy.Char8(ByteString)
import Data.String.Utils

-- |Global state for the parser
data GParseState = GParseState { stateGenId :: Int
}

-- |Local state for the parser (dependant on the parent xml elements)
data LParseState = LParseState { stateBaseUrl :: Maybe BaseUrl
, stateLang :: Maybe String
, stateSubject :: Subject
}

-- |Parse a xml ByteString to an RDF representation
parseXmlRDF :: forall rdf. (RDF rdf)
=> Maybe BaseUrl -- ^ The base URL for the RDF if required
-> Maybe ByteString -- ^ DocUrl: The request URL for the RDF if available
-> ByteString -- ^ The contents to parse
-> Either ParseFailure rdf -- ^ The RDF representation of the triples or ParseFailure
parseXmlRDF bUrl dUrl xmlStr = case runSLA (xread >>> addMetaData bUrl dUrl >>> getRDF) 0 (b2s xmlStr) of
parseXmlRDF bUrl dUrl xmlStr = case runSLA (xread >>> addMetaData bUrl dUrl >>> getRDF) (GParseState { stateGenId = 0 }) (b2s xmlStr) of
(_,r:rest) -> Right r
_ -> Left (ParseFailure "XML parsing failed")

Expand All @@ -40,51 +50,77 @@ addMetaData bUrlM dUrlM = mkelem "/"
mkBase Nothing = []

-- |Arrow that translates HXT XmlTree to an RDF representation
getRDF :: forall rdf a. (RDF rdf, ArrowXml a, ArrowState Int a) => a XmlTree rdf
getRDF :: forall rdf a. (RDF rdf, ArrowXml a, ArrowState GParseState a) => a XmlTree rdf
getRDF = proc xml -> do
rdf <- hasName "rdf:RDF" <<< isElem <<< getChildren -< xml
bUrl <- arr (Just . BaseUrl . s2b) <<< getAttrValue "transfer-URI" -< xml
prefixMap <- arr toPrefixMap <<< toAttrMap -< rdf
triples <- (parseDescription <<< isElem <<< getChildren) >. id -< rdf
desc <- isElem <<< getChildren -< rdf
state <- arr (\(bUrl, o) -> LParseState bUrl Nothing o) <<< second(mkNode (LParseState Nothing Nothing undefined)) -< (bUrl, desc)
triples <- parseDescription >. id -< (state, desc)
returnA -< mkRdf triples bUrl prefixMap
where toAttrMap = (getAttrl >>> (getName &&& (getChildren >>> getText))) >. id
toPrefixMap = PrefixMappings . Map.fromList . map (\(n, m) -> (s2b (drop 6 n), s2b m)) . filter (startswith "xmlns:" . fst)
parseDescription = proc desc -> do
s0 <- mkNode -< desc
(s, p, o) <- (arr2A getPredicatesFromAttr <+> arr2A getPredicatesFromChildren) -< (s0, desc)
returnA -< Triple s p o

getPredicatesFromChildren :: forall a. (ArrowXml a, ArrowState Int a) => Subject -> a XmlTree (Subject, Predicate, Object)
getPredicatesFromChildren s0 = proc rdf -> do
cp <- isElem <<< getChildren -< rdf
p0 <- arr (unode . s2b) <<< getName -< cp
co <- isElem <<< getChildren -< cp
o0 <- mkNode -< co
(s, p, o) <- arr2A constA <+> (wrapKnownDescription <<< arr snd) -< ((s0, p0, o0), (o0, co))
returnA -< (s, p, o)

recursiveParse :: forall a. (ArrowXml a, ArrowState Int a) => a ((Subject, Predicate, Object), (Object, XmlTree)) (Subject, Predicate, Object)
recursiveParse = arr2A constA <+> (arr (snd) >>> wrapKnownDescription)

wrapKnownDescription :: forall a. (ArrowXml a, ArrowState Int a) => a (Object, XmlTree) (Subject, Predicate, Object)
wrapKnownDescription = arr2A parseKnownDescription

parseKnownDescription :: forall a. (ArrowXml a, ArrowState Int a) => Subject -> a XmlTree (Subject, Predicate, Object)
parseKnownDescription s0 = proc desc -> do
(s, p, o) <- arr2A getPredicatesFromAttr <+> arr2A getPredicatesFromChildren -< (s0, desc)
returnA -< (s, p, o)

getPredicatesFromAttr :: forall a. (ArrowXml a, ArrowState Int a) => Subject -> a XmlTree (Subject, Predicate, Object)
getPredicatesFromAttr s = getAttrl >>> ((getName >>> isA (/= "rdf:about") >>> (arr (unode . s2b))) &&& (getChildren >>> getText >>> arr (lnode . plainL . s2b))) >>> arr (attachSubject s)

attachSubject :: Subject -> (Predicate, Object) -> (Subject, Predicate, Object)
attachSubject s (p, o) = (s, p, o)
-- |Read an rdf:Description tag to its corresponding Triples
parseDescription :: forall a. (ArrowXml a, ArrowState GParseState a) => a (LParseState, XmlTree) Triple
parseDescription = updateState
>>> (arr2A parsePredicatesFromAttr <+> parsePredicatesFromChildren)

-- |Read the attributes of an rdf:Description element. These correspond to the Predicate Object pairs of the Triple
parsePredicatesFromAttr :: forall a. (ArrowXml a, ArrowState GParseState a) => LParseState -> a XmlTree Triple
parsePredicatesFromAttr s = getAttrl >>> ((getName >>> isA (/= "rdf:about") >>> (arr (unode . s2b))) &&& (getChildren >>> getText >>> arr (lnode . plainL . s2b))) >>> arr (attachSubject (stateSubject s))

-- |Read a children of an rdf:Description element. These correspond to the Predicate portion of the Triple
parsePredicatesFromChildren :: forall a. (ArrowXml a, ArrowState GParseState a) => a (LParseState, XmlTree) Triple
parsePredicatesFromChildren = updateState
>>> choiceA [ second (hasAttrValue "rdf:parseType" (== "Literal")) :-> arr2A getLiteralTriple
, second (hasAttrValue "rdf:parseType" (== "Collection")) :-> arr2A getCollectionTriples
, second (hasAttr "rdf:resource") :-> arr2A getResourceTriple
, this :-> proc (state, predXml) -> do
p <- arr(unode . s2b) <<< getName -< predXml
desc <- isElem <<< getChildren -< predXml
o <- arr2A mkNode -< (state, desc)
t0 <- arr (\(s, (p, o)) -> Triple s p o) -< (stateSubject state, (p, o))
t <- arr fst <+> (parseDescription <<< arr snd) -< (t0, (state { stateSubject = o }, desc))
returnA -< t
]

attachSubject :: Subject -> (Predicate, Object) -> Triple
attachSubject s (p, o) = Triple s p o

-- |Updates the local state at a given node
updateState :: forall a. (ArrowXml a, ArrowState GParseState a) => a (LParseState, XmlTree) (LParseState, XmlTree)
updateState = (ifA (second (hasAttr "rdf:lang")) (arr2A readLang) (arr id))
>>> (ifA (second (hasAttr "xml:base")) (arr2A readBase) (arr id))
where readLang state = (getAttrValue0 "rdf:lang" >>> arr (\lang -> state { stateLang = Just lang } ) ) &&& arr id
readBase state = (getAttrValue0 "xml:base" >>> arr (\base -> state { stateBaseUrl = (Just . BaseUrl . s2b) base } ) ) &&& arr id

-- |Read a Triple with an rdf:parseType of Literal
getLiteralTriple :: forall a. (ArrowXml a, ArrowState GParseState a) => LParseState -> a XmlTree Triple
getLiteralTriple state = ((getName >>> arr (unode . s2b)) &&& (xshow ( getChildren ) >>> arr (mkLiteralNode state))) >>> arr (attachSubject (stateSubject state))

-- TODO
getCollectionTriples :: forall a. (ArrowXml a, ArrowState GParseState a) => LParseState -> a XmlTree Triple
getCollectionTriples state = none

-- TODO: include use of BaseURL
getResourceTriple :: forall a. (ArrowXml a, ArrowState GParseState a) => LParseState -> a XmlTree Triple
getResourceTriple state = ((getName >>> arr (unode . s2b)) &&& (getAttrValue "rdf:resource" >>> arr (mkLiteralNode state))) >>> arr (attachSubject (stateSubject state))

-- |Read a Node from the "rdf:about" property or generate a blank node
mkNode :: forall a b. (ArrowXml a, ArrowState Int a) => a XmlTree Node
mkNode = ((getAttrValue0 "rdf:about" `orElse` getAttrValue0 "rdf:resource") >>> arr (unode . s2b)) `orElse` mkBlankNode
mkNode :: forall a. (ArrowXml a, ArrowState GParseState a) => LParseState -> a XmlTree Node
mkNode s = choiceA [ hasAttr "rdf:about" :-> (getAttrValue "rdf:about" >>> arr (unode . s2b))
, hasAttr "rdf:resource" :-> (getAttrValue "rdf:resource" >>> arr (unode . s2b))
, this :-> mkBlankNode
]

-- |Use the given state to create a literal node
mkLiteralNode :: LParseState -> String -> Node
mkLiteralNode (LParseState _ (Just lang) _) content = (lnode (plainLL (s2b content) (s2b lang)))
mkLiteralNode (LParseState _ Nothing _) content = (lnode . plainL . s2b) content

-- |Generate an RDF blank node with incrementing IDs from the arrow state
mkBlankNode :: forall a b. (ArrowState Int a) => a b Node
mkBlankNode = nextState (+1) >>> arr (BNodeGen)
mkBlankNode :: forall a b. (ArrowState GParseState a) => a b Node
mkBlankNode = nextState (\gState -> gState { stateGenId = stateGenId gState + 1 } ) >>> arr (BNodeGen . stateGenId)

0 comments on commit dfeabd8

Please sign in to comment.