-
Notifications
You must be signed in to change notification settings - Fork 2
/
NTriplesParser.hs
231 lines (190 loc) · 8.98 KB
/
NTriplesParser.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
{-# LANGUAGE RankNTypes #-}
-- |A parser for RDF in N-Triples format
-- <http://www.w3.org/TR/rdf-testcases/#ntriples>.
module Text.RDF.RDF4H.NTriplesParser(
NTriplesParser(NTriplesParser),
parseNTriplesRDF
) where
-- TODO: switch to OverloadedStrings and use ByteString literals (?).
import Data.RDF
import Data.Char(isLetter, isDigit, isLower)
import qualified Data.Map as Map
import Text.Parsec
import Text.Parsec.ByteString.Lazy
import Data.ByteString.Lazy.Char8(ByteString)
import qualified Data.ByteString.Lazy.Char8 as B
-- |NTriplesParser is an 'RdfParser' implementation for parsing RDF in the
-- NTriples format. It requires no configuration options. To use this parser,
-- pass an 'NTriplesParser' value as the first argument to any of the
-- 'parseString', 'parseFile', or 'parseURL' methods of the 'RdfParser' type
-- class.
data NTriplesParser = NTriplesParser
-- |'NTriplesParser' is an instance of 'RdfParser'.
instance RdfParser NTriplesParser where
parseString _ bs = handleParse mkRdf (runParser nt_ntripleDoc () "" bs)
parseFile _ path = B.readFile path >>= return . runParser nt_ntripleDoc () path >>= return . handleParse mkRdf
parseURL p url = parseURL' (parseString p) url
parseNTriplesRDF :: forall rdf. (RDF rdf)
=> ByteString -- ^ The contents to parse
-> Either ParseFailure rdf -- ^ The RDF representation of the triples or ParseFailure
parseNTriplesRDF bs = handleParse mkRdf (runParser nt_ntripleDoc () "" bs)
-- We define or redefine all here using same names as the spec, but with an
-- 'nt_' prefix in order to avoid name clashes (e.g., ntripleDoc becomes
-- nt_ntripleDoc).
-- |nt_ntripleDoc is simply zero or more lines.
nt_ntripleDoc :: GenParser ByteString () [Maybe Triple]
nt_ntripleDoc = manyTill nt_line eof
nt_line :: GenParser ByteString () (Maybe Triple)
nt_line =
skipMany nt_space >>
(nt_comment <|> nt_triple <|> nt_empty) >>=
\res -> nt_eoln >> return res
-- A comment consists of an initial # character, followed by any number of
-- characters except cr or lf. The spec is redundant in specifying that
-- comment is hash followed by "character - (cr | lf)", since character
-- is already defined as the range #x0020-#x007E, so cr #x000D and
-- lf #x000A are both already excluded. This returns Nothing as we are
-- ignoring comments for now.
nt_comment :: GenParser ByteString () (Maybe Triple)
nt_comment = char '#' >> skipMany nt_character >> return Nothing
-- A triple consists of whitespace-delimited subject, predicate, and object,
-- followed by optional whitespace and a period, and possibly more
-- whitespace.
nt_triple :: GenParser ByteString () (Maybe Triple)
nt_triple =
do
subj <- nt_subject
skipMany1 nt_space
pred <- nt_predicate
skipMany1 nt_space
obj <- nt_object
skipMany nt_space
char '.'
many nt_space
return $ Just (Triple subj pred obj)
-- A literal is either a language literal (with optional language
-- specified) or a datatype literal (with required datatype
-- specified). The literal value is always enclosed in double
-- quotes. A language literal may have '@' after the closing quote,
-- followed by a language specifier. A datatype literal follows
-- the closing quote with ^^ followed by the URI of the datatype.
nt_literal :: GenParser ByteString () LValue
nt_literal =
do lit_str <- between_chars '"' '"' inner_literal
(char '@' >> nt_language >>= return . plainLL lit_str) <|>
(count 2 (char '^') >> nt_uriref >>= return . typedL lit_str . mkFastString) <|>
(return $ plainL lit_str)
where inner_literal = (manyTill inner_string (lookAhead $ char '"') >>= return . B.concat)
-- A language specifier of a language literal is any number of lowercase
-- letters followed by any number of blocks consisting of a hyphen followed
-- by one or more lowercase letters or digits.
nt_language :: GenParser ByteString () ByteString
nt_language =
do str <- fmap B.pack (many (satisfy (\ c -> c == '-' || isLower c)))
if B.null str || B.last str == '-' || B.head str == '-'
then fail ("Invalid language string: '" ++ B.unpack str ++ "'")
else return str
-- nt_empty is a line that isn't a comment or a triple. They appear in the
-- parsed output as Nothing, whereas a real triple appears as (Just triple).
nt_empty :: GenParser ByteString () (Maybe Triple)
nt_empty = skipMany nt_space >> return Nothing
-- A subject is either a URI reference for a resource or a node id for a
-- blank node.
nt_subject :: GenParser ByteString () Node
nt_subject = fmap unode nt_uriref
<|>fmap bnode nt_nodeID
-- A predicate may only be a URI reference to a resource.
nt_predicate :: GenParser ByteString () Node
nt_predicate = fmap unode nt_uriref
-- An object may be either a resource (represented by a URI reference),
-- a blank node (represented by a node id), or an object literal.
nt_object :: GenParser ByteString () Node
nt_object = fmap unode nt_uriref
<|>fmap bnode nt_nodeID
<|>fmap LNode nt_literal
-- A URI reference is one or more nrab_character inside angle brackets.
nt_uriref :: GenParser ByteString () ByteString
nt_uriref = between_chars '<' '>' (fmap B.pack (many (satisfy (/= '>'))))
-- A node id is "_:" followed by a name.
nt_nodeID :: GenParser ByteString () ByteString
nt_nodeID = char '_' >> char ':' >> nt_name >>= \n ->
return ('_' `B.cons'` (':' `B.cons'` n))
-- A name is a letter followed by any number of alpha-numeric characters.
nt_name :: GenParser ByteString () ByteString
nt_name =
do init <- letter
rest <- many (satisfy isLetterOrDigit)
return $ B.pack (init:rest)
isLetterOrDigit :: Char -> Bool
isLetterOrDigit c = isLetter c || isDigit c
-- An nt_character is any character except a double quote character.
nt_character :: GenParser ByteString () Char
nt_character = satisfy is_nonquote_char
-- A character is any Unicode value from ASCII space to decimal 126 (tilde).
is_character :: Char -> Bool
is_character c = c >= '\x0020' && c <= '\x007E'
-- A non-quote character is a character that isn't the double-quote character.
is_nonquote_char :: Char -> Bool
is_nonquote_char c = is_character c && c/= '"'
-- End-of-line consists of either lf or crlf.
-- We also test for eof and consider that to match as well.
nt_eoln :: GenParser ByteString () ()
nt_eoln = eof
<|> (nt_cr >> nt_lf >> return ())
<|> (nt_lf >> return ())
-- Whitespace is either a space or tab character. We must avoid using the
-- built-in space combinator here, because it includes newline.
nt_space :: GenParser ByteString () Char
nt_space = char ' ' <|> nt_tab
-- Carriage return is \r.
nt_cr :: GenParser ByteString () Char
nt_cr = char '\r'
-- Line feed is \n.
nt_lf :: GenParser ByteString () Char
nt_lf = char '\n'
-- Tab is \t.
nt_tab :: GenParser ByteString () Char
nt_tab = char '\t'
-- An inner_string is a fragment of a string (this is used inside double
-- quotes), and consists of the non-quote characters allowed and the
-- standard escapes for a backslash (\\), a tab (\t), a carriage return (\r),
-- a newline (\n), a double-quote (\"), a 4-digit Unicode escape (\uxxxx
-- where x is a hexadecimal digit), and an 8-digit Unicode escape
-- (\Uxxxxxxxx where x is a hexadecimaldigit).
inner_string :: GenParser ByteString () ByteString
inner_string =
try (char '\\' >>
((char 't' >> return b_tab) <|>
(char 'r' >> return b_ret) <|>
(char 'n' >> return b_nl) <|>
(char '\\' >> return b_slash) <|>
(char '"' >> return b_quote) <|>
(char 'u' >> count 4 hexDigit >>= \cs -> return $ B.pack ('\\':'u':cs)) <|>
(char 'U' >> count 8 hexDigit >>= \cs -> return $ B.pack ('\\':'U':cs))))
<|> fmap B.pack (many (satisfy (\ c -> is_nonquote_char c && c /= '\\')))
b_tab = B.singleton '\t'
b_ret = B.singleton '\r'
b_nl = B.singleton '\n'
b_slash = B.singleton '\\'
b_quote = B.singleton '"'
between_chars :: Char -> Char -> GenParser ByteString () ByteString -> GenParser ByteString () ByteString
between_chars start end parser = char start >> parser >>= \res -> char end >> return res
handleParse :: forall rdf. (RDF rdf) => (Triples -> Maybe BaseUrl -> PrefixMappings -> rdf) ->
Either ParseError [Maybe Triple] ->
Either ParseFailure rdf
handleParse _mkRdf result
-- | B.length rem /= 0 = (Left $ ParseFailure $ "Invalid Document. Unparseable end of document: " ++ B.unpack rem)
| otherwise =
case result of
Left err -> Left $ ParseFailure $ "Parse failure: \n" ++ show err
Right ts -> Right $ _mkRdf (conv ts) Nothing (PrefixMappings Map.empty)
where
conv [] = []
conv (Nothing:ts) = conv ts
conv ((Just t):ts) = t : conv ts
_test :: GenParser ByteString () a -> String -> IO a
_test p str =
case result of
(Left err) -> putStr "ParseError: '" >> putStr (show err) >> putStr "\n" >> error ""
(Right a) -> return a
where result = runParser p () "" (B.pack str)