diff --git a/README.md b/README.md index f60fc39..7151c65 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ open Markup;; string s "

Markup.ml

rocks!" (* malformed HTML *) |> parse_html `Start_element "p" - `Start_element "em" +|> signals `Start_element "em" `Text ["Markup.ml"] ~report (1, 4) (`Unmatched_start_tag "em") `End_element (* /em: recovery *) @@ -32,7 +32,7 @@ string s "

Markup.ml

rocks!" (* malformed HTML *) `Text ["rocks!"] `End_element (* /em *) `End_element (* /p *) -|> drop_locations + |> pretty_print (* adjusts the `Text signals *) |> write_html @@ -117,7 +117,7 @@ let () = >|= Markup_lwt.lwt_stream (* Now a Markup.stream. *) >|= Markup.strings_to_bytes >|= Markup.parse_html - >|= Markup.drop_locations + >|= Markup.signals >|= Markup.elements (fun name _ -> snd name = "h3") >>= Markup_lwt.iter begin fun h3_subtree -> h3_subtree diff --git a/src/kstream.ml b/src/kstream.ml index 5aaa66a..f28afa3 100644 --- a/src/kstream.ml +++ b/src/kstream.ml @@ -15,6 +15,8 @@ let construct c = | Some s' -> s'.f throw e k) |> make +let empty () = (fun _ e _ -> e ()) |> make + let next {f} throw e k = f throw e k let next_option {f} throw k = f throw (fun () -> k None) (fun v -> k (Some v)) diff --git a/src/kstream.mli b/src/kstream.mli index 2a9440f..043495c 100644 --- a/src/kstream.mli +++ b/src/kstream.mli @@ -21,6 +21,7 @@ type 'a t val make : (exn cont -> unit cont -> 'a cont -> unit) -> 'a t val construct : 'a t cps -> 'a t +val empty : unit -> 'a t val next : 'a t -> exn cont -> unit cont -> 'a cont -> unit val next_option : 'a t -> 'a option cps diff --git a/src/markup.ml b/src/markup.ml index 602bf42..1d56fdf 100644 --- a/src/markup.ml +++ b/src/markup.ml @@ -74,6 +74,19 @@ type content_signal = Common.content_signal let signal_to_string = Common.signal_to_string +type 's parser = + {mutable location : location; + mutable signals : (signal, 's) stream} + +let signals parser = parser.signals +let location parser = parser.location + +let stream_to_parser s = + let parser = {location = (1, 1); signals = Kstream.empty ()} in + parser.signals <- + s |> Kstream.map (fun (l, v) _ k -> parser.location <- l; k v); + parser + module Cps = struct let parse_xml @@ -96,6 +109,7 @@ struct in Kstream.construct constructor + |> stream_to_parser let write_xml report prefix signals = signals @@ -121,6 +135,7 @@ struct in Kstream.construct constructor + |> stream_to_parser let write_html signals = signals @@ -163,7 +178,7 @@ sig ?namespace:(string -> string option) -> ?entity:(string -> string option) -> ?context:[ `Document | `Fragment ] -> - (char, _) stream -> (location * signal, async) stream + (char, _) stream -> async parser val write_xml : ?report:((signal * int) -> Error.t -> unit io) -> @@ -174,7 +189,7 @@ sig ?report:(location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?context:[ `Document | `Fragment of string ] -> - (char, _) stream -> (location * signal, async) stream + (char, _) stream -> async parser val write_html : ([< signal ], _) stream -> (char, async) stream diff --git a/src/markup.mli b/src/markup.mli index 28216f4..92dd935 100644 --- a/src/markup.mli +++ b/src/markup.mli @@ -28,7 +28,7 @@ open Markup (* Correct and pretty-print HTML. *) channel stdin -|> parse_html |> drop_locations |> pretty_print +|> parse_html |> signals |> pretty_print |> write_html |> to_channel stdout (* Show up to 10 XML well-formedness errors to the user. Stop after @@ -40,13 +40,14 @@ let report = count := !count + 1; if !count >= 10 then raise_notrace Exit -string "some xml" |> parse_xml ~report |> drain +string "some xml" |> parse_xml ~report |> signals |> drain (* Load HTML into a custom document tree data type. *) type html = Text of string | Element of string * html list file "some_file" |> parse_html +|> signals |> tree ~text:(fun ss -> Text (String.concat "" ss)) ~element:(fun (_, name) _ children -> Element (name, children)) @@ -302,6 +303,24 @@ type content_signal = val signal_to_string : [< signal ] -> string (** Provides a human-readable representation of signals for debugging. *) + + +(** {2 Parsers} *) + +type 's parser +(** A ['s parser] is a thin wrapper around a [(signal, 's) stream] that supports + access to additional information that is not carried directly in the stream, + such as source locations. *) + +val signals : 's parser -> (signal, 's) stream +(** Converts a parser to its underlying signal stream. *) + +val location : _ parser -> location +(** Evaluates to the location of the last signal emitted on the parser's signal + stream. If no signals have yet been emitted, evaluates to [(1, 1)]. *) + + + (** {2 XML} *) val parse_xml : @@ -310,8 +329,10 @@ val parse_xml : ?namespace:(string -> string option) -> ?entity:(string -> string option) -> ?context:[ `Document | `Fragment ] -> - (char, 's) stream -> (location * signal, 's) stream -(** Converts an XML byte stream to a signal stream. + (char, 's) stream -> 's parser +(** Creates a parser that converts an XML byte stream to a signal stream. + + For simple usage, [bytes |> parse_xml |> signals]. If [~report] is provided, [report] is called for every error encountered. You may raise an exception in [report], and it will propagate to the code @@ -348,13 +369,15 @@ val write_xml : for a namespace URI. If it evaluates to [Some s], the writer uses [s] for the URI. Otherwise, the writer reports [`Bad_namespace]. *) + + (** {2 HTML} *) val parse_html : ?report:(location -> Error.t -> unit) -> ?encoding:Encoding.t -> ?context:[ `Document | `Fragment of string ] -> - (char, 's) stream -> (location * signal, 's) stream + (char, 's) stream -> 's parser (** Similar to {!parse_xml}, but parses HTML with embedded SVG and MathML, never emits signals [`Xml] or [`PI], and [~context] has a different type on tag [`Fragment]. @@ -496,8 +519,7 @@ val to_list : ('a, sync) stream -> 'a list (** {2 Utility} *) -val content : - (location * [< signal ], 's) stream -> (location * content_signal, 's) stream +val content : ([< signal ], 's) stream -> (content_signal, 's) stream (** Converts a {!signal} stream into a {!content_signal} stream by filtering out all signals besides [`Start_element], [`End_element], and [`Text]. *) @@ -531,7 +553,7 @@ type dom = Text of string | Element of name * dom list "

HTML5 is easy to parse" |> string |> parse_html -|> drop_locations +|> signals |> tree ~text:(fun ss -> Text (String.concat "" ss)) ~element:(fun (name, _) children -> Element (name, children)) @@ -566,10 +588,6 @@ val elements : at all. However, once the using code has tried to get the next substream, it should not try to read a previous one. *) -val drop_locations : (location * 'a, 's) stream -> ('a, 's) stream -(** Forgets location information emitted by the parsers. It is equivalent to - [map snd]. *) - val text : ([< signal ], 's) stream -> (char, 's) stream (** Extracts all the text in a signal stream by discarding all markup, i.e. for each [`Text ss] signal, the result stream has the bytes of the strings [ss], @@ -687,7 +705,7 @@ sig ?namespace:(string -> string option) -> ?entity:(string -> string option) -> ?context:[ `Document | `Fragment ] -> - (char, _) stream -> (location * signal, async) stream + (char, _) stream -> async parser val write_xml : ?report:((signal * int) -> Error.t -> unit io) -> @@ -700,7 +718,7 @@ sig ?report:(location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?context:[ `Document | `Fragment of string ] -> - (char, _) stream -> (location * signal, async) stream + (char, _) stream -> async parser val write_html : ([< signal ], _) stream -> (char, async) stream diff --git a/src/utility.ml b/src/utility.ml index 5a076db..cbadbbd 100644 --- a/src/utility.ml +++ b/src/utility.ml @@ -5,10 +5,9 @@ open Common open Kstream let content s = - let filter (l, signal) _ k = + let filter signal _ k = match signal with - | `Start_element _ | `End_element | `Text _ as signal -> - k (Some (l, signal)) + | `Start_element _ | `End_element | `Text _ as signal -> k (Some signal) | `Comment _ | `PI _ | `Doctype _ | `Xml _ -> k None in filter_map filter s @@ -226,8 +225,6 @@ let pretty_print s = |> make |> normalize_text -let drop_locations s = s |> map (fun v _ k -> k (snd v)) - let html5 s = let remove_markup v _ k = match v with diff --git a/test/dependency/dep_core.ml b/test/dependency/dep_core.ml index 6c95d9e..3097467 100644 --- a/test/dependency/dep_core.ml +++ b/test/dependency/dep_core.ml @@ -6,6 +6,4 @@ open Markup let (|>) x f = f x let () = - "foo" - |> string |> parse_html - |> drop_locations |> write_html |> to_string |> ignore + string "foo" |> parse_html |> signals |> write_html |> to_string |> ignore diff --git a/test/performance/performance_markup.ml b/test/performance/performance_markup.ml index 4fca63b..dcb9b30 100644 --- a/test/performance/performance_markup.ml +++ b/test/performance/performance_markup.ml @@ -8,7 +8,7 @@ let (|>) x f = f x let () = measure 100 "markup.ml" google_page "html" (fun () -> - file google_page |> fst |> parse_html |> drain); + file google_page |> fst |> parse_html |> signals |> drain); measure 100 "markup.ml" xml_spec "xml" (fun () -> - file xml_spec |> fst |> parse_xml |> drain) + file xml_spec |> fst |> parse_xml |> signals |> drain) diff --git a/test/test_integration.ml b/test/test_integration.ml index f214c18..dd6006c 100644 --- a/test/test_integration.ml +++ b/test/test_integration.ml @@ -11,7 +11,7 @@ let tests = [ "\xa0" |> string |> parse_xml - |> drop_locations + |> signals |> write_xml |> to_string |> assert_equal @@ -21,7 +21,7 @@ let tests = [ "\xfe\xff\x00f\x00o\x00o" |> string |> parse_xml - |> drop_locations + |> signals |> write_xml |> to_string |> assert_equal "foo"); @@ -30,7 +30,7 @@ let tests = [ "

foo

bar" |> string |> parse_html - |> drop_locations + |> signals |> write_html |> to_string |> assert_equal @@ -41,11 +41,24 @@ let tests = [ "foobarbaz" |> string |> parse_xml - |> drop_locations + |> signals |> pretty_print |> write_xml |> to_string |> assert_equal ("\n foo\n \n bar\n \n" ^ - " \n baz\n \n\n")) + " \n baz\n \n\n")); + + ("integration.locations" >:: fun _ -> + let parser = "foo" |> string |> parse_xml in + + assert_equal (location parser) (1, 1); + parser |> signals |> next |> ignore; + assert_equal (location parser) (1, 1); + parser |> signals |> next |> ignore; + assert_equal (location parser) (1, 7); + parser |> signals |> next |> ignore; + assert_equal (location parser) (1, 10); + parser |> signals |> next |> ignore; + assert_equal (location parser) (1, 10)) ] diff --git a/test/test_utility.ml b/test/test_utility.ml index f5d16e0..b02859d 100644 --- a/test/test_utility.ml +++ b/test/test_utility.ml @@ -20,8 +20,8 @@ let tests = [ "

foo

" |> string |> parse_xml + |> signals |> content - |> drop_locations |> write_xml |> to_string |> assert_equal "

foo

"); @@ -128,17 +128,5 @@ let tests = [ `End_element; `Text ["\n"]; `End_element; - `Text ["\n"]]); - - ("utility.drop_locations" >:: fun _ -> - [(1, 1), start_element "a"; - (1, 4), `Text ["foo"]; - (1, 7), `End_element] - |> of_list - |> drop_locations - |> to_list - |> assert_equal [ - start_element "a"; - `Text ["foo"]; - `End_element]) + `Text ["\n"]]) ]