diff --git a/README.md b/README.md index f60fc39..7151c65 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ open Markup;; string s "
Markup.ml rocks!" (* malformed HTML *)
|> parse_html `Start_element "p"
- `Start_element "em"
+|> signals `Start_element "em"
`Text ["Markup.ml"]
~report (1, 4) (`Unmatched_start_tag "em")
`End_element (* /em: recovery *)
@@ -32,7 +32,7 @@ string s " Markup.ml rocks!" (* malformed HTML *)
`Text ["rocks!"]
`End_element (* /em *)
`End_element (* /p *)
-|> drop_locations
+
|> pretty_print (* adjusts the `Text signals *)
|> write_html
@@ -117,7 +117,7 @@ let () =
>|= Markup_lwt.lwt_stream (* Now a Markup.stream. *)
>|= Markup.strings_to_bytes
>|= Markup.parse_html
- >|= Markup.drop_locations
+ >|= Markup.signals
>|= Markup.elements (fun name _ -> snd name = "h3")
>>= Markup_lwt.iter begin fun h3_subtree ->
h3_subtree
diff --git a/src/kstream.ml b/src/kstream.ml
index 5aaa66a..f28afa3 100644
--- a/src/kstream.ml
+++ b/src/kstream.ml
@@ -15,6 +15,8 @@ let construct c =
| Some s' -> s'.f throw e k)
|> make
+let empty () = (fun _ e _ -> e ()) |> make
+
let next {f} throw e k = f throw e k
let next_option {f} throw k = f throw (fun () -> k None) (fun v -> k (Some v))
diff --git a/src/kstream.mli b/src/kstream.mli
index 2a9440f..043495c 100644
--- a/src/kstream.mli
+++ b/src/kstream.mli
@@ -21,6 +21,7 @@ type 'a t
val make : (exn cont -> unit cont -> 'a cont -> unit) -> 'a t
val construct : 'a t cps -> 'a t
+val empty : unit -> 'a t
val next : 'a t -> exn cont -> unit cont -> 'a cont -> unit
val next_option : 'a t -> 'a option cps
diff --git a/src/markup.ml b/src/markup.ml
index 602bf42..1d56fdf 100644
--- a/src/markup.ml
+++ b/src/markup.ml
@@ -74,6 +74,19 @@ type content_signal = Common.content_signal
let signal_to_string = Common.signal_to_string
+type 's parser =
+ {mutable location : location;
+ mutable signals : (signal, 's) stream}
+
+let signals parser = parser.signals
+let location parser = parser.location
+
+let stream_to_parser s =
+ let parser = {location = (1, 1); signals = Kstream.empty ()} in
+ parser.signals <-
+ s |> Kstream.map (fun (l, v) _ k -> parser.location <- l; k v);
+ parser
+
module Cps =
struct
let parse_xml
@@ -96,6 +109,7 @@ struct
in
Kstream.construct constructor
+ |> stream_to_parser
let write_xml report prefix signals =
signals
@@ -121,6 +135,7 @@ struct
in
Kstream.construct constructor
+ |> stream_to_parser
let write_html signals =
signals
@@ -163,7 +178,7 @@ sig
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
?context:[ `Document | `Fragment ] ->
- (char, _) stream -> (location * signal, async) stream
+ (char, _) stream -> async parser
val write_xml :
?report:((signal * int) -> Error.t -> unit io) ->
@@ -174,7 +189,7 @@ sig
?report:(location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?context:[ `Document | `Fragment of string ] ->
- (char, _) stream -> (location * signal, async) stream
+ (char, _) stream -> async parser
val write_html : ([< signal ], _) stream -> (char, async) stream
diff --git a/src/markup.mli b/src/markup.mli
index 28216f4..92dd935 100644
--- a/src/markup.mli
+++ b/src/markup.mli
@@ -28,7 +28,7 @@ open Markup
(* Correct and pretty-print HTML. *)
channel stdin
-|> parse_html |> drop_locations |> pretty_print
+|> parse_html |> signals |> pretty_print
|> write_html |> to_channel stdout
(* Show up to 10 XML well-formedness errors to the user. Stop after
@@ -40,13 +40,14 @@ let report =
count := !count + 1;
if !count >= 10 then raise_notrace Exit
-string "some xml" |> parse_xml ~report |> drain
+string "some xml" |> parse_xml ~report |> signals |> drain
(* Load HTML into a custom document tree data type. *)
type html = Text of string | Element of string * html list
file "some_file"
|> parse_html
+|> signals
|> tree
~text:(fun ss -> Text (String.concat "" ss))
~element:(fun (_, name) _ children -> Element (name, children))
@@ -302,6 +303,24 @@ type content_signal =
val signal_to_string : [< signal ] -> string
(** Provides a human-readable representation of signals for debugging. *)
+
+
+(** {2 Parsers} *)
+
+type 's parser
+(** A ['s parser] is a thin wrapper around a [(signal, 's) stream] that supports
+ access to additional information that is not carried directly in the stream,
+ such as source locations. *)
+
+val signals : 's parser -> (signal, 's) stream
+(** Converts a parser to its underlying signal stream. *)
+
+val location : _ parser -> location
+(** Evaluates to the location of the last signal emitted on the parser's signal
+ stream. If no signals have yet been emitted, evaluates to [(1, 1)]. *)
+
+
+
(** {2 XML} *)
val parse_xml :
@@ -310,8 +329,10 @@ val parse_xml :
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
?context:[ `Document | `Fragment ] ->
- (char, 's) stream -> (location * signal, 's) stream
-(** Converts an XML byte stream to a signal stream.
+ (char, 's) stream -> 's parser
+(** Creates a parser that converts an XML byte stream to a signal stream.
+
+ For simple usage, [bytes |> parse_xml |> signals].
If [~report] is provided, [report] is called for every error encountered.
You may raise an exception in [report], and it will propagate to the code
@@ -348,13 +369,15 @@ val write_xml :
for a namespace URI. If it evaluates to [Some s], the writer uses [s] for
the URI. Otherwise, the writer reports [`Bad_namespace]. *)
+
+
(** {2 HTML} *)
val parse_html :
?report:(location -> Error.t -> unit) ->
?encoding:Encoding.t ->
?context:[ `Document | `Fragment of string ] ->
- (char, 's) stream -> (location * signal, 's) stream
+ (char, 's) stream -> 's parser
(** Similar to {!parse_xml}, but parses HTML with embedded SVG and MathML, never
emits signals [`Xml] or [`PI], and [~context] has a different type on tag
[`Fragment].
@@ -496,8 +519,7 @@ val to_list : ('a, sync) stream -> 'a list
(** {2 Utility} *)
-val content :
- (location * [< signal ], 's) stream -> (location * content_signal, 's) stream
+val content : ([< signal ], 's) stream -> (content_signal, 's) stream
(** Converts a {!signal} stream into a {!content_signal} stream by filtering out
all signals besides [`Start_element], [`End_element], and [`Text]. *)
@@ -531,7 +553,7 @@ type dom = Text of string | Element of name * dom list
" HTML5 is easy to parse"
|> string
|> parse_html
-|> drop_locations
+|> signals
|> tree
~text:(fun ss -> Text (String.concat "" ss))
~element:(fun (name, _) children -> Element (name, children))
@@ -566,10 +588,6 @@ val elements :
at all. However, once the using code has tried to get the next substream, it
should not try to read a previous one. *)
-val drop_locations : (location * 'a, 's) stream -> ('a, 's) stream
-(** Forgets location information emitted by the parsers. It is equivalent to
- [map snd]. *)
-
val text : ([< signal ], 's) stream -> (char, 's) stream
(** Extracts all the text in a signal stream by discarding all markup, i.e. for
each [`Text ss] signal, the result stream has the bytes of the strings [ss],
@@ -687,7 +705,7 @@ sig
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
?context:[ `Document | `Fragment ] ->
- (char, _) stream -> (location * signal, async) stream
+ (char, _) stream -> async parser
val write_xml :
?report:((signal * int) -> Error.t -> unit io) ->
@@ -700,7 +718,7 @@ sig
?report:(location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?context:[ `Document | `Fragment of string ] ->
- (char, _) stream -> (location * signal, async) stream
+ (char, _) stream -> async parser
val write_html : ([< signal ], _) stream -> (char, async) stream
diff --git a/src/utility.ml b/src/utility.ml
index 5a076db..cbadbbd 100644
--- a/src/utility.ml
+++ b/src/utility.ml
@@ -5,10 +5,9 @@ open Common
open Kstream
let content s =
- let filter (l, signal) _ k =
+ let filter signal _ k =
match signal with
- | `Start_element _ | `End_element | `Text _ as signal ->
- k (Some (l, signal))
+ | `Start_element _ | `End_element | `Text _ as signal -> k (Some signal)
| `Comment _ | `PI _ | `Doctype _ | `Xml _ -> k None
in
filter_map filter s
@@ -226,8 +225,6 @@ let pretty_print s =
|> make
|> normalize_text
-let drop_locations s = s |> map (fun v _ k -> k (snd v))
-
let html5 s =
let remove_markup v _ k =
match v with
diff --git a/test/dependency/dep_core.ml b/test/dependency/dep_core.ml
index 6c95d9e..3097467 100644
--- a/test/dependency/dep_core.ml
+++ b/test/dependency/dep_core.ml
@@ -6,6 +6,4 @@ open Markup
let (|>) x f = f x
let () =
- "foo"
- |> string |> parse_html
- |> drop_locations |> write_html |> to_string |> ignore
+ string "foo" |> parse_html |> signals |> write_html |> to_string |> ignore
diff --git a/test/performance/performance_markup.ml b/test/performance/performance_markup.ml
index 4fca63b..dcb9b30 100644
--- a/test/performance/performance_markup.ml
+++ b/test/performance/performance_markup.ml
@@ -8,7 +8,7 @@ let (|>) x f = f x
let () =
measure 100 "markup.ml" google_page "html" (fun () ->
- file google_page |> fst |> parse_html |> drain);
+ file google_page |> fst |> parse_html |> signals |> drain);
measure 100 "markup.ml" xml_spec "xml" (fun () ->
- file xml_spec |> fst |> parse_xml |> drain)
+ file xml_spec |> fst |> parse_xml |> signals |> drain)
diff --git a/test/test_integration.ml b/test/test_integration.ml
index f214c18..dd6006c 100644
--- a/test/test_integration.ml
+++ b/test/test_integration.ml
@@ -11,7 +11,7 @@ let tests = [
" foo bar"
|> string
|> parse_html
- |> drop_locations
+ |> signals
|> write_html
|> to_string
|> assert_equal
@@ -41,11 +41,24 @@ let tests = [
" foo foo