Skip to content

Commit

Permalink
Explicit parser type.
Browse files Browse the repository at this point in the history
The immediate purpose of this change is to take location information
out of the signal streams emitted by parsing functions. Locations can
now be queried from values of the new parser type, from which signal
streams can also be obtained.

In the future, additional bits of information may be queryable from
parser values.

Removed drop_locations.

Closes #2.
  • Loading branch information
aantron committed Jan 15, 2016
1 parent 4fbe1c6 commit 22351e2
Show file tree
Hide file tree
Showing 10 changed files with 80 additions and 48 deletions.
6 changes: 3 additions & 3 deletions README.md
Expand Up @@ -22,7 +22,7 @@ open Markup;;
string s "<p><em>Markup.ml<p>rocks!" (* malformed HTML *)
|> parse_html `Start_element "p"
`Start_element "em"
|> signals `Start_element "em"
`Text ["Markup.ml"]
~report (1, 4) (`Unmatched_start_tag "em")
`End_element (* /em: recovery *)
Expand All @@ -32,7 +32,7 @@ string s "<p><em>Markup.ml<p>rocks!" (* malformed HTML *)
`Text ["rocks!"]
`End_element (* /em *)
`End_element (* /p *)
|> drop_locations
|> pretty_print (* adjusts the `Text signals *)
|> write_html
Expand Down Expand Up @@ -117,7 +117,7 @@ let () =
>|= Markup_lwt.lwt_stream (* Now a Markup.stream. *)
>|= Markup.strings_to_bytes
>|= Markup.parse_html
>|= Markup.drop_locations
>|= Markup.signals
>|= Markup.elements (fun name _ -> snd name = "h3")
>>= Markup_lwt.iter begin fun h3_subtree ->
h3_subtree
Expand Down
2 changes: 2 additions & 0 deletions src/kstream.ml
Expand Up @@ -15,6 +15,8 @@ let construct c =
| Some s' -> s'.f throw e k)
|> make

let empty () = (fun _ e _ -> e ()) |> make

let next {f} throw e k = f throw e k

let next_option {f} throw k = f throw (fun () -> k None) (fun v -> k (Some v))
Expand Down
1 change: 1 addition & 0 deletions src/kstream.mli
Expand Up @@ -21,6 +21,7 @@ type 'a t

val make : (exn cont -> unit cont -> 'a cont -> unit) -> 'a t
val construct : 'a t cps -> 'a t
val empty : unit -> 'a t

val next : 'a t -> exn cont -> unit cont -> 'a cont -> unit
val next_option : 'a t -> 'a option cps
Expand Down
19 changes: 17 additions & 2 deletions src/markup.ml
Expand Up @@ -74,6 +74,19 @@ type content_signal = Common.content_signal

let signal_to_string = Common.signal_to_string

type 's parser =
{mutable location : location;
mutable signals : (signal, 's) stream}

let signals parser = parser.signals
let location parser = parser.location

let stream_to_parser s =
let parser = {location = (1, 1); signals = Kstream.empty ()} in
parser.signals <-
s |> Kstream.map (fun (l, v) _ k -> parser.location <- l; k v);
parser

module Cps =
struct
let parse_xml
Expand All @@ -96,6 +109,7 @@ struct
in

Kstream.construct constructor
|> stream_to_parser

let write_xml report prefix signals =
signals
Expand All @@ -121,6 +135,7 @@ struct
in

Kstream.construct constructor
|> stream_to_parser

let write_html signals =
signals
Expand Down Expand Up @@ -163,7 +178,7 @@ sig
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
?context:[ `Document | `Fragment ] ->
(char, _) stream -> (location * signal, async) stream
(char, _) stream -> async parser

val write_xml :
?report:((signal * int) -> Error.t -> unit io) ->
Expand All @@ -174,7 +189,7 @@ sig
?report:(location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?context:[ `Document | `Fragment of string ] ->
(char, _) stream -> (location * signal, async) stream
(char, _) stream -> async parser

val write_html : ([< signal ], _) stream -> (char, async) stream

Expand Down
46 changes: 32 additions & 14 deletions src/markup.mli
Expand Up @@ -28,7 +28,7 @@ open Markup
(* Correct and pretty-print HTML. *)
channel stdin
|> parse_html |> drop_locations |> pretty_print
|> parse_html |> signals |> pretty_print
|> write_html |> to_channel stdout
(* Show up to 10 XML well-formedness errors to the user. Stop after
Expand All @@ -40,13 +40,14 @@ let report =
count := !count + 1;
if !count >= 10 then raise_notrace Exit
string "some xml" |> parse_xml ~report |> drain
string "some xml" |> parse_xml ~report |> signals |> drain
(* Load HTML into a custom document tree data type. *)
type html = Text of string | Element of string * html list
file "some_file"
|> parse_html
|> signals
|> tree
~text:(fun ss -> Text (String.concat "" ss))
~element:(fun (_, name) _ children -> Element (name, children))
Expand Down Expand Up @@ -302,6 +303,24 @@ type content_signal =
val signal_to_string : [< signal ] -> string
(** Provides a human-readable representation of signals for debugging. *)



(** {2 Parsers} *)

type 's parser
(** A ['s parser] is a thin wrapper around a [(signal, 's) stream] that supports
access to additional information that is not carried directly in the stream,
such as source locations. *)

val signals : 's parser -> (signal, 's) stream
(** Converts a parser to its underlying signal stream. *)

val location : _ parser -> location
(** Evaluates to the location of the last signal emitted on the parser's signal
stream. If no signals have yet been emitted, evaluates to [(1, 1)]. *)



(** {2 XML} *)

val parse_xml :
Expand All @@ -310,8 +329,10 @@ val parse_xml :
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
?context:[ `Document | `Fragment ] ->
(char, 's) stream -> (location * signal, 's) stream
(** Converts an XML byte stream to a signal stream.
(char, 's) stream -> 's parser
(** Creates a parser that converts an XML byte stream to a signal stream.
For simple usage, [bytes |> parse_xml |> signals].
If [~report] is provided, [report] is called for every error encountered.
You may raise an exception in [report], and it will propagate to the code
Expand Down Expand Up @@ -348,13 +369,15 @@ val write_xml :
for a namespace URI. If it evaluates to [Some s], the writer uses [s] for
the URI. Otherwise, the writer reports [`Bad_namespace]. *)



(** {2 HTML} *)

val parse_html :
?report:(location -> Error.t -> unit) ->
?encoding:Encoding.t ->
?context:[ `Document | `Fragment of string ] ->
(char, 's) stream -> (location * signal, 's) stream
(char, 's) stream -> 's parser
(** Similar to {!parse_xml}, but parses HTML with embedded SVG and MathML, never
emits signals [`Xml] or [`PI], and [~context] has a different type on tag
[`Fragment].
Expand Down Expand Up @@ -496,8 +519,7 @@ val to_list : ('a, sync) stream -> 'a list

(** {2 Utility} *)

val content :
(location * [< signal ], 's) stream -> (location * content_signal, 's) stream
val content : ([< signal ], 's) stream -> (content_signal, 's) stream
(** Converts a {!signal} stream into a {!content_signal} stream by filtering out
all signals besides [`Start_element], [`End_element], and [`Text]. *)

Expand Down Expand Up @@ -531,7 +553,7 @@ type dom = Text of string | Element of name * dom list
"<p>HTML5 is <em>easy</em> to parse"
|> string
|> parse_html
|> drop_locations
|> signals
|> tree
~text:(fun ss -> Text (String.concat "" ss))
~element:(fun (name, _) children -> Element (name, children))
Expand Down Expand Up @@ -566,10 +588,6 @@ val elements :
at all. However, once the using code has tried to get the next substream, it
should not try to read a previous one. *)

val drop_locations : (location * 'a, 's) stream -> ('a, 's) stream
(** Forgets location information emitted by the parsers. It is equivalent to
[map snd]. *)

val text : ([< signal ], 's) stream -> (char, 's) stream
(** Extracts all the text in a signal stream by discarding all markup, i.e. for
each [`Text ss] signal, the result stream has the bytes of the strings [ss],
Expand Down Expand Up @@ -687,7 +705,7 @@ sig
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
?context:[ `Document | `Fragment ] ->
(char, _) stream -> (location * signal, async) stream
(char, _) stream -> async parser

val write_xml :
?report:((signal * int) -> Error.t -> unit io) ->
Expand All @@ -700,7 +718,7 @@ sig
?report:(location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?context:[ `Document | `Fragment of string ] ->
(char, _) stream -> (location * signal, async) stream
(char, _) stream -> async parser

val write_html : ([< signal ], _) stream -> (char, async) stream

Expand Down
7 changes: 2 additions & 5 deletions src/utility.ml
Expand Up @@ -5,10 +5,9 @@ open Common
open Kstream

let content s =
let filter (l, signal) _ k =
let filter signal _ k =
match signal with
| `Start_element _ | `End_element | `Text _ as signal ->
k (Some (l, signal))
| `Start_element _ | `End_element | `Text _ as signal -> k (Some signal)
| `Comment _ | `PI _ | `Doctype _ | `Xml _ -> k None
in
filter_map filter s
Expand Down Expand Up @@ -226,8 +225,6 @@ let pretty_print s =
|> make
|> normalize_text

let drop_locations s = s |> map (fun v _ k -> k (snd v))

let html5 s =
let remove_markup v _ k =
match v with
Expand Down
4 changes: 1 addition & 3 deletions test/dependency/dep_core.ml
Expand Up @@ -6,6 +6,4 @@ open Markup
let (|>) x f = f x

let () =
"foo"
|> string |> parse_html
|> drop_locations |> write_html |> to_string |> ignore
string "foo" |> parse_html |> signals |> write_html |> to_string |> ignore
4 changes: 2 additions & 2 deletions test/performance/performance_markup.ml
Expand Up @@ -8,7 +8,7 @@ let (|>) x f = f x

let () =
measure 100 "markup.ml" google_page "html" (fun () ->
file google_page |> fst |> parse_html |> drain);
file google_page |> fst |> parse_html |> signals |> drain);

measure 100 "markup.ml" xml_spec "xml" (fun () ->
file xml_spec |> fst |> parse_xml |> drain)
file xml_spec |> fst |> parse_xml |> signals |> drain)
23 changes: 18 additions & 5 deletions test/test_integration.ml
Expand Up @@ -11,7 +11,7 @@ let tests = [
"<?xml version='1.0' encoding='windows-1252'?><root>\xa0</root><a></a>"
|> string
|> parse_xml
|> drop_locations
|> signals
|> write_xml
|> to_string
|> assert_equal
Expand All @@ -21,7 +21,7 @@ let tests = [
"\xfe\xff\x00f\x00o\x00o"
|> string
|> parse_xml
|> drop_locations
|> signals
|> write_xml
|> to_string
|> assert_equal "foo");
Expand All @@ -30,7 +30,7 @@ let tests = [
"<!DOCTYPE html><html><body><p><em>foo<p>bar"
|> string
|> parse_html
|> drop_locations
|> signals
|> write_html
|> to_string
|> assert_equal
Expand All @@ -41,11 +41,24 @@ let tests = [
"<root>foo<nested>bar</nested><nested>baz</nested></root>"
|> string
|> parse_xml
|> drop_locations
|> signals
|> pretty_print
|> write_xml
|> to_string
|> assert_equal
("<root>\n foo\n <nested>\n bar\n </nested>\n" ^
" <nested>\n baz\n </nested>\n</root>\n"))
" <nested>\n baz\n </nested>\n</root>\n"));

("integration.locations" >:: fun _ ->
let parser = "<root>foo</root>" |> string |> parse_xml in

assert_equal (location parser) (1, 1);
parser |> signals |> next |> ignore;
assert_equal (location parser) (1, 1);
parser |> signals |> next |> ignore;
assert_equal (location parser) (1, 7);
parser |> signals |> next |> ignore;
assert_equal (location parser) (1, 10);
parser |> signals |> next |> ignore;
assert_equal (location parser) (1, 10))
]
16 changes: 2 additions & 14 deletions test/test_utility.ml
Expand Up @@ -20,8 +20,8 @@ let tests = [
"<?xml version='1.0'?><!DOCTYPE html><!--blah--><p>foo</p><?bar baz?>"
|> string
|> parse_xml
|> signals
|> content
|> drop_locations
|> write_xml
|> to_string
|> assert_equal "<p>foo</p>");
Expand Down Expand Up @@ -128,17 +128,5 @@ let tests = [
`End_element;
`Text ["\n"];
`End_element;
`Text ["\n"]]);

("utility.drop_locations" >:: fun _ ->
[(1, 1), start_element "a";
(1, 4), `Text ["foo"];
(1, 7), `End_element]
|> of_list
|> drop_locations
|> to_list
|> assert_equal [
start_element "a";
`Text ["foo"];
`End_element])
`Text ["\n"]])
]

0 comments on commit 22351e2

Please sign in to comment.