From 163106f0a3fe64c1c71785d6c9dc3e91c8986f48 Mon Sep 17 00:00:00 2001 From: Christophe Raffalli Date: Sun, 16 Jul 2023 15:07:32 -1000 Subject: [PATCH 1/5] add the list of opened elements to report --- src/common.ml | 2 ++ src/html_parser.ml | 20 ++++++++++++++------ src/html_parser.mli | 2 +- src/markup.ml | 29 +++++++++++++++++------------ src/markup.mli | 11 +++++++---- src/xml_parser.ml | 13 ++++++++----- src/xml_parser.mli | 2 +- 7 files changed, 50 insertions(+), 29 deletions(-) diff --git a/src/common.ml b/src/common.ml index cbf60a4..e394f64 100644 --- a/src/common.ml +++ b/src/common.ml @@ -12,6 +12,8 @@ let compare_locations (line, column) (line', column') = | order -> order type name = string * string +type attributes = (name * string) list +type open_elements = (name * location * attributes) list let xml_ns = "http://www.w3.org/XML/1998/namespace" let xmlns_ns = "http://www.w3.org/2000/xmlns/" diff --git a/src/html_parser.ml b/src/html_parser.ml index 16dd3fd..18fe85f 100644 --- a/src/html_parser.ml +++ b/src/html_parser.ml @@ -1022,12 +1022,6 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) = let ended = ref (fun _ -> ()) in let output = ref (fun _ -> ()) in - let report_if = Error.report_if report in - let unmatched_end_tag l name k = - report l (`Unmatched_end_tag name) !throw k in - let misnested_tag l t context_name k = - report l (`Misnested_tag (t.name, context_name, t.Token_tag.attributes)) !throw k in - let open_elements = Stack.create () in let active_formatting_elements = Active.create () in let subtree_buffer = Subtree.create open_elements in @@ -1042,6 +1036,20 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) = set_foreign (fun () -> Stack.current_element_is_foreign context open_elements); + let report l error throw k = + let elts = List.map + (fun e -> + let (ns, s) = e.element_name in + ((Ns.to_string ns, s), e.location, e.attributes)) + !open_elements + in + report elts l error throw k in + let report_if = Error.report_if report in + let unmatched_end_tag l name k = + report l (`Unmatched_end_tag name) !throw k in + let misnested_tag l t context_name k = + report l (`Misnested_tag (t.name, context_name, t.Token_tag.attributes)) !throw k in + let report_if_stack_has_other_than names k = let rec iterate = function | [] -> k () diff --git a/src/html_parser.mli b/src/html_parser.mli index d605fe3..29ea8ee 100644 --- a/src/html_parser.mli +++ b/src/html_parser.mli @@ -5,7 +5,7 @@ open Common val parse : [< `Document | `Fragment of string ] option -> - Error.parse_handler -> + (open_elements -> Error.parse_handler) -> (location * Html_tokenizer.token) Kstream.t * (Html_tokenizer.state -> unit) * ((unit -> bool) -> unit) -> diff --git a/src/markup.ml b/src/markup.ml index 04ba209..ca19ef4 100644 --- a/src/markup.ml +++ b/src/markup.ml @@ -56,6 +56,8 @@ module Error = Error type name = Common.name +type attributes = (name * string) list +type open_elements = (name * location * attributes) list type xml_declaration = Common.xml_declaration = {version : string; @@ -91,10 +93,11 @@ struct let parse_xml report ?encoding namespace entity context source = let with_encoding (encoding : Encoding.t) k = + let report' = report [] in source - |> encoding ~report - |> Input.preprocess Common.is_valid_xml_char report - |> Xml_tokenizer.tokenize report entity + |> encoding ~report:report' + |> Input.preprocess Common.is_valid_xml_char report' + |> Xml_tokenizer.tokenize report' entity |> Xml_parser.parse context namespace report |> k in @@ -116,11 +119,12 @@ struct |> Utility.strings_to_bytes let parse_html report ?encoding context source = + let report' = report [] in let with_encoding (encoding : Encoding.t) k = source - |> encoding ~report - |> Input.preprocess Common.is_valid_html_char report - |> Html_tokenizer.tokenize report + |> encoding ~report:report' + |> Input.preprocess Common.is_valid_html_char report' + |> Html_tokenizer.tokenize report' |> Html_parser.parse context report |> k in @@ -189,7 +193,7 @@ sig end val parse_xml : - ?report:(location -> Error.t -> unit io) -> + ?report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?namespace:(string -> string option) -> ?entity:(string -> string option) -> @@ -202,7 +206,7 @@ sig ([< signal ], _) stream -> (char, async) stream val parse_html : - ?report:(location -> Error.t -> unit io) -> + ?report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?context:[< `Document | `Fragment of string ] -> (char, _) stream -> async parser @@ -248,6 +252,7 @@ end module Asynchronous (IO : IO) = struct + let wrap_report_ops report = fun ops l e -> IO.to_cps (fun () -> report ops l e) let wrap_report report = fun l e -> IO.to_cps (fun () -> report l e) module Encoding = @@ -259,7 +264,7 @@ struct end let parse_xml - ?(report = fun _ _ -> IO.return ()) + ?(report = fun _ _ _ -> IO.return ()) ?encoding ?(namespace = fun _ -> None) ?(entity = fun _ -> None) @@ -267,7 +272,7 @@ struct source = Cps.parse_xml - (wrap_report report) ?encoding namespace entity context source + (wrap_report_ops report) ?encoding namespace entity context source let write_xml ?(report = fun _ _ -> IO.return ()) @@ -277,12 +282,12 @@ struct Cps.write_xml (wrap_report report) prefix signals let parse_html - ?(report = fun _ _ -> IO.return ()) + ?(report = fun _ _ _ -> IO.return ()) ?encoding ?context source = - Cps.parse_html (wrap_report report) ?encoding context source + Cps.parse_html (wrap_report_ops report) ?encoding context source let write_html ?escape_attribute ?escape_text signals = Cps.write_html ?escape_attribute ?escape_text signals diff --git a/src/markup.mli b/src/markup.mli index a47580b..82451b4 100644 --- a/src/markup.mli +++ b/src/markup.mli @@ -228,6 +228,9 @@ end (** {2 Signals} *) type name = string * string +type attributes = (name * string) list +type open_elements = (name * location * attributes) list + (** Expanded name: a namespace URI followed by a local name. *) type xml_declaration = @@ -320,7 +323,7 @@ val location : _ parser -> location (** {2 XML} *) val parse_xml : - ?report:(location -> Error.t -> unit) -> + ?report:(open_elements -> location -> Error.t -> unit) -> ?encoding:Encoding.t -> ?namespace:(string -> string option) -> ?entity:(string -> string option) -> @@ -370,7 +373,7 @@ val write_xml : (** {2 HTML} *) val parse_html : - ?report:(location -> Error.t -> unit) -> + ?report:(open_elements -> location -> Error.t -> unit) -> ?encoding:Encoding.t -> ?context:[< `Document | `Fragment of string ] -> (char, 's) stream -> 's parser @@ -825,7 +828,7 @@ sig (** {2 XML} *) val parse_xml : - ?report:(location -> Error.t -> unit io) -> + ?report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?namespace:(string -> string option) -> ?entity:(string -> string option) -> @@ -840,7 +843,7 @@ sig (** {2 HTML} *) val parse_html : - ?report:(location -> Error.t -> unit io) -> + ?report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?context:[< `Document | `Fragment of string ] -> (char, _) stream -> async parser diff --git a/src/xml_parser.ml b/src/xml_parser.ml index 781a022..f6a9117 100644 --- a/src/xml_parser.ml +++ b/src/xml_parser.ml @@ -9,6 +9,9 @@ let is_whitespace_only strings = List.for_all is_whitespace_only strings let parse context namespace report tokens = let open_elements = ref [] in + let report = + report (List.map (fun (l,name,_,attrs) -> (name,l,attrs)) !open_elements) + in let namespaces = Namespace.Parsing.init namespace in let is_fragment = ref false in let fragment_allowed = ref true in @@ -45,7 +48,7 @@ let parse context namespace report tokens = in deduplicate [] attributes (fun attributes -> - open_elements := (l, expanded_name, raw_name)::!open_elements; + open_elements := (l, expanded_name, raw_name, attributes)::!open_elements; emit l (`Start_element (expanded_name, attributes)) state)) and pop l state = @@ -190,7 +193,7 @@ let parse context namespace report tokens = let is_on_stack = !open_elements - |> List.exists (fun (_, name, _) -> name = expanded_name) + |> List.exists (fun (_, name, _, _) -> name = expanded_name) in if not is_on_stack then @@ -198,13 +201,13 @@ let parse context namespace report tokens = else let rec pop_until_match () = match !open_elements with - | (_, name, _)::_ when name = expanded_name -> + | (_, name, _, _)::_ when name = expanded_name -> pop l (fun () -> match !open_elements with | [] when not !is_fragment -> after_root_state () | _ -> content_state ()) - | (l', _, name)::_ -> + | (l', _, name, _)::_ -> report l' (`Unmatched_start_tag name) !throw (fun () -> pop l pop_until_match) @@ -225,7 +228,7 @@ let parse context namespace report tokens = let rec pop_stack () = match !open_elements with | [] -> emit_end () - | (l', _, raw_name)::_ -> + | (l', _, raw_name, _)::_ -> report l' (`Unmatched_start_tag raw_name) !throw (fun () -> pop l pop_stack) in diff --git a/src/xml_parser.mli b/src/xml_parser.mli index f6fa9e5..35de0df 100644 --- a/src/xml_parser.mli +++ b/src/xml_parser.mli @@ -6,6 +6,6 @@ open Common val parse : [< `Document | `Fragment ] option -> (string -> string option) -> - Error.parse_handler -> + (open_elements -> Error.parse_handler) -> (location * Xml_tokenizer.token) Kstream.t -> (location * signal) Kstream.t From 2dd9dbcdd46d45c4d5b144cc09e8a701305e073a Mon Sep 17 00:00:00 2001 From: Christophe Raffalli Date: Sun, 16 Jul 2023 15:18:32 -1000 Subject: [PATCH 2/5] doc + introduce ~detailed_report + fix tests --- src/markup.ml | 24 ++++++++++++++++++------ src/markup.mli | 16 ++++++++++++---- test/test_html_parser.ml | 2 +- test/test_xml_parser.ml | 2 +- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/markup.ml b/src/markup.ml index ca19ef4..5d7ec2c 100644 --- a/src/markup.ml +++ b/src/markup.ml @@ -193,7 +193,8 @@ sig end val parse_xml : - ?report:(open_elements -> location -> Error.t -> unit io) -> + ?report:(location -> Error.t -> unit io) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?namespace:(string -> string option) -> ?entity:(string -> string option) -> @@ -206,7 +207,8 @@ sig ([< signal ], _) stream -> (char, async) stream val parse_html : - ?report:(open_elements -> location -> Error.t -> unit io) -> + ?report:(location -> Error.t -> unit io) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?context:[< `Document | `Fragment of string ] -> (char, _) stream -> async parser @@ -264,13 +266,18 @@ struct end let parse_xml - ?(report = fun _ _ _ -> IO.return ()) + ?report ?detailed_report ?encoding ?(namespace = fun _ -> None) ?(entity = fun _ -> None) ?context source = - + let report = match detailed_report, report with + | Some f, None -> f + | None, Some f -> (fun _ -> f) + | Some _, Some _ -> invalid_arg "both report and detailed_report given" + | None, None -> (fun _ _ _ -> IO.return ()) + in Cps.parse_xml (wrap_report_ops report) ?encoding namespace entity context source @@ -282,11 +289,16 @@ struct Cps.write_xml (wrap_report report) prefix signals let parse_html - ?(report = fun _ _ _ -> IO.return ()) + ?report ?detailed_report ?encoding ?context source = - + let report = match detailed_report, report with + | Some f, None -> f + | None, Some f -> (fun _ -> f) + | Some _, Some _ -> invalid_arg "both report and detailed_report given" + | None, None -> (fun _ _ _ -> IO.return ()) + in Cps.parse_html (wrap_report_ops report) ?encoding context source let write_html ?escape_attribute ?escape_text signals = diff --git a/src/markup.mli b/src/markup.mli index 82451b4..b87c838 100644 --- a/src/markup.mli +++ b/src/markup.mli @@ -323,7 +323,8 @@ val location : _ parser -> location (** {2 XML} *) val parse_xml : - ?report:(open_elements -> location -> Error.t -> unit) -> + ?report:(location -> Error.t -> unit) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit) -> ?encoding:Encoding.t -> ?namespace:(string -> string option) -> ?entity:(string -> string option) -> @@ -337,6 +338,10 @@ val parse_xml : You may raise an exception in [report], and it will propagate to the code reading the signal stream. + [~detailed_report] is similar to report, but receive the list of + open_elements. You must only give [~report] or [~detailed_report] but not + both. + If [~encoding] is {e not} specified, the parser detects the input encoding automatically. Otherwise, the given encoding is used. @@ -373,7 +378,8 @@ val write_xml : (** {2 HTML} *) val parse_html : - ?report:(open_elements -> location -> Error.t -> unit) -> + ?report:(location -> Error.t -> unit) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit) -> ?encoding:Encoding.t -> ?context:[< `Document | `Fragment of string ] -> (char, 's) stream -> 's parser @@ -828,7 +834,8 @@ sig (** {2 XML} *) val parse_xml : - ?report:(open_elements -> location -> Error.t -> unit io) -> + ?report:(location -> Error.t -> unit io) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?namespace:(string -> string option) -> ?entity:(string -> string option) -> @@ -843,7 +850,8 @@ sig (** {2 HTML} *) val parse_html : - ?report:(open_elements -> location -> Error.t -> unit io) -> + ?report:(location -> Error.t -> unit io) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?context:[< `Document | `Fragment of string ] -> (char, _) stream -> async parser diff --git a/test/test_html_parser.ml b/test/test_html_parser.ml index 7def980..cf091eb 100644 --- a/test/test_html_parser.ml +++ b/test/test_html_parser.ml @@ -26,7 +26,7 @@ let expect ?prefix ?(context = Some `Document) text signals = |> Markup__Encoding.utf_8 |> Markup__Input.preprocess is_valid_html_char Error.ignore_errors |> Markup__Html_tokenizer.tokenize Error.ignore_errors - |> Markup__Html_parser.parse context report + |> Markup__Html_parser.parse context (fun _ -> report) |> iter iterate; ended () diff --git a/test/test_xml_parser.ml b/test/test_xml_parser.ml index 21b5dd1..c262fda 100644 --- a/test/test_xml_parser.ml +++ b/test/test_xml_parser.ml @@ -24,7 +24,7 @@ let expect ?context ?(namespace = no_top_level_namespaces) text signals = |> Markup__Encoding.utf_8 |> Markup__Input.preprocess is_valid_xml_char Error.ignore_errors |> Markup__Xml_tokenizer.tokenize Error.ignore_errors no_custom_entities - |> Markup__Xml_parser.parse context namespace report + |> Markup__Xml_parser.parse context namespace (fun _ -> report) |> iter iterate; ended () From b7fb99876bef16de553d70cd96987054319d2cf6 Mon Sep 17 00:00:00 2001 From: Christophe Raffalli Date: Sun, 16 Jul 2023 15:23:39 -1000 Subject: [PATCH 3/5] python3-bs4 in test.yml --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 81cd5af..37c5cdd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,7 @@ jobs: - uses: ocaml/setup-ocaml@v2 with: ocaml-compiler: ${{matrix.ocaml}} - - run: sudo apt-get install python-bs4 + - run: sudo apt-get install python3-bs4 - run: opam install --deps-only --with-test . --yes - run: opam install js_of_ocaml --yes From 1132dba013ea552e6f3f47a5d4c40bb32d5c1a5f Mon Sep 17 00:00:00 2001 From: Christophe Raffalli Date: Sun, 16 Jul 2023 15:41:29 -1000 Subject: [PATCH 4/5] hask to get open elements in all cases --- src/html_parser.ml | 11 ++++++----- src/html_parser.mli | 1 + src/markup.ml | 6 ++++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/html_parser.ml b/src/html_parser.ml index 18fe85f..7925223 100644 --- a/src/html_parser.ml +++ b/src/html_parser.ml @@ -1015,7 +1015,8 @@ end -let parse requested_context report (tokens, set_tokenizer_state, set_foreign) = +let parse ?(get_opens=ref None) requested_context report + (tokens, set_tokenizer_state, set_foreign) = let context = Context.uninitialized () in let throw = ref (fun _ -> ()) in @@ -1036,14 +1037,14 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) = set_foreign (fun () -> Stack.current_element_is_foreign context open_elements); - let report l error throw k = - let elts = List.map + let opens () = List.map (fun e -> let (ns, s) = e.element_name in ((Ns.to_string ns, s), e.location, e.attributes)) !open_elements - in - report elts l error throw k in + in + get_opens := Some opens; + let report l error throw k = report (opens ()) l error throw k in let report_if = Error.report_if report in let unmatched_end_tag l name k = report l (`Unmatched_end_tag name) !throw k in diff --git a/src/html_parser.mli b/src/html_parser.mli index 29ea8ee..221fc3f 100644 --- a/src/html_parser.mli +++ b/src/html_parser.mli @@ -4,6 +4,7 @@ open Common val parse : + ?get_opens:((unit -> open_elements) option ref) -> [< `Document | `Fragment of string ] option -> (open_elements -> Error.parse_handler) -> (location * Html_tokenizer.token) Kstream.t * diff --git a/src/markup.ml b/src/markup.ml index 5d7ec2c..3048c87 100644 --- a/src/markup.ml +++ b/src/markup.ml @@ -119,13 +119,15 @@ struct |> Utility.strings_to_bytes let parse_html report ?encoding context source = - let report' = report [] in + let get_opens = ref None in + let parse = Html_parser.parse ~get_opens in + let report' = report (match !get_opens with None -> [] | Some f -> f ()) in let with_encoding (encoding : Encoding.t) k = source |> encoding ~report:report' |> Input.preprocess Common.is_valid_html_char report' |> Html_tokenizer.tokenize report' - |> Html_parser.parse context report + |> parse context report |> k in From 90a2cefbfda553bff0a804ba23bd900bdc2a20fd Mon Sep 17 00:00:00 2001 From: Christophe Raffalli Date: Sun, 16 Jul 2023 16:06:04 -1000 Subject: [PATCH 5/5] same for xml_parser --- src/markup.ml | 14 +++++++++----- src/xml_parser.ml | 8 +++++--- src/xml_parser.mli | 1 + 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/markup.ml b/src/markup.ml index 3048c87..3c34664 100644 --- a/src/markup.ml +++ b/src/markup.ml @@ -93,12 +93,15 @@ struct let parse_xml report ?encoding namespace entity context source = let with_encoding (encoding : Encoding.t) k = - let report' = report [] in + let get_opens = ref None in + let parse = Xml_parser.parse ~get_opens context namespace report in + let report' x = + report (match !get_opens with None -> [] | Some f -> f ()) x in source |> encoding ~report:report' |> Input.preprocess Common.is_valid_xml_char report' |> Xml_tokenizer.tokenize report' entity - |> Xml_parser.parse context namespace report + |> parse |> k in @@ -120,14 +123,15 @@ struct let parse_html report ?encoding context source = let get_opens = ref None in - let parse = Html_parser.parse ~get_opens in - let report' = report (match !get_opens with None -> [] | Some f -> f ()) in + let parse = Html_parser.parse ~get_opens context report in + let report' x = + report (match !get_opens with None -> [] | Some f -> f ()) x in let with_encoding (encoding : Encoding.t) k = source |> encoding ~report:report' |> Input.preprocess Common.is_valid_html_char report' |> Html_tokenizer.tokenize report' - |> parse context report + |> parse |> k in diff --git a/src/xml_parser.ml b/src/xml_parser.ml index f6a9117..532ed1b 100644 --- a/src/xml_parser.ml +++ b/src/xml_parser.ml @@ -7,11 +7,13 @@ open Token_tag let is_whitespace_only strings = List.for_all is_whitespace_only strings -let parse context namespace report tokens = +let parse ?(get_opens=ref None) context namespace report tokens = let open_elements = ref [] in - let report = - report (List.map (fun (l,name,_,attrs) -> (name,l,attrs)) !open_elements) + let opens () = + List.map (fun (l,name,_,attrs) -> (name,l,attrs)) !open_elements in + get_opens := Some opens; + let report l error throw k = report (opens ()) l error throw k in let namespaces = Namespace.Parsing.init namespace in let is_fragment = ref false in let fragment_allowed = ref true in diff --git a/src/xml_parser.mli b/src/xml_parser.mli index 35de0df..96a89ca 100644 --- a/src/xml_parser.mli +++ b/src/xml_parser.mli @@ -4,6 +4,7 @@ open Common val parse : + ?get_opens : ((unit -> open_elements) option ref) -> [< `Document | `Fragment ] option -> (string -> string option) -> (open_elements -> Error.parse_handler) ->