diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 81cd5af..37c5cdd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,7 @@ jobs: - uses: ocaml/setup-ocaml@v2 with: ocaml-compiler: ${{matrix.ocaml}} - - run: sudo apt-get install python-bs4 + - run: sudo apt-get install python3-bs4 - run: opam install --deps-only --with-test . --yes - run: opam install js_of_ocaml --yes diff --git a/src/common.ml b/src/common.ml index cbf60a4..e394f64 100644 --- a/src/common.ml +++ b/src/common.ml @@ -12,6 +12,8 @@ let compare_locations (line, column) (line', column') = | order -> order type name = string * string +type attributes = (name * string) list +type open_elements = (name * location * attributes) list let xml_ns = "http://www.w3.org/XML/1998/namespace" let xmlns_ns = "http://www.w3.org/2000/xmlns/" diff --git a/src/html_parser.ml b/src/html_parser.ml index 55f8529..ea49201 100644 --- a/src/html_parser.ml +++ b/src/html_parser.ml @@ -1013,19 +1013,14 @@ end -let parse requested_context report (tokens, set_tokenizer_state, set_foreign) = +let parse ?(get_opens=ref None) requested_context report + (tokens, set_tokenizer_state, set_foreign) = let context = Context.uninitialized () in let throw = ref (fun _ -> ()) in let ended = ref (fun _ -> ()) in let output = ref (fun _ -> ()) in - let report_if = Error.report_if report in - let unmatched_end_tag l name k = - report l (`Unmatched_end_tag name) !throw k in - let misnested_tag l t context_name k = - report l (`Misnested_tag (t.name, context_name, t.Token_tag.attributes)) !throw k in - let open_elements = Stack.create () in let active_formatting_elements = Active.create () in let subtree_buffer = Subtree.create open_elements in @@ -1040,6 +1035,20 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) = set_foreign (fun () -> Stack.current_element_is_foreign context open_elements); + let opens () = List.map + (fun e -> + let (ns, s) = e.element_name in + ((Ns.to_string ns, s), e.location, e.attributes)) + !open_elements + in + get_opens := Some opens; + let report l error throw k = report (opens ()) l error throw k in + let report_if = Error.report_if report in + let unmatched_end_tag l name k = + report l (`Unmatched_end_tag name) !throw k in + let misnested_tag l t context_name k = + report l (`Misnested_tag (t.name, context_name, t.Token_tag.attributes)) !throw k in + let report_if_stack_has_other_than names k = let rec iterate = function | [] -> k () diff --git a/src/html_parser.mli b/src/html_parser.mli index d605fe3..221fc3f 100644 --- a/src/html_parser.mli +++ b/src/html_parser.mli @@ -4,8 +4,9 @@ open Common val parse : + ?get_opens:((unit -> open_elements) option ref) -> [< `Document | `Fragment of string ] option -> - Error.parse_handler -> + (open_elements -> Error.parse_handler) -> (location * Html_tokenizer.token) Kstream.t * (Html_tokenizer.state -> unit) * ((unit -> bool) -> unit) -> diff --git a/src/markup.ml b/src/markup.ml index 04ba209..3c34664 100644 --- a/src/markup.ml +++ b/src/markup.ml @@ -56,6 +56,8 @@ module Error = Error type name = Common.name +type attributes = (name * string) list +type open_elements = (name * location * attributes) list type xml_declaration = Common.xml_declaration = {version : string; @@ -91,11 +93,15 @@ struct let parse_xml report ?encoding namespace entity context source = let with_encoding (encoding : Encoding.t) k = + let get_opens = ref None in + let parse = Xml_parser.parse ~get_opens context namespace report in + let report' x = + report (match !get_opens with None -> [] | Some f -> f ()) x in source - |> encoding ~report - |> Input.preprocess Common.is_valid_xml_char report - |> Xml_tokenizer.tokenize report entity - |> Xml_parser.parse context namespace report + |> encoding ~report:report' + |> Input.preprocess Common.is_valid_xml_char report' + |> Xml_tokenizer.tokenize report' entity + |> parse |> k in @@ -116,12 +122,16 @@ struct |> Utility.strings_to_bytes let parse_html report ?encoding context source = + let get_opens = ref None in + let parse = Html_parser.parse ~get_opens context report in + let report' x = + report (match !get_opens with None -> [] | Some f -> f ()) x in let with_encoding (encoding : Encoding.t) k = source - |> encoding ~report - |> Input.preprocess Common.is_valid_html_char report - |> Html_tokenizer.tokenize report - |> Html_parser.parse context report + |> encoding ~report:report' + |> Input.preprocess Common.is_valid_html_char report' + |> Html_tokenizer.tokenize report' + |> parse |> k in @@ -190,6 +200,7 @@ sig val parse_xml : ?report:(location -> Error.t -> unit io) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?namespace:(string -> string option) -> ?entity:(string -> string option) -> @@ -203,6 +214,7 @@ sig val parse_html : ?report:(location -> Error.t -> unit io) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?context:[< `Document | `Fragment of string ] -> (char, _) stream -> async parser @@ -248,6 +260,7 @@ end module Asynchronous (IO : IO) = struct + let wrap_report_ops report = fun ops l e -> IO.to_cps (fun () -> report ops l e) let wrap_report report = fun l e -> IO.to_cps (fun () -> report l e) module Encoding = @@ -259,15 +272,20 @@ struct end let parse_xml - ?(report = fun _ _ -> IO.return ()) + ?report ?detailed_report ?encoding ?(namespace = fun _ -> None) ?(entity = fun _ -> None) ?context source = - + let report = match detailed_report, report with + | Some f, None -> f + | None, Some f -> (fun _ -> f) + | Some _, Some _ -> invalid_arg "both report and detailed_report given" + | None, None -> (fun _ _ _ -> IO.return ()) + in Cps.parse_xml - (wrap_report report) ?encoding namespace entity context source + (wrap_report_ops report) ?encoding namespace entity context source let write_xml ?(report = fun _ _ -> IO.return ()) @@ -277,12 +295,17 @@ struct Cps.write_xml (wrap_report report) prefix signals let parse_html - ?(report = fun _ _ -> IO.return ()) + ?report ?detailed_report ?encoding ?context source = - - Cps.parse_html (wrap_report report) ?encoding context source + let report = match detailed_report, report with + | Some f, None -> f + | None, Some f -> (fun _ -> f) + | Some _, Some _ -> invalid_arg "both report and detailed_report given" + | None, None -> (fun _ _ _ -> IO.return ()) + in + Cps.parse_html (wrap_report_ops report) ?encoding context source let write_html ?escape_attribute ?escape_text signals = Cps.write_html ?escape_attribute ?escape_text signals diff --git a/src/markup.mli b/src/markup.mli index a47580b..b87c838 100644 --- a/src/markup.mli +++ b/src/markup.mli @@ -228,6 +228,9 @@ end (** {2 Signals} *) type name = string * string +type attributes = (name * string) list +type open_elements = (name * location * attributes) list + (** Expanded name: a namespace URI followed by a local name. *) type xml_declaration = @@ -321,6 +324,7 @@ val location : _ parser -> location val parse_xml : ?report:(location -> Error.t -> unit) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit) -> ?encoding:Encoding.t -> ?namespace:(string -> string option) -> ?entity:(string -> string option) -> @@ -334,6 +338,10 @@ val parse_xml : You may raise an exception in [report], and it will propagate to the code reading the signal stream. + [~detailed_report] is similar to report, but receive the list of + open_elements. You must only give [~report] or [~detailed_report] but not + both. + If [~encoding] is {e not} specified, the parser detects the input encoding automatically. Otherwise, the given encoding is used. @@ -371,6 +379,7 @@ val write_xml : val parse_html : ?report:(location -> Error.t -> unit) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit) -> ?encoding:Encoding.t -> ?context:[< `Document | `Fragment of string ] -> (char, 's) stream -> 's parser @@ -826,6 +835,7 @@ sig val parse_xml : ?report:(location -> Error.t -> unit io) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?namespace:(string -> string option) -> ?entity:(string -> string option) -> @@ -841,6 +851,7 @@ sig val parse_html : ?report:(location -> Error.t -> unit io) -> + ?detailed_report:(open_elements -> location -> Error.t -> unit io) -> ?encoding:Encoding.t -> ?context:[< `Document | `Fragment of string ] -> (char, _) stream -> async parser diff --git a/src/xml_parser.ml b/src/xml_parser.ml index 781a022..532ed1b 100644 --- a/src/xml_parser.ml +++ b/src/xml_parser.ml @@ -7,8 +7,13 @@ open Token_tag let is_whitespace_only strings = List.for_all is_whitespace_only strings -let parse context namespace report tokens = +let parse ?(get_opens=ref None) context namespace report tokens = let open_elements = ref [] in + let opens () = + List.map (fun (l,name,_,attrs) -> (name,l,attrs)) !open_elements + in + get_opens := Some opens; + let report l error throw k = report (opens ()) l error throw k in let namespaces = Namespace.Parsing.init namespace in let is_fragment = ref false in let fragment_allowed = ref true in @@ -45,7 +50,7 @@ let parse context namespace report tokens = in deduplicate [] attributes (fun attributes -> - open_elements := (l, expanded_name, raw_name)::!open_elements; + open_elements := (l, expanded_name, raw_name, attributes)::!open_elements; emit l (`Start_element (expanded_name, attributes)) state)) and pop l state = @@ -190,7 +195,7 @@ let parse context namespace report tokens = let is_on_stack = !open_elements - |> List.exists (fun (_, name, _) -> name = expanded_name) + |> List.exists (fun (_, name, _, _) -> name = expanded_name) in if not is_on_stack then @@ -198,13 +203,13 @@ let parse context namespace report tokens = else let rec pop_until_match () = match !open_elements with - | (_, name, _)::_ when name = expanded_name -> + | (_, name, _, _)::_ when name = expanded_name -> pop l (fun () -> match !open_elements with | [] when not !is_fragment -> after_root_state () | _ -> content_state ()) - | (l', _, name)::_ -> + | (l', _, name, _)::_ -> report l' (`Unmatched_start_tag name) !throw (fun () -> pop l pop_until_match) @@ -225,7 +230,7 @@ let parse context namespace report tokens = let rec pop_stack () = match !open_elements with | [] -> emit_end () - | (l', _, raw_name)::_ -> + | (l', _, raw_name, _)::_ -> report l' (`Unmatched_start_tag raw_name) !throw (fun () -> pop l pop_stack) in diff --git a/src/xml_parser.mli b/src/xml_parser.mli index f6fa9e5..96a89ca 100644 --- a/src/xml_parser.mli +++ b/src/xml_parser.mli @@ -4,8 +4,9 @@ open Common val parse : + ?get_opens : ((unit -> open_elements) option ref) -> [< `Document | `Fragment ] option -> (string -> string option) -> - Error.parse_handler -> + (open_elements -> Error.parse_handler) -> (location * Xml_tokenizer.token) Kstream.t -> (location * signal) Kstream.t diff --git a/test/test_html_parser.ml b/test/test_html_parser.ml index c4be125..9ffcd76 100644 --- a/test/test_html_parser.ml +++ b/test/test_html_parser.ml @@ -26,7 +26,7 @@ let expect ?prefix ?(context = Some `Document) text signals = |> Markup__Encoding.utf_8 |> Markup__Input.preprocess is_valid_html_char Error.ignore_errors |> Markup__Html_tokenizer.tokenize Error.ignore_errors - |> Markup__Html_parser.parse context report + |> Markup__Html_parser.parse context (fun _ -> report) |> iter iterate; ended () diff --git a/test/test_xml_parser.ml b/test/test_xml_parser.ml index 21b5dd1..c262fda 100644 --- a/test/test_xml_parser.ml +++ b/test/test_xml_parser.ml @@ -24,7 +24,7 @@ let expect ?context ?(namespace = no_top_level_namespaces) text signals = |> Markup__Encoding.utf_8 |> Markup__Input.preprocess is_valid_xml_char Error.ignore_errors |> Markup__Xml_tokenizer.tokenize Error.ignore_errors no_custom_entities - |> Markup__Xml_parser.parse context namespace report + |> Markup__Xml_parser.parse context namespace (fun _ -> report) |> iter iterate; ended ()