Skip to content

Commit

Permalink
Merge 90a2cef into d77da22
Browse files Browse the repository at this point in the history
  • Loading branch information
craff committed Jul 17, 2023
2 parents d77da22 + 90a2cef commit 9876dfb
Show file tree
Hide file tree
Showing 10 changed files with 84 additions and 32 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Expand Up @@ -25,7 +25,7 @@ jobs:
- uses: ocaml/setup-ocaml@v2
with:
ocaml-compiler: ${{matrix.ocaml}}
- run: sudo apt-get install python-bs4
- run: sudo apt-get install python3-bs4
- run: opam install --deps-only --with-test . --yes
- run: opam install js_of_ocaml --yes

Expand Down
2 changes: 2 additions & 0 deletions src/common.ml
Expand Up @@ -12,6 +12,8 @@ let compare_locations (line, column) (line', column') =
| order -> order

type name = string * string
type attributes = (name * string) list
type open_elements = (name * location * attributes) list

let xml_ns = "http://www.w3.org/XML/1998/namespace"
let xmlns_ns = "http://www.w3.org/2000/xmlns/"
Expand Down
23 changes: 16 additions & 7 deletions src/html_parser.ml
Expand Up @@ -1013,19 +1013,14 @@ end



let parse requested_context report (tokens, set_tokenizer_state, set_foreign) =
let parse ?(get_opens=ref None) requested_context report
(tokens, set_tokenizer_state, set_foreign) =
let context = Context.uninitialized () in

let throw = ref (fun _ -> ()) in
let ended = ref (fun _ -> ()) in
let output = ref (fun _ -> ()) in

let report_if = Error.report_if report in
let unmatched_end_tag l name k =
report l (`Unmatched_end_tag name) !throw k in
let misnested_tag l t context_name k =
report l (`Misnested_tag (t.name, context_name, t.Token_tag.attributes)) !throw k in

let open_elements = Stack.create () in
let active_formatting_elements = Active.create () in
let subtree_buffer = Subtree.create open_elements in
Expand All @@ -1040,6 +1035,20 @@ let parse requested_context report (tokens, set_tokenizer_state, set_foreign) =
set_foreign (fun () ->
Stack.current_element_is_foreign context open_elements);

let opens () = List.map
(fun e ->
let (ns, s) = e.element_name in
((Ns.to_string ns, s), e.location, e.attributes))
!open_elements
in
get_opens := Some opens;
let report l error throw k = report (opens ()) l error throw k in
let report_if = Error.report_if report in
let unmatched_end_tag l name k =
report l (`Unmatched_end_tag name) !throw k in
let misnested_tag l t context_name k =
report l (`Misnested_tag (t.name, context_name, t.Token_tag.attributes)) !throw k in

let report_if_stack_has_other_than names k =
let rec iterate = function
| [] -> k ()
Expand Down
3 changes: 2 additions & 1 deletion src/html_parser.mli
Expand Up @@ -4,8 +4,9 @@
open Common

val parse :
?get_opens:((unit -> open_elements) option ref) ->
[< `Document | `Fragment of string ] option ->
Error.parse_handler ->
(open_elements -> Error.parse_handler) ->
(location * Html_tokenizer.token) Kstream.t *
(Html_tokenizer.state -> unit) *
((unit -> bool) -> unit) ->
Expand Down
51 changes: 37 additions & 14 deletions src/markup.ml
Expand Up @@ -56,6 +56,8 @@ module Error = Error


type name = Common.name
type attributes = (name * string) list
type open_elements = (name * location * attributes) list

type xml_declaration = Common.xml_declaration =
{version : string;
Expand Down Expand Up @@ -91,11 +93,15 @@ struct
let parse_xml
report ?encoding namespace entity context source =
let with_encoding (encoding : Encoding.t) k =
let get_opens = ref None in
let parse = Xml_parser.parse ~get_opens context namespace report in
let report' x =
report (match !get_opens with None -> [] | Some f -> f ()) x in
source
|> encoding ~report
|> Input.preprocess Common.is_valid_xml_char report
|> Xml_tokenizer.tokenize report entity
|> Xml_parser.parse context namespace report
|> encoding ~report:report'
|> Input.preprocess Common.is_valid_xml_char report'
|> Xml_tokenizer.tokenize report' entity
|> parse
|> k
in

Expand All @@ -116,12 +122,16 @@ struct
|> Utility.strings_to_bytes

let parse_html report ?encoding context source =
let get_opens = ref None in
let parse = Html_parser.parse ~get_opens context report in
let report' x =
report (match !get_opens with None -> [] | Some f -> f ()) x in
let with_encoding (encoding : Encoding.t) k =
source
|> encoding ~report
|> Input.preprocess Common.is_valid_html_char report
|> Html_tokenizer.tokenize report
|> Html_parser.parse context report
|> encoding ~report:report'
|> Input.preprocess Common.is_valid_html_char report'
|> Html_tokenizer.tokenize report'
|> parse
|> k
in

Expand Down Expand Up @@ -190,6 +200,7 @@ sig

val parse_xml :
?report:(location -> Error.t -> unit io) ->
?detailed_report:(open_elements -> location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
Expand All @@ -203,6 +214,7 @@ sig

val parse_html :
?report:(location -> Error.t -> unit io) ->
?detailed_report:(open_elements -> location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?context:[< `Document | `Fragment of string ] ->
(char, _) stream -> async parser
Expand Down Expand Up @@ -248,6 +260,7 @@ end

module Asynchronous (IO : IO) =
struct
let wrap_report_ops report = fun ops l e -> IO.to_cps (fun () -> report ops l e)
let wrap_report report = fun l e -> IO.to_cps (fun () -> report l e)

module Encoding =
Expand All @@ -259,15 +272,20 @@ struct
end
let parse_xml
?(report = fun _ _ -> IO.return ())
?report ?detailed_report
?encoding
?(namespace = fun _ -> None)
?(entity = fun _ -> None)
?context
source =
let report = match detailed_report, report with
| Some f, None -> f
| None, Some f -> (fun _ -> f)
| Some _, Some _ -> invalid_arg "both report and detailed_report given"
| None, None -> (fun _ _ _ -> IO.return ())
in
Cps.parse_xml
(wrap_report report) ?encoding namespace entity context source
(wrap_report_ops report) ?encoding namespace entity context source
let write_xml
?(report = fun _ _ -> IO.return ())
Expand All @@ -277,12 +295,17 @@ struct
Cps.write_xml (wrap_report report) prefix signals
let parse_html
?(report = fun _ _ -> IO.return ())
?report ?detailed_report
?encoding
?context
source =
Cps.parse_html (wrap_report report) ?encoding context source
let report = match detailed_report, report with
| Some f, None -> f
| None, Some f -> (fun _ -> f)
| Some _, Some _ -> invalid_arg "both report and detailed_report given"
| None, None -> (fun _ _ _ -> IO.return ())
in
Cps.parse_html (wrap_report_ops report) ?encoding context source
let write_html ?escape_attribute ?escape_text signals =
Cps.write_html ?escape_attribute ?escape_text signals
Expand Down
11 changes: 11 additions & 0 deletions src/markup.mli
Expand Up @@ -228,6 +228,9 @@ end
(** {2 Signals} *)

type name = string * string
type attributes = (name * string) list
type open_elements = (name * location * attributes) list

(** Expanded name: a namespace URI followed by a local name. *)

type xml_declaration =
Expand Down Expand Up @@ -321,6 +324,7 @@ val location : _ parser -> location

val parse_xml :
?report:(location -> Error.t -> unit) ->
?detailed_report:(open_elements -> location -> Error.t -> unit) ->
?encoding:Encoding.t ->
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
Expand All @@ -334,6 +338,10 @@ val parse_xml :
You may raise an exception in [report], and it will propagate to the code
reading the signal stream.
[~detailed_report] is similar to report, but receive the list of
open_elements. You must only give [~report] or [~detailed_report] but not
both.
If [~encoding] is {e not} specified, the parser detects the input encoding
automatically. Otherwise, the given encoding is used.
Expand Down Expand Up @@ -371,6 +379,7 @@ val write_xml :

val parse_html :
?report:(location -> Error.t -> unit) ->
?detailed_report:(open_elements -> location -> Error.t -> unit) ->
?encoding:Encoding.t ->
?context:[< `Document | `Fragment of string ] ->
(char, 's) stream -> 's parser
Expand Down Expand Up @@ -826,6 +835,7 @@ sig

val parse_xml :
?report:(location -> Error.t -> unit io) ->
?detailed_report:(open_elements -> location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?namespace:(string -> string option) ->
?entity:(string -> string option) ->
Expand All @@ -841,6 +851,7 @@ sig

val parse_html :
?report:(location -> Error.t -> unit io) ->
?detailed_report:(open_elements -> location -> Error.t -> unit io) ->
?encoding:Encoding.t ->
?context:[< `Document | `Fragment of string ] ->
(char, _) stream -> async parser
Expand Down
17 changes: 11 additions & 6 deletions src/xml_parser.ml
Expand Up @@ -7,8 +7,13 @@ open Token_tag

let is_whitespace_only strings = List.for_all is_whitespace_only strings

let parse context namespace report tokens =
let parse ?(get_opens=ref None) context namespace report tokens =
let open_elements = ref [] in
let opens () =
List.map (fun (l,name,_,attrs) -> (name,l,attrs)) !open_elements
in
get_opens := Some opens;
let report l error throw k = report (opens ()) l error throw k in
let namespaces = Namespace.Parsing.init namespace in
let is_fragment = ref false in
let fragment_allowed = ref true in
Expand Down Expand Up @@ -45,7 +50,7 @@ let parse context namespace report tokens =
in

deduplicate [] attributes (fun attributes ->
open_elements := (l, expanded_name, raw_name)::!open_elements;
open_elements := (l, expanded_name, raw_name, attributes)::!open_elements;
emit l (`Start_element (expanded_name, attributes)) state))

and pop l state =
Expand Down Expand Up @@ -190,21 +195,21 @@ let parse context namespace report tokens =

let is_on_stack =
!open_elements
|> List.exists (fun (_, name, _) -> name = expanded_name)
|> List.exists (fun (_, name, _, _) -> name = expanded_name)
in

if not is_on_stack then
report l (`Unmatched_end_tag raw_name) !throw content_state
else
let rec pop_until_match () =
match !open_elements with
| (_, name, _)::_ when name = expanded_name ->
| (_, name, _, _)::_ when name = expanded_name ->
pop l (fun () ->
match !open_elements with
| [] when not !is_fragment -> after_root_state ()
| _ -> content_state ())

| (l', _, name)::_ ->
| (l', _, name, _)::_ ->
report l' (`Unmatched_start_tag name) !throw (fun () ->
pop l pop_until_match)

Expand All @@ -225,7 +230,7 @@ let parse context namespace report tokens =
let rec pop_stack () =
match !open_elements with
| [] -> emit_end ()
| (l', _, raw_name)::_ ->
| (l', _, raw_name, _)::_ ->
report l' (`Unmatched_start_tag raw_name) !throw (fun () ->
pop l pop_stack)
in
Expand Down
3 changes: 2 additions & 1 deletion src/xml_parser.mli
Expand Up @@ -4,8 +4,9 @@
open Common

val parse :
?get_opens : ((unit -> open_elements) option ref) ->
[< `Document | `Fragment ] option ->
(string -> string option) ->
Error.parse_handler ->
(open_elements -> Error.parse_handler) ->
(location * Xml_tokenizer.token) Kstream.t ->
(location * signal) Kstream.t
2 changes: 1 addition & 1 deletion test/test_html_parser.ml
Expand Up @@ -26,7 +26,7 @@ let expect ?prefix ?(context = Some `Document) text signals =
|> Markup__Encoding.utf_8
|> Markup__Input.preprocess is_valid_html_char Error.ignore_errors
|> Markup__Html_tokenizer.tokenize Error.ignore_errors
|> Markup__Html_parser.parse context report
|> Markup__Html_parser.parse context (fun _ -> report)
|> iter iterate;

ended ()
Expand Down
2 changes: 1 addition & 1 deletion test/test_xml_parser.ml
Expand Up @@ -24,7 +24,7 @@ let expect ?context ?(namespace = no_top_level_namespaces) text signals =
|> Markup__Encoding.utf_8
|> Markup__Input.preprocess is_valid_xml_char Error.ignore_errors
|> Markup__Xml_tokenizer.tokenize Error.ignore_errors no_custom_entities
|> Markup__Xml_parser.parse context namespace report
|> Markup__Xml_parser.parse context namespace (fun _ -> report)
|> iter iterate;

ended ()
Expand Down

0 comments on commit 9876dfb

Please sign in to comment.