Skip to content

Commit

Permalink
Merge b635e4f into f2a99b4
Browse files Browse the repository at this point in the history
  • Loading branch information
aantron committed Sep 17, 2018
2 parents f2a99b4 + b635e4f commit aea6ecb
Show file tree
Hide file tree
Showing 5 changed files with 274 additions and 73 deletions.
45 changes: 39 additions & 6 deletions src/markup.mli
Expand Up @@ -666,15 +666,26 @@ val text : ([< signal ], 's) stream -> (char, 's) stream
[`Text ss] signal, the result stream has the bytes of the strings [ss], and
all other signals are ignored. *)

val trim : ([> `Text of string list ] as 'a, 's) stream -> ('a, 's) stream
(** Trims whitespace in a signal stream. For each signal [`Text ss], transforms
[ss] so that the result strings [ss'] satisfy
val trim : ([> content_signal ] as 'a, 's) stream -> ('a, 's) stream
(** Trims insignificant whitespace in an HTML signal stream. Whitespace around
flow ("block") content does not matter, but whitespace in phrasing
("inline") content does. So, if the input stream is
{[
String.concat "" ss' = String.trim (String.concat "" ss)
<div>
<p>
<em>foo</em> bar
</p>
</div>
]}
All signals for which [String.concat "" ss' = ""] are then dropped. *)
passing it through [Markup.trim] will result in
{[
<div><p><em>foo</em> bar</p></div>
]}
Note that whitespace around the [</em>] tag was preserved. *)

val normalize_text :
([> `Text of string list ] as 'a, 's) stream -> ('a, 's) stream
Expand All @@ -688,7 +699,29 @@ val normalize_text :
val pretty_print : ([> content_signal ] as 'a, 's) stream -> ('a, 's) stream
(** Adjusts the whitespace in the [`Text] signals in the given stream so that
the output appears nicely-indented when the stream is converted to bytes and
written. *)
written.
This function is aware of the significance of whitespace in HTML, so it
avoids changing the whitespace in phrasing ("inline") content. For example,
pretty printing
{[
<div><p><em>foo</em>bar</p></div>
]}
results in
{[
<div>
<p>
<em>foo</em>bar
</p>
</div>
]}
Note that no whitespace was inserted around [<em>] and [</em>], because
doing so would create a word break that wasn't present in the original
stream. *)

val html5 : ([< signal ], 's) stream -> (signal, 's) stream
(** Converts a signal stream into an HTML5 signal stream by stripping any
Expand Down
177 changes: 141 additions & 36 deletions src/utility.ml
Expand Up @@ -187,28 +187,6 @@ let text s =
|> unwrap_lists
|> strings_to_bytes

let trim s =
let rec trim_string_list trim = function
| [] -> []
| s::more ->
match trim s with
| "" -> trim_string_list trim more
| s -> s::more
in

s |> filter_map (fun v _ k ->
match v with
| `Text ss ->
ss
|> trim_string_list trim_string_left
|> List.rev
|> trim_string_list trim_string_right
|> List.rev
|> (function
| [] -> k None
| ss -> k (Some (`Text ss)))
| _ -> k (Some v))

let normalize_text s =
let rec match_text acc throw e k =
next_option s throw begin function
Expand Down Expand Up @@ -236,32 +214,159 @@ let normalize_text s =

make match_other

let tab_width = 2
let is_phrasing_element (namespace, element_name) =
if namespace <> html_ns then
false
else
match element_name with
| "a" | "abbr" | "b" | "bdi" | "bdo" | "br" | "button" | "cite" | "code"
| "data" | "dfn" | "em" | "i" | "img" | "input" | "kbd" | "label" | "mark"
| "pre" | "q" | "rb" | "rt" | "ruby" | "s" | "samp" | "select" | "small"
| "span" | "strong" | "sub" | "sup" | "textarea" | "time" | "u" | "var"
| "wbr" ->
true
| _ ->
false

let rec trim_string_list trim = function
| [] -> []
| s::more ->
match trim s with
| "" -> trim_string_list trim more
| s -> s::more

let trim signals =
let signals = normalize_text signals in

let signals_and_flow : ('signal * bool) Kstream.t =
Kstream.transform begin fun phrasing_nesting_level signal _throw k ->
match signal with
| `Start_element (name, _) ->
if phrasing_nesting_level > 0 then
k ([signal, false], Some (phrasing_nesting_level + 1))
else
if is_phrasing_element name then
k ([signal, false], Some 1)
else
k ([signal, true], Some 0)

| `End_element ->
if phrasing_nesting_level > 0 then
k ([signal, false], Some (phrasing_nesting_level - 1))
else
k ([signal, true], Some 0)

| _ ->
k ([signal, false], Some phrasing_nesting_level)
end 0 signals
in

let signals =
Kstream.transform begin fun saw_flow_tag (signal, is_flow_tag) throw k ->
match signal with
| `Text ss ->
let ss =
if saw_flow_tag then
trim_string_list Common.trim_string_left ss
else
ss
in

Kstream.peek_option signals_and_flow throw (fun maybe_signal ->
let ss =
match maybe_signal with
| Some (_, true) ->
ss
|> List.rev
|> trim_string_list Common.trim_string_right
|> List.rev
| _ ->
ss
in

k ([`Text ss], Some false))

| _ ->
k ([signal], Some is_flow_tag)
end true signals_and_flow
in

let pretty_print s =
let s = s |> normalize_text |> trim in
normalize_text signals

let tab_width = 1

let pretty_print signals =
let signals = trim signals in

let indent n =
let n = if n < 0 then 0 else n in
String.make (n * tab_width) ' '
in

let rec current_state = ref (fun throw e k -> row 0 throw e k)
let rec current_state = ref (fun throw e k -> flow 0 throw e k)

and row depth throw e k =
next s throw e begin fun v ->
match v with
| `Start_element _ ->
list [`Text [indent depth]; v; `Text ["\n"]]
(row (depth + 1)) throw e k
and flow indentation throw e k =
next signals throw e begin fun signal ->
match signal with
| `Start_element (name, _) when not @@ is_phrasing_element name ->
(* If the next signal is `End_element, don't insert a line break. This
is mainly for collapsing inherently empty tags like <meta> and
<br>. *)
peek_expected signals throw begin fun next_signal ->
match next_signal with
| `End_element ->
next_expected signals throw begin fun _ ->
list
[`Text [indent indentation]; signal; next_signal; `Text ["\n"]]
(flow indentation) throw e k
end

| _ ->
list
[`Text [indent indentation]; signal; `Text ["\n"]]
(flow (indentation + 1)) throw e k
end

| `End_element ->
list [`Text [indent (depth - 1)]; v; `Text ["\n"]]
(row (depth - 1)) throw e k
list
[`Text [indent (indentation - 1)]; signal; `Text ["\n"]]
(flow (indentation - 1)) throw e k

| `Start_element _ | `Text _ ->
push signals signal;
list
[`Text [indent indentation]]
(phrasing indentation 0) throw e k

| _ ->
list
[signal]
(flow indentation) throw e k
end

and phrasing indentation phrasing_nesting_level throw e k =
next signals throw e begin fun signal ->
match signal with
| `Start_element (name, _) when is_phrasing_element name ->
list
[signal]
(phrasing indentation (phrasing_nesting_level + 1)) throw e k

| `End_element when phrasing_nesting_level > 0 ->
list
[signal]
(phrasing indentation (phrasing_nesting_level - 1)) throw e k

| `Text _ ->
list
[signal]
(phrasing indentation phrasing_nesting_level) throw e k

| _ ->
list [`Text [indent depth]; v; `Text ["\n"]]
(row depth) throw e k
push signals signal;
list
[`Text ["\n"]]
(flow indentation) throw e k
end

and list signals state throw e k =
Expand Down
2 changes: 1 addition & 1 deletion test/dune
Expand Up @@ -6,4 +6,4 @@
(alias
(name runtest)
(package markup)
(action (run %{exe:test.exe})))
(action (run %{exe:test.exe} -runner sequential)))
4 changes: 2 additions & 2 deletions test/test_integration.ml
Expand Up @@ -69,8 +69,8 @@ let tests = [
|> write_xml
|> to_string
|> assert_equal
("<root>\n foo\n <nested>\n bar\n </nested>\n" ^
" <nested>\n baz\n </nested>\n</root>\n"));
("<root>\n foo\n <nested>\n bar\n </nested>\n" ^
" <nested>\n baz\n </nested>\n</root>\n"));

("integration.locations" >:: fun _ ->
let parser = "<root>foo</root>" |> string |> parse_xml in
Expand Down

0 comments on commit aea6ecb

Please sign in to comment.