From 49fbba73a3f5a599c6a29bf31b9fcd9b627ccf8b Mon Sep 17 00:00:00 2001 From: benwa Date: Tue, 30 May 2017 17:23:12 +0700 Subject: [PATCH] JAMES-2018 Manage list levels well --- .../jmap/utils/JsoupHtmlTextExtractor.java | 59 +++++++++++++++---- .../utils/JsoupHtmlTextExtractorTest.java | 22 +++++++ 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java index c153ae5bc39..d5b359e8155 100644 --- a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java +++ b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java @@ -22,6 +22,7 @@ import java.util.Optional; import java.util.stream.Stream; +import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -40,6 +41,7 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor { public static final String P_TAG = "p"; public static final String IMG_TAG = "img"; public static final String ALT_TAG = "alt"; + public static final int INITIAL_LIST_NESTED_LEVEL = 0; @Override public String toPlainText(String html) { @@ -48,7 +50,7 @@ public String toPlainText(String html) { Element body = Optional.ofNullable(document.body()).orElse(document); - return flatten(body) + return flatten(body, INITIAL_LIST_NESTED_LEVEL) .map(this::convertNodeToText) .reduce("", (s1, s2) -> s1 + s2); } catch (Exception e) { @@ -57,7 +59,8 @@ public String toPlainText(String html) { } } - private String convertNodeToText(Node node) { + private String convertNodeToText(HTMLNode htmlNode) { + Node node = htmlNode.underlyingNode; if (node instanceof TextNode) { TextNode textNode = (TextNode) node; return textNode.getWholeText(); @@ -67,14 +70,14 @@ private String convertNodeToText(Node node) { if (element.tagName().equals(BR_TAG)) { return "\n"; } - if (element.tagName().equals(UL_TAG)) { - return "\n\n"; + if (isList(element)) { + return convertListElement(htmlNode.listNestedLevel); } if (element.tagName().equals(OL_TAG)) { return "\n\n"; } if (element.tagName().equals(LI_TAG)) { - return "\n - "; + return "\n" + StringUtils.repeat(" ", htmlNode.listNestedLevel) + "- "; } if (element.tagName().equals(P_TAG)) { return "\n\n"; @@ -86,21 +89,47 @@ private String convertNodeToText(Node node) { return ""; } - Stream flatten(Node base) { + private String convertListElement(int nestedLevel) { + if (nestedLevel == 0) { + return "\n\n"; + } else { + return ""; + } + } + + Stream flatten(Node base, int listNestedLevel) { Position position = getPosition(base); - Stream flatChildren = base.childNodes() + int nextElementLevel = getNewNestedLevel(listNestedLevel, base); + + Stream baseStream = Stream.of(new HTMLNode(base, listNestedLevel)); + Stream flatChildren = base.childNodes() .stream() - .flatMap(this::flatten); + .flatMap(node -> flatten(node, nextElementLevel)); + switch (position) { case PREFIX: - return Stream.concat(Stream.of(base), flatChildren); + return Stream.concat(baseStream, flatChildren); case SUFFIX: - return Stream.concat(flatChildren, Stream.of(base)); + return Stream.concat(flatChildren, baseStream); default: throw new RuntimeException("Unexpected POSITION for node element: " + position); } } + private int getNewNestedLevel(int listNestedLevel, Node node) { + if (node instanceof Element) { + Element element = (Element) node; + if (isList(element)) { + return listNestedLevel + 1; + } + } + return listNestedLevel; + } + + private boolean isList(Element element) { + return element.tagName().equals(UL_TAG) || element.tagName().equals(OL_TAG); + } + private enum Position { PREFIX, SUFFIX @@ -116,4 +145,14 @@ private Position getPosition(Node node) { return Position.SUFFIX; } + private static class HTMLNode { + private final Node underlyingNode; + private final int listNestedLevel; + + public HTMLNode(Node underlyingNode, int listNestedLevel) { + this.underlyingNode = underlyingNode; + this.listNestedLevel = listNestedLevel; + } + } + } diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java index 75ba62af2af..30e858a9cb5 100644 --- a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java +++ b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java @@ -154,6 +154,28 @@ public void imgShouldBeWellHandled() { assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } + @Test + public void nestedListsShouldBeWellHandled() { + String html = "
    " + + "
  • Coffee
  • " + + "
  • Tea" + + "
      " + + "
    • Black tea
    • " + + "
    • Green tea
    • " + + "
    " + + "
  • " + + "
  • Milk
  • " + + "
"; + String expectedPlainText = " \n" + + " - Coffee \n" + + " - Tea \n" + + " - Black tea \n" + + " - Green tea \n" + + " - Milk\n" + + "\n"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + @Test public void nonClosedHtmlShouldBeTranslated() { String html = "This is an HTML text !";