diff --git a/server/container/guice/protocols/jmap/src/main/java/org/apache/james/jmap/JMAPModule.java b/server/container/guice/protocols/jmap/src/main/java/org/apache/james/jmap/JMAPModule.java index a43b0d694b0..a04a866d6f5 100644 --- a/server/container/guice/protocols/jmap/src/main/java/org/apache/james/jmap/JMAPModule.java +++ b/server/container/guice/protocols/jmap/src/main/java/org/apache/james/jmap/JMAPModule.java @@ -32,7 +32,7 @@ import org.apache.james.jmap.methods.RequestHandler; import org.apache.james.jmap.send.PostDequeueDecoratorFactory; import org.apache.james.jmap.utils.HtmlTextExtractor; -import org.apache.james.jmap.utils.MailboxBasedHtmlTextExtractor; +import org.apache.james.jmap.utils.JsoupHtmlTextExtractor; import org.apache.james.jmap.utils.SystemMailboxesProvider; import org.apache.james.jmap.utils.SystemMailboxesProviderImpl; import org.apache.james.jwt.JwtConfiguration; @@ -84,10 +84,10 @@ protected void configure() { bind(JMAPServer.class).in(Scopes.SINGLETON); bind(RequestHandler.class).in(Scopes.SINGLETON); bind(UploadHandler.class).in(Scopes.SINGLETON); - bind(MailboxBasedHtmlTextExtractor.class).in(Scopes.SINGLETON); + bind(JsoupHtmlTextExtractor.class).in(Scopes.SINGLETON); bind(SystemMailboxesProviderImpl.class).in(Scopes.SINGLETON); - bind(HtmlTextExtractor.class).to(MailboxBasedHtmlTextExtractor.class); + bind(HtmlTextExtractor.class).to(JsoupHtmlTextExtractor.class); Multibinder.newSetBinder(binder(), ConfigurationPerformer.class).addBinding().to(RequiredCapabilitiesPrecondition.class); Multibinder transportProcessorChecks = Multibinder.newSetBinder(binder(), CamelMailetContainerModule.TransportProcessorCheck.class); diff --git a/server/protocols/jmap/pom.xml b/server/protocols/jmap/pom.xml index cf74ca93351..4546b5036ec 100644 --- a/server/protocols/jmap/pom.xml +++ b/server/protocols/jmap/pom.xml @@ -358,6 +358,11 @@ jgrapht-core 0.9.1 + + org.jsoup + jsoup + 1.9.2 + org.mockito mockito-core diff --git a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java new file mode 100644 index 00000000000..912a617cc77 --- /dev/null +++ b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java @@ -0,0 +1,78 @@ +/**************************************************************** + O * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ + +package org.apache.james.jmap.utils; + +import java.util.Optional; +import java.util.stream.Stream; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class JsoupHtmlTextExtractor implements HtmlTextExtractor { + + private static final Logger LOGGER = LoggerFactory.getLogger(JsoupHtmlTextExtractor.class); + + @Override + public String toPlainText(String html) { + try { + Document document = Jsoup.parse(html); + + Element body = Optional.ofNullable(document.body()).orElse(document); + + return flatten(body) + .map(this::convertNodeToText) + .reduce("", (s1, s2) -> s1 + s2); + } catch (Exception e) { + LOGGER.warn("Failed extracting text from html", e); + return html; + } + } + + private String convertNodeToText(Node node) { + if (node instanceof TextNode) { + TextNode textNode = (TextNode) node; + return textNode.getWholeText(); + } + if (node instanceof Element) { + Element element = (Element) node; + if (element.tagName().equals("br")) { + return "\n"; + } + if (element.tagName().equals("p")) { + return "\n\n"; + } + } + return ""; + } + + Stream flatten(Node base) { + return Stream.concat( + base.childNodes() + .stream() + .flatMap(this::flatten), + Stream.of(base)); + } + +} diff --git a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractor.java b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractor.java deleted file mode 100644 index 99cb01e3ffc..00000000000 --- a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractor.java +++ /dev/null @@ -1,33 +0,0 @@ - - -package org.apache.james.jmap.utils; - -import java.io.ByteArrayInputStream; - -import javax.inject.Inject; - -import org.apache.james.mailbox.extractor.TextExtractor; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class MailboxBasedHtmlTextExtractor implements HtmlTextExtractor { - - private static final Logger LOGGER = LoggerFactory.getLogger(MailboxBasedHtmlTextExtractor.class); - - private final TextExtractor textExtractor; - - @Inject - public MailboxBasedHtmlTextExtractor(TextExtractor textExtractor) { - this.textExtractor = textExtractor; - } - - @Override - public String toPlainText(String html) { - try { - return textExtractor.extractContent(new ByteArrayInputStream(html.getBytes()), "text/html", "").getTextualContent(); - } catch (Exception e) { - LOGGER.warn("Error extracting text from HTML", e); - return html; - } - } -} diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/methods/GetMessagesMethodTest.java b/server/protocols/jmap/src/test/java/org/apache/james/jmap/methods/GetMessagesMethodTest.java index db400db86f5..7a0ab78f490 100644 --- a/server/protocols/jmap/src/test/java/org/apache/james/jmap/methods/GetMessagesMethodTest.java +++ b/server/protocols/jmap/src/test/java/org/apache/james/jmap/methods/GetMessagesMethodTest.java @@ -41,7 +41,7 @@ import org.apache.james.jmap.model.MessagePreviewGenerator; import org.apache.james.jmap.model.MessageProperties.MessageProperty; import org.apache.james.jmap.utils.HtmlTextExtractor; -import org.apache.james.jmap.utils.MailboxBasedHtmlTextExtractor; +import org.apache.james.jmap.utils.JsoupHtmlTextExtractor; import org.apache.james.mailbox.MailboxManager; import org.apache.james.mailbox.MailboxSession; import org.apache.james.mailbox.MessageIdManager; @@ -54,7 +54,6 @@ import org.apache.james.mailbox.model.ComposedMessageId; import org.apache.james.mailbox.model.MailboxId; import org.apache.james.mailbox.model.MailboxPath; -import org.apache.james.mailbox.tika.extractor.TikaTextExtractor; import org.apache.james.metrics.logger.DefaultMetricFactory; import org.apache.james.util.mime.MessageContentExtractor; import org.assertj.core.api.Condition; @@ -116,7 +115,7 @@ public boolean isSameUser(String username) { @Before public void setup() throws Exception { clientId = ClientId.of("#0"); - HtmlTextExtractor htmlTextExtractor = new MailboxBasedHtmlTextExtractor(new TikaTextExtractor()); + HtmlTextExtractor htmlTextExtractor = new JsoupHtmlTextExtractor(); MessagePreviewGenerator messagePreview = new MessagePreviewGenerator(); MessageContentExtractor messageContentExtractor = new MessageContentExtractor(); MessageFactory messageFactory = new MessageFactory(messagePreview, messageContentExtractor, htmlTextExtractor); diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/model/MessageFactoryTest.java b/server/protocols/jmap/src/test/java/org/apache/james/jmap/model/MessageFactoryTest.java index f95998f449c..cc65cc7d188 100644 --- a/server/protocols/jmap/src/test/java/org/apache/james/jmap/model/MessageFactoryTest.java +++ b/server/protocols/jmap/src/test/java/org/apache/james/jmap/model/MessageFactoryTest.java @@ -31,14 +31,13 @@ import org.apache.commons.lang3.StringUtils; import org.apache.james.jmap.model.MessageFactory.MetaDataWithContent; import org.apache.james.jmap.utils.HtmlTextExtractor; -import org.apache.james.jmap.utils.MailboxBasedHtmlTextExtractor; +import org.apache.james.jmap.utils.JsoupHtmlTextExtractor; import org.apache.james.mailbox.MessageUid; import org.apache.james.mailbox.inmemory.InMemoryId; import org.apache.james.mailbox.model.AttachmentId; import org.apache.james.mailbox.model.Cid; import org.apache.james.mailbox.model.MessageAttachment; import org.apache.james.mailbox.model.TestMessageId; -import org.apache.james.mailbox.tika.extractor.TikaTextExtractor; import org.apache.james.util.mime.MessageContentExtractor; import org.junit.Before; import org.junit.Test; @@ -57,7 +56,7 @@ public class MessageFactoryTest { @Before public void setUp() { - htmlTextExtractor = new MailboxBasedHtmlTextExtractor(new TikaTextExtractor()); + htmlTextExtractor = new JsoupHtmlTextExtractor(); messagePreview = new MessagePreviewGenerator(); MessageContentExtractor messageContentExtractor = new MessageContentExtractor(); diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractorTest.java b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java similarity index 88% rename from server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractorTest.java rename to server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java index 9a44dde4c6d..28e9d1dfc1a 100644 --- a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/MailboxBasedHtmlTextExtractorTest.java +++ b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java @@ -24,17 +24,16 @@ import java.nio.charset.StandardCharsets; import org.apache.commons.io.IOUtils; -import org.apache.james.mailbox.tika.extractor.TikaTextExtractor; import org.junit.Before; import org.junit.Test; -public class MailboxBasedHtmlTextExtractorTest { +public class JsoupHtmlTextExtractorTest { - private MailboxBasedHtmlTextExtractor textExtractor; + private JsoupHtmlTextExtractor textExtractor; @Before public void setUp() { - textExtractor = new MailboxBasedHtmlTextExtractor(new TikaTextExtractor()); + textExtractor = new JsoupHtmlTextExtractor(); } @Test @@ -53,14 +52,21 @@ public void toPlainTextShouldRemoveSimpleHtmlTag() { @Test public void toPlainTextShouldReplaceSkipLine() { String html = "

This is an
HTML text !

"; - String expectedPlainText = "This is an\nHTML text !\n"; + String expectedPlainText = "This is an\nHTML text !\n\n"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @Test public void toPlainTextShouldSkipLinesBetweenParagraph() { String html = "

para1

para2

"; - String expectedPlainText = "para1\npara2\n"; + String expectedPlainText = "para1\n\npara2\n\n"; + assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); + } + + @Test + public void toPlainTextShouldConciderUpperCaseLabelsAsLowerCase() { + String html = "

para1

para2

"; + String expectedPlainText = "para1\n\npara2\n\n"; assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); } @@ -85,19 +91,20 @@ public void toPlainTextShouldWorkWithMoreComplexHTML() throws Exception { " Why a new Logo?\n" + "\n" + "\n" + - "\n" + " We are happy with our current logo, but for the\n" + " upcoming James Server 3.0 release, we would like to\n" + " give our community the opportunity to create a new image for James.\n" + "\n" + "\n" + "\n" + + "\n" + " Don't be shy, take your inkscape and gimp, and send us on\n" + " the James Server User mailing list\n" + " your creations. We will publish them on this page.\n" + "\n" + "\n" + "\n" + + "\n" + " We need an horizontal logo (100p height) to be show displayed on the upper\n" + " left corner of this page, an avatar (48x48p) to be used on a Twitter stream for example.\n" + " The used fonts should be redistributable (or commonly available on Windows and Linux).\n" +