Skip to content

Commit

Permalink
JAMES-2018 Use Jsoup in JMAP project
Browse files Browse the repository at this point in the history
  • Loading branch information
chibenwa committed Jun 1, 2017
1 parent 1925eeb commit 84d7a31
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 49 deletions.
Expand Up @@ -32,7 +32,7 @@
import org.apache.james.jmap.methods.RequestHandler; import org.apache.james.jmap.methods.RequestHandler;
import org.apache.james.jmap.send.PostDequeueDecoratorFactory; import org.apache.james.jmap.send.PostDequeueDecoratorFactory;
import org.apache.james.jmap.utils.HtmlTextExtractor; import org.apache.james.jmap.utils.HtmlTextExtractor;
import org.apache.james.jmap.utils.MailboxBasedHtmlTextExtractor; import org.apache.james.jmap.utils.JsoupHtmlTextExtractor;
import org.apache.james.jmap.utils.SystemMailboxesProvider; import org.apache.james.jmap.utils.SystemMailboxesProvider;
import org.apache.james.jmap.utils.SystemMailboxesProviderImpl; import org.apache.james.jmap.utils.SystemMailboxesProviderImpl;
import org.apache.james.jwt.JwtConfiguration; import org.apache.james.jwt.JwtConfiguration;
Expand Down Expand Up @@ -84,10 +84,10 @@ protected void configure() {
bind(JMAPServer.class).in(Scopes.SINGLETON); bind(JMAPServer.class).in(Scopes.SINGLETON);
bind(RequestHandler.class).in(Scopes.SINGLETON); bind(RequestHandler.class).in(Scopes.SINGLETON);
bind(UploadHandler.class).in(Scopes.SINGLETON); bind(UploadHandler.class).in(Scopes.SINGLETON);
bind(MailboxBasedHtmlTextExtractor.class).in(Scopes.SINGLETON); bind(JsoupHtmlTextExtractor.class).in(Scopes.SINGLETON);
bind(SystemMailboxesProviderImpl.class).in(Scopes.SINGLETON); bind(SystemMailboxesProviderImpl.class).in(Scopes.SINGLETON);


bind(HtmlTextExtractor.class).to(MailboxBasedHtmlTextExtractor.class); bind(HtmlTextExtractor.class).to(JsoupHtmlTextExtractor.class);
Multibinder.newSetBinder(binder(), ConfigurationPerformer.class).addBinding().to(RequiredCapabilitiesPrecondition.class); Multibinder.newSetBinder(binder(), ConfigurationPerformer.class).addBinding().to(RequiredCapabilitiesPrecondition.class);


Multibinder<CamelMailetContainerModule.TransportProcessorCheck> transportProcessorChecks = Multibinder.newSetBinder(binder(), CamelMailetContainerModule.TransportProcessorCheck.class); Multibinder<CamelMailetContainerModule.TransportProcessorCheck> transportProcessorChecks = Multibinder.newSetBinder(binder(), CamelMailetContainerModule.TransportProcessorCheck.class);
Expand Down
5 changes: 5 additions & 0 deletions server/protocols/jmap/pom.xml
Expand Up @@ -358,6 +358,11 @@
<artifactId>jgrapht-core</artifactId> <artifactId>jgrapht-core</artifactId>
<version>0.9.1</version> <version>0.9.1</version>
</dependency> </dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
<dependency> <dependency>
<groupId>org.mockito</groupId> <groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId> <artifactId>mockito-core</artifactId>
Expand Down
@@ -0,0 +1,78 @@
/****************************************************************
O * Licensed to the Apache Software Foundation (ASF) under one *
* or more contributor license agreements. See the NOTICE file *
* distributed with this work for additional information *
* regarding copyright ownership. The ASF licenses this file *
* to you under the Apache License, Version 2.0 (the *
* "License"); you may not use this file except in compliance *
* with the License. You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, *
* software distributed under the License is distributed on an *
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
* KIND, either express or implied. See the License for the *
* specific language governing permissions and limitations *
* under the License. *
****************************************************************/

package org.apache.james.jmap.utils;

import java.util.Optional;
import java.util.stream.Stream;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class JsoupHtmlTextExtractor implements HtmlTextExtractor {

private static final Logger LOGGER = LoggerFactory.getLogger(JsoupHtmlTextExtractor.class);

@Override
public String toPlainText(String html) {
try {
Document document = Jsoup.parse(html);

Element body = Optional.ofNullable(document.body()).orElse(document);

return flatten(body)
.map(this::convertNodeToText)
.reduce("", (s1, s2) -> s1 + s2);
} catch (Exception e) {
LOGGER.warn("Failed extracting text from html", e);
return html;
}
}

private String convertNodeToText(Node node) {
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
return textNode.getWholeText();
}
if (node instanceof Element) {
Element element = (Element) node;
if (element.tagName().equals("br")) {
return "\n";
}
if (element.tagName().equals("p")) {
return "\n\n";
}
}
return "";
}

Stream<Node> flatten(Node base) {
return Stream.concat(
base.childNodes()
.stream()
.flatMap(this::flatten),
Stream.of(base));
}

}

This file was deleted.

Expand Up @@ -41,7 +41,7 @@
import org.apache.james.jmap.model.MessagePreviewGenerator; import org.apache.james.jmap.model.MessagePreviewGenerator;
import org.apache.james.jmap.model.MessageProperties.MessageProperty; import org.apache.james.jmap.model.MessageProperties.MessageProperty;
import org.apache.james.jmap.utils.HtmlTextExtractor; import org.apache.james.jmap.utils.HtmlTextExtractor;
import org.apache.james.jmap.utils.MailboxBasedHtmlTextExtractor; import org.apache.james.jmap.utils.JsoupHtmlTextExtractor;
import org.apache.james.mailbox.MailboxManager; import org.apache.james.mailbox.MailboxManager;
import org.apache.james.mailbox.MailboxSession; import org.apache.james.mailbox.MailboxSession;
import org.apache.james.mailbox.MessageIdManager; import org.apache.james.mailbox.MessageIdManager;
Expand All @@ -54,7 +54,6 @@
import org.apache.james.mailbox.model.ComposedMessageId; import org.apache.james.mailbox.model.ComposedMessageId;
import org.apache.james.mailbox.model.MailboxId; import org.apache.james.mailbox.model.MailboxId;
import org.apache.james.mailbox.model.MailboxPath; import org.apache.james.mailbox.model.MailboxPath;
import org.apache.james.mailbox.tika.extractor.TikaTextExtractor;
import org.apache.james.metrics.logger.DefaultMetricFactory; import org.apache.james.metrics.logger.DefaultMetricFactory;
import org.apache.james.util.mime.MessageContentExtractor; import org.apache.james.util.mime.MessageContentExtractor;
import org.assertj.core.api.Condition; import org.assertj.core.api.Condition;
Expand Down Expand Up @@ -116,7 +115,7 @@ public boolean isSameUser(String username) {
@Before @Before
public void setup() throws Exception { public void setup() throws Exception {
clientId = ClientId.of("#0"); clientId = ClientId.of("#0");
HtmlTextExtractor htmlTextExtractor = new MailboxBasedHtmlTextExtractor(new TikaTextExtractor()); HtmlTextExtractor htmlTextExtractor = new JsoupHtmlTextExtractor();
MessagePreviewGenerator messagePreview = new MessagePreviewGenerator(); MessagePreviewGenerator messagePreview = new MessagePreviewGenerator();
MessageContentExtractor messageContentExtractor = new MessageContentExtractor(); MessageContentExtractor messageContentExtractor = new MessageContentExtractor();
MessageFactory messageFactory = new MessageFactory(messagePreview, messageContentExtractor, htmlTextExtractor); MessageFactory messageFactory = new MessageFactory(messagePreview, messageContentExtractor, htmlTextExtractor);
Expand Down
Expand Up @@ -31,14 +31,13 @@
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.james.jmap.model.MessageFactory.MetaDataWithContent; import org.apache.james.jmap.model.MessageFactory.MetaDataWithContent;
import org.apache.james.jmap.utils.HtmlTextExtractor; import org.apache.james.jmap.utils.HtmlTextExtractor;
import org.apache.james.jmap.utils.MailboxBasedHtmlTextExtractor; import org.apache.james.jmap.utils.JsoupHtmlTextExtractor;
import org.apache.james.mailbox.MessageUid; import org.apache.james.mailbox.MessageUid;
import org.apache.james.mailbox.inmemory.InMemoryId; import org.apache.james.mailbox.inmemory.InMemoryId;
import org.apache.james.mailbox.model.AttachmentId; import org.apache.james.mailbox.model.AttachmentId;
import org.apache.james.mailbox.model.Cid; import org.apache.james.mailbox.model.Cid;
import org.apache.james.mailbox.model.MessageAttachment; import org.apache.james.mailbox.model.MessageAttachment;
import org.apache.james.mailbox.model.TestMessageId; import org.apache.james.mailbox.model.TestMessageId;
import org.apache.james.mailbox.tika.extractor.TikaTextExtractor;
import org.apache.james.util.mime.MessageContentExtractor; import org.apache.james.util.mime.MessageContentExtractor;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
Expand All @@ -57,7 +56,7 @@ public class MessageFactoryTest {


@Before @Before
public void setUp() { public void setUp() {
htmlTextExtractor = new MailboxBasedHtmlTextExtractor(new TikaTextExtractor()); htmlTextExtractor = new JsoupHtmlTextExtractor();


messagePreview = new MessagePreviewGenerator(); messagePreview = new MessagePreviewGenerator();
MessageContentExtractor messageContentExtractor = new MessageContentExtractor(); MessageContentExtractor messageContentExtractor = new MessageContentExtractor();
Expand Down
Expand Up @@ -24,17 +24,16 @@
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;


import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.james.mailbox.tika.extractor.TikaTextExtractor;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;


public class MailboxBasedHtmlTextExtractorTest { public class JsoupHtmlTextExtractorTest {


private MailboxBasedHtmlTextExtractor textExtractor; private JsoupHtmlTextExtractor textExtractor;


@Before @Before
public void setUp() { public void setUp() {
textExtractor = new MailboxBasedHtmlTextExtractor(new TikaTextExtractor()); textExtractor = new JsoupHtmlTextExtractor();
} }


@Test @Test
Expand All @@ -53,14 +52,21 @@ public void toPlainTextShouldRemoveSimpleHtmlTag() {
@Test @Test
public void toPlainTextShouldReplaceSkipLine() { public void toPlainTextShouldReplaceSkipLine() {
String html = "<p>This is an<br/>HTML text !</p>"; String html = "<p>This is an<br/>HTML text !</p>";
String expectedPlainText = "This is an\nHTML text !\n"; String expectedPlainText = "This is an\nHTML text !\n\n";
assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText);
} }


@Test @Test
public void toPlainTextShouldSkipLinesBetweenParagraph() { public void toPlainTextShouldSkipLinesBetweenParagraph() {
String html = "<p>para1</p><p>para2</p>"; String html = "<p>para1</p><p>para2</p>";
String expectedPlainText = "para1\npara2\n"; String expectedPlainText = "para1\n\npara2\n\n";
assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText);
}

@Test
public void toPlainTextShouldConciderUpperCaseLabelsAsLowerCase() {
String html = "<P>para1</P><p>para2</p>";
String expectedPlainText = "para1\n\npara2\n\n";
assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText); assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText);
} }


Expand All @@ -85,19 +91,20 @@ public void toPlainTextShouldWorkWithMoreComplexHTML() throws Exception {
" Why a new Logo?\n" + " Why a new Logo?\n" +
"\n" + "\n" +
"\n" + "\n" +
"\n" +
" We are happy with our current logo, but for the\n" + " We are happy with our current logo, but for the\n" +
" upcoming James Server 3.0 release, we would like to\n" + " upcoming James Server 3.0 release, we would like to\n" +
" give our community the opportunity to create a new image for James.\n" + " give our community the opportunity to create a new image for James.\n" +
"\n" + "\n" +
"\n" + "\n" +
"\n" + "\n" +
"\n" +
" Don't be shy, take your inkscape and gimp, and send us on\n" + " Don't be shy, take your inkscape and gimp, and send us on\n" +
" the James Server User mailing list\n" + " the James Server User mailing list\n" +
" your creations. We will publish them on this page.\n" + " your creations. We will publish them on this page.\n" +
"\n" + "\n" +
"\n" + "\n" +
"\n" + "\n" +
"\n" +
" We need an horizontal logo (100p height) to be show displayed on the upper\n" + " We need an horizontal logo (100p height) to be show displayed on the upper\n" +
" left corner of this page, an avatar (48x48p) to be used on a Twitter stream for example.\n" + " left corner of this page, an avatar (48x48p) to be used on a Twitter stream for example.\n" +
" The used fonts should be redistributable (or commonly available on Windows and Linux).\n" + " The used fonts should be redistributable (or commonly available on Windows and Linux).\n" +
Expand Down

0 comments on commit 84d7a31

Please sign in to comment.