From fec80a1308bbc8328cc5b9a9382fba7c7bf5980b Mon Sep 17 00:00:00 2001 From: Antoine Duprat Date: Fri, 8 Sep 2017 14:12:17 +0200 Subject: [PATCH] JAMES-2142 Text content in attachments may be empty --- .../mailbox/extractor/ParsedContent.java | 7 +-- .../mailbox/elasticsearch/json/MimePart.java | 2 +- .../elasticsearch/json/MimePartTest.java | 50 +++++++++++++++++++ .../inmemory/JsoupTextExtractorTest.java | 4 +- .../store/search/PDFTextExtractorTest.java | 4 +- .../mailbox/store/search/MessageSearches.java | 14 +++--- .../extractor/DefaultTextExtractorTest.java | 4 +- .../mailbox/tika/TikaTextExtractorTest.java | 22 ++++---- 8 files changed, 80 insertions(+), 27 deletions(-) create mode 100644 mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MimePartTest.java diff --git a/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/ParsedContent.java b/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/ParsedContent.java index fc5af9f6def..6dcdc81ef49 100644 --- a/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/ParsedContent.java +++ b/mailbox/api/src/main/java/org/apache/james/mailbox/extractor/ParsedContent.java @@ -21,18 +21,19 @@ import java.util.List; import java.util.Map; +import java.util.Optional; public class ParsedContent { - private final String textualContent; + private final Optional textualContent; private final Map> metadata; public ParsedContent(String textualContent, Map> metadata) { - this.textualContent = textualContent; + this.textualContent = Optional.ofNullable(textualContent); this.metadata = metadata; } - public String getTextualContent() { + public Optional getTextualContent() { return textualContent; } diff --git a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePart.java b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePart.java index ea15e97710e..366cdd42e90 100644 --- a/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePart.java +++ b/mailbox/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/MimePart.java @@ -120,7 +120,7 @@ public MimePart build() { Optional parsedContent = parseContent(textExtractor); return new MimePart( headerCollectionBuilder.build(), - parsedContent.map(ParsedContent::getTextualContent), + parsedContent.flatMap(ParsedContent::getTextualContent), mediaType, subType, fileName, diff --git a/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MimePartTest.java b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MimePartTest.java new file mode 100644 index 00000000000..2bca61a2c38 --- /dev/null +++ b/mailbox/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/MimePartTest.java @@ -0,0 +1,50 @@ +/**************************************************************** + * Licensed to the Apache Software Foundation (ASF) under one * + * or more contributor license agreements. See the NOTICE file * + * distributed with this work for additional information * + * regarding copyright ownership. The ASF licenses this file * + * to you under the Apache License, Version 2.0 (the * + * "License"); you may not use this file except in compliance * + * with the License. You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, * + * software distributed under the License is distributed on an * + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * + * KIND, either express or implied. See the License for the * + * specific language governing permissions and limitations * + * under the License. * + ****************************************************************/ +package org.apache.james.mailbox.elasticsearch.json; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; + +import org.junit.Test; + +public class MimePartTest { + + @Test + public void buildShouldWorkWhenTextualContentFromParserIsEmpty() { + MimePart.builder() + .addBodyContent(new ByteArrayInputStream(new byte[] {})) + .addMediaType("text") + .addSubType("plain") + .build(); + } + + @Test + public void buildShouldWorkWhenTextualContentFromParserIsNonEmpty() { + String body = "text"; + MimePart mimePart = MimePart.builder() + .addBodyContent(new ByteArrayInputStream(body.getBytes(StandardCharsets.UTF_8))) + .addMediaType("text") + .addSubType("plain") + .build(); + + assertThat(mimePart.getTextualBody()).contains(body); + } +} diff --git a/mailbox/memory/src/test/java/org/apache/james/mailbox/inmemory/JsoupTextExtractorTest.java b/mailbox/memory/src/test/java/org/apache/james/mailbox/inmemory/JsoupTextExtractorTest.java index 3a91e685452..20140a73e32 100644 --- a/mailbox/memory/src/test/java/org/apache/james/mailbox/inmemory/JsoupTextExtractorTest.java +++ b/mailbox/memory/src/test/java/org/apache/james/mailbox/inmemory/JsoupTextExtractorTest.java @@ -20,10 +20,10 @@ package org.apache.james.mailbox.inmemory; import static org.assertj.core.api.Assertions.assertThat; -import org.apache.james.mailbox.extractor.TextExtractor; import java.io.InputStream; +import org.apache.james.mailbox.extractor.TextExtractor; import org.junit.Before; import org.junit.Test; @@ -39,7 +39,7 @@ public void setUp() { public void extractedTextFromHtmlShouldNotContainTheContentOfTitleTag() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/html.txt"); - assertThat(textExtractor.extractContent(inputStream, "text/html").getTextualContent()) + assertThat(textExtractor.extractContent(inputStream, "text/html").getTextualContent().get()) .doesNotContain("*|MC:SUBJECT|*"); } diff --git a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java index df52009dc2f..65c28fda647 100644 --- a/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java +++ b/mailbox/scanning-search/src/test/java/org/apache/james/mailbox/store/search/PDFTextExtractorTest.java @@ -59,12 +59,12 @@ public void extractContentShouldExtractPlainText() throws Exception { assertThat(testee.extractContent(inputStream, "text/plain") .getTextualContent()) - .isEqualTo(content); + .contains(content); } @Test public void extractContentShouldExtractPDF() throws Exception { - String content = "Little PDF"; + String content = "Little PDF\n"; InputStream inputStream = ClassLoader.getSystemResourceAsStream("pdf.pdf"); assertThat(testee.extractContent(inputStream, PDFTextExtractor.PDF_TYPE) diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java index dc3555996c0..cb14f6781c3 100644 --- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java +++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/MessageSearches.java @@ -74,6 +74,7 @@ import org.apache.james.mime4j.stream.MimeConfig; import org.apache.james.mime4j.util.MimeUtil; import org.apache.james.mime4j.utils.search.MessageMatcher; +import org.apache.james.util.OptionalUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -256,12 +257,13 @@ private boolean isInAttachments(String value, List attachment private Stream toAttachmentContent(Attachment attachment) { try { - return Stream.of(textExtractor - .extractContent( - attachment.getStream(), - attachment.getType()) - .getTextualContent()); - } catch (Exception e) { + return OptionalUtils.toStream( + textExtractor + .extractContent( + attachment.getStream(), + attachment.getType()) + .getTextualContent()); + } catch (Exception e) { LOGGER.error("Error while parsing attachment content", e); return Stream.of(); } diff --git a/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractorTest.java b/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractorTest.java index 353bf58cde5..a5c5c0cd7b3 100644 --- a/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractorTest.java +++ b/mailbox/store/src/test/java/org/apache/james/mailbox/store/extractor/DefaultTextExtractorTest.java @@ -41,7 +41,7 @@ public void textTest() throws Exception { assertThat(inputStream).isNotNull(); assertThat(textExtractor.extractContent(inputStream, "text/plain") .getTextualContent()) - .isEqualTo("This is some awesome text text.\n\n"); + .contains("This is some awesome text text.\n\n"); } @Test @@ -52,6 +52,6 @@ public void textMicrosoftWorldTest() throws Exception { inputStream, "application/vnd.openxmlformats-officedocument.wordprocessingml.document") .getTextualContent()) - .isNull(); + .isEmpty(); } } diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java index a75a5add2b4..9794ead30b8 100644 --- a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java +++ b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java @@ -64,7 +64,7 @@ public void setUp() throws Exception { @Test public void textualContentShouldReturnNullWhenInputStreamIsEmpty() throws Exception { assertThat(textExtractor.extractContent(IOUtils.toInputStream("", Charsets.UTF_8), "text/plain").getTextualContent()) - .isNull(); + .isEmpty(); } @Test @@ -72,7 +72,7 @@ public void textTest() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/Text.txt"); assertThat(inputStream).isNotNull(); assertThat(textExtractor.extractContent(inputStream, "text/plain").getTextualContent()) - .isEqualTo("This is some awesome text text.\n\n\n"); + .contains("This is some awesome text text.\n\n\n"); } @Test @@ -80,7 +80,7 @@ public void textMicrosoftWorldTest() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/writter.docx"); assertThat(inputStream).isNotNull(); assertThat(textExtractor.extractContent(inputStream, "application/vnd.openxmlformats-officedocument.wordprocessingml.document").getTextualContent()) - .isEqualTo("This is an awesome document on libroffice writter !\n"); + .contains("This is an awesome document on libroffice writter !\n"); } @Test @@ -88,7 +88,7 @@ public void textOdtTest() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/writter.odt"); assertThat(inputStream).isNotNull(); assertThat(textExtractor.extractContent(inputStream, "application/vnd.oasis.opendocument.text").getTextualContent()) - .isEqualTo("This is an awesome document on libroffice writter !\n"); + .contains("This is an awesome document on libroffice writter !\n"); } @Test @@ -96,7 +96,7 @@ public void documentWithBadDeclaredMetadataShouldBeWellHandled() throws Exceptio InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/fake.txt"); assertThat(inputStream).isNotNull(); assertThat(textExtractor.extractContent(inputStream, "application/vnd.oasis.opendocument.text").getTextualContent()) - .isEqualTo("This is an awesome document on libroffice writter !\n"); + .contains("This is an awesome document on libroffice writter !\n"); } @Test @@ -104,7 +104,7 @@ public void slidePowerPointTest() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/slides.pptx"); assertThat(inputStream).isNotNull(); assertThat(textExtractor.extractContent(inputStream, "application/vnd.openxmlformats-officedocument.presentationml.presentation").getTextualContent()) - .isEqualTo("James is awesome\nIt manages attachments so well !\n\n\n"); + .contains("James is awesome\nIt manages attachments so well !\n\n\n"); } @Test @@ -112,7 +112,7 @@ public void slideOdpTest() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/slides.odp"); assertThat(inputStream).isNotNull(); assertThat(textExtractor.extractContent(inputStream, "application/vnd.oasis.opendocument.presentation").getTextualContent()) - .isEqualTo("James is awesome\n\nIt manages attachments so well !\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"); + .contains("James is awesome\n\nIt manages attachments so well !\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"); } @Test @@ -120,7 +120,7 @@ public void pdfTest() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/PDF.pdf"); assertThat(inputStream).isNotNull(); assertThat(textExtractor.extractContent(inputStream, "application/pdf").getTextualContent()) - .isEqualTo("This is an awesome document on libroffice writter !\n\n\n"); + .contains("This is an awesome document on libroffice writter !\n\n\n"); } @Test @@ -128,7 +128,7 @@ public void odsTest() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/calc.ods"); assertThat(inputStream).isNotNull(); assertThat(textExtractor.extractContent(inputStream, "application/vnd.oasis.opendocument.spreadsheet").getTextualContent()) - .isEqualTo("This is an aesome LibreOffice document !\n" + + .contains("This is an aesome LibreOffice document !\n" + "\n" + "\n" + "???\n" + @@ -143,7 +143,7 @@ public void excelTest() throws Exception { InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/calc.xlsx"); assertThat(inputStream).isNotNull(); assertThat(textExtractor.extractContent(inputStream, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet").getTextualContent()) - .isEqualTo("Feuille1\n" + + .contains("Feuille1\n" + "\tThis is an aesome LibreOffice document !\n" + "\n" + "&A\t\n" + @@ -173,7 +173,7 @@ public void deserializerShouldTakeFirstNodeWhenSeveral() throws Exception { InputStream inputStream = null; ParsedContent parsedContent = textExtractor.extractContent(inputStream, "text/plain"); - assertThat(parsedContent.getTextualContent()).isEqualTo(expectedExtractedContent); + assertThat(parsedContent.getTextualContent()).contains(expectedExtractedContent); } @Test