Skip to content

Commit

Permalink
JAMES-2142 Text content in attachments may be empty
Browse files Browse the repository at this point in the history
  • Loading branch information
aduprat committed Sep 13, 2017
1 parent a9a40c7 commit fec80a1
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 27 deletions.
Expand Up @@ -21,18 +21,19 @@

import java.util.List;
import java.util.Map;
import java.util.Optional;

public class ParsedContent {

private final String textualContent;
private final Optional<String> textualContent;
private final Map<String, List<String>> metadata;

public ParsedContent(String textualContent, Map<String, List<String>> metadata) {
this.textualContent = textualContent;
this.textualContent = Optional.ofNullable(textualContent);
this.metadata = metadata;
}

public String getTextualContent() {
public Optional<String> getTextualContent() {
return textualContent;
}

Expand Down
Expand Up @@ -120,7 +120,7 @@ public MimePart build() {
Optional<ParsedContent> parsedContent = parseContent(textExtractor);
return new MimePart(
headerCollectionBuilder.build(),
parsedContent.map(ParsedContent::getTextualContent),
parsedContent.flatMap(ParsedContent::getTextualContent),
mediaType,
subType,
fileName,
Expand Down
@@ -0,0 +1,50 @@
/****************************************************************
* Licensed to the Apache Software Foundation (ASF) under one *
* or more contributor license agreements. See the NOTICE file *
* distributed with this work for additional information *
* regarding copyright ownership. The ASF licenses this file *
* to you under the Apache License, Version 2.0 (the *
* "License"); you may not use this file except in compliance *
* with the License. You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, *
* software distributed under the License is distributed on an *
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
* KIND, either express or implied. See the License for the *
* specific language governing permissions and limitations *
* under the License. *
****************************************************************/
package org.apache.james.mailbox.elasticsearch.json;

import static org.assertj.core.api.Assertions.assertThat;

import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;

import org.junit.Test;

public class MimePartTest {

@Test
public void buildShouldWorkWhenTextualContentFromParserIsEmpty() {
MimePart.builder()
.addBodyContent(new ByteArrayInputStream(new byte[] {}))
.addMediaType("text")
.addSubType("plain")
.build();
}

@Test
public void buildShouldWorkWhenTextualContentFromParserIsNonEmpty() {
String body = "text";
MimePart mimePart = MimePart.builder()
.addBodyContent(new ByteArrayInputStream(body.getBytes(StandardCharsets.UTF_8)))
.addMediaType("text")
.addSubType("plain")
.build();

assertThat(mimePart.getTextualBody()).contains(body);
}
}
Expand Up @@ -20,10 +20,10 @@
package org.apache.james.mailbox.inmemory;

import static org.assertj.core.api.Assertions.assertThat;
import org.apache.james.mailbox.extractor.TextExtractor;

import java.io.InputStream;

import org.apache.james.mailbox.extractor.TextExtractor;
import org.junit.Before;
import org.junit.Test;

Expand All @@ -39,7 +39,7 @@ public void setUp() {
public void extractedTextFromHtmlShouldNotContainTheContentOfTitleTag() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/html.txt");

assertThat(textExtractor.extractContent(inputStream, "text/html").getTextualContent())
assertThat(textExtractor.extractContent(inputStream, "text/html").getTextualContent().get())
.doesNotContain("*|MC:SUBJECT|*");
}

Expand Down
Expand Up @@ -59,12 +59,12 @@ public void extractContentShouldExtractPlainText() throws Exception {

assertThat(testee.extractContent(inputStream, "text/plain")
.getTextualContent())
.isEqualTo(content);
.contains(content);
}

@Test
public void extractContentShouldExtractPDF() throws Exception {
String content = "Little PDF";
String content = "Little PDF\n";
InputStream inputStream = ClassLoader.getSystemResourceAsStream("pdf.pdf");

assertThat(testee.extractContent(inputStream, PDFTextExtractor.PDF_TYPE)
Expand Down
Expand Up @@ -74,6 +74,7 @@
import org.apache.james.mime4j.stream.MimeConfig;
import org.apache.james.mime4j.util.MimeUtil;
import org.apache.james.mime4j.utils.search.MessageMatcher;
import org.apache.james.util.OptionalUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -256,12 +257,13 @@ private boolean isInAttachments(String value, List<MessageAttachment> attachment

private Stream<String> toAttachmentContent(Attachment attachment) {
try {
return Stream.of(textExtractor
.extractContent(
attachment.getStream(),
attachment.getType())
.getTextualContent());
} catch (Exception e) {
return OptionalUtils.toStream(
textExtractor
.extractContent(
attachment.getStream(),
attachment.getType())
.getTextualContent());
} catch (Exception e) {
LOGGER.error("Error while parsing attachment content", e);
return Stream.of();
}
Expand Down
Expand Up @@ -41,7 +41,7 @@ public void textTest() throws Exception {
assertThat(inputStream).isNotNull();
assertThat(textExtractor.extractContent(inputStream, "text/plain")
.getTextualContent())
.isEqualTo("This is some awesome text text.\n\n");
.contains("This is some awesome text text.\n\n");
}

@Test
Expand All @@ -52,6 +52,6 @@ public void textMicrosoftWorldTest() throws Exception {
inputStream,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document")
.getTextualContent())
.isNull();
.isEmpty();
}
}
Expand Up @@ -64,71 +64,71 @@ public void setUp() throws Exception {
@Test
public void textualContentShouldReturnNullWhenInputStreamIsEmpty() throws Exception {
assertThat(textExtractor.extractContent(IOUtils.toInputStream("", Charsets.UTF_8), "text/plain").getTextualContent())
.isNull();
.isEmpty();
}

@Test
public void textTest() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/Text.txt");
assertThat(inputStream).isNotNull();
assertThat(textExtractor.extractContent(inputStream, "text/plain").getTextualContent())
.isEqualTo("This is some awesome text text.\n\n\n");
.contains("This is some awesome text text.\n\n\n");
}

@Test
public void textMicrosoftWorldTest() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/writter.docx");
assertThat(inputStream).isNotNull();
assertThat(textExtractor.extractContent(inputStream, "application/vnd.openxmlformats-officedocument.wordprocessingml.document").getTextualContent())
.isEqualTo("This is an awesome document on libroffice writter !\n");
.contains("This is an awesome document on libroffice writter !\n");
}

@Test
public void textOdtTest() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/writter.odt");
assertThat(inputStream).isNotNull();
assertThat(textExtractor.extractContent(inputStream, "application/vnd.oasis.opendocument.text").getTextualContent())
.isEqualTo("This is an awesome document on libroffice writter !\n");
.contains("This is an awesome document on libroffice writter !\n");
}

@Test
public void documentWithBadDeclaredMetadataShouldBeWellHandled() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/fake.txt");
assertThat(inputStream).isNotNull();
assertThat(textExtractor.extractContent(inputStream, "application/vnd.oasis.opendocument.text").getTextualContent())
.isEqualTo("This is an awesome document on libroffice writter !\n");
.contains("This is an awesome document on libroffice writter !\n");
}

@Test
public void slidePowerPointTest() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/slides.pptx");
assertThat(inputStream).isNotNull();
assertThat(textExtractor.extractContent(inputStream, "application/vnd.openxmlformats-officedocument.presentationml.presentation").getTextualContent())
.isEqualTo("James is awesome\nIt manages attachments so well !\n\n\n");
.contains("James is awesome\nIt manages attachments so well !\n\n\n");
}

@Test
public void slideOdpTest() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/slides.odp");
assertThat(inputStream).isNotNull();
assertThat(textExtractor.extractContent(inputStream, "application/vnd.oasis.opendocument.presentation").getTextualContent())
.isEqualTo("James is awesome\n\nIt manages attachments so well !\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n");
.contains("James is awesome\n\nIt manages attachments so well !\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n");
}

@Test
public void pdfTest() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/PDF.pdf");
assertThat(inputStream).isNotNull();
assertThat(textExtractor.extractContent(inputStream, "application/pdf").getTextualContent())
.isEqualTo("This is an awesome document on libroffice writter !\n\n\n");
.contains("This is an awesome document on libroffice writter !\n\n\n");
}

@Test
public void odsTest() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/calc.ods");
assertThat(inputStream).isNotNull();
assertThat(textExtractor.extractContent(inputStream, "application/vnd.oasis.opendocument.spreadsheet").getTextualContent())
.isEqualTo("This is an aesome LibreOffice document !\n" +
.contains("This is an aesome LibreOffice document !\n" +
"\n" +
"\n" +
"???\n" +
Expand All @@ -143,7 +143,7 @@ public void excelTest() throws Exception {
InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/calc.xlsx");
assertThat(inputStream).isNotNull();
assertThat(textExtractor.extractContent(inputStream, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet").getTextualContent())
.isEqualTo("Feuille1\n" +
.contains("Feuille1\n" +
"\tThis is an aesome LibreOffice document !\n" +
"\n" +
"&A\t\n" +
Expand Down Expand Up @@ -173,7 +173,7 @@ public void deserializerShouldTakeFirstNodeWhenSeveral() throws Exception {
InputStream inputStream = null;
ParsedContent parsedContent = textExtractor.extractContent(inputStream, "text/plain");

assertThat(parsedContent.getTextualContent()).isEqualTo(expectedExtractedContent);
assertThat(parsedContent.getTextualContent()).contains(expectedExtractedContent);
}

@Test
Expand Down

0 comments on commit fec80a1

Please sign in to comment.