Skip to content

Commit

Permalink
JAMES-2456 Upgrade Tika / Tika client should not throw
Browse files Browse the repository at this point in the history
  • Loading branch information
aduprat authored and chibenwa committed Jul 11, 2018
1 parent 6ef1a29 commit c8bd682
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 49 deletions.

This file was deleted.

Expand Up @@ -19,8 +19,9 @@
package org.apache.james.mailbox.tika;

import java.io.InputStream;
import java.util.Optional;

public interface TikaHttpClient {

InputStream recursiveMetaDataAsJson(InputStream inputStream, String contentType) throws TikaException;
Optional<InputStream> recursiveMetaDataAsJson(InputStream inputStream, String contentType);
}
Expand Up @@ -22,6 +22,7 @@
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Optional;

import org.apache.http.client.fluent.Request;
import org.apache.http.client.utils.URIBuilder;
Expand Down Expand Up @@ -51,17 +52,18 @@ private URI buildURI(TikaConfiguration tikaConfiguration) throws URISyntaxExcept
}

@Override
public InputStream recursiveMetaDataAsJson(InputStream inputStream, String contentType) throws TikaException {
public Optional<InputStream> recursiveMetaDataAsJson(InputStream inputStream, String contentType) {
try {
return Request.Put(recursiveMetaData)
.socketTimeout(tikaConfiguration.getTimeoutInMillis())
.bodyStream(inputStream, ContentType.create(contentType))
.execute()
.returnContent()
.asStream();
return Optional.ofNullable(
Request.Put(recursiveMetaData)
.socketTimeout(tikaConfiguration.getTimeoutInMillis())
.bodyStream(inputStream, ContentType.create(contentType))
.execute()
.returnContent()
.asStream());
} catch (IOException e) {
LOGGER.error("Failing to call Tika", e);
throw new TikaException(e);
LOGGER.warn("Failing to call Tika", e);
return Optional.empty();
}
}

Expand Down
Expand Up @@ -25,6 +25,7 @@
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Predicate;

import javax.inject.Inject;
Expand All @@ -51,6 +52,7 @@
import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;

public class TikaTextExtractor implements TextExtractor {

Expand Down Expand Up @@ -82,11 +84,13 @@ public ParsedContent extractContent(InputStream inputStream, String contentType)

public ParsedContent performContentExtraction(InputStream inputStream, String contentType) throws IOException {
ContentAndMetadata contentAndMetadata = convert(tikaHttpClient.recursiveMetaDataAsJson(inputStream, contentType));
return new ParsedContent(contentAndMetadata.getContent(), contentAndMetadata.getMetadata());
return new ParsedContent(contentAndMetadata.getContent().orElse(null), contentAndMetadata.getMetadata());
}

private ContentAndMetadata convert(InputStream json) throws IOException, JsonParseException, JsonMappingException {
return objectMapper.readValue(json, ContentAndMetadata.class);
private ContentAndMetadata convert(Optional<InputStream> maybeInputStream) throws IOException, JsonParseException, JsonMappingException {
return maybeInputStream
.map(Throwing.function(inputStream -> objectMapper.readValue(inputStream, ContentAndMetadata.class)))
.orElse(ContentAndMetadata.empty());
}

@VisibleForTesting
Expand Down Expand Up @@ -119,8 +123,12 @@ private static class ContentAndMetadata {
private static final String TIKA_HEADER = "X-TIKA";
private static final String CONTENT_METADATA_HEADER_NAME = TIKA_HEADER + ":content";

public static ContentAndMetadata empty() {
return new ContentAndMetadata();
}

public static ContentAndMetadata from(Map<String, List<String>> contentAndMetadataMap) {
return new ContentAndMetadata(content(contentAndMetadataMap),
return new ContentAndMetadata(Optional.ofNullable(content(contentAndMetadataMap)),
contentAndMetadataMap.entrySet().stream()
.filter(allHeadersButTika())
.collect(Guavate.toImmutableMap(Entry::getKey, Entry::getValue)));
Expand All @@ -139,15 +147,19 @@ private static String content(Map<String, List<String>> contentAndMetadataMap) {
return StringUtils.stripStart(content.get(0), onlySpaces);
}

private final String content;
private final Optional<String> content;
private final Map<String, List<String>> metadata;

private ContentAndMetadata(String content, Map<String, List<String>> metadata) {
private ContentAndMetadata() {
this(Optional.empty(), ImmutableMap.of());
}

private ContentAndMetadata(Optional<String> content, Map<String, List<String>> metadata) {
this.content = content;
this.metadata = metadata;
}

public String getContent() {
public Optional<String> getContent() {
return content;
}

Expand Down
Expand Up @@ -27,6 +27,7 @@
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Optional;

import org.apache.commons.io.IOUtils;
import org.apache.james.mailbox.extractor.ParsedContent;
Expand Down Expand Up @@ -157,8 +158,10 @@ public void excelTest() throws Exception {
@Test
public void deserializerShouldNotThrowWhenMoreThanOneNode() throws Exception {
TikaTextExtractor textExtractor = new TikaTextExtractor(
new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream(("[{\"X-TIKA:content\": \"This is an awesome LibreOffice document !\"}, " +
"{\"Chroma BlackIsZero\": \"true\"}]").getBytes(StandardCharsets.UTF_8)));
new NoopMetricFactory(),
(inputStream, contentType) -> Optional.of(new ByteArrayInputStream(("[{\"X-TIKA:content\": \"This is an awesome LibreOffice document !\"}, " +
"{\"Chroma BlackIsZero\": \"true\"}]")
.getBytes(StandardCharsets.UTF_8))));

InputStream inputStream = null;
textExtractor.extractContent(inputStream, "text/plain");
Expand All @@ -168,8 +171,10 @@ public void deserializerShouldNotThrowWhenMoreThanOneNode() throws Exception {
public void deserializerShouldTakeFirstNodeWhenSeveral() throws Exception {
String expectedExtractedContent = "content A";
TikaTextExtractor textExtractor = new TikaTextExtractor(
new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream(("[{\"X-TIKA:content\": \"" + expectedExtractedContent + "\"}, " +
"{\"X-TIKA:content\": \"content B\"}]").getBytes(StandardCharsets.UTF_8)));
new NoopMetricFactory(),
(inputStream, contentType) -> Optional.of(new ByteArrayInputStream(("[{\"X-TIKA:content\": \"" + expectedExtractedContent + "\"}, " +
"{\"X-TIKA:content\": \"content B\"}]")
.getBytes(StandardCharsets.UTF_8))));

InputStream inputStream = null;
ParsedContent parsedContent = textExtractor.extractContent(inputStream, "text/plain");
Expand All @@ -183,7 +188,9 @@ public void deserializerShouldThrowWhenNodeIsNotAnObject() throws Exception {
expectedException.expectMessage("The element should be a Json object");

TikaTextExtractor textExtractor = new TikaTextExtractor(
new NoopMetricFactory(), (inputStream, contentType) -> new ByteArrayInputStream("[\"value1\"]".getBytes(StandardCharsets.UTF_8)));
new NoopMetricFactory(),
(inputStream, contentType) -> Optional.of(new ByteArrayInputStream("[\"value1\"]"
.getBytes(StandardCharsets.UTF_8))));

InputStream inputStream = null;
textExtractor.extractContent(inputStream, "text/plain");
Expand Down
Expand Up @@ -24,6 +24,6 @@ public interface Images {
String RABBITMQ = "rabbitmq:3.7.5";
String ELASTICSEARCH = "elasticsearch:2.2.2";
String NGINX = "nginx:1.7.1";
String TIKA = "logicalspark/docker-tikaserver:1.15rc2";
String TIKA = "linagora/docker-tikaserver:1.18-SNAPSHOT-plus-TIKA-2520";
String SPAMASSASSIN = "dinkel/spamassassin:3.4.0";
}

0 comments on commit c8bd682

Please sign in to comment.