Skip to content

Commit

Permalink
TIKA-2626
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Apr 6, 2018
1 parent b928453 commit d1a7cab
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 11 deletions.
Expand Up @@ -149,14 +149,13 @@ private static DateFormat createDateFormat(String format, TimeZone timezone, boo
private final boolean extractAllAlternatives;
private final EmbeddedDocumentExtractor extractor;
private final Detector detector;

//this is used to buffer a multipart body that
//keeps track of multipart/alternative and its children
private Stack<Part> alternativePartBuffer = new Stack<>();

private Stack<BodyDescriptor> parts = new Stack<>();

MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata,
MailContentHandler(XHTMLContentHandler xhtml, Detector detector, Metadata metadata,
ParseContext context, boolean strictParsing, boolean extractAllAlternatives) {
this.handler = xhtml;
this.metadata = metadata;
Expand All @@ -169,7 +168,7 @@ private static DateFormat createDateFormat(String format, TimeZone timezone, boo

// Was an EmbeddedDocumentExtractor explicitly supplied?
this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
this.detector = new EmbeddedDocumentUtil(context).getDetector();
this.detector = detector;
}

@Override
Expand Down Expand Up @@ -221,7 +220,7 @@ public void body(BodyDescriptor body, InputStream is) throws MimeException,
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
byte[] bytes = bos.toByteArray();
if (isTextOrHtml(submd, bytes)) {
if (detectTextOrHtml(submd, bytes)) {
handleInlineBodyPart(new BodyContents(submd, bos.toByteArray()));
} else {
//else handle as you would any other embedded content
Expand All @@ -237,15 +236,23 @@ public void body(BodyDescriptor body, InputStream is) throws MimeException,
}
}

private boolean isTextOrHtml(Metadata submd, byte[] bytes) {
private boolean detectTextOrHtml(Metadata submd, byte[] bytes) {
String mediaTypeString = submd.get(Metadata.CONTENT_TYPE);
if (mediaTypeString != null && mediaTypeString.startsWith("text")) {
return true;
if (mediaTypeString != null) {
if (mediaTypeString.startsWith("text")) {
return true;
} else {
return false;
}
}
try (TikaInputStream tis = TikaInputStream.get(bytes)) {
MediaType mediaType = detector.detect(tis, submd);
if (mediaType != null && mediaType.toString().startsWith("text")) {
return true;
if (mediaType != null) {
//detect only once
submd.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, mediaType.toString());
if (mediaType.toString().startsWith("text")) {
return true;
}
}
} catch (IOException e) {

Expand Down
Expand Up @@ -26,7 +26,9 @@
import org.apache.james.mime4j.parser.MimeStreamParser;
import org.apache.james.mime4j.stream.MimeConfig;
import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
Expand Down Expand Up @@ -54,6 +56,10 @@ public class RFC822Parser extends AbstractParser {
private static final Set<MediaType> SUPPORTED_TYPES = Collections
.singleton(MediaType.parse("message/rfc822"));

//rely on the detector to be thread-safe
//built lazily and then reused
private Detector detector;

@Field
private boolean extractAllAlternatives = false;

Expand All @@ -71,12 +77,20 @@ public void parse(InputStream stream, ContentHandler handler,
.build();

config = context.get(MimeConfig.class, config);

Detector localDetector = context.get(Detector.class);
if (localDetector == null) {
//lazily load this if necessary
if (detector == null) {
EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
detector = embeddedDocumentUtil.getDetector();
}
localDetector = detector;
}
MimeStreamParser parser = new MimeStreamParser(config, null, new DefaultBodyDescriptorBuilder());
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

MailContentHandler mch = new MailContentHandler(
xhtml, metadata, context, config.isStrictParsing(),
xhtml, localDetector, metadata, context, config.isStrictParsing(),
extractAllAlternatives);
parser.setContentHandler(mch);
parser.setContentDecoding(true);
Expand Down

0 comments on commit d1a7cab

Please sign in to comment.