diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java index b8735bf161..0bf15cf1a8 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java @@ -41,6 +41,7 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -50,6 +51,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; @@ -85,6 +87,12 @@ public class PackageParser extends AbstractParser { private static final Set SUPPORTED_TYPES = MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ); + //this can't be static because of the ForkParser + //lazily load this when parse is called if it is null. + private MediaTypeRegistry bufferedMediaTypeRegistry; + + private final Object lock = new Object[0]; + static MediaType getMediaType(ArchiveInputStream stream) { if (stream instanceof JarArchiveInputStream) { return JAR; @@ -117,7 +125,27 @@ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - + + //lazily load the MediaTypeRegistry at parse time + //only want to call getDefaultConfig() once, and can't + //load statically because of the ForkParser + TikaConfig config = context.get(TikaConfig.class); + MediaTypeRegistry mediaTypeRegistry = null; + if (config != null) { + mediaTypeRegistry = config.getMediaTypeRegistry(); + } else { + if (bufferedMediaTypeRegistry == null) { + //buffer this for next time. + synchronized (lock) { + //now that we're locked, check again + if (bufferedMediaTypeRegistry == null) { + bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry(); + } + } + } + mediaTypeRegistry = bufferedMediaTypeRegistry; + } + // Ensure that the stream supports the mark feature if (! stream.markSupported()) { stream = new BufferedInputStream(stream); @@ -165,10 +193,7 @@ public void parse( throw new TikaException("Unable to unpack document stream", e); } - MediaType type = getMediaType(ais); - if (!type.equals(MediaType.OCTET_STREAM)) { - metadata.set(CONTENT_TYPE, type.toString()); - } + updateMediaType(ais, mediaTypeRegistry, metadata); // Use the delegate parser to parse the contained document EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); @@ -200,6 +225,34 @@ public void parse( xhtml.endDocument(); } + private void updateMediaType(ArchiveInputStream ais, MediaTypeRegistry mediaTypeRegistry, Metadata metadata) { + MediaType type = getMediaType(ais); + if (type.equals(MediaType.OCTET_STREAM)) { + return; + } + + //now see if the user or an earlier step has passed in a content type + String incomingContentTypeString = metadata.get(CONTENT_TYPE); + if (incomingContentTypeString == null) { + metadata.set(CONTENT_TYPE, type.toString()); + return; + } + + + MediaType incomingMediaType = MediaType.parse(incomingContentTypeString); + if (incomingMediaType == null) { + metadata.set(CONTENT_TYPE, type.toString()); + return; + } + //if the existing type is a specialization of the detected type, + //leave in the specialization; otherwise set the detected + if (! mediaTypeRegistry.isSpecializationOf(incomingMediaType, type)) { + metadata.set(CONTENT_TYPE, type.toString()); + return; + } + + } + private void parseEntry( ArchiveInputStream archive, ArchiveEntry entry, EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml) diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 0b9cb182ab..af1ba27359 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -27,6 +27,7 @@ import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import java.io.ByteArrayOutputStream; +import java.io.EOFException; import java.io.File; import java.io.InputStream; import java.io.PrintStream; @@ -43,6 +44,7 @@ import org.apache.tika.TikaTest; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; @@ -62,6 +64,7 @@ import org.junit.Ignore; import org.junit.Test; import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; public class OOXMLParserTest extends TikaTest { @@ -1470,8 +1473,22 @@ public void testXLSBVarious() throws Exception { assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml); assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml); assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml); + } - + @Test + public void testTruncated() throws Exception { + Parser p = new AutoDetectParser(); + ContentHandler handler = new DefaultHandler(); + Metadata metadata = new Metadata(); + ParseContext parseContext = new ParseContext(); + try (InputStream is = getTestDocument("testWORD_truncated.docx")) { + p.parse(is, handler, metadata, parseContext); + fail("should have thrown an EOF exception?!"); + } catch (TikaException e) { + Throwable cause = e.getCause(); + assertTrue(cause instanceof EOFException); + assertEquals("application/x-tika-ooxml", metadata.get(Metadata.CONTENT_TYPE)); + } } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java index 4b4fb63999..7bf36502df 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java @@ -46,7 +46,7 @@ public void testTarParsing() throws Exception { parser.parse(stream, handler, metadata, recursingContext); } - assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("application/x-gtar", metadata.get(Metadata.CONTENT_TYPE)); String content = handler.toString(); assertContains("test-documents/testEXCEL.xls", content); assertContains("Sample Excel Worksheet", content); diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_truncated.docx b/tika-parsers/src/test/resources/test-documents/testWORD_truncated.docx new file mode 100644 index 0000000000..ebe5e1a5ce Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_truncated.docx differ