Skip to content

Commit

Permalink
TIKA-2311 -- maintain mime information for truncated ooxml
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Apr 13, 2017
1 parent f3db573 commit 3aab15f
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
Expand All @@ -50,6 +51,7 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
Expand Down Expand Up @@ -85,6 +87,12 @@ public class PackageParser extends AbstractParser {
private static final Set<MediaType> SUPPORTED_TYPES =
MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);

//this can't be static because of the ForkParser
//lazily load this when parse is called if it is null.
private MediaTypeRegistry bufferedMediaTypeRegistry;

private final Object lock = new Object[0];

static MediaType getMediaType(ArchiveInputStream stream) {
if (stream instanceof JarArchiveInputStream) {
return JAR;
Expand Down Expand Up @@ -117,7 +125,27 @@ public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {


//lazily load the MediaTypeRegistry at parse time
//only want to call getDefaultConfig() once, and can't
//load statically because of the ForkParser
TikaConfig config = context.get(TikaConfig.class);
MediaTypeRegistry mediaTypeRegistry = null;
if (config != null) {
mediaTypeRegistry = config.getMediaTypeRegistry();
} else {
if (bufferedMediaTypeRegistry == null) {
//buffer this for next time.
synchronized (lock) {
//now that we're locked, check again
if (bufferedMediaTypeRegistry == null) {
bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
}
}
}
mediaTypeRegistry = bufferedMediaTypeRegistry;
}

// Ensure that the stream supports the mark feature
if (! stream.markSupported()) {
stream = new BufferedInputStream(stream);
Expand Down Expand Up @@ -165,10 +193,7 @@ public void parse(
throw new TikaException("Unable to unpack document stream", e);
}

MediaType type = getMediaType(ais);
if (!type.equals(MediaType.OCTET_STREAM)) {
metadata.set(CONTENT_TYPE, type.toString());
}
updateMediaType(ais, mediaTypeRegistry, metadata);
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);

Expand Down Expand Up @@ -200,6 +225,34 @@ public void parse(
xhtml.endDocument();
}

private void updateMediaType(ArchiveInputStream ais, MediaTypeRegistry mediaTypeRegistry, Metadata metadata) {
MediaType type = getMediaType(ais);
if (type.equals(MediaType.OCTET_STREAM)) {
return;
}

//now see if the user or an earlier step has passed in a content type
String incomingContentTypeString = metadata.get(CONTENT_TYPE);
if (incomingContentTypeString == null) {
metadata.set(CONTENT_TYPE, type.toString());
return;
}


MediaType incomingMediaType = MediaType.parse(incomingContentTypeString);
if (incomingMediaType == null) {
metadata.set(CONTENT_TYPE, type.toString());
return;
}
//if the existing type is a specialization of the detected type,
//leave in the specialization; otherwise set the detected
if (! mediaTypeRegistry.isSpecializationOf(incomingMediaType, type)) {
metadata.set(CONTENT_TYPE, type.toString());
return;
}

}

private void parseEntry(
ArchiveInputStream archive, ArchiveEntry entry,
EmbeddedDocumentExtractor extractor, Metadata parentMetadata, XHTMLContentHandler xhtml)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
Expand All @@ -43,6 +44,7 @@
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
Expand All @@ -62,6 +64,7 @@
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;

public class OOXMLParserTest extends TikaTest {

Expand Down Expand Up @@ -1470,8 +1473,22 @@ public void testXLSBVarious() throws Exception {
assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml);
}


@Test
public void testTruncated() throws Exception {
Parser p = new AutoDetectParser();
ContentHandler handler = new DefaultHandler();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
try (InputStream is = getTestDocument("testWORD_truncated.docx")) {
p.parse(is, handler, metadata, parseContext);
fail("should have thrown an EOF exception?!");
} catch (TikaException e) {
Throwable cause = e.getCause();
assertTrue(cause instanceof EOFException);
assertEquals("application/x-tika-ooxml", metadata.get(Metadata.CONTENT_TYPE));
}
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public void testTarParsing() throws Exception {
parser.parse(stream, handler, metadata, recursingContext);
}

assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("application/x-gtar", metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("test-documents/testEXCEL.xls", content);
assertContains("Sample Excel Worksheet", content);
Expand Down
Binary file not shown.

0 comments on commit 3aab15f

Please sign in to comment.