Skip to content

Commit

Permalink
TIKA-2483 -- revert loading of mime repository in PackageParser from …
Browse files Browse the repository at this point in the history
…TIKA-2311 to avoid NPE in ForkParser
  • Loading branch information
tballison committed Nov 14, 2017
1 parent b19c2d7 commit 06486c8
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
Expand All @@ -51,7 +50,6 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
Expand Down Expand Up @@ -85,14 +83,15 @@ public class PackageParser extends AbstractParser {
private static final MediaType TAR = MediaType.application("x-tar");
private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");

private static final MediaType TIKA_OOXML = MediaType.application("tika-ooxml");
private static final MediaType GTAR = MediaType.application("x-gtar");


private static final Set<MediaType> SUPPORTED_TYPES =
MediaType.set(ZIP, JAR, AR, ARJ, CPIO, DUMP, TAR, SEVENZ);

//this can't be static because of the ForkParser
//lazily load this when parse is called if it is null.
private MediaTypeRegistry bufferedMediaTypeRegistry;

private final Object lock = new Object[0];
private static final Set<MediaType> DONT_OVERWRITE_CONTENT_TYPE =
MediaType.set(TIKA_OOXML, GTAR);

@Deprecated
static MediaType getMediaType(ArchiveInputStream stream) {
Expand Down Expand Up @@ -149,26 +148,6 @@ public void parse(
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {

//lazily load the MediaTypeRegistry at parse time
//only want to call getDefaultConfig() once, and can't
//load statically because of the ForkParser
TikaConfig config = context.get(TikaConfig.class);
MediaTypeRegistry mediaTypeRegistry = null;
if (config != null) {
mediaTypeRegistry = config.getMediaTypeRegistry();
} else {
if (bufferedMediaTypeRegistry == null) {
//buffer this for next time.
synchronized (lock) {
//now that we're locked, check again
if (bufferedMediaTypeRegistry == null) {
bufferedMediaTypeRegistry = TikaConfig.getDefaultConfig().getMediaTypeRegistry();
}
}
}
mediaTypeRegistry = bufferedMediaTypeRegistry;
}

// Ensure that the stream supports the mark feature
if (! stream.markSupported()) {
stream = new BufferedInputStream(stream);
Expand Down Expand Up @@ -216,7 +195,7 @@ public void parse(
throw new TikaException("Unable to unpack document stream", e);
}

updateMediaType(ais, mediaTypeRegistry, metadata);
updateMediaType(ais, metadata);
// Use the delegate parser to parse the contained document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);

Expand Down Expand Up @@ -248,7 +227,7 @@ public void parse(
xhtml.endDocument();
}

private void updateMediaType(ArchiveInputStream ais, MediaTypeRegistry mediaTypeRegistry, Metadata metadata) {
private void updateMediaType(ArchiveInputStream ais, Metadata metadata) {
MediaType type = getMediaType(ais);
if (type.equals(MediaType.OCTET_STREAM)) {
return;
Expand All @@ -267,13 +246,10 @@ private void updateMediaType(ArchiveInputStream ais, MediaTypeRegistry mediaType
metadata.set(CONTENT_TYPE, type.toString());
return;
}
//if the existing type is a specialization of the detected type,
//leave in the specialization; otherwise set the detected
if (! mediaTypeRegistry.isSpecializationOf(incomingMediaType, type)) {

if (! DONT_OVERWRITE_CONTENT_TYPE.contains(incomingMediaType)) {
metadata.set(CONTENT_TYPE, type.toString());
return;
}

}

private void parseEntry(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.tika.parser.fork;

import static org.apache.tika.TikaTest.assertContains;
import static org.apache.tika.TikaTest.debug;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.fail;
Expand All @@ -29,6 +30,7 @@
import java.util.Set;

import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.fork.ForkParser;
Expand All @@ -46,7 +48,7 @@
* Test that the ForkParser correctly behaves when
* wired in to the regular Parsers and their test data
*/
public class ForkParserIntegrationTest {
public class ForkParserIntegrationTest extends TikaTest {

private Tika tika = new Tika(); // TODO Use TikaConfig instead, when it works

Expand Down Expand Up @@ -269,4 +271,20 @@ public void testForkedPDFParsing() throws Exception {
parser.close();
}
}

@Test
public void testForkedPackageParsing() throws Exception {
ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
tika.getParser());
try {
ContentHandler output = new BodyContentHandler();
InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
"/test-documents/moby.zip");
ParseContext context = new ParseContext();
parser.parse(stream, output, new Metadata(), context);
assertContains("Moby Dick", output.toString());
} finally {
parser.close();
}
}
}

0 comments on commit 06486c8

Please sign in to comment.