Skip to content

Commit

Permalink
TIKA-2311 -- try OPC before ZipFile. This can work better on some tru…
Browse files Browse the repository at this point in the history
…ncated files.
  • Loading branch information
tballison committed May 1, 2017
1 parent 4e1e87f commit 6930ff0
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException

return type;
} catch (InvalidFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
//swallow
}finally {
tmp.close();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,20 @@ private static MediaType detectArchiveFormat(byte[] prefix, int length) {
}

private MediaType detectZipFormat(TikaInputStream tis) {

//try opc first because opening a package
//will not necessarily throw an exception for
//truncated files.
MediaType type = detectOPCBased(tis);
if (type != null) {
return type;
}

try {
ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
try {
MediaType type = detectOpenDocument(zip);
if (type == null) {
type = detectOPCBased(zip, tis);
}
type = detectOpenDocument(zip);

if (type == null) {
type = detectIWork(zip);
}
Expand Down Expand Up @@ -191,18 +198,16 @@ private static MediaType detectOpenDocument(ZipFile zip) {
}
}

private MediaType detectOPCBased(ZipFile zip, TikaInputStream stream) {
private MediaType detectOPCBased(TikaInputStream stream) {
try {
if (zip.getEntry("_rels/.rels") != null
|| zip.getEntry("[Content_Types].xml") != null) {
// if (zip.getEntry("_rels/.rels") != null
// || zip.getEntry("[Content_Types].xml") != null) {
MediaType type = this.opcDetector.detect(stream, null);
if (type != null) return type;

// We don't know what it is, sorry
return null;
} else {
return null;
}

} catch (IOException e) {
return null;
} catch (RuntimeException e) {
Expand Down

0 comments on commit 6930ff0

Please sign in to comment.