Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,25 @@ public MediaType detect(ZipFile zipFile, TikaInputStream stream) throws IOExcept
//no need to close zipEntrySource because it
//only closes the underlying zipFile, not any other resources
//as of this writing.... :'(
return null;
//fall through to [Content_Types].xml fallback below
}
// POI may have failed (caught above) OR returned null because the
// rels were malformed and POI silently produced an empty relationship
// collection. Either way, fall back to parsing [Content_Types].xml
// directly — same approach as the streaming detector.
if (type == null) {
ZipArchiveEntry ctEntry = zipEntrySource.getEntry("[Content_Types].xml");
if (ctEntry != null) {
try (InputStream contentTypesStream =
zipEntrySource.getInputStream(ctEntry)) {
type = parseOOXMLContentTypes(contentTypesStream);
} catch (IOException ignore) {
//swallow
}
}
if (type == null || pkg == null) {
return type;
}
}
//this will now be closed eventually when the wrapper closes
//the pkg which will close this
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
Expand Down Expand Up @@ -376,6 +377,10 @@ private OOXMLTikaBodyPartHandler handlePart(PackagePart packagePart,
linkedRelationships, config.isIncludeShapeBasedContent(),
config.isConcatenatePhoneticRuns(),
config.isPreferAlternateContentChoice())), context);
} catch (SAXException e) {
WriteLimitReachedException.throwIfWriteLimitReached(e);
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
} catch (TikaException | IOException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ private void outputCell() {
break;
case SST_STRING:
String sstIndex = value.toString().trim();
if (!sstIndex.isEmpty()) {
if (!sstIndex.isEmpty() && sharedStringsShim != null) {
try {
int idx = Integer.parseInt(sstIndex);
thisStr = sharedStringsShim.getItemAt(idx);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,15 @@

import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;

Expand Down Expand Up @@ -142,24 +144,56 @@ protected void buildXHTML(XHTMLContentHandler xhtml)
throws SAXException, IOException {
OPCPackage container = opcPackage;

XSSFSharedStringsShim stringsShim;
XSSFSharedStringsShim stringsShim = null;
XSSFReader.SheetIterator iter;
XSSFReader xssfReader;
XSSFStylesShim stylesShim;
XSSFStylesShim stylesShim = null;
try {
xssfReader = new XSSFReader(container);
stylesShim = new XSSFStylesShim(xssfReader.getStylesData(), parseContext);

iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
} catch (OpenXML4JException | RuntimeException e) {
throw new IOException(e);
}
// Styles and shared strings are optional — if either part is missing or
// unreadable, log to metadata and continue with degraded extraction.
try {
stylesShim = new XSSFStylesShim(xssfReader.getStylesData(), parseContext);
} catch (Exception e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
try {
stringsShim = new XSSFSharedStringsShim(xssfReader.getSharedStringsData(),
config.isConcatenatePhoneticRuns(), parseContext);
} catch (OpenXML4JException | TikaException e) {
throw new IOException(e);
} catch (Exception e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
while (iter.hasNext()) {
while (true) {
try {
if (!iter.hasNext()) {
break;
}
} catch (RuntimeException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
break;
}
SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config, xhtml);
PackagePart sheetPart = null;
try (InputStream stream = iter.next()) {
InputStream nextStream;
try {
nextStream = iter.next();
} catch (RuntimeException e) {
// POI can throw POIXMLException for missing sheet parts (e.g.,
// truncated workbook references a sheet that isn't in the zip).
// Break rather than continue — POI's iterator state may not have
// advanced, which would cause an infinite loop.
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
break;
}
try (InputStream stream = nextStream) {
sheetPart = iter.getSheetPart();

addDrawingHyperLinks(sheetPart);
Expand All @@ -178,7 +212,15 @@ protected void buildXHTML(XHTMLContentHandler xhtml)
xhtml.startElement("table");
xhtml.startElement("tbody");

processSheet(sheetExtractor, commentsShim, stylesShim, stringsShim, stream);
try {
processSheet(sheetExtractor, commentsShim, stylesShim, stringsShim, stream);
} catch (SAXException e) {
// Truncated/malformed sheet XML — keep prior sheets and
// record the failure as a warning.
WriteLimitReachedException.throwIfWriteLimitReached(e);
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
try {
getThreadedComments(container, sheetPart, xhtml);
} catch (InvalidFormatException | TikaException | IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ParserUtils;

/**
* Parser for ZIP and JAR archives using file-based access for complete metadata extraction.
Expand Down Expand Up @@ -374,12 +375,22 @@ private void parseStreamEntries(ZipArchiveInputStream zis, Metadata metadata,
throws TikaException, IOException, SAXException {

try {
ArchiveEntry entry = zis.getNextEntry();
while (entry != null) {
ArchiveEntry entry;
while (true) {
try {
entry = zis.getNextEntry();
} catch (java.util.zip.ZipException ze) {
Comment thread
tballison marked this conversation as resolved.
// Truncated/corrupt central directory: stop iteration but keep
// entries already extracted. Record the failure as a warning.
ParserUtils.recordParserFailure(this, ze, metadata);
break;
}
if (entry == null) {
break;
}
if (shouldUseDataDescriptor && entryCnt.get() > 0) {
// Skip already-processed entries on re-read
entryCnt.decrementAndGet();
entry = zis.getNextEntry();
continue;
}

Expand All @@ -405,8 +416,6 @@ private void parseStreamEntries(ZipArchiveInputStream zis, Metadata metadata,
if (!shouldUseDataDescriptor) {
entryCnt.incrementAndGet();
}

entry = zis.getNextEntry();
}
} catch (UnsupportedZipFeatureException zfe) {
if (zfe.getFeature() == Feature.ENCRYPTION) {
Expand Down
Loading