diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java index 52108e88854..8378d9b19c8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java @@ -281,7 +281,25 @@ public MediaType detect(ZipFile zipFile, TikaInputStream stream) throws IOExcept //no need to close zipEntrySource because it //only closes the underlying zipFile, not any other resources //as of this writing.... :'( - return null; + //fall through to [Content_Types].xml fallback below + } + // POI may have failed (caught above) OR returned null because the + // rels were malformed and POI silently produced an empty relationship + // collection. Either way, fall back to parsing [Content_Types].xml + // directly — same approach as the streaming detector. + if (type == null) { + ZipArchiveEntry ctEntry = zipEntrySource.getEntry("[Content_Types].xml"); + if (ctEntry != null) { + try (InputStream contentTypesStream = + zipEntrySource.getInputStream(ctEntry)) { + type = parseOOXMLContentTypes(contentTypesStream); + } catch (IOException ignore) { + //swallow + } + } + if (type == null || pkg == null) { + return type; + } } //this will now be closed eventually when the wrapper closes //the pkg which will close this diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index d990e6f96ae..de3ff518359 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -38,6 +38,7 @@ import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; @@ -376,6 +377,10 @@ private OOXMLTikaBodyPartHandler handlePart(PackagePart packagePart, linkedRelationships, config.isIncludeShapeBasedContent(), config.isConcatenatePhoneticRuns(), config.isPreferAlternateContentChoice())), context); + } catch (SAXException e) { + WriteLimitReachedException.throwIfWriteLimitReached(e); + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, + ExceptionUtils.getStackTrace(e)); } catch (TikaException | IOException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java index 3ba83dd255c..ec96a40c2e4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java @@ -299,7 +299,7 @@ private void outputCell() { break; case SST_STRING: String sstIndex = value.toString().trim(); - if (!sstIndex.isEmpty()) { + if (!sstIndex.isEmpty() && sharedStringsShim != null) { try { int idx = Integer.parseInt(sstIndex); thisStr = sharedStringsShim.getItemAt(idx); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index 10f4c1daf51..d968cdb8562 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -51,6 +51,7 @@ import org.apache.tika.exception.RuntimeSAXException; import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; @@ -58,6 +59,7 @@ import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.TikaExcelDataFormatter; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; @@ -142,24 +144,56 @@ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException { OPCPackage container = opcPackage; - XSSFSharedStringsShim stringsShim; + XSSFSharedStringsShim stringsShim = null; XSSFReader.SheetIterator iter; XSSFReader xssfReader; - XSSFStylesShim stylesShim; + XSSFStylesShim stylesShim = null; try { xssfReader = new XSSFReader(container); - stylesShim = new XSSFStylesShim(xssfReader.getStylesData(), parseContext); - iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); + } catch (OpenXML4JException | RuntimeException e) { + throw new IOException(e); + } + // Styles and shared strings are optional — if either part is missing or + // unreadable, log to metadata and continue with degraded extraction. + try { + stylesShim = new XSSFStylesShim(xssfReader.getStylesData(), parseContext); + } catch (Exception e) { + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, + ExceptionUtils.getStackTrace(e)); + } + try { stringsShim = new XSSFSharedStringsShim(xssfReader.getSharedStringsData(), config.isConcatenatePhoneticRuns(), parseContext); - } catch (OpenXML4JException | TikaException e) { - throw new IOException(e); + } catch (Exception e) { + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, + ExceptionUtils.getStackTrace(e)); } - while (iter.hasNext()) { + while (true) { + try { + if (!iter.hasNext()) { + break; + } + } catch (RuntimeException e) { + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, + ExceptionUtils.getStackTrace(e)); + break; + } SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(config, xhtml); PackagePart sheetPart = null; - try (InputStream stream = iter.next()) { + InputStream nextStream; + try { + nextStream = iter.next(); + } catch (RuntimeException e) { + // POI can throw POIXMLException for missing sheet parts (e.g., + // truncated workbook references a sheet that isn't in the zip). + // Break rather than continue — POI's iterator state may not have + // advanced, which would cause an infinite loop. + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, + ExceptionUtils.getStackTrace(e)); + break; + } + try (InputStream stream = nextStream) { sheetPart = iter.getSheetPart(); addDrawingHyperLinks(sheetPart); @@ -178,7 +212,15 @@ protected void buildXHTML(XHTMLContentHandler xhtml) xhtml.startElement("table"); xhtml.startElement("tbody"); - processSheet(sheetExtractor, commentsShim, stylesShim, stringsShim, stream); + try { + processSheet(sheetExtractor, commentsShim, stylesShim, stringsShim, stream); + } catch (SAXException e) { + // Truncated/malformed sheet XML — keep prior sheets and + // record the failure as a warning. + WriteLimitReachedException.throwIfWriteLimitReached(e); + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, + ExceptionUtils.getStackTrace(e)); + } try { getThreadedComments(container, sheetPart, xhtml); } catch (InvalidFormatException | TikaException | IOException e) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java index 698bd269919..eec152cb7e5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java @@ -59,6 +59,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.ParserUtils; /** * Parser for ZIP and JAR archives using file-based access for complete metadata extraction. @@ -374,12 +375,22 @@ private void parseStreamEntries(ZipArchiveInputStream zis, Metadata metadata, throws TikaException, IOException, SAXException { try { - ArchiveEntry entry = zis.getNextEntry(); - while (entry != null) { + ArchiveEntry entry; + while (true) { + try { + entry = zis.getNextEntry(); + } catch (java.util.zip.ZipException ze) { + // Truncated/corrupt central directory: stop iteration but keep + // entries already extracted. Record the failure as a warning. + ParserUtils.recordParserFailure(this, ze, metadata); + break; + } + if (entry == null) { + break; + } if (shouldUseDataDescriptor && entryCnt.get() > 0) { // Skip already-processed entries on re-read entryCnt.decrementAndGet(); - entry = zis.getNextEntry(); continue; } @@ -405,8 +416,6 @@ private void parseStreamEntries(ZipArchiveInputStream zis, Metadata metadata, if (!shouldUseDataDescriptor) { entryCnt.incrementAndGet(); } - - entry = zis.getNextEntry(); } } catch (UnsupportedZipFeatureException zfe) { if (zfe.getFeature() == Feature.ENCRYPTION) {