From e22685b3eb507cd84fb5caf587f83b788c30b46f Mon Sep 17 00:00:00 2001
From: tallison <tallison@apache.org>
Date: Wed, 29 Apr 2026 10:38:56 -0400
Subject: [PATCH] improve epub handling of truncated files

---
 .../apache/tika/parser/epub/EpubParser.java   | 112 +++++++++++++++++-
 .../org/apache/tika/parser/pkg/ZipParser.java |  15 +++
 2 files changed, 124 insertions(+), 3 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index ae131407d01..4460946c521 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -37,6 +37,8 @@
 import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -68,6 +70,8 @@
 @TikaComponent
 public class EpubParser implements Parser {
 
+    private static final Logger LOG = LoggerFactory.getLogger(EpubParser.class);
+
     /**
      * Serial version UID
      */
@@ -155,12 +159,24 @@ private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan
             throws IOException, TikaException, SAXException {
 
         String rootOPF = getRoot(zipFile, context);
+        LOG.trace("epub bufferedParseZipFile: rootOPF={}", rootOPF);
         if (rootOPF == null) {
-            return Collections.EMPTY_SET;
+            // No container.xml and no .opf — typical of truncated epubs where
+            // the OPF lives past the truncation point.  Fall back to iterating
+            // the recoverable HTML/XHTML entries in stored order so we still
+            // emit partial content (matching 3.x's streamingParse contract),
+            // then throw to signal the result is incomplete.
+            LOG.trace("epub fallback: rootOPF=null, streaming all html entries");
+            return fallbackParseAllHtmlEntries(zipFile, bodyHandler, metadata, context,
+                    "no OPF found in (possibly truncated) container");
         }
         ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
+        LOG.trace("epub OPF entry: zae={} canReadEntryData={}",
+                zae, zae == null ? "n/a" : zipFile.canReadEntryData(zae));
         if (zae == null || !zipFile.canReadEntryData(zae)) {
-            return Collections.EMPTY_SET;
+            LOG.trace("epub fallback: OPF entry missing/unreadable, streaming all html entries");
+            return fallbackParseAllHtmlEntries(zipFile, bodyHandler, metadata, context,
+                    "OPF entry missing or unreadable in (possibly truncated) container");
         }
         try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(zae))) {
             opf.parse(tis, new DefaultHandler(), metadata, context);
@@ -170,8 +186,13 @@ private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan
         try (InputStream is = zipFile.getInputStream(zae)) {
             XMLReaderUtils.parseSAX(is, contentOrderScraper, context);
         }
+        LOG.trace("epub OPF parsed: spine items={}, manifest entries={}",
+                contentOrderScraper.contentItems.size(),
+                contentOrderScraper.locationMap.size());
         if (contentOrderScraper.contentItems.isEmpty()) {
-            return Collections.EMPTY_SET;
+            LOG.trace("epub fallback: empty spine, streaming all html entries");
+            return fallbackParseAllHtmlEntries(zipFile, bodyHandler, metadata, context,
+                    "OPF declared no spine items in (possibly truncated) container");
         }
         String relativePath = "";
         if (rootOPF.lastIndexOf("/") > -1) {
@@ -182,7 +203,9 @@ private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan
         Set<String> encryptedItems = checkForDRM(zipFile);
         Set<String> processed = new HashSet<>();
         Set<SAXException> saxExceptions = new HashSet<>();
+        int spineSeen = 0, spineParsed = 0, spineMissing = 0, spineNonHtml = 0;
         for (String id : contentOrderScraper.contentItems) {
+            spineSeen++;
             HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
             if (hRefMediaPair != null && hRefMediaPair.href != null) {
                 //we need to test for xhtml/xml because the content parser
@@ -207,18 +230,29 @@ private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan
                     if (zae != null) {
                         try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(zae))) {
                             content.parse(tis, bodyHandler, metadata, context);
+                            spineParsed++;
                         } catch (SAXException e) {
                             if (WriteLimitReachedException.isWriteLimitReached(e)) {
                                 throw e;
                             }
                             saxExceptions.add(e);
+                        } catch (IOException ioe) {
+                            LOG.trace("epub spine read IOException on {}: {}", path, ioe.toString());
+                            throw ioe;
                         } finally {
                             processed.add(id);
                         }
+                    } else {
+                        spineMissing++;
+                        LOG.trace("epub spine: getEntry({}) returned null (truncated?)", path);
                     }
+                } else {
+                    spineNonHtml++;
                 }
             }
         }
+        LOG.trace("epub spine summary: seen={} parsed={} missing={} non-html={}",
+                spineSeen, spineParsed, spineMissing, spineNonHtml);
 
         //now handle embedded files
         EmbeddedDocumentExtractor embeddedDocumentExtractor =
@@ -240,9 +274,81 @@ private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHan
         for (SAXException e : saxExceptions) {
             throw e;
         }
+        // If spine items referenced entries not in the (possibly salvaged)
+        // zip — typical of truncated epubs where the OPF survived but later
+        // chapters didn't — throw IOException so the outer parse() flushes
+        // the partial content already in xhtml and signals incompleteness.
+        // This restores 3.x's partial-content-plus-exception contract.
+        if (spineMissing > 0) {
+            throw new IOException("EPUB: " + spineMissing + " of "
+                    + spineSeen + " spine items missing from (possibly truncated) "
+                    + "container; emitted " + spineParsed + " recovered chapters");
+        }
         return encryptedItems;
     }
 
+    /**
+     * Fallback used when the OPF can't be located or parsed (typically a
+     * truncated epub where the OPF lives past the truncation point).
+     * Iterates the zip's entries in stored order and parses any HTML/XHTML/XML
+     * entry, mirroring 3.x's {@code streamingParse} behaviour.  Throws
+     * IOException at the end so the outer parse() flushes the partial content
+     * and the caller learns that extraction was incomplete.
+     */
+    private Set<String> fallbackParseAllHtmlEntries(ZipFile zipFile,
+                                                   ContentHandler bodyHandler,
+                                                   Metadata metadata,
+                                                   ParseContext context,
+                                                   String reason)
+            throws IOException, TikaException, SAXException {
+        // Try to recover mimetype + metadata.xml even in the fallback path,
+        // since they may be present even when the OPF isn't.
+        try {
+            extractMetadata(zipFile, metadata, context);
+        } catch (Exception e) {
+            LOG.trace("epub fallback: extractMetadata threw {}", e.toString());
+        }
+        int parsed = 0;
+        int failed = 0;
+        Enumeration<ZipArchiveEntry> entries = zipFile.getEntries();
+        while (entries.hasMoreElements()) {
+            ZipArchiveEntry entry = entries.nextElement();
+            String name = entry.getName().toLowerCase(Locale.US);
+            if (!(name.endsWith(".xhtml") || name.endsWith(".html")
+                    || name.endsWith(".htm") || name.endsWith(".xml"))) {
+                continue;
+            }
+            // Skip the OPF file if we somehow have one but it didn't parse
+            // upstream — body handler isn't the right place for it.
+            if (name.endsWith(".opf")) {
+                continue;
+            }
+            if (!zipFile.canReadEntryData(entry)) {
+                continue;
+            }
+            try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(entry))) {
+                content.parse(tis, bodyHandler, metadata, context);
+                parsed++;
+            } catch (SAXException e) {
+                if (WriteLimitReachedException.isWriteLimitReached(e)) {
+                    throw e;
+                }
+                failed++;
+                LOG.trace("epub fallback: SAX failure on {}: {}", entry.getName(), e.toString());
+            } catch (IOException e) {
+                failed++;
+                LOG.trace("epub fallback: IO failure on {}: {}", entry.getName(), e.toString());
+            }
+        }
+        LOG.trace("epub fallback summary: parsed={} failed={}", parsed, failed);
+        // Always throw — the caller asked for an EPUB and we couldn't follow
+        // the spine.  Partial content was emitted to xhtml; outer parse()
+        // flushes it.
+        throw new IOException("EPUB: fallback recovery (" + reason
+                + "); recovered " + parsed + " HTML/XHTML entries"
+                + (failed > 0 ? " (" + failed + " failed)" : ""));
+    }
+
     private Set<String> checkForDRM(ZipFile zipFile) throws IOException, TikaException,
             SAXException {
         ZipArchiveEntry zae = zipFile.getEntry(META_INF_ENCRYPTION);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index eec152cb7e5..d206cae6492 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -475,6 +475,21 @@ private void parseZipFileEntry(ZipFile zipFile, ZipArchiveEntry entry,
             return;
         }
 
+        // Defensive: mirror the streaming path's canReadEntryData gate so a
+        // truncated / unsupported entry in a salvaged ZipFile records an
+        // embedded-stream exception (caller-visible signal) instead of
+        // silently disappearing when getInputStream/parseEmbedded fail
+        // partway through.
+        if (!zipFile.canReadEntryData(entry)) {
+            EmbeddedDocumentUtil.recordEmbeddedStreamException(
+                    new TikaException("Can't read archive stream (" + name + ")"),
+                    parentMetadata);
+            if (name != null && !name.isEmpty()) {
+                xhtml.element("p", name);
+            }
+            return;
+        }
+
         Metadata entryMetadata = buildEntryMetadata(entry, name, context);
 
         writeEntryXhtml(name, xhtml);