diff --git a/poi-scratchpad/src/test/java/org/apache/poi/hwpf/TestHWPFParser.java b/poi-scratchpad/src/test/java/org/apache/poi/hwpf/TestHWPFParser.java index 093b860f5d4..6bda9d9e4c1 100644 --- a/poi-scratchpad/src/test/java/org/apache/poi/hwpf/TestHWPFParser.java +++ b/poi-scratchpad/src/test/java/org/apache/poi/hwpf/TestHWPFParser.java @@ -17,15 +17,16 @@ Licensed to the Apache Software Foundation (ASF) under one or more package org.apache.poi.hwpf; +import org.apache.poi.POIDataSamples; +import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.OfficeXmlFileException; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.junit.jupiter.api.Test; +import java.io.File; import java.io.InputStream; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertInstanceOf; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.*; public class TestHWPFParser { @Test @@ -39,6 +40,114 @@ void testDoc() throws Exception { } } + /** + * Test reading a real-world .doc file. + * This test now handles non-standard formatting that WPS/Word can open. + */ + @Test + void testDocRead() throws Exception { + // Enable tolerant mode for corrupt blocks + System.setProperty("org.apache.poi.poifs.allowCorruptBlocks", "true"); + try { + try ( + InputStream stream = HWPFTestDataSamples.openSampleFileStream("issue_1041.doc"); + HWPFDocument doc = HWPFParser.parse(stream) + ) { + assertNotNull(doc); + WordExtractor extractor = new WordExtractor(doc); + String text = extractor.getText(); + + // Verify actual text content, not just non-null + assertNotNull(text, "Extracted text should not be null"); + assertFalse(text.isEmpty(), "Extracted text should not be empty"); + assertFalse(text.trim().isEmpty(), "Extracted text should not be blank"); + } + } finally { + // Reset to default strict mode + System.clearProperty("org.apache.poi.poifs.allowCorruptBlocks"); + } + } + + /** + * Test that by default (strict mode), reading corrupt files throws an exception. + */ + @Test + void testDocReadStrictMode() throws Exception { + // Ensure strict mode is enabled (default behavior) + System.clearProperty("org.apache.poi.poifs.allowCorruptBlocks"); + + // Should throw HWPFReadException (wrapping IndexOutOfBoundsException) in strict mode + HWPFReadException exception = assertThrows(HWPFReadException.class, () -> { + try ( + InputStream stream = HWPFTestDataSamples.openSampleFileStream("issue_1041.doc"); + HWPFDocument doc = HWPFParser.parse(stream) + ) { + // This should not succeed in strict mode + } + }); + + // Verify the root cause is IndexOutOfBoundsException about corrupt blocks + Throwable rootCause = getRootCause(exception); + assertInstanceOf(IndexOutOfBoundsException.class, rootCause, "Expected root cause to be IndexOutOfBoundsException, but was: " + rootCause.getClass().getName()); + assertTrue(rootCause.getMessage().contains("beyond EOF"), + "Expected exception message to contain 'beyond EOF', but got: " + rootCause.getMessage()); + } + + /** + * Helper method to get the root cause of an exception chain. + * Limits traversal depth to prevent infinite loops in case of circular references. + */ + private static Throwable getRootCause(Throwable throwable) { + Throwable cause = throwable; + int depth = 0; + final int MAX_DEPTH = 20; // Reasonable limit for exception chains + + while (cause.getCause() != null && cause.getCause() != cause && depth < MAX_DEPTH) { + cause = cause.getCause(); + depth++; + } + return cause; + } + + @Test + void testWpsDocByFs()throws Exception{ + // Enable tolerant mode for corrupt blocks + System.setProperty("org.apache.poi.poifs.allowCorruptBlocks", "true"); + try { + POIDataSamples instance = POIDataSamples.getDocumentInstance(); + File file = instance.getFile("issue_1041.doc"); + POIFSFileSystem fs = new POIFSFileSystem(file); + WordExtractor extractor = new WordExtractor(fs); + String text = extractor.getText(); + + // Verify actual text content, not just non-null + assertNotNull(text, "Extracted text should not be null"); + assertFalse(text.isEmpty(), "Extracted text should not be empty"); + assertFalse(text.trim().isEmpty(), "Extracted text should not be blank"); + } finally { + // Reset to default strict mode + System.clearProperty("org.apache.poi.poifs.allowCorruptBlocks"); + } + } + + @Test + void testOffice97_2003DocRead() throws Exception { + try ( + InputStream stream = HWPFTestDataSamples.openSampleFileStream("issue_1041_2.doc"); + HWPFDocument doc = HWPFParser.parse(stream) + ) { + assertNotNull(doc); + WordExtractor extractor = new WordExtractor(doc); + String text = extractor.getText(); + + // Verify actual text content, not just non-null + assertNotNull(text, "Extracted text should not be null"); + assertFalse(text.isEmpty(), "Extracted text should not be empty"); + assertFalse(text.trim().isEmpty(), "Extracted text should not be blank"); + } + } + + @Test void testFailOnDocx() throws Exception { try (InputStream stream = HWPFTestDataSamples.openSampleFileStream("sample.docx")) { diff --git a/poi/src/main/java/org/apache/poi/poifs/nio/ByteArrayBackedDataSource.java b/poi/src/main/java/org/apache/poi/poifs/nio/ByteArrayBackedDataSource.java index 15c4f6ff078..5aedabbedbe 100644 --- a/poi/src/main/java/org/apache/poi/poifs/nio/ByteArrayBackedDataSource.java +++ b/poi/src/main/java/org/apache/poi/poifs/nio/ByteArrayBackedDataSource.java @@ -44,14 +44,23 @@ public ByteArrayBackedDataSource(byte[] data) { @Override public ByteBuffer read(int length, long position) { + // Handle non-standard files that have references to blocks beyond EOF if(position >= size) { - throw new IndexOutOfBoundsException( - "Unable to read " + length + " bytes from " + - position + " in stream of length " + size - ); + // Check system property dynamically to allow runtime configuration + boolean allowCorruptBlocks = Boolean.getBoolean("org.apache.poi.poifs.allowCorruptBlocks"); + if (!allowCorruptBlocks) { + throw new IndexOutOfBoundsException( + "Position " + position + " is beyond EOF (" + size + "). " + + "Set system property 'org.apache.poi.poifs.allowCorruptBlocks' to true " + + "to allow reading corrupt files with missing blocks."); + } + // Return a zero-filled buffer in tolerant mode + // This allows processing of documents with corrupted block chains (e.g., some WPS files) + return ByteBuffer.allocate(length); } int toRead = (int)Math.min(length, size - position); + return ByteBuffer.wrap(buffer, (int)position, toRead); } diff --git a/poi/src/main/java/org/apache/poi/poifs/nio/FileBackedDataSource.java b/poi/src/main/java/org/apache/poi/poifs/nio/FileBackedDataSource.java index 52ff3a23814..35589370cef 100644 --- a/poi/src/main/java/org/apache/poi/poifs/nio/FileBackedDataSource.java +++ b/poi/src/main/java/org/apache/poi/poifs/nio/FileBackedDataSource.java @@ -95,7 +95,16 @@ public FileChannel getChannel() { @Override public ByteBuffer read(int length, long position) throws IOException { if (position >= size()) { - throw new IndexOutOfBoundsException("Position " + position + " past the end of the file"); + // Check system property dynamically to allow runtime configuration + boolean allowCorruptBlocks = Boolean.getBoolean("org.apache.poi.poifs.allowCorruptBlocks"); + if (!allowCorruptBlocks) { + throw new IndexOutOfBoundsException( + "Position " + position + " is beyond EOF (" + size() + "). " + + "Set system property 'org.apache.poi.poifs.allowCorruptBlocks' to true " + + "to allow reading corrupt files with missing blocks."); + } + // Return a zero-filled buffer in tolerant mode + return ByteBuffer.allocate(length); } // TODO Could we do the read-only case with MapMode.PRIVATE instead? diff --git a/test-data/document/issue_1041.doc b/test-data/document/issue_1041.doc new file mode 100644 index 00000000000..cb3c17d9180 Binary files /dev/null and b/test-data/document/issue_1041.doc differ diff --git a/test-data/document/issue_1041_2.doc b/test-data/document/issue_1041_2.doc new file mode 100644 index 00000000000..bc5f7c5b648 Binary files /dev/null and b/test-data/document/issue_1041_2.doc differ