Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 113 additions & 4 deletions poi-scratchpad/src/test/java/org/apache/poi/hwpf/TestHWPFParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,16 @@ Licensed to the Apache Software Foundation (ASF) under one or more

package org.apache.poi.hwpf;

import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.junit.jupiter.api.Test;

import java.io.File;
import java.io.InputStream;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.*;

public class TestHWPFParser {
@Test
Expand All @@ -39,6 +40,114 @@ void testDoc() throws Exception {
}
}

/**
* Test reading a real-world .doc file.
* This test now handles non-standard formatting that WPS/Word can open.
*/
@Test
void testDocRead() throws Exception {
// Enable tolerant mode for corrupt blocks
System.setProperty("org.apache.poi.poifs.allowCorruptBlocks", "true");
try {
try (
InputStream stream = HWPFTestDataSamples.openSampleFileStream("issue_1041.doc");
HWPFDocument doc = HWPFParser.parse(stream)
) {
assertNotNull(doc);
WordExtractor extractor = new WordExtractor(doc);
String text = extractor.getText();

// Verify actual text content, not just non-null
assertNotNull(text, "Extracted text should not be null");
assertFalse(text.isEmpty(), "Extracted text should not be empty");
assertFalse(text.trim().isEmpty(), "Extracted text should not be blank");
}
} finally {
// Reset to default strict mode
System.clearProperty("org.apache.poi.poifs.allowCorruptBlocks");
}
}

/**
* Test that by default (strict mode), reading corrupt files throws an exception.
*/
@Test
void testDocReadStrictMode() throws Exception {
// Ensure strict mode is enabled (default behavior)
System.clearProperty("org.apache.poi.poifs.allowCorruptBlocks");

// Should throw HWPFReadException (wrapping IndexOutOfBoundsException) in strict mode
HWPFReadException exception = assertThrows(HWPFReadException.class, () -> {
try (
InputStream stream = HWPFTestDataSamples.openSampleFileStream("issue_1041.doc");
HWPFDocument doc = HWPFParser.parse(stream)
) {
// This should not succeed in strict mode
}
});

// Verify the root cause is IndexOutOfBoundsException about corrupt blocks
Throwable rootCause = getRootCause(exception);
assertInstanceOf(IndexOutOfBoundsException.class, rootCause, "Expected root cause to be IndexOutOfBoundsException, but was: " + rootCause.getClass().getName());
assertTrue(rootCause.getMessage().contains("beyond EOF"),
"Expected exception message to contain 'beyond EOF', but got: " + rootCause.getMessage());
}

/**
* Helper method to get the root cause of an exception chain.
* Limits traversal depth to prevent infinite loops in case of circular references.
*/
private static Throwable getRootCause(Throwable throwable) {
Throwable cause = throwable;
int depth = 0;
final int MAX_DEPTH = 20; // Reasonable limit for exception chains

while (cause.getCause() != null && cause.getCause() != cause && depth < MAX_DEPTH) {
cause = cause.getCause();
depth++;
}
return cause;
}

@Test
void testWpsDocByFs()throws Exception{
// Enable tolerant mode for corrupt blocks
System.setProperty("org.apache.poi.poifs.allowCorruptBlocks", "true");
try {
POIDataSamples instance = POIDataSamples.getDocumentInstance();
File file = instance.getFile("issue_1041.doc");
POIFSFileSystem fs = new POIFSFileSystem(file);
WordExtractor extractor = new WordExtractor(fs);
String text = extractor.getText();

// Verify actual text content, not just non-null
assertNotNull(text, "Extracted text should not be null");
assertFalse(text.isEmpty(), "Extracted text should not be empty");
assertFalse(text.trim().isEmpty(), "Extracted text should not be blank");
} finally {
// Reset to default strict mode
System.clearProperty("org.apache.poi.poifs.allowCorruptBlocks");
}
}

@Test
void testOffice97_2003DocRead() throws Exception {
try (
InputStream stream = HWPFTestDataSamples.openSampleFileStream("issue_1041_2.doc");
HWPFDocument doc = HWPFParser.parse(stream)
) {
assertNotNull(doc);
WordExtractor extractor = new WordExtractor(doc);
String text = extractor.getText();

// Verify actual text content, not just non-null
assertNotNull(text, "Extracted text should not be null");
assertFalse(text.isEmpty(), "Extracted text should not be empty");
assertFalse(text.trim().isEmpty(), "Extracted text should not be blank");
}
}


@Test
void testFailOnDocx() throws Exception {
try (InputStream stream = HWPFTestDataSamples.openSampleFileStream("sample.docx")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,23 @@ public ByteArrayBackedDataSource(byte[] data) {

@Override
public ByteBuffer read(int length, long position) {
// Handle non-standard files that have references to blocks beyond EOF
if(position >= size) {
throw new IndexOutOfBoundsException(
"Unable to read " + length + " bytes from " +
position + " in stream of length " + size
);
// Check system property dynamically to allow runtime configuration
boolean allowCorruptBlocks = Boolean.getBoolean("org.apache.poi.poifs.allowCorruptBlocks");
if (!allowCorruptBlocks) {
throw new IndexOutOfBoundsException(
"Position " + position + " is beyond EOF (" + size + "). " +
"Set system property 'org.apache.poi.poifs.allowCorruptBlocks' to true " +
"to allow reading corrupt files with missing blocks.");
}
// Return a zero-filled buffer in tolerant mode
// This allows processing of documents with corrupted block chains (e.g., some WPS files)
return ByteBuffer.allocate(length);
}

int toRead = (int)Math.min(length, size - position);

return ByteBuffer.wrap(buffer, (int)position, toRead);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,16 @@ public FileChannel getChannel() {
@Override
public ByteBuffer read(int length, long position) throws IOException {
if (position >= size()) {
throw new IndexOutOfBoundsException("Position " + position + " past the end of the file");
// Check system property dynamically to allow runtime configuration
boolean allowCorruptBlocks = Boolean.getBoolean("org.apache.poi.poifs.allowCorruptBlocks");
if (!allowCorruptBlocks) {
throw new IndexOutOfBoundsException(
"Position " + position + " is beyond EOF (" + size() + "). " +
"Set system property 'org.apache.poi.poifs.allowCorruptBlocks' to true " +
"to allow reading corrupt files with missing blocks.");
}
// Return a zero-filled buffer in tolerant mode
return ByteBuffer.allocate(length);
}

// TODO Could we do the read-only case with MapMode.PRIVATE instead?
Expand Down
Binary file added test-data/document/issue_1041.doc
Binary file not shown.
Binary file added test-data/document/issue_1041_2.doc
Binary file not shown.