Skip to content

Commit

Permalink
TIKA-3677 -- remove sanity check language and other cleanups
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Feb 10, 2022
1 parent 2f01a19 commit 9a144ab
Show file tree
Hide file tree
Showing 19 changed files with 215 additions and 118 deletions.
2 changes: 1 addition & 1 deletion tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@ private void compareFileMagic(String magicDir) throws Exception {
Set<String> tikaLacking = new TreeSet<>();
Set<String> tikaNoMagic = new TreeSet<>();

// Sanity check
// Plausibility check
File dir = new File(magicDir);
if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() &&
(new File(dir, "vorbis")).exists()) {
Expand Down
4 changes: 3 additions & 1 deletion tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,9 @@ public static long readLongBE(InputStream stream) throws IOException, BufferUnde
public static long readUE7(InputStream stream) throws IOException {
int i;
long v = 0;
while ((i = stream.read()) >= 0) {
int max = 6;
int read = 0;
while ((i = stream.read()) >= 0 && read++ < max) {
v = v << 7;
if ((i & 128) == 128) {
// Continues
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,10 @@ private long readUInt32(DataInputStream input) throws IOException {
}

private int readUInt24(DataInputStream input) throws IOException {
int uint = input.read() << 16;
uint += input.read() << 8;
uint += input.read();
//readUnsignedByte ensures EOFException
int uint = input.readUnsignedByte() << 16;
uint += input.readUnsignedByte() << 8;
uint += input.readUnsignedByte();
return uint;
}

Expand Down Expand Up @@ -206,7 +207,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
break;
}

int datalen = readUInt24(datainput); //body length
final int datalen = readUInt24(datainput); //body length
readUInt32(datainput); // timestamp
readUInt24(datainput); // streamid

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,23 +266,23 @@ private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
// The offset is stored in the header from 0x20 onwards
long offsetToSection = EndianUtils.getLongLE(header, 0x20);

// Sanity check the offset. Some files seem to use a different format,
// Bounds check the offset. Some files seem to use a different format,
// and the offset isn't available at 0x20. Until we can work out how
// to find the offset in those files, skip them if detected
if (offsetToSection > 0xa00000l) {
// Header should never be more than 10mb into the file, something is wrong
offsetToSection = 0;
}

// Work out how far to skip, and sanity check
// Work out how far to skip, and bounds check
long toSkip = offsetToSection - header.length;
if (offsetToSection == 0) {
return false;
}
while (toSkip > 0) {
byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
IOUtils.readFully(stream, skip);
toSkip -= skip.length;
long skipped = IOUtils.skipFully(stream, toSkip);
if (skipped != toSkip) {
throw new TikaException("Failed to skip: " + toSkip +
" bytes; skipped: " + skipped);
}
return true;
}
Expand Down Expand Up @@ -329,7 +329,7 @@ private int skipToCustomProperties(InputStream stream) throws IOException, TikaE
// We should now have the count
int count = EndianUtils.readUShortLE(stream);

// Sanity check it
// Plausibilitu check it
if (count > 0 && count < 0x7f) {
// Looks plausible
return count;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public class PRTParser extends AbstractParser {
* How long do we allow a text run to claim to be, before we
* decide we're confused and it's not really text after all?
*/
private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
private static final int MAX_TEXT_LENGTH = 0x0800;

public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
Expand Down Expand Up @@ -145,8 +145,8 @@ private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
}

int length = EndianUtils.readUShortLE(stream);
if (length <= MAX_SANE_TEXT_LENGTH) {
// Length sanity check passed
if (length <= MAX_TEXT_LENGTH) {
// Length check passed
handleText(length, stream, xhtml);
}
}
Expand All @@ -170,15 +170,15 @@ private void handleViewName(int typeA, int typeB, InputStream stream, XHTMLConte
byte[] b2 = new byte[2];
IOUtils.readFully(stream, b2);
int length = EndianUtils.getUShortLE(b2);
if (length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
// Length sanity check passed
if (length > 1 && length <= MAX_TEXT_LENGTH) {
// Length check passed
handleText(length, stream, xhtml);
} else {
// Was probably something else
l5.record(b2[0]);
l5.record(b2[1]);
}
} else if (maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
} else if (maybeLength > 0 && maybeLength < MAX_TEXT_LENGTH) {
// Looks like it's straight into the text
handleText(maybeLength, stream, xhtml);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream st
// Grab the PE header offset
int peOffset = EndianUtils.readIntLE(stream);

// Sanity check - while it may go anywhere, it's normally in the first few kb
// Reasonability check - while it may go anywhere, it's normally in the first few kb
if (peOffset > 4096 || peOffset < 0x3f) {
return;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Photoshop;
Expand All @@ -46,6 +48,10 @@ public class BPGParser extends AbstractImageParser {
protected static final int EXTENSION_TAG_ICC_PROFILE = 2;
protected static final int EXTENSION_TAG_XMP = 3;
protected static final int EXTENSION_TAG_THUMBNAIL = 4;

//50 MB -- throw TikaMemoryLimitException if xmp or exif is allegedly longer than this
private static final int DEFAULT_MAX_RECORD_LENGTH = 50 * 1024 * 1024;

private static final long serialVersionUID = -161736541253892772L;
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(
Expand All @@ -55,6 +61,8 @@ public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}

private int maxRecordLength = DEFAULT_MAX_RECORD_LENGTH;

@Override
void extractMetadata(InputStream stream, ContentHandler contentHandler, Metadata metadata,
ParseContext parseContext)
Expand Down Expand Up @@ -145,6 +153,12 @@ void extractMetadata(InputStream stream, ContentHandler contentHandler, Metadata
while (extensionsDataSeen < extensionDataLength) {
int extensionType = (int) EndianUtils.readUE7(stream);
int extensionLength = (int) EndianUtils.readUE7(stream);
if (extensionLength > maxRecordLength) {
throw new TikaMemoryLimitException("extension length (" +
extensionLength + " bytes) is greater than 'maxRecordLength' (" +
maxRecordLength + " bytes). If this file is not corrupt, " +
"consider bumping the maxRecordLength via tika-config.xml");
}
switch (extensionType) {
case EXTENSION_TAG_EXIF:
metadataExtractor.parseRawExif(stream, extensionLength, true);
Expand All @@ -153,7 +167,7 @@ void extractMetadata(InputStream stream, ContentHandler contentHandler, Metadata
handleXMP(stream, extensionLength, metadataExtractor);
break;
default:
stream.skip(extensionLength);
IOUtils.skipFully(stream, extensionLength);
}
extensionsDataSeen += extensionLength;
}
Expand All @@ -163,8 +177,22 @@ void extractMetadata(InputStream stream, ContentHandler contentHandler, Metadata
// We can't do anything with these parts
}

@Field
public void setMaxRecordLength(int maxRecordLength) {
this.maxRecordLength = maxRecordLength;
}

protected void handleXMP(InputStream stream, int xmpLength, ImageMetadataExtractor extractor)
throws IOException, TikaException, SAXException {
if (xmpLength < 0) {
throw new TikaException("xmp length must be >= 0");
}
if (xmpLength > maxRecordLength) {
throw new TikaMemoryLimitException("xmplength (" + xmpLength + " bytes) is larger than maxXMPLength (" +
maxRecordLength + "). Consider setting maxXMPLength to a greater value for " +
"this parser via" +
" tika-config.xml if this file is not corrupt.");
}
byte[] xmp = new byte[xmpLength];
IOUtils.readFully(stream, xmp);
extractor.parseRawXMP(xmp);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
image_length -= 8;//for the bytes read so far
if (image_length > MAX_IMAGE_LENGTH_BYTES) {
throw new TikaMemoryLimitException(image_length, MAX_IMAGE_LENGTH_BYTES);
} else if (image_length < 0) {
throw new TikaException("image length must be >= 0");
}

byte[] full_file = new byte[image_length];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Photoshop;
Expand Down Expand Up @@ -234,7 +235,7 @@ private ResourceBlock(InputStream stream, int maxDataLengthBytes)
// Do we have use for the data segment?
if (captureData(id)) {
if (dataLen > maxDataLengthBytes) {
throw new TikaException(
throw new TikaMemoryLimitException(
"data length must be < " + maxDataLengthBytes + ": " + dataLen);
}
data = new byte[dataLen];
Expand Down
Loading

0 comments on commit 9a144ab

Please sign in to comment.