Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
Expand Down Expand Up @@ -111,6 +112,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
MediaType.image("jpx"), MediaType.image("x-portable-pixmap")
})));
private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>();
private static Map<String,Boolean> IMAGE_MAGICK_PRESENT = new HashMap<>();


@Override
Expand Down Expand Up @@ -145,6 +147,16 @@ public boolean hasTesseract(TesseractOCRConfig config) {
if (TESSERACT_PRESENT.containsKey(tesseract)) {
return TESSERACT_PRESENT.get(tesseract);
}
//prevent memory bloat
if (TESSERACT_PRESENT.size() > 100) {
TESSERACT_PRESENT.clear();
}
//check that the parent directory exists
if (! Files.isDirectory(Paths.get(config.getTesseractPath()))) {
TESSERACT_PRESENT.put(tesseract, false);
return false;
}

// Try running Tesseract from there, and see if it exists + works
String[] checkCmd = { tesseract };
boolean hasTesseract = ExternalParser.check(checkCmd);
Expand All @@ -158,14 +170,22 @@ private boolean hasImageMagick(TesseractOCRConfig config) {
String ImageMagick = getImageMagickPath(config);

// Have we already checked for a copy of ImageMagick Program there?
if (TESSERACT_PRESENT.containsKey(ImageMagick)) {
return TESSERACT_PRESENT.get(ImageMagick);
if (IMAGE_MAGICK_PRESENT.containsKey(ImageMagick)) {
return IMAGE_MAGICK_PRESENT.get(ImageMagick);
}
//prevent memory bloat
if (IMAGE_MAGICK_PRESENT.size() > 100) {
IMAGE_MAGICK_PRESENT.clear();
}
//check that directory exists
if (! Files.isDirectory(Paths.get(config.getImageMagickPath()))) {
IMAGE_MAGICK_PRESENT.put(ImageMagick, false);
return false;
}

// Try running ImageMagick program from there, and see if it exists + works
String[] checkCmd = { ImageMagick };
boolean hasImageMagick = ExternalParser.check(checkCmd);
TESSERACT_PRESENT.put(ImageMagick, hasImageMagick);
IMAGE_MAGICK_PRESENT.put(ImageMagick, hasImageMagick);

return hasImageMagick;

Expand All @@ -178,9 +198,9 @@ private String getImageMagickPath(TesseractOCRConfig config) {
static boolean hasPython() {
// check if python is installed and it has the required dependencies for the rotation program to run
boolean hasPython = false;

TemporaryResources tmp = null;
try {
TemporaryResources tmp = new TemporaryResources();
tmp = new TemporaryResources();
File importCheck = tmp.createTemporaryFile();
String prg = "import numpy, matplotlib, skimage";
OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(importCheck), Charset.forName("UTF-8"));
Expand All @@ -192,10 +212,11 @@ static boolean hasPython() {
hasPython = true;
}

tmp.close();

} catch (Exception e) {

} finally {
IOUtils.closeQuietly(tmp);
}

return hasPython;
Expand Down Expand Up @@ -311,20 +332,26 @@ public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseCont

/**
* This method is used to process the image to an OCR-friendly format.
* @param streamingObject input image to be processed
* @param scratchFile input image to be processed
* @param config TesseractOCRconfig class to get ImageMagick properties
* @throws IOException if an input error occurred
* @throws TikaException if an exception timed out
*/
private void processImage(File streamingObject, TesseractOCRConfig config) throws IOException, TikaException {
private void processImage(File scratchFile, TesseractOCRConfig config) throws IOException, TikaException {

// fetch rotation script from resources
InputStream in = getClass().getResourceAsStream("rotation.py");
TemporaryResources tmp = new TemporaryResources();
File rotationScript = tmp.createTemporaryFile();
Files.copy(in, rotationScript.toPath(), StandardCopyOption.REPLACE_EXISTING);

String cmd = "python -W ignore " + rotationScript.getAbsolutePath() + " -f " + streamingObject.getAbsolutePath();

CommandLine commandLine = new CommandLine("python");
String[] args = {"-W",
"ignore",
rotationScript.getAbsolutePath(),
"-f",
scratchFile.getAbsolutePath()};
commandLine.addArguments(args, true);
String angle = "0";

DefaultExecutor executor = new DefaultExecutor();
Expand All @@ -333,24 +360,33 @@ private void processImage(File streamingObject, TesseractOCRConfig config) throw
executor.setStreamHandler(streamHandler);

// determine the angle of rotation required to make the text horizontal
CommandLine cmdLine = CommandLine.parse(cmd);
if(config.getApplyRotation() && hasPython()) {
try {
executor.execute(cmdLine);
angle = outputStream.toString("UTF-8").trim();
executor.execute(commandLine);
String tmpAngle = outputStream.toString("UTF-8").trim();
//verify that you've gotten a numeric value out
Double.parseDouble(tmpAngle);
angle = tmpAngle;
} catch(Exception e) {

}
}

// process the image - parameter values can be set in TesseractOCRConfig.properties
String line = getImageMagickPath(config) + " -density " + config.getDensity() + " -depth " + config.getDepth() +
" -colorspace " + config.getColorspace() + " -filter " + config.getFilter() +
" -resize " + config.getResize() + "% -rotate "+ angle + " " + streamingObject.getAbsolutePath() +
" " + streamingObject.getAbsolutePath();
cmdLine = CommandLine.parse(line);
commandLine = new CommandLine(getImageMagickPath(config));
args = new String[]{
"-density", Integer.toString(config.getDensity()),
"-depth ", Integer.toString(config.getDepth()),
"-colorspace", config.getColorspace(),
" -filter ", config.getFilter(),
"-resize", config.getResize() + "%",
"-rotate", angle,
scratchFile.getAbsolutePath(),
scratchFile.getAbsolutePath()
};
commandLine.addArguments(args, true);
try {
executor.execute(cmdLine);
executor.execute(commandLine);
} catch(Exception e) {

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,4 +171,60 @@ public void testValidateResize() {
config.setResize(1000);
}

@Test(expected=IllegalArgumentException.class)
public void testDataPathCheck() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setTessdataPath("blah\u0000deblah");
}

@Test(expected=IllegalArgumentException.class)
public void testPathCheck() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setTesseractPath("blah\u0000deblah");
}

@Test(expected=IllegalArgumentException.class)
public void testBadOtherKey() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.addOtherTesseractConfig("bad bad", "bad");

}

@Test(expected=IllegalArgumentException.class)
public void testBadOtherValue() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.addOtherTesseractConfig("bad", "bad bad");
}

@Test(expected=IllegalArgumentException.class)
public void testBadOtherValueSlash() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.addOtherTesseractConfig("bad", "bad\\bad");
}

@Test(expected=IllegalArgumentException.class)
public void testBadOtherValueControl() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.addOtherTesseractConfig("bad", "bad\u0001bad");
}

@Test
public void testGoodOtherParameters() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.addOtherTesseractConfig("good", "good");
}

@Test
public void testBogusPathCheck() {
//allow path that doesn't actually exist
TesseractOCRConfig config = new TesseractOCRConfig();
config.setTesseractPath("blahdeblahblah");
assertEquals("blahdeblahblah", config.getTesseractPath());
}

@Test(expected=IllegalArgumentException.class)
public void testBadColorSpace() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setColorspace("someth!ng");
}
}
2 changes: 1 addition & 1 deletion tika-serialization/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
<parent>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId>
<version>1.19-SNAPSHOT</version>
<version>1.18-SNAPSHOT</version>
<relativePath>../tika-parent/pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion tika-server/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
<parent>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId>
<version>1.19-SNAPSHOT</version>
<version>1.18-SNAPSHOT</version>
<relativePath>../tika-parent/pom.xml</relativePath>
</parent>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
Expand Down Expand Up @@ -82,6 +84,9 @@

@Path("/tika")
public class TikaResource {

private static Pattern ALLOWABLE_HEADER_CHARS = Pattern.compile("(?i)^[-/_\\.A-Z0-9 ]+$");

public static final String GREETING = "This is Tika Server (" + new Tika().toString() + "). Please PUT\n";
public static final String X_TIKA_OCR_HEADER_PREFIX = "X-Tika-OCR";
public static final String X_TIKA_PDF_HEADER_PREFIX = "X-Tika-PDF";
Expand Down Expand Up @@ -190,40 +195,103 @@ public static InputStream getInputStream(InputStream is, HttpHeaders headers) {
* @throws WebApplicationException thrown when field cannot be found.
*/
private static void processHeaderConfig(MultivaluedMap<String, String> httpHeaders, Object object, String key, String prefix) {
try {
String property = StringUtils.removeStart(key, prefix);
Field field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));

field.setAccessible(true);
if (field.getType() == String.class) {
field.set(object, httpHeaders.getFirst(key));
} else if (field.getType() == int.class) {
field.setInt(object, Integer.parseInt(httpHeaders.getFirst(key)));
} else if (field.getType() == double.class) {
field.setDouble(object, Double.parseDouble(httpHeaders.getFirst(key)));
} else if (field.getType() == boolean.class) {
field.setBoolean(object, Boolean.parseBoolean(httpHeaders.getFirst(key)));
} else {
//couldn't find a directly accessible field
//try for setX(String s)
String setter = StringUtils.uncapitalize(property);
setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
Method m = null;
try {
m = object.getClass().getMethod(setter, String.class);
} catch (NoSuchMethodException e) {
//swallow

try {String property = StringUtils.removeStart(key, prefix);
Field field = null;
try {
field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
} catch (NoSuchFieldException e) {
//swallow
}
String setter = property;
setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
//default assume string class
//if there's a more specific type, e.g. double, int, boolean
//try that.
Class clazz = String.class;
if (field != null) {
if (field.getType() == int.class || field.getType() == Integer.class) {
clazz = int.class;
} else if (field.getType() == double.class) {
clazz = double.class;
} else if (field.getType() == Double.class) {
clazz = Double.class;
} else if (field.getType() == float.class) {
clazz = float.class;
} else if (field.getType() == Float.class) {
clazz = Float.class;
} else if (field.getType() == boolean.class) {
clazz = boolean.class;
} else if (field.getType() == Boolean.class) {
clazz = Boolean.class;
}
if (m != null) {
m.invoke(object, httpHeaders.getFirst(key));
}

Method m = tryToGetMethod(object, setter, clazz);
//if you couldn't find more specific setter, back off
//to string setter and try that.
if (m == null && clazz != String.class) {
m = tryToGetMethod(object, setter, String.class);
}

if (m != null) {
String val = httpHeaders.getFirst(key);
val = val.trim();
if (clazz == String.class) {
checkTrustWorthy(setter, val);
m.invoke(object, val);
} else if (clazz == int.class || clazz == Integer.class) {
m.invoke(object, Integer.parseInt(val));
} else if (clazz == double.class || clazz == Double.class) {
m.invoke(object, Double.parseDouble(val));
} else if (clazz == boolean.class || clazz == Boolean.class) {
m.invoke(object, Boolean.parseBoolean(val));
} else if (clazz == float.class || clazz == Float.class) {
m.invoke(object, Float.parseFloat(val));
} else {
throw new IllegalArgumentException("setter must be String, int, float, double or boolean...for now");
}
} else {
throw new NoSuchMethodException("Couldn't find: "+setter);
}

} catch (Throwable ex) {
throw new WebApplicationException(String.format(Locale.ROOT,
"%s is an invalid %s header", key, X_TIKA_OCR_HEADER_PREFIX));
}
}

private static void checkTrustWorthy(String setter, String val) {
if (setter == null || val == null) {
throw new IllegalArgumentException("setter and val must not be null");
}
if (setter.toLowerCase(Locale.US).contains("trusted")) {
throw new IllegalArgumentException("Can't call a trusted method via tika-server headers");
}
Matcher m = ALLOWABLE_HEADER_CHARS.matcher(val);
if (! m.find()) {
throw new IllegalArgumentException("Header val: "+val +" contains illegal characters. " +
"Must contain: TikaResource.ALLOWABLE_HEADER_CHARS");
}
}

/**
* Tries to get method. Silently swallows NoMethodException and returns
* <code>null</code> if not found.
* @param object
* @param method
* @param clazz
* @return
*/
private static Method tryToGetMethod(Object object, String method, Class clazz) {
try {
return object.getClass().getMethod(method, clazz);
} catch (NoSuchMethodException e) {
//swallow
}
return null;
}

@SuppressWarnings("serial")
public static void fillMetadata(Parser parser, Metadata metadata, ParseContext context, MultivaluedMap<String, String> httpHeaders) {
String fileName = detectFilename(httpHeaders);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,6 @@

package org.apache.tika.server;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import javax.ws.rs.core.Response;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
Expand All @@ -35,6 +26,15 @@
import org.apache.tika.server.resource.TikaResource;
import org.junit.Test;

import javax.ws.rs.core.Response;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

public class TikaResourceTest extends CXFTestBase {
public static final String TEST_DOC = "test.doc";
public static final String TEST_PASSWORD_PROTECTED = "password.xls";
Expand Down Expand Up @@ -279,4 +279,52 @@ public void testExtractTextAcceptPlainText() throws Exception {
responseMsg
);
}

@Test
public void testDataIntegrityCheck() throws Exception {
Response response = WebClient.create(endPoint + TIKA_PATH)
.type("application/pdf")
.accept("text/plain")
.header(TikaResource.X_TIKA_OCR_HEADER_PREFIX +
"tesseractPath",

"C://tmp//hello.bat\u0000")
.put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
assertEquals(500, response.getStatus());

response = WebClient.create(endPoint + TIKA_PATH)
.type("application/pdf")
.accept("text/plain")
.header(TikaResource.X_TIKA_OCR_HEADER_PREFIX +
"tesseractPath",
"bogus path")
.put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
assertEquals(200, response.getStatus());
}

@Test
public void testTrustedMethodPrevention() {
Response response = WebClient.create(endPoint + TIKA_PATH)
.type("application/pdf")
.accept("text/plain")
.header(TikaResource.X_TIKA_OCR_HEADER_PREFIX +
"trustedPageSeparator",
"\u0010")
.put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
assertEquals(500, response.getStatus());

}

@Test
public void testFloatInHeader() {
Response response = WebClient.create(endPoint + TIKA_PATH)
.type("application/pdf")
.accept("text/plain")
.header(TikaResource.X_TIKA_PDF_HEADER_PREFIX +
"averageCharTolerance",
"2.0")
.put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
assertEquals(200, response.getStatus());

}
}
2 changes: 1 addition & 1 deletion tika-translate/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
<parent>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId>
<version>1.19-SNAPSHOT</version>
<version>1.18-SNAPSHOT</version>
<relativePath>../tika-parent/pom.xml</relativePath>
</parent>

Expand Down
2 changes: 1 addition & 1 deletion tika-xmp/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
<parent>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId>
<version>1.19-SNAPSHOT</version>
<version>1.18-SNAPSHOT</version>
<relativePath>../tika-parent/pom.xml</relativePath>
</parent>

Expand Down