From e8ba769a52f68046335576b18523c76888efaf72 Mon Sep 17 00:00:00 2001 From: epugh Date: Sun, 20 Oct 2019 16:12:42 -0400 Subject: [PATCH 1/8] Adding in a demo test.. There is some debugging as well --- .../java/org/apache/tika/cli/TikaCLITest.java | 10 ++++++++ .../test-data/tika-config-ocr-pdf.xml | 23 +++++++++++++++++++ .../org/apache/tika/parser/ParseContext.java | 2 ++ .../tika/parser/ocr/TesseractOCRParser.java | 13 ++++++++++- .../parser/ocr/TesseractOCRParserTest.java | 6 +++++ 5 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tika-app/src/test/resources/test-data/tika-config-ocr-pdf.xml diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index ab17b68e1b4..997d3ac73de 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -470,6 +470,16 @@ public void testConfig() throws Exception { assertTrue(content.contains("apple")); assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser")); } + + @Test + public void testStupidTesseractConfig() throws Exception { + String[] params = new String[]{"--config="+testDataFile.toString()+"/tika-config-ocr-pdf.xml", "--jsonRecursive","--extract","--pretty-print","-v","-x", resourcePrefix+"testPDF_childAttachments.pdf"}; + TikaCLI.main(params); + String content = outContent.toString(UTF_8.name()); + System.err.println(content); + //assertTrue(content.contains("apple")); + //assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser")); + } @Test public void testConfigIgnoreInit() throws Exception { diff --git a/tika-app/src/test/resources/test-data/tika-config-ocr-pdf.xml b/tika-app/src/test/resources/test-data/tika-config-ocr-pdf.xml new file mode 100644 index 00000000000..5e60684ef82 --- /dev/null +++ b/tika-app/src/test/resources/test-data/tika-config-ocr-pdf.xml @@ -0,0 +1,23 @@ + + + + + + + + + + /usr/local/bin/ + /usr/local/Cellar/tesseract/4.1.0/share/tessdata + hocr + + + + + ocr_only + true + + + + + diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index f5c3169bf5a..f97dfcd59a1 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -59,6 +59,8 @@ public class ParseContext implements Serializable { * @param value the value to be added, or null to remove */ public void set(Class key, T value) { + System.err.println("Here is the key:" + key); + System.err.println("Here is the val:" + value); if (value != null) { context.put(key.getName(), value); } else { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 655aaf8aa66..fb2a8b35fb0 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -123,6 +123,9 @@ public class TesseractOCRParser extends AbstractParser implements Initializable public Set getSupportedTypes(ParseContext context) { // If Tesseract is installed, offer our supported image types TesseractOCRConfig config = context.get(TesseractOCRConfig.class, defaultConfig); + System.err.println("In getSupportedTypes"); + System.err.println("Default Cojnfig:" + defaultConfig.getOutputType()); + System.err.println("config:" + config.getOutputType()); if (hasTesseract(config)) { return SUPPORTED_TYPES; } @@ -405,6 +408,7 @@ private void processImage(File scratchFile, TesseractOCRConfig config) throws IO private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ParseContext parseContext, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException { + System.err.println("The Config has output type of " + config.getOutputType()); File tmpTxtOutput = null; try { File input = tikaInputStream.getFile(); @@ -459,7 +463,13 @@ private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, Parse */ @Override public void initialize(Map params) throws TikaConfigException { - + System.err.println("Boom, here I am!!!!!!!!!!!!!!"); + System.err.println(params.keySet()); + System.err.println("Params:" + params.toString()); + //String s = params.get("OutputType").getValue().toString(); + //System.err.println("I got " + s); + //defaultConfig.setOutputType(s); + } @Override @@ -467,6 +477,7 @@ public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { //this will incorrectly trigger for people who turn off Tesseract //by sending in a bogus tesseract path via a custom TesseractOCRConfig. + System.out.println("Ack, here I am"); //TODO: figure out how to solve that. if (! hasWarned()) { if (hasTesseract(defaultConfig)) { diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index 9ebcee0685b..bf8f721feef 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -51,11 +51,15 @@ public class TesseractOCRParserTest extends TikaTest { public static boolean canRun() { TesseractOCRConfig config = new TesseractOCRConfig(); + config.setTesseractPath("/usr/local/bin"); + config.setTessdataPath("/usr/local/Cellar/tesseract/4.1.0/share/tessdata"); TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest(); return tesseractOCRTest.canRun(config); } private boolean canRun(TesseractOCRConfig config) { + config.setTesseractPath("/usr/local/bin"); + config.setTessdataPath("/usr/local/Cellar/tesseract/4.1.0/share/tessdata"); String[] checkCmd = {config.getTesseractPath() + getTesseractProg()}; // If Tesseract is not on the path, do not run the test. return ExternalParser.check(checkCmd); @@ -164,6 +168,8 @@ private String runOCR(String resource, String[] nonOCRContains, int numMetadatas BasicContentHandlerFactory.HANDLER_TYPE handlerType, TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception { TesseractOCRConfig config = new TesseractOCRConfig(); + config.setTesseractPath("/usr/local/bin"); + config.setTessdataPath("/usr/local/Cellar/tesseract/4.1.0/share/tessdata"); config.setOutputType(outputType); Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), From 9e3bb49072353f6e9e83fb175c2e8eedb2234972 Mon Sep 17 00:00:00 2001 From: epugh Date: Mon, 21 Oct 2019 08:25:44 -0400 Subject: [PATCH 2/8] confirm Tesseract ran --- .../src/test/java/org/apache/tika/cli/TikaCLITest.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 997d3ac73de..f8c67de424b 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -472,13 +472,14 @@ public void testConfig() throws Exception { } @Test - public void testStupidTesseractConfig() throws Exception { + public void testModifyAndRunTesseractViaConfig() throws Exception { String[] params = new String[]{"--config="+testDataFile.toString()+"/tika-config-ocr-pdf.xml", "--jsonRecursive","--extract","--pretty-print","-v","-x", resourcePrefix+"testPDF_childAttachments.pdf"}; TikaCLI.main(params); String content = outContent.toString(UTF_8.name()); - System.err.println(content); - //assertTrue(content.contains("apple")); - //assertTrue(content.contains("org.apache.tika.parser.html.HtmlParser")); + assertTrue(content.contains("org.apache.tika.parser.ocr.TesseractOCRParser")); + assertTrue(content.contains("bbox")); + assertTrue(content.contains("ocr_line")); + } @Test From 2e3a60d8b813efc0b565aa8bd75cf202c4d32d9a Mon Sep 17 00:00:00 2001 From: epugh Date: Mon, 21 Oct 2019 08:26:03 -0400 Subject: [PATCH 3/8] remove debugging lines --- .../java/org/apache/tika/parser/ParseContext.java | 2 -- .../apache/tika/parser/ocr/TesseractOCRParser.java | 13 +------------ 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index f97dfcd59a1..f5c3169bf5a 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -59,8 +59,6 @@ public class ParseContext implements Serializable { * @param value the value to be added, or null to remove */ public void set(Class key, T value) { - System.err.println("Here is the key:" + key); - System.err.println("Here is the val:" + value); if (value != null) { context.put(key.getName(), value); } else { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index fb2a8b35fb0..655aaf8aa66 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -123,9 +123,6 @@ public class TesseractOCRParser extends AbstractParser implements Initializable public Set getSupportedTypes(ParseContext context) { // If Tesseract is installed, offer our supported image types TesseractOCRConfig config = context.get(TesseractOCRConfig.class, defaultConfig); - System.err.println("In getSupportedTypes"); - System.err.println("Default Cojnfig:" + defaultConfig.getOutputType()); - System.err.println("config:" + config.getOutputType()); if (hasTesseract(config)) { return SUPPORTED_TYPES; } @@ -408,7 +405,6 @@ private void processImage(File scratchFile, TesseractOCRConfig config) throws IO private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ParseContext parseContext, XHTMLContentHandler xhtml, TesseractOCRConfig config) throws IOException, SAXException, TikaException { - System.err.println("The Config has output type of " + config.getOutputType()); File tmpTxtOutput = null; try { File input = tikaInputStream.getFile(); @@ -463,13 +459,7 @@ private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, Parse */ @Override public void initialize(Map params) throws TikaConfigException { - System.err.println("Boom, here I am!!!!!!!!!!!!!!"); - System.err.println(params.keySet()); - System.err.println("Params:" + params.toString()); - //String s = params.get("OutputType").getValue().toString(); - //System.err.println("I got " + s); - //defaultConfig.setOutputType(s); - + } @Override @@ -477,7 +467,6 @@ public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { //this will incorrectly trigger for people who turn off Tesseract //by sending in a bogus tesseract path via a custom TesseractOCRConfig. - System.out.println("Ack, here I am"); //TODO: figure out how to solve that. if (! hasWarned()) { if (hasTesseract(defaultConfig)) { From ccb8abe7b8ecb43a2b42a695e31bb37e70d51202 Mon Sep 17 00:00:00 2001 From: epugh Date: Mon, 21 Oct 2019 08:27:26 -0400 Subject: [PATCH 4/8] bool not boolean --- tika-app/src/test/resources/test-data/tika-config-ocr-pdf.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tika-app/src/test/resources/test-data/tika-config-ocr-pdf.xml b/tika-app/src/test/resources/test-data/tika-config-ocr-pdf.xml index 5e60684ef82..8199edbeacf 100644 --- a/tika-app/src/test/resources/test-data/tika-config-ocr-pdf.xml +++ b/tika-app/src/test/resources/test-data/tika-config-ocr-pdf.xml @@ -14,8 +14,8 @@ - ocr_only - true + ocr_only + true From d61ab3978689b4e9f671f5dfdbb1aa7527632584 Mon Sep 17 00:00:00 2001 From: epugh Date: Mon, 21 Oct 2019 08:28:17 -0400 Subject: [PATCH 5/8] if we have a TesseractOCRConfig, then populate context so the PDFParser picks it up --- .../java/org/apache/tika/cli/TikaCLI.java | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index bb3f91af5cf..5a3c7aa1eb5 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -49,6 +49,7 @@ import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; @@ -98,6 +99,8 @@ import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.parser.html.BoilerpipeContentHandler; +import org.apache.tika.parser.ocr.TesseractOCRConfig; +import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.parser.utils.CommonsDigester; import org.apache.tika.sax.BasicContentHandlerFactory; @@ -193,6 +196,33 @@ private void extractInlineImagesFromPDFs() { context.set(PDFParserConfig.class, pdfParserConfig); } } + + private void enableOCRPDF() throws TikaException, IOException, SAXException { + + configure(); + + CompositeParser parser = (CompositeParser)config.getParser(); + Iterator iter = parser.getAllComponentParsers().iterator(); + while(iter.hasNext()) { + Parser p = iter.next(); + if (p instanceof TesseractOCRParser) { + TesseractOCRConfig tesseractOCRConfig = ((TesseractOCRParser) p).getDefaultConfig(); + context.set(TesseractOCRConfig.class, tesseractOCRConfig); + // Should we mimic the warn? + } + } + + if (configFilePath == null && context.get(PDFParserConfig.class) == null) { + PDFParserConfig pdfParserConfig = new PDFParserConfig(); + pdfParserConfig.setExtractInlineImages(true); + String warn = "As a convenience, TikaCLI has turned on extraction of\n" + + "inline images for the PDFParser (TIKA-2374).\n" + + "Aside from the -z option, this is not the default behavior\n"+ + "in Tika generally or in tika-server."; + LOG.info(warn); + context.set(PDFParserConfig.class, pdfParserConfig); + } + } private class OutputType { public void process( @@ -452,6 +482,7 @@ public void process(String arg) throws Exception { extractDir = new File(dirPath); } else if (arg.equals("-z") || arg.equals("--extract")) { extractInlineImagesFromPDFs(); + enableOCRPDF(); type = NO_OUTPUT; context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor()); } else if (arg.equals("-r") || arg.equals("--pretty-print")) { From 239c92a617c17bea1f39f5eac9750c572079cbdb Mon Sep 17 00:00:00 2001 From: epugh Date: Mon, 21 Oct 2019 08:38:05 -0400 Subject: [PATCH 6/8] use small PDF to drastically speed up test --- .../java/org/apache/tika/cli/TikaCLITest.java | 3 ++- .../src/test/resources/test-data/testPDF_bom.pdf | Bin 0 -> 7645 bytes 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 tika-app/src/test/resources/test-data/testPDF_bom.pdf diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index f8c67de424b..0702c7650cf 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -473,10 +473,11 @@ public void testConfig() throws Exception { @Test public void testModifyAndRunTesseractViaConfig() throws Exception { - String[] params = new String[]{"--config="+testDataFile.toString()+"/tika-config-ocr-pdf.xml", "--jsonRecursive","--extract","--pretty-print","-v","-x", resourcePrefix+"testPDF_childAttachments.pdf"}; + String[] params = new String[]{"--config="+testDataFile.toString()+"/tika-config-ocr-pdf.xml", "--jsonRecursive","--extract","--pretty-print","-v","-x", resourcePrefix+"testPDF_bom.pdf"}; TikaCLI.main(params); String content = outContent.toString(UTF_8.name()); assertTrue(content.contains("org.apache.tika.parser.ocr.TesseractOCRParser")); + assertTrue(content.contains("Hello")); assertTrue(content.contains("bbox")); assertTrue(content.contains("ocr_line")); diff --git a/tika-app/src/test/resources/test-data/testPDF_bom.pdf b/tika-app/src/test/resources/test-data/testPDF_bom.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3b236e6ab6d6712bb7ae58ba15e4ea2f637c02c7 GIT binary patch literal 7645 zcmb_>cRX9)|9^|p+Dd7uimKTXiNqeYYi~+XLCjha#HyCos8wRstWkT_UbR=vnx*!p zX6;h^5^dl6`}q9xyEl@1p7);Dd7anmc%J)szBa4tm{p}@I6+)Igv`wg+Y751J8wTV zw-Q1C+yFacD?(vm08kNbYmT%4KrxUy04QVm1PMoAexJaQa4EQnohck3DoW^xM8IJ- zgs6+~P!O5-ed3ocUM+kGpSDO!QtjXX2d2sw4g8GCw=bT08D9}6$fzxwa#xbIl97;U zCt6J)yNM95;mmF%(vWS6+l3IV2btJjb15Ty=kjjgxyhMzbo{ilO)u#7Xw`5mlf;^w z$)No#n(kxEz2Ts=vF243VzYAa^$*!kOkNw|ek^=H@Oh7WJV2*MEh$?pY53kTA>7vV zmw%Wm?6k0R!Ty3E0@$0_OYmQ2tm$eG2LM%J=I7rCxGfUE^K32tf$ldwYp9&uv;r~nke4*?qxVj}cw9BjYekyS<5nP|X~fJd0Y zq+|dN@LUdtW_U5$$2SQ7UI}`5LQ)=N&2MHxIkR{*S zk3aU@N)k!l^OUtz&q}htVK!~vxxn6s(uJJ&K!8Y0xxT_kdK68ow>GumgmZYM zEwmR@Rd}ea_UhrY#&NSGHZ!zNDfonvyt}}3u44K@U~0}r)(ZdPm%>c_+sYQ!olsq zY=FA9=);tdt2;+o--`o;evs>dv2G!_#k!prz3N8VBcHvGiWfDx$%$)6P&Nfg1!2jf>1b1A;Z*@yDzX6rK7lLWD=U+rzWD|+%p zuRIov(iG~iE7M)?0=^l00~NM2vt`ci1B;p2&eV=-MAHG@ zZk(>J=7y9z*w0GWb5cz^De}ohKh!YH$?`FHHgcy^NT}vg|M0v*!Q^rs;?xTY^L%I? zcr|-)f~eZ#Y`!fweNTJEFd)izH}J^dgz1Ff#KlwT?7}kNiopueiuFp-?mJ{8#8O%> zvNo(XQWRa=_<5-?Up_kQ(e_}eitf@bU|?SWGDq3Ex5ZBCf6a*ZbMG+8yB?$bIQFFV z?z@FO6O-a5?>?#5k?Cdh;pVBh1nruQ6E8fDJ5L`mE zwLVQ({hh&XL(VkX?GkHe)ImEF0<_VP>v6$=e{>);b^Kk&3vj-voCba*2rebi!Ji!| zq^YrQ>`dGnV7l+0IK9i}U8iD^(Bu`!w)CB-QFP*R=}FaTP*jUufB&m|0pYY8A*s?D z&slQoukL8u#2hkoJ(cpRy?*jYW2_*S6Esz(#^Dsa{P?KK&;8#Ne9lSd9L&wn|C_t8 zEQiG~geWH|DFJhYn*y+WrVcRp{RozbfAi;`Jni^@F)x-(oiK!nAwC65Q%4NdVu>DO z?=OrA0{zRTBn%0AVrTwy6L9>+{(o{TmUjQ{#5pSiHJpr*=X-}H!U_J|(#8UqMm-E~b9~kS~|9uKTu>0SCpHL_- z0Q<-N7oWht?49G7emp!pe^1H3w_|J&{|wg8;3>mwFx%LF1ojEc9FsLngs{CN&i7+Z z0VoK-3E>9;KoAHN00MzA`!Y}tlV?j4aa;2zaEu#3aYqyEZqLVuxyBBLZE%8lFfB=# z{R6nAxdrAXhz|hNK*DXb0RsF0;KQFVAr#E>6Mg{Thhi)_o1f^mRvTw zKe%3&Loq*poyK?(kE)sAa-{#26#o>9xQAJIEh$6t?C?QdgD-hxpQ`fmN=kQD21TDl z>J`+t1sr8r8&rSo=^5-vB`dEUP+m7VW*?i-9b3t~UFqs;;`X*a%<_S z*V1YD%`aoOp7?@3UAZ~PNm`zNVXx?e)V-o7N)^!jxGC1B#=5fAJo!Wd7skI{%R9&o zSnWq&5Ih#i`Ain7;zn|R$3wvp8T&|IuHTv8CYx_D9g@dxM8La`Og ze)Q)TpGgYH7{WJdfHmm?lT*WT!Z}Xsxa&O&)5yV?`eoa>I=>`&25`l^gF_+TRF#Y4?$V{}q#-{)AF_@KK*+>+HN&^|T&wulu+eyJ`%aEq< z^$gJ%rAQo!dz1G4Dn09ZufXZYr=fFrYIfsJfM~ay4r}EF&cb0g8bmxp4E8Vg$F$9@ zi3^d4%sxbq%{??sJ6hV6dT5uF8u7#Ba5v;5GgsT4W9hcTHUAfBG$Tv=t<)zSVbZu+ zacGvkL&Xr{W6F@M`Hm8TqKvEbZMcW`f?CXvg^TIi9;CIcCq+K$r))^t*IA-+yM%^*6T%f;Z|_jk)PeJs>I1yR+!8MG@{mG%O>}j%|apm%fN>9s`NK zEF{Jrz4fTS70${g(RHh8cmknxXgGNasEob4vu#D4ySZs$PJsFerL}3)1?_wr*V>vYp<%C*2ReL;t=`x=^VzM zc+%H~JQG?Y+vC$Jg9hp4SexdRb=1Areh1c#E7C>_ZupkcQn{duw0+J$=mI$d1=qv+ zj-w8OFYH1I&m*DA0vU#hP%j5nXJ5(F%NtCoXX50_-fAdZ#0%xZZ*vV7$&a5cdpD*A z8Yf*bufJ|UdHgs(h<7hj;+~!wDeH_HikkE}z9F*!!G>?f%+3517xyc~_0=&~=9bqM z7;zk@r?{!z3AF1K2J7gR8*KTB@o9_k)!VBnMc$L79Y3)3mNf?6r;eznM&Zg^4kvJ5 zA7fZ$-Nbo_NANbf_3Y}z8g~+vyNENzofs8IQHewsV@G4ZdgAY@SDZ{e@s!{CRg+s^ zB#KZ&iuoip8KNj_8~q#r!&g`R6GA-|v#g#IXJ(X95Nw%jkk>Kq7mq~0XJZV*HL)C$ zLE#vteJ4{Bl9RQ*u-W45f#br%;2uM=75B1anSACo5l@y@lirBT<(n@Dwpb5|o_61h z53^pHe5Lg2gkd?^H&NX++-&)6S>zXGxAeKp28sCRP8AeM%@c=@c%wLt#rG-8LOff$ z7ozM}E2p-}wmch2XDow7kCfI>Yl>?JukoELh`tz6icGNclSNO15fB}Gw?g8(-)yZC z4(|!Qa{tDb(}YGP($|YkvmQbXVK_X6R8ehTMk#Zn6O8K?wGq{o<`rW0Q(FUgENj<9 zhfvL4p}f-7DRm){EutZwva-2&TwmVE#}FwnDyobTQqF9yu#>6_NmV=3PI^099pmvB zk=HU&UFf!Gy7ExLhO4LPTE*3*CX+$s%=aFh7ECcwZywTi8tvr<IAQXEGn5H?yaA*3D7q=uE<)z zmZ?UMk#)G!Sjg1L}yU)1HoWU8XBl7@5RT@U7wml+l=w|pf! zQf3xA>Eq(i>wBh^c8&ro?Y$4mbvZ4w*gwjdp&f$63kKvh?ng$>6gUyQUPA? z6df9!n-B-v>l6$SD2F;qPIa-7@eP#2@y+%;r2Mh+*jCm;!B;4I;hbPbm$25%;s(d) z=Zb`Stw9-SqD??v`lQ@IQFrp&pg2DtRbzeWFpim4F-qa!ocN+)gB9Bq!=VS$jowB{ z-R2r(k8gbS_S8<&(jwdr)#Ht~4 zo)O|~=u%~@>8EqerW+|lJaW8(1m0zgUnXP}S(JZe?%Ty?&L4UYW%hER=g1mXayZuj z&EGz-%;VjY%mhoF&{YR(SY~9_^{ua+`ikb!P53+xE_>m}NgMi&PBp$mCUpK)(2O<0 zZyUCfoMHRE;!(uOy6Qm$vYgpI;W_dWEwh=x8`1hzZ?4fBw{bn=x(WiGm2si2JD9V1 zzwf5l8Vl`r(X?HenD5b1w|SQ|%=!+HL~LSkC%Dr^yRSn_KA*ojVWK$7MJOkpGeWUO zc46LPgEW{GZ`J1o<-N3Pp2@ou?**u0<#&Q~7LyqVKL|YL|6> za5vV5Rj{|OQ01veS8P)=3eTC5?@v)RJ`Z~!%cw66BA|LvM*0L z1d`Y8n30oCQZ*)HP1nR2ld@l)f_fJM4SZL7XL{&D3@R1~cEVQy8FupCH#l_Jzws36 zctm8=klnwc22Fg&{T0fT{}15G`yXv>SOy*>J%^Cj^?GE`ockB zpkcLGuqn$y-RI8)p;tj?Qd*U&dx&;8%J4jBMRVMENg_$TQUcGp{Ukz}=9=*%$)=M} zpQ6%QjE^MCsVDefu=6i@#{00-q{K}I+zQ&SrCg_)tSIu|)LPyqJ{^p-ETJQoH>Rl& zpP^cKPXBZ^j++@~)|hamVC#6`QfNfi)peQIYtA6KwlzyPn0`OMN&d$tj?2Y1!-{R% zI**v&wT>&4`h2UYKv8hu-(Bw2wBq38h<`t6UTkXN^vt5qT|7>luMiAe?q>!rF7!=F zl8NAvEDp|IL9;6DR@n`t-Pf~<4Oc>6{p3lf7;C|G>il^!*!{p}q>Q0@mB>!)P@=`! z`ZzVZ?o>tpsKtHh>G$5Hvq+^kF}h7_(HZN8)Xm9DW$Y?!Q zlfo0n7S0iVfmZA{Uku&lTMD>X3tp$7!B3!%3h7 zP%(4(bcg&n+Nzgh$lo}-Gm-J@T9#Yrp=Rsd8+Gj{pwB@XiyIg(rw0&wwTNC0zw0*mF_jG97+)ye9PmRSMq`hv9=sWS29QV!brA821I;;yfC z$#8BfHpt8*95mW)O`JH^UtELiAN>$x`8hqn@KeSel!K!8*y3&UeZG$=JB`C)wCtEC|$-EKJ+rRjxhQje;5g<7l>Lf@864e{3S$bTK?h$EjOaU$Ag z3_kdZE_sm>;Hsv`BzIjhoNILY5~N>Aeh3w&@HXyMF`_e7vj?man~4(>9AA-^EgN`hHZV(^DSoji1a9)lLSEzg*VoQ|L=xUX5|N zM)`@g8Dvs@_{PZFRn(4miJs?W+{c|O;bF`cN_vuIcl_UiR_;8y%CX7vuv?CTc(*#1 zH!DEaNPUrAbyz$8mSCRgt?U4N61f)k0XAIq0rl3toOC%*Wwo1zVUSBG)`tSJ&{(V*8Q@67O zzU8lHdHHk&qPUC95e|(|a<&{43iuuG|_#gV+?-CIN#0U8;&VhaxR@k&`k(S6O za8vd_Zv8HNuwg_kfQkm-Appe9h57IR)SO@lq#FQJ0y(dyppbud`Yi~uVM`;@rj~!X z^;?5ulM)1TV`LR5=9>os<>A)j=4Qow{xtHtBEt3u1jZ^=KZV9WgvGy!fLPxF=d;0- zO)zB|QzsKl(F^pu=!2MhH5ut4LzomUgyM*s2QXye5N@eC$lIZNlGe1-Pq= z9Redkf_^HRKyh(ZMZrJNbC=ZrqDrC=a5Icz1_lvw|8W3#p->(uzzpyU;}rm7+{PXN z+us;Bm=}|pzhfXithV@fjGG&)!TkdRW0bLfVm!Q<8U6h^5HBBQ>i@v_|D!L2_h0rv zyjU&pAAR{Sg|dHOU=T)K`zHp0{0rm#kGb*kVf4g*^yTH_#bh0O{241=eyrT~PYm)O z&+-2&R=fh-e~CR30mJBdi1UJ}hNT-ElQ#fR-Odi9JYyFYW);cVn%QAw=%3e87$I7i z7X$-?VWzw$Cfs0%35XvA=7X6+`MCM`V8&)J0e(@!|NY2M)zlG*`42i@Fnrv+P(o&A JX%!j5{|9J$0@45g literal 0 HcmV?d00001 From f50546ec16105ee31b2cb2d18a4bf7dff0db6f1a Mon Sep 17 00:00:00 2001 From: epugh Date: Mon, 21 Oct 2019 08:39:32 -0400 Subject: [PATCH 7/8] unwind hard coded change --- .../org/apache/tika/parser/ocr/TesseractOCRParserTest.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index bf8f721feef..9ebcee0685b 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -51,15 +51,11 @@ public class TesseractOCRParserTest extends TikaTest { public static boolean canRun() { TesseractOCRConfig config = new TesseractOCRConfig(); - config.setTesseractPath("/usr/local/bin"); - config.setTessdataPath("/usr/local/Cellar/tesseract/4.1.0/share/tessdata"); TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest(); return tesseractOCRTest.canRun(config); } private boolean canRun(TesseractOCRConfig config) { - config.setTesseractPath("/usr/local/bin"); - config.setTessdataPath("/usr/local/Cellar/tesseract/4.1.0/share/tessdata"); String[] checkCmd = {config.getTesseractPath() + getTesseractProg()}; // If Tesseract is not on the path, do not run the test. return ExternalParser.check(checkCmd); @@ -168,8 +164,6 @@ private String runOCR(String resource, String[] nonOCRContains, int numMetadatas BasicContentHandlerFactory.HANDLER_TYPE handlerType, TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception { TesseractOCRConfig config = new TesseractOCRConfig(); - config.setTesseractPath("/usr/local/bin"); - config.setTessdataPath("/usr/local/Cellar/tesseract/4.1.0/share/tessdata"); config.setOutputType(outputType); Parser parser = new RecursiveParserWrapper(new AutoDetectParser(), From 19c6a85ff51763eb709717f6d35b4cc5706744d1 Mon Sep 17 00:00:00 2001 From: epugh Date: Mon, 21 Oct 2019 08:40:46 -0400 Subject: [PATCH 8/8] clean method, and add better todo --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 5a3c7aa1eb5..5a81c048299 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -208,20 +208,10 @@ private void enableOCRPDF() throws TikaException, IOException, SAXException { if (p instanceof TesseractOCRParser) { TesseractOCRConfig tesseractOCRConfig = ((TesseractOCRParser) p).getDefaultConfig(); context.set(TesseractOCRConfig.class, tesseractOCRConfig); - // Should we mimic the warn? + // TODO Should we mimic the warn from extractInlineImagesFromPDFs? } } - if (configFilePath == null && context.get(PDFParserConfig.class) == null) { - PDFParserConfig pdfParserConfig = new PDFParserConfig(); - pdfParserConfig.setExtractInlineImages(true); - String warn = "As a convenience, TikaCLI has turned on extraction of\n" + - "inline images for the PDFParser (TIKA-2374).\n" + - "Aside from the -z option, this is not the default behavior\n"+ - "in Tika generally or in tika-server."; - LOG.info(warn); - context.set(PDFParserConfig.class, pdfParserConfig); - } } private class OutputType {