Skip to content
Permalink
Browse files
Merge pull request #561 from Ewocker/OAK-9758
OAK-9758 error out if tika dependencies are missing and improve loggi…
  • Loading branch information
thomasmueller committed May 12, 2022
2 parents ad59229 + 2fab417 commit ed36bf1ac9781e8b3db0cf6d30f644a3c27f7e1f
Show file tree
Hide file tree
Showing 8 changed files with 182 additions and 77 deletions.
@@ -19,14 +19,6 @@

package org.apache.jackrabbit.oak.index;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
@@ -37,6 +29,14 @@
import org.apache.jackrabbit.oak.run.cli.OptionsBean;
import org.apache.jackrabbit.oak.run.cli.OptionsBeanFactory;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class IndexOptions implements OptionsBean {

public static final OptionsBeanFactory FACTORY = IndexOptions::new;
@@ -50,6 +50,7 @@ public class IndexOptions implements OptionsBean {
private final OptionSpec<Void> definitions;
private final OptionSpec<Void> dumpIndex;
private final OptionSpec<Void> reindex;
private final OptionSpec<Void> ignoreMissingTikaDep;
private final OptionSpec<Void> asyncIndex;
private final OptionSpec<Void> importIndex;
private final OptionSpec<Void> docTraversal;
@@ -99,6 +100,7 @@ public IndexOptions(OptionParser parser){

dumpIndex = parser.accepts("index-dump", "Dumps index content");
reindex = parser.accepts("reindex", "Reindex the indexes specified by --index-paths or --index-definitions-file");
ignoreMissingTikaDep = parser.accepts("ignore-missing-tika-dep", "Ignore when there are missing tika dependencies and continue to run");
asyncIndex = parser.accepts("async-index", "Runs async index cycle");

asyncIndexLanes = parser.accepts("async-index-lanes", "Comma separated list of async index lanes for which the " +
@@ -207,6 +209,10 @@ public boolean isReindex() {
return options.has(reindex);
}

public boolean isIgnoreMissingTikaDep() {
return options.has(ignoreMissingTikaDep);
}

public boolean isAsyncIndex() {
return options.has(asyncIndex);
}
@@ -19,14 +19,14 @@

package org.apache.jackrabbit.oak.run.cli;

import java.util.Collections;
import java.util.List;
import java.util.Set;

import joptsimple.OptionParser;
import joptsimple.OptionSet;
import joptsimple.OptionSpec;

import java.util.Collections;
import java.util.List;
import java.util.Set;

import static java.util.Arrays.asList;

public class CommonOptions implements OptionsBean {
@@ -19,10 +19,6 @@

package org.apache.jackrabbit.oak.run.cli;

import java.io.IOException;
import java.util.EnumSet;
import java.util.Set;

import com.google.common.collect.ClassToInstanceMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.MutableClassToInstanceMap;
@@ -32,6 +28,10 @@
import org.apache.jackrabbit.oak.spi.whiteboard.DefaultWhiteboard;
import org.apache.jackrabbit.oak.spi.whiteboard.Whiteboard;

import java.io.IOException;
import java.util.EnumSet;
import java.util.Set;

import static com.google.common.base.Preconditions.checkNotNull;
import static java.util.Arrays.asList;

@@ -19,17 +19,6 @@

package org.apache.jackrabbit.oak.index;

import java.io.File;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import com.google.common.base.Joiner;
import com.google.common.base.Stopwatch;
import com.google.common.collect.ImmutableMap;
@@ -55,6 +44,17 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.util.Collections.emptyMap;
@@ -78,6 +78,10 @@ public class IndexCommand implements Command {
private IndexOptions indexOpts;
private static boolean disableExitOnError;

public void checkTikaDependency() throws ClassNotFoundException {
Class.forName("org.apache.tika.parser.pdf.PDFParser");
}

@Override
public void execute(String... args) throws Exception {
OptionParser parser = new OptionParser();
@@ -91,6 +95,15 @@ public void execute(String... args) throws Exception {

indexOpts = opts.getOptionBean(IndexOptions.class);

if (indexOpts.isReindex() && !opts.getCommonOpts().isHelpRequested() && !indexOpts.isIgnoreMissingTikaDep()) {
try {
checkTikaDependency();
} catch (Throwable e) {
System.err.println("Missing tika parser dependencies, use --ignore-missing-tika-dep to force continue");
System.exit(1);
}
}

//Clean up before setting up NodeStore as the temp
//directory might be used by NodeStore for cache stuff like persistentCache
setupDirectories(indexOpts);
@@ -19,6 +19,17 @@

package org.apache.jackrabbit.oak.plugins.tika;

import com.google.common.io.ByteSource;
import com.google.common.io.CountingInputStream;
import org.apache.jackrabbit.oak.commons.IOUtils;
import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.WriteOutContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
@@ -34,17 +45,6 @@
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Supplier;

import com.google.common.io.ByteSource;
import com.google.common.io.CountingInputStream;
import org.apache.jackrabbit.oak.commons.IOUtils;
import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.WriteOutContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class TextExtractor implements Closeable {
private static final Logger log = LoggerFactory.getLogger(TextExtractor.class);
private static final Logger parserError = LoggerFactory.getLogger("org.apache.jackrabbit.oak.plugins.tika.ParserError");
@@ -79,6 +79,8 @@ class TextExtractor implements Closeable {
private boolean initialized;
private BinaryStats stats;
private boolean closed;
private boolean linkageErrorFound;
private boolean throwableErrorFound;

public TextExtractor(TextWriter textWriter) {
this.textWriter = textWriter;
@@ -270,22 +272,24 @@ public InputStream get() {
// not being present. This is equivalent to disabling
// selected media types in configuration, so we can simply
// ignore these errors.
log.debug("Failed to extract text from a binary property: {}."
String format = "Failed to extract text from a binary property: {}."
+ " This often happens when some media types are disabled by configuration."
+ " The stack trace is included to flag some 'unintended' failures",
path, e);
+ " The stack trace is included to flag some 'unintended' failures";
log.warn(format, linkageErrorFound ? path : new Object[]{path, e});
linkageErrorFound = true;
parserErrorCount.incrementAndGet();
return ERROR_TEXT;
} catch (Throwable t) {
// Capture and report any other full text extraction problems.
// The special STOP exception is used for normal termination.
if (!handler.isWriteLimitReached(t)) {
parserErrorCount.incrementAndGet();
parserError.debug("Failed to extract text from a binary property: "
+ path
String format = "Failed to extract text from a binary property: {}"
+ " This is a fairly common case, and nothing to"
+ " worry about. The stack trace is included to"
+ " help improve the text extraction feature.", t);
+ " help improve the text extraction feature.";
parserError.info(format, throwableErrorFound ? path : new Object[]{path, t});
throwableErrorFound = true;
return ERROR_TEXT;
} else {
parserError.debug("Extracted text size exceeded configured limit({})", maxExtractedLength);
@@ -19,15 +19,18 @@

package org.apache.jackrabbit.oak.index;

import java.util.List;

import joptsimple.OptionParser;
import org.apache.jackrabbit.oak.run.cli.Options;
import org.junit.Before;
import org.junit.Test;

import java.util.List;

import static org.hamcrest.core.IsCollectionContaining.hasItems;
import static org.junit.Assert.*;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;

public class IndexOptionsTest {

0 comments on commit ed36bf1

Please sign in to comment.