Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions conf/tika-config.xml.template
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,11 @@
-->
<properties>
<service-loader initializableProblemHandler="ignore" loadErrorHandler="warn" />
<!--
Set pool size of SAX parsers to a higher value if fetcher is
parsing with many threads and Tika complains about "Consider
increasing the XMLReaderUtils.POOL_SIZE". Tika's default pool
size is 10. Cf. NUTCH-2578, TIKA-2645, NUTCH-2582.
-->
<xml-reader-utils poolSize="20" />
</properties>
4 changes: 4 additions & 0 deletions src/java/org/apache/nutch/fetcher/Fetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
Expand Down Expand Up @@ -201,6 +202,9 @@ public void run(Context innerContext) throws IOException {
int threadCount = conf.getInt("fetcher.threads.fetch", 10);
LOG.info("Fetcher: threads: {}", threadCount);

// NUTCH-2582: adapt Tika MIME detector pool size to thread count
MimeUtil.setPoolSize(Math.max(10, threadCount / 2));

int timeoutDivisor = conf.getInt("fetcher.threads.timeout.divisor", 2);
LOG.info("Fetcher: time-out divisor: {}", timeoutDivisor);

Expand Down
28 changes: 15 additions & 13 deletions src/java/org/apache/nutch/util/MimeUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,24 @@
import java.lang.invoke.MethodHandles;

import org.apache.hadoop.conf.Configuration;

import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;

import org.apache.tika.mime.MimeTypesReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.nutch.protocol.ProtocolOutput;

/**
* @author mattmann
* @since NUTCH-608
*
* <p>
* This is a facade class to insulate Nutch from its underlying Mime Type
* substrate library, <a href="http://incubator.apache.org/tika/">Apache
* Tika</a>. Any mime handling code should be placed in this utility
* class, and hidden from the Nutch classes that rely on it.
* </p>
* This is a facade class to insulate Nutch from its underlying Mime Type
* substrate library, <a href="https://tika.apache.org/">Apache Tika</a>. Any
* Mime handling code should be placed in this utility class, and hidden from
* the Nutch classes that rely on it.
*/
public final class MimeUtil {

Expand All @@ -64,6 +58,14 @@ public final class MimeUtil {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());

public static void setPoolSize(int poolSize) {
try {
MimeTypesReader.setPoolSize(poolSize);
} catch (TikaException e) {
LOG.error("Failed to set pool size", e);
}
}

public MimeUtil(Configuration conf) {
ObjectCache objectCache = ObjectCache.get(conf);
tika = (Tika) objectCache.getObject(Tika.class.getName());
Expand Down