diff --git a/CHANGES.txt b/CHANGES.txt index c5e26f7130..0beca25181 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,6 +1,8 @@ Nutch Change Log Release nutchgora - Current Development +* NUTCH-1354 nutchgora support fetcher.queue.depth.multiplier property (ferdy) + * NUTCH-1353 nutchgora DomainStatistics support crawlId, counter bug and reformatting (ferdy) * NUTCH-1350 remove unused dependancy because of access restriction (ferdy) diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index e0811d8bc6..71312ebb41 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -686,6 +686,16 @@ + + fetcher.queue.depth.multiplier + 50 + (EXPERT)The fetcher buffers the incoming URLs into queues based on the [host|domain|IP] + (see param fetcher.queue.mode). The depth of the queue is the number of threads times the value of this parameter. + A large value requires more memory but can improve the performance of the fetch when the order of the URLS in the fetch list + is not optimal. + + + diff --git a/src/java/org/apache/nutch/fetcher/FetcherReducer.java b/src/java/org/apache/nutch/fetcher/FetcherReducer.java index fbdba823d8..056be60995 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherReducer.java +++ b/src/java/org/apache/nutch/fetcher/FetcherReducer.java @@ -775,7 +775,8 @@ public void run(Context context) } LOG.info("Fetcher: threads: " + threadCount); - feeder = new QueueFeeder(context, fetchQueues, threadCount * 50); + int maxFeedPerThread = conf.getInt("fetcher.queue.depth.multiplier", 50); + feeder = new QueueFeeder(context, fetchQueues, threadCount * maxFeedPerThread); feeder.start(); for (int i = 0; i < threadCount; i++) { // spawn threads