Permalink
Browse files

NUTCH-1370 Expose exact number of urls injected @runtime

git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1412573 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent fc785b3 commit eee3decbac5544d89bee0ad4d739ed5a233c96c2 Lewis John McGibbney committed Nov 22, 2012
Showing with 13 additions and 2 deletions.
  1. +2 −0 CHANGES.txt
  2. +11 −2 src/java/org/apache/nutch/crawl/Injector.java
View
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1370 Expose exact number of urls injected @runtime (snagel via lewismc)
+
* NUTCH-1117 JUnit test for index-anchor (lewismc)
* NUTCH-1451 Upgrade automaton jar to 1.11-8 (lewismc)
@@ -134,7 +134,9 @@ else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); }
url = null;
}
- if (url != null) { // if it passes
+ if (url == null) {
+ reporter.getCounter("injector", "urls_filtered").increment(1);
+ } else { // if it passes
value.set(url); // collect it
CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_INJECTED);
@@ -166,6 +168,7 @@ else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
+ ", using default (" + e.getMessage() + ")");
}
}
+ reporter.getCounter("injector", "urls_injected").increment(1);
output.collect(value, datum);
}
}
@@ -275,7 +278,13 @@ public void inject(Path crawlDb, Path urlDir) throws IOException {
sortJob.setOutputKeyClass(Text.class);
sortJob.setOutputValueClass(CrawlDatum.class);
sortJob.setLong("injector.current.time", System.currentTimeMillis());
- JobClient.runJob(sortJob);
+ RunningJob mapJob = JobClient.runJob(sortJob);
+
+ long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
+ long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
+ LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
+ LOG.info("Injector: total number of urls injected after normalization and filtering: "
+ + urlsInjected);
// merge with existing crawl db
if (LOG.isInfoEnabled()) {

0 comments on commit eee3dec

Please sign in to comment.