Skip to content

Commit

Permalink
NUTCH-1370 Expose exact number of urls injected @runtime
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1412573 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
Lewis John McGibbney committed Nov 22, 2012
1 parent fc785b3 commit eee3dec
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Expand Up @@ -2,6 +2,8 @@ Nutch Change Log

(trunk) Current Development:

* NUTCH-1370 Expose exact number of urls injected @runtime (snagel via lewismc)

* NUTCH-1117 JUnit test for index-anchor (lewismc)

* NUTCH-1451 Upgrade automaton jar to 1.11-8 (lewismc)
Expand Down
13 changes: 11 additions & 2 deletions src/java/org/apache/nutch/crawl/Injector.java
Expand Up @@ -134,7 +134,9 @@ else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); }
url = null;
}
if (url != null) { // if it passes
if (url == null) {
reporter.getCounter("injector", "urls_filtered").increment(1);
} else { // if it passes
value.set(url); // collect it
CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_INJECTED);
Expand Down Expand Up @@ -166,6 +168,7 @@ else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
+ ", using default (" + e.getMessage() + ")");
}
}
reporter.getCounter("injector", "urls_injected").increment(1);
output.collect(value, datum);
}
}
Expand Down Expand Up @@ -275,7 +278,13 @@ public void inject(Path crawlDb, Path urlDir) throws IOException {
sortJob.setOutputKeyClass(Text.class);
sortJob.setOutputValueClass(CrawlDatum.class);
sortJob.setLong("injector.current.time", System.currentTimeMillis());
JobClient.runJob(sortJob);
RunningJob mapJob = JobClient.runJob(sortJob);

long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
LOG.info("Injector: total number of urls injected after normalization and filtering: "
+ urlsInjected);

// merge with existing crawl db
if (LOG.isInfoEnabled()) {
Expand Down

0 comments on commit eee3dec

Please sign in to comment.