Permalink
Browse files

NUTCH-1441 AnchorIndexingFilter should use plain HashSet

git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1387341 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent d2831a8 commit a56783048cc08196d27c8deff267fb797ee1ff3d Lewis John McGibbney committed Sep 18, 2012
View
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy via lewismc)
+
* NUTCH-1470 Ensure test files are included for runtime testing (lewismc)
* NUTCH-1434 Indexer to delete robots noindex (markus)
@@ -16,10 +16,8 @@
*/
package org.apache.nutch.indexer.anchor;
-import java.util.WeakHashMap;
+import java.util.HashSet;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
@@ -28,6 +26,8 @@
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Indexing filter that indexes all inbound anchor text for a document.
@@ -56,19 +56,19 @@ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum
String[] anchors = (inlinks != null ? inlinks.getAnchors()
: new String[0]);
- // https://issues.apache.org/jira/browse/NUTCH-1037
- WeakHashMap<String,Integer> map = new WeakHashMap<String,Integer>();
+ HashSet<String> set = null;
for (int i = 0; i < anchors.length; i++) {
if (deduplicate) {
+ if (set == null) set = new HashSet<String>();
String lcAnchor = anchors[i].toLowerCase();
// Check if already processed the current anchor
- if (!map.containsKey(lcAnchor)) {
+ if (!set.contains(lcAnchor)) {
doc.add("anchor", anchors[i]);
// Add to map
- map.put(lcAnchor, 1);
+ set.add(lcAnchor);
}
} else {
doc.add("anchor", anchors[i]);

0 comments on commit a567830

Please sign in to comment.