Permalink
Browse files

NUTCH-1118 JUnit test for index-basic

git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1425494 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent 2b3fe9c commit 618a80c8bf38e4488e0096933efa1fb0af27c14f Lewis John McGibbney committed Dec 23, 2012
View
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk): Current Development
+* NUTCH-1118 JUnit test for index-basic (Tejas Patil via lewismc)
+
* NUTCH-1331 limit crawler to defined depth (jnioche)
Release 1.6 - 23/11/2012
View
@@ -77,6 +77,7 @@
<target name="test">
<parallel threadCount="2">
<ant dir="creativecommons" target="test"/>
+ <ant dir="index-basic" target="test"/>
<ant dir="index-anchor" target="test"/>
<ant dir="index-more" target="test"/>
<ant dir="language-identifier" target="test"/>
@@ -38,7 +38,14 @@
import org.apache.hadoop.conf.Configuration;
-/** Adds basic searchable fields to a document. */
+/**
+ * Adds basic searchable fields to a document.
+ * The fields added are : domain, host, url, content, title, cache, tstamp
+ * domain is included depending on {@code indexer.add.domain} in nutch-default.xml.
+ * title is truncated as per {@code indexer.max.title.length} in nutch-default.xml.
+ * (As per NUTCH-1004, a zero-length title is not added)
+ * content is truncated as per {@code indexer.max.content.length} in nutch-default.xml.
+ */
public class BasicIndexingFilter implements IndexingFilter {
public static final Logger LOG = LoggerFactory.getLogger(BasicIndexingFilter.class);
@@ -47,6 +54,19 @@
private boolean addDomain = false;
private Configuration conf;
+ /**
+ * The {@link BasicIndexingFilter} filter object which supports few
+ * configuration settings for adding basic searchable fields.
+ * See {@code indexer.add.domain}, {@code indexer.max.title.length},
+ * {@code indexer.max.content.length} in nutch-default.xml.
+ *
+ * @param doc The {@link NutchDocument} object
+ * @param parse The relevant {@link Parse} object passing through the filter
+ * @param url URL to be filtered for anchor text
+ * @param datum The {@link CrawlDatum} entry
+ * @param inlinks The {@link Inlinks} containing anchor text
+ * @return filtered NutchDocument
+ */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
@@ -108,13 +128,19 @@ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum
return doc;
}
+ /**
+ * Set the {@link Configuration} object
+ */
public void setConf(Configuration conf) {
this.conf = conf;
this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
this.addDomain = conf.getBoolean("indexer.add.domain", false);
this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1);
}
+ /**
+ * Get the {@link Configuration} object
+ */
public Configuration getConf() {
return this.conf;
}
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.basic;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.basic.BasicIndexingFilter;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.util.Date;
+import junit.framework.TestCase;
+
+/**
+ * JUnit test case which tests
+ * 1. that basic searchable fields are added to a document
+ * 2. that domain is added as per {@code indexer.add.domain} in nutch-default.xml.
+ * 3. that title is truncated as per {@code indexer.max.title.length} in nutch-default.xml.
+ * 4. that content is truncated as per {@code indexer.max.content.length} in nutch-default.xml.
+ *
+ * @author tejasp
+ *
+ */
+
+public class TestBasicIndexingFilter extends TestCase {
+
+ public void testBasicIndexingFilter() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ conf.setInt("indexer.max.title.length", 10);
+ conf.setBoolean("indexer.add.domain", true);
+ conf.setInt("indexer.max.content.length", 20);
+
+ BasicIndexingFilter filter = new BasicIndexingFilter();
+ filter.setConf(conf);
+ assertNotNull(filter);
+
+ NutchDocument doc = new NutchDocument();
+
+ String title = "The Foo Page";
+ Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
+ Metadata metaData = new Metadata();
+ metaData.add("Language", "en/us");
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
+ ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
+
+ CrawlDatum crawlDatum = new CrawlDatum();
+ crawlDatum.setFetchTime(100L);
+
+ Inlinks inlinks = new Inlinks();
+
+ try {
+ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
+ } catch(Exception e){
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ assertNotNull(doc);
+ assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
+ assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
+ assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
+ assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html",
+ doc.getField("url").getValues().get(0));
+ assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
+ assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
+ }
+}

0 comments on commit 618a80c

Please sign in to comment.