Permalink
Browse files

Nutch 1.2 release.

git-svn-id: https://svn.apache.org/repos/asf/nutch/tags/release-1.2@1001087 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
2 parents 550a5ee + afc3e91 commit 5c39b8d4106202f06f7b5782452493f77314ddb8 @chrismattmann chrismattmann committed Sep 24, 2010
View
@@ -1,6 +1,14 @@
Nutch Change Log
-Release 1.2 - 08/07/2010
+Release 1.2 - 09/18/2010
+
+* NUTCH-901 Make index-more plug-in configurable (Markus Jelsma via mattmann)
+
+* NUTCH-908 Infinite Loop and Null Pointer Bugs in Searching (kubes via mattmann)
+
+* NUTCH-906 Nutch OpenSearch sometimes raises DOMExceptions (Asheesh Laroia via ab)
+
+* NUTCH-862 HttpClient null pointer exception (Sebastian Nagel via ab)
* NUTCH-905 Configurable file protocol parent directory crawling (Thorsten Scherler, mattmann, ab)
View
@@ -764,6 +764,19 @@
</description>
</property>
+
+<!-- moreindexingfilter plugin properties -->
+
+<property>
+ <name>moreIndexingFilter.indexMimeTypeParts</name>
+ <value>true</value>
+ <description>Determines whether the index-more plugin will split the mime-type
+ in sub parts, this requires the type field to be multi valued. Set to true for backward
+ compatibility. False will not split the mime-type.
+ </description>
+</property>
+
+
<!-- indexingfilter plugin properties -->
<property>
@@ -181,7 +181,10 @@ public Summary getSummary(HitDetails details, Query query)
detailsList[i] = new ArrayList<HitDetails>();
}
for (HitDetails details : detailsArr) {
- detailsList[segmentMap.get(details.getValue("segment"))].add(details);
+ String segment = details.getValue("segment");
+ if (segmentMap.containsKey(segment)) {
+ detailsList[segmentMap.get(segment)].add(details);
+ }
}
for (int i = 0; i < summaryTasks.size(); i++) {
DistSummmaryTask task = (DistSummmaryTask)summaryTasks.get(i);
@@ -56,6 +56,9 @@
/** BooleanQuery won't permit more than 32 required/prohibited clauses. We
* don't want to use too many of those. */
private static final int MAX_PROHIBITED_TERMS = 20;
+
+ // don't let the optimize fall into an infinite loop
+ private static final int MAX_OPTIMIZE_LOOPS = 3;
private final Configuration conf;
@@ -186,9 +189,15 @@ public Hits search(Query query) throws IOException {
final Set<Hit> seen = new HashSet<Hit>();
final List<String> excludedValues = new ArrayList<String>();
boolean totalIsExact = true;
- for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
+ int optimizeNum = 0;
+
+ for (int rawHitNum = 0; rawHitNum < hits.getLength(); rawHitNum++) {
// get the next raw hit
- if (rawHitNum >= hits.getLength()) {
+ if (rawHitNum == (hits.getLength() - 1) && (optimizeNum < MAX_OPTIMIZE_LOOPS)) {
+
+ // increment the loop
+ optimizeNum++;
+
// optimize query by prohibiting more matches on some excluded values
final Query optQuery = (Query)query.clone();
for (int i = 0; i < excludedValues.size(); i++) {
@@ -271,21 +271,21 @@ public void doGet(HttpServletRequest request, HttpServletResponse response)
}
private static Element addNode(Document doc, Node parent, String name) {
- Element child = doc.createElement(name);
+ Element child = doc.createElement(getLegalTagName(name));
parent.appendChild(child);
return child;
}
private static void addNode(Document doc, Node parent,
String name, String text) {
- Element child = doc.createElement(name);
+ Element child = doc.createElement(getLegalTagName(name));
child.appendChild(doc.createTextNode(getLegalXml(text)));
parent.appendChild(child);
}
private static void addNode(Document doc, Node parent,
String ns, String name, String text) {
- Element child = doc.createElementNS(NS_MAP.get(ns), ns+":"+name);
+ Element child = doc.createElementNS(NS_MAP.get(ns), ns+":"+getLegalTagName(name));
child.appendChild(doc.createTextNode(getLegalXml(text)));
parent.appendChild(child);
}
@@ -332,4 +332,12 @@ private static boolean isLegalXml(final char c) {
|| (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff);
}
+ static String getLegalTagName(String string) {
+ char firstChar = string.charAt(0);
+ if (firstChar >= '0' &&
+ firstChar <= '9') {
+ string = "nutch_" + string;
+ }
+ return string;
+ }
}
@@ -225,10 +225,13 @@ private NutchDocument addType(NutchDocument doc, ParseData data, String url) {
doc.add("type", contentType);
- String[] parts = getParts(contentType);
+ // Check if we need to split the content type in sub parts
+ if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
+ String[] parts = getParts(contentType);
- for(String part: parts) {
- doc.add("type", part);
+ for(String part: parts) {
+ doc.add("type", part);
+ }
}
// leave this for future improvement
@@ -46,6 +46,31 @@ public void testGetParts() {
}
+ /**
+ * @since NUTCH-901
+ */
+ public void testNoParts(){
+ Configuration conf = NutchConfiguration.create();
+ conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
+ MoreIndexingFilter filter = new MoreIndexingFilter();
+ filter.setConf(conf);
+ assertNotNull(filter);
+ NutchDocument doc = new NutchDocument();
+ ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+
+ try{
+ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
+ }
+ catch(Exception e){
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ assertNotNull(doc);
+ assertTrue(doc.getFieldNames().contains("type"));
+ assertEquals(1, doc.getField("type").getValues().size());
+ assertEquals("text/html", doc.getFieldValue("type"));
+ }
+
private void assertParts(String[] parts, int count, String... expected) {
assertEquals(count, parts.length);
for (int i = 0; i < expected.length; i++) {
@@ -134,7 +134,9 @@
if (code == 200) throw new IOException(e.toString());
// for codes other than 200 OK, we are fine with empty content
} finally {
- in.close();
+ if (in != null) {
+ in.close();
+ }
get.abort();
}
@@ -30,4 +30,24 @@ public void testGetLegalXml(){
assertEquals("hello",OpenSearchServlet.getLegalXml("\u0000he\u0000llo\u0000"));
}
+ /**
+ * Test turning Lucene column names into valid XML names.
+ *
+ * The Nutch FAQ explains that OpenSearch includes "all fields that are available
+ * at search result time." However, some Lucene column names can start
+ * with numbers. Valid XML tags cannot. If Nutch is generating OpenSearch results
+ * for a document with a Lucene document column whose name starts with numbers,
+ * the underlying Xerces library throws this exception:
+ *
+ *
+ * org.w3c.dom.DOMException: INVALID_CHARACTER_ERR: An invalid or illegal XML character is specified.
+ *
+ * Therefore, we test here that Nutch can turn strings into valid XML tags.
+ */
+ public void testGetLegalTagName(){
+ assertEquals("nutch_000_numbers_first", OpenSearchServlet.getLegalTagName("000_numbers_first"));
+ assertEquals("letters_first_000", OpenSearchServlet.getLegalTagName("letters_first_000"));
+ }
+
+
}

0 comments on commit 5c39b8d

Please sign in to comment.