Permalink
Browse files

NUTCH-1491 Strip UTF-8 non-character codepoints in title

git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1406076 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent 4c4e208 commit f146015f6390df37bf2f1b51dc4bc2dc6c9d2d1b Markus Jelsma committed Nov 6, 2012
Showing with 8 additions and 4 deletions.
  1. +4 −0 CHANGES.txt
  2. +4 −4 src/java/org/apache/nutch/indexer/solr/SolrWriter.java
View
@@ -2,6 +2,10 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1491 Strip UTF-8 non-character codepoints in title (Nathan Gass via markus)
+
+* NUTCH-1480 SolrIndexer to write to multiple server (markus)
+
* NUTCH-1421 RegexURLNormalizer to only skip rules with invalid patterns (snagel)
* NUTCH-1341 NotModified time set to now but page not modified (markus)
@@ -37,7 +37,7 @@
import org.apache.solr.common.util.DateUtil;
public class SolrWriter implements NutchIndexWriter {
-
+
public static final Logger LOG = LoggerFactory.getLogger(SolrWriter.class);
private SolrServer solr;
@@ -55,7 +55,7 @@ public void open(JobConf job, String name) throws IOException {
SolrServer server = SolrUtils.getCommonsHttpSolrServer(job);
init(server, job);
}
-
+
// package protected for tests
void init(SolrServer server, JobConf job) throws IOException {
solr = server;
@@ -99,14 +99,14 @@ public void write(NutchDocument doc) throws IOException {
val2 = DateUtil.getThreadLocalDateFormat().format(val);
}
- if (e.getKey().equals("content")) {
+ if (e.getKey().equals("content") || e.getKey().equals("title")) {
val2 = SolrUtils.stripNonCharCodepoints((String)val);
}
inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e.getValue().getWeight());
String sCopy = solrMapping.mapCopyKey(e.getKey());
if (sCopy != e.getKey()) {
- inputDoc.addField(sCopy, val);
+ inputDoc.addField(sCopy, val);
}
}
}

0 comments on commit f146015

Please sign in to comment.