Skip to content

Commit

Permalink
NUTCH-1757 ParserChecker to take custom metadata as input
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/nutch/trunk@1596662 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
jnioche committed May 21, 2014
1 parent f4cfd6a commit 7089e15
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGES.txt
Expand Up @@ -2,6 +2,8 @@ Nutch Change Log

Nutch Current Development

* NUTCH-1757 ParserChecker to take custom metadata as input (jnioche)

* NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus)

* NUTCH-1772 Injector does not need merging if no pre-existing crawldb (jnioche)
Expand Down
52 changes: 50 additions & 2 deletions src/java/org/apache/nutch/parse/ParserChecker.java
Expand Up @@ -17,6 +17,9 @@

package org.apache.nutch.parse;

import java.util.HashMap;
import java.util.Iterator;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
Expand All @@ -29,6 +32,7 @@
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.URLUtil;
import org.apache.nutch.util.StringUtil;
Expand Down Expand Up @@ -73,19 +77,32 @@ public int run(String[] args) throws Exception {
String contentType = null;
String url = null;

String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";
String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";

if (args.length == 0) {
LOG.error(usage);
return (-1);
}

// used to simulate the metadata propagated from injection
HashMap<String, String> metadata = new HashMap<String, String>();

for (int i = 0; i < args.length; i++) {
if (args[i].equals("-forceAs")) {
force = true;
contentType = args[++i];
} else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (args[i].equals("-md")) {
String k = null, v = null;
String nextOne = args[++i];
int firstEquals = nextOne.indexOf("=");
if (firstEquals != -1) {
k = nextOne.substring(0, firstEquals);
v = nextOne.substring(firstEquals + 1);
} else
k = nextOne;
metadata.put(k, v);
} else if (i != args.length - 1) {
LOG.error(usage);
System.exit(-1);
Expand All @@ -98,9 +115,21 @@ public int run(String[] args) throws Exception {
LOG.info("fetching: " + url);
}

CrawlDatum cd = new CrawlDatum();

Iterator<String> iter = metadata.keySet().iterator();
while (iter.hasNext()) {
String key = iter.next();
String value = metadata.get(key);
if (value == null)
value = "";
cd.getMetaData().put(new Text(key), new Text(value));
}

ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
ProtocolOutput output = protocol.getProtocolOutput(new Text(url), new CrawlDatum());
Text turl = new Text(url);
ProtocolOutput output = protocol.getProtocolOutput(turl, cd);

if (!output.getStatus().isSuccess()) {
System.err.println("Fetch failed with protocol status: " + output.getStatus());
Expand Down Expand Up @@ -129,6 +158,16 @@ public int run(String[] args) throws Exception {
LOG.warn("Content is truncated, parse may fail!");
}

ScoringFilters scfilters = new ScoringFilters(conf);
// call the scoring filters
try {
scfilters.passScoreBeforeParsing(turl, cd, content);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score, url " + turl.toString() + " (" + e + ")");
}
}

ParseResult parseResult = new ParseUtil(conf).parse(content);

if (parseResult == null) {
Expand All @@ -145,6 +184,15 @@ public int run(String[] args) throws Exception {
LOG.info("signature: " + StringUtil.toHexString(signature));
}

// call the scoring filters
try {
scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score, url " + turl + " (" + e + ")");
}
}

for (java.util.Map.Entry<Text, Parse> entry : parseResult) {
Parse parse = entry.getValue();
LOG.info("---------\nUrl\n---------------\n");
Expand Down

0 comments on commit 7089e15

Please sign in to comment.