Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ public class IndexingFiltersChecker extends AbstractChecker {
protected URLNormalizers normalizers = null;
protected boolean dumpText = false;
protected boolean followRedirects = false;
protected boolean doIndex = false;
// used to simulate the metadata propagated from injection
protected HashMap<String, String> metadata = new HashMap<>();

Expand All @@ -68,14 +69,34 @@ public class IndexingFiltersChecker extends AbstractChecker {
public int run(String[] args) throws Exception {
String url = null;

usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] (-stdin | -listen <port> [-keepClientCnxOpen])";
String usage = "Usage:\n" //
+ " IndexingFiltersChecker [OPTIONS] <url>\n" //
+ " Fetch single URL and index it\n" //
+ " IndexingFiltersChecker [OPTIONS] -stdin\n" //
+ " Read URLs to be indexed from stdin\n" //
+ " IndexingFiltersChecker [OPTIONS] -listen <port> [-keepClientCnxOpen]\n" //
+ " Listen on <port> for URLs to be indexed\n" //
+ "Options:\n" //
+ " -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" //
+ " \t(a generic Hadoop option to be passed\n" //
+ " \t before other command-specific options)\n"
+ " -normalize \tnormalize URLs\n" //
+ " -followRedirects\tfollow redirects when fetching URL\n" //
+ " -dumpText \tshow the entire plain-text content,\n" //"
+ " \tnot only the first 100 characters\n" //
+ " -doIndex \tpass document to configured index writers\n" //
+ " \tand let them index it\n" //
+ " -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";

// Print help when no args given
if (args.length < 1) {
System.err.println(usage);
System.exit(-1);
}

// read property "doIndex" for back-ward compatibility
doIndex = getConf().getBoolean("doIndex", false);

int numConsumed;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-normalize")) {
Expand All @@ -84,6 +105,8 @@ public int run(String[] args) throws Exception {
followRedirects = true;
} else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (args[i].equals("-doIndex")) {
doIndex = true;
} else if (args[i].equals("-md")) {
String k = null, v = null;
String nextOne = args[++i];
Expand Down Expand Up @@ -268,7 +291,7 @@ protected int process(String url, StringBuilder output) throws Exception {

output.append("\n"); // For readability if keepClientCnxOpen

if (getConf().getBoolean("doIndex", false)) {
if (doIndex) {
IndexWriters writers = IndexWriters.get(getConf());
writers.open(getConf(), "IndexingFilterChecker");
writers.write(doc);
Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/parse/ParserChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public int run(String[] args) throws Exception {
+ "Options:\n" //
+ " -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" //
+ " \t(a generic Hadoop option to be passed\n" //
+ " \t before other command-specific options)"
+ " \t before other command-specific options)\n"
+ " -normalize \tnormalize URLs\n" //
+ " -followRedirects\tfollow redirects when fetching URL\n" //
+ " -dumpText \talso show the plain-text extracted by parsers\n" //
Expand Down