From 76c8cff1402e217049942bac88a8a005d45abf43 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 14 Mar 2019 16:46:25 +0100 Subject: [PATCH] NUTCH-2700 Indexchecker: improve command-line help - add options `-doIndex` to pass "checked" document to index writers (the property `doIndex` is kept to ensure back-ward compatibility) --- .../nutch/indexer/IndexingFiltersChecker.java | 27 +++++++++++++++++-- .../org/apache/nutch/parse/ParserChecker.java | 2 +- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java index 08c85c3dda..fa62a00a53 100644 --- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java +++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java @@ -59,6 +59,7 @@ public class IndexingFiltersChecker extends AbstractChecker { protected URLNormalizers normalizers = null; protected boolean dumpText = false; protected boolean followRedirects = false; + protected boolean doIndex = false; // used to simulate the metadata propagated from injection protected HashMap metadata = new HashMap<>(); @@ -68,7 +69,24 @@ public class IndexingFiltersChecker extends AbstractChecker { public int run(String[] args) throws Exception { String url = null; - usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] (-stdin | -listen [-keepClientCnxOpen])"; + String usage = "Usage:\n" // + + " IndexingFiltersChecker [OPTIONS] \n" // + + " Fetch single URL and index it\n" // + + " IndexingFiltersChecker [OPTIONS] -stdin\n" // + + " Read URLs to be indexed from stdin\n" // + + " IndexingFiltersChecker [OPTIONS] -listen [-keepClientCnxOpen]\n" // + + " Listen on for URLs to be indexed\n" // + + "Options:\n" // + + " -D=\tset/overwrite Nutch/Hadoop properties\n" // + + " \t(a generic Hadoop option to be passed\n" // + + " \t before other command-specific options)\n" + + " -normalize \tnormalize URLs\n" // + + " -followRedirects\tfollow redirects when fetching URL\n" // + + " -dumpText \tshow the entire plain-text content,\n" //" + + " \tnot only the first 100 characters\n" // + + " -doIndex \tpass document to configured index writers\n" // + + " \tand let them index it\n" // + + " -md =\tmetadata added to CrawlDatum before parsing\n"; // Print help when no args given if (args.length < 1) { @@ -76,6 +94,9 @@ public int run(String[] args) throws Exception { System.exit(-1); } + // read property "doIndex" for back-ward compatibility + doIndex = getConf().getBoolean("doIndex", false); + int numConsumed; for (int i = 0; i < args.length; i++) { if (args[i].equals("-normalize")) { @@ -84,6 +105,8 @@ public int run(String[] args) throws Exception { followRedirects = true; } else if (args[i].equals("-dumpText")) { dumpText = true; + } else if (args[i].equals("-doIndex")) { + doIndex = true; } else if (args[i].equals("-md")) { String k = null, v = null; String nextOne = args[++i]; @@ -268,7 +291,7 @@ protected int process(String url, StringBuilder output) throws Exception { output.append("\n"); // For readability if keepClientCnxOpen - if (getConf().getBoolean("doIndex", false)) { + if (doIndex) { IndexWriters writers = IndexWriters.get(getConf()); writers.open(getConf(), "IndexingFilterChecker"); writers.write(doc); diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java index 454068bb8f..8419fa3789 100644 --- a/src/java/org/apache/nutch/parse/ParserChecker.java +++ b/src/java/org/apache/nutch/parse/ParserChecker.java @@ -91,7 +91,7 @@ public int run(String[] args) throws Exception { + "Options:\n" // + " -D=\tset/overwrite Nutch/Hadoop properties\n" // + " \t(a generic Hadoop option to be passed\n" // - + " \t before other command-specific options)" + + " \t before other command-specific options)\n" + " -normalize \tnormalize URLs\n" // + " -followRedirects\tfollow redirects when fetching URL\n" // + " -dumpText \talso show the plain-text extracted by parsers\n" //