From 47e130720a0c1d8d9fa2618b75e3d38c1dbb8fa4 Mon Sep 17 00:00:00 2001 From: Michael Joyce Date: Fri, 12 Feb 2016 15:15:34 -0800 Subject: [PATCH] NUTCH-2218 - Update CrawlComplete util with Commons CLI arg parsing - Switch all argument parsing and checking to commons CLI. - Update input directory processing such that the 'crawldb' folder should no longer be included in the input paths. Now the user simply points to the crawl folder instead of specific crawldb directories. --- .../nutch/util/CrawlCompletionStats.java | 77 ++++++++++++++----- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java index 38fa598fcd..f1b3aacd04 100644 --- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java +++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java @@ -23,6 +23,14 @@ import java.text.SimpleDateFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.MissingOptionException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; @@ -61,27 +69,60 @@ public class CrawlCompletionStats extends Configured implements Tool { private int mode = 0; public int run(String[] args) throws Exception { - if (args.length < 3) { - System.err.println("Usage: CrawlCompletionStats inputDirs outDir mode [numOfReducer]"); + Option helpOpt = new Option("h", "help", false, "Show this message"); + Option inDirs = OptionBuilder + .withArgName("inputDirs") + .isRequired() + .withDescription("Comma separated list of crawl directories") + .hasArgs() + .create("inputDirs"); + Option outDir = OptionBuilder + .withArgName("outputDir") + .isRequired() + .withDescription("Output directory where results should be dumped") + .hasArgs() + .create("outputDir"); + Option modeOpt = OptionBuilder + .withArgName("mode") + .isRequired() + .withDescription("Set statistics gathering mode (by 'host' or by 'domain')") + .hasArgs() + .create("mode"); + Option numReducers = OptionBuilder + .withArgName("numReducers") + .withDescription("Optional number of reduce jobs to use. Defaults to 1") + .hasArgs() + .create("numReducers"); + + Options options = new Options(); + options.addOption(helpOpt); + options.addOption(inDirs); + options.addOption(outDir); + options.addOption(modeOpt); + options.addOption(numReducers); + + CommandLineParser parser = new GnuParser(); + CommandLine cli; - System.err.println("\tinputDirs\tComma separated list of crawldb input directories"); - System.err.println("\t\t\tE.g.: crawl/crawldb/"); - - System.err.println("\toutDir\t\tOutput directory where results should be dumped"); - - System.err.println("\tmode\t\tSet statistics gathering mode"); - System.err.println("\t\t\t\thost\tGather statistics by host"); - System.err.println("\t\t\t\tdomain\tGather statistics by domain"); + try { + cli = parser.parse(options, args); + } catch (MissingOptionException e) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("CrawlCompletionStats", options, true); + return 1; + } - System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1."); + if (cli.hasOption("help")) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("CrawlCompletionStats", options, true); return 1; } - String inputDir = args[0]; - String outputDir = args[1]; - int numOfReducers = 1; + String inputDir = cli.getOptionValue("inputDirs"); + String outputDir = cli.getOptionValue("outputDir"); - if (args.length > 3) { + int numOfReducers = 1; + if (cli.hasOption("numReducers")) { numOfReducers = Integer.parseInt(args[3]); } @@ -91,10 +132,10 @@ public int run(String[] args) throws Exception { int mode = 0; String jobName = "CrawlCompletionStats"; - if (args[2].equals("host")) { + if (cli.getOptionValue("mode").equals("host")) { jobName = "Host CrawlCompletionStats"; mode = MODE_HOST; - } else if (args[2].equals("domain")) { + } else if (cli.getOptionValue("mode").equals("domain")) { jobName = "Domain CrawlCompletionStats"; mode = MODE_DOMAIN; } @@ -108,7 +149,7 @@ public int run(String[] args) throws Exception { String[] inputDirsSpecs = inputDir.split(","); for (int i = 0; i < inputDirsSpecs.length; i++) { - File completeInputPath = new File(new File(inputDirsSpecs[i]), "current"); + File completeInputPath = new File(new File(inputDirsSpecs[i]), "crawldb/current"); FileInputFormat.addInputPath(job, new Path(completeInputPath.toString())); }