Skip to content

Commit

Permalink
NUTCH-2785 FreeGenerator: command-line option to define number of gen…
Browse files Browse the repository at this point in the history
…erated fetch lists

- add command-line option `-numFetchers` to FreeGenerator
- in local mode: generate one single fetch list
  • Loading branch information
sebastian-nagel committed Apr 29, 2020
1 parent 73880df commit 72f3ff2
Showing 1 changed file with 21 additions and 5 deletions.
26 changes: 21 additions & 5 deletions src/java/org/apache/nutch/tools/FreeGenerator.java
Expand Up @@ -146,27 +146,33 @@ public void reduce(Text key, Iterable<Generator.SelectorEntry> values,
@Override
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err
.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
System.err.println(
"Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize] [-numFetchers <n>]");
System.err
.println("\tinputDir\tinput directory containing one or more input files.");
System.err
.println("\t\tEach text file contains a list of URLs, one URL per line");
.println("\t \tEach text file contains a list of URLs, one URL per line");
System.err
.println("\tsegmentsDir\toutput directory, where new segment will be created");
System.err.println("\t-filter\trun current URLFilters on input URLs");
System.err.println("\t-filter \trun current URLFilters on input URLs");
System.err
.println("\t-normalize\trun current URLNormalizers on input URLs");
System.err.println(
"\t-numFetchers <n>\tnumber of generated fetch lists, determines number of fetcher tasks");
return -1;
}
boolean filter = false;
boolean normalize = false;
int numFetchers = -1;
if (args.length > 2) {
for (int i = 2; i < args.length; i++) {
if (args[i].equals("-filter")) {
filter = true;
} else if (args[i].equals("-normalize")) {
normalize = true;
} else if ("-numFetchers".equals(args[i])) {
numFetchers = Integer.parseInt(args[i + 1]);
i++;
} else {
LOG.error("Unknown argument: " + args[i] + ", exiting ...");
return -1;
Expand All @@ -191,7 +197,17 @@ public int run(String[] args) throws Exception {
job.setPartitionerClass(URLPartitioner.class);
job.setReducerClass(FG.FGReducer.class);
String segName = Generator.generateSegmentName();
job.setNumReduceTasks(Integer.parseInt(conf.get("mapreduce.job.maps")));
if (numFetchers == -1) {
/* for politeness create exactly one partition per fetch task */
numFetchers = Integer.parseInt(conf.get("mapreduce.job.maps"));
}
if ("local".equals(conf.get("mapreduce.framework.name"))
&& numFetchers != 1) {
// override
LOG.info(
"FreeGenerator: running in local mode, generating exactly one partition.");
numFetchers = 1;
}
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
Expand Down

0 comments on commit 72f3ff2

Please sign in to comment.