Skip to content

Commit

Permalink
NUTCH-2645 Webgraph tools ignore command-line options
Browse files Browse the repository at this point in the history
- must set values of command-line options in job configuration
  to pass them to job tasks
- use separate job configuration for separate web graph jobs/steps
- make NodeDumper job/tool to log to stdout
  • Loading branch information
sebastian-nagel committed Sep 13, 2018
1 parent af37024 commit 497db00
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 15 deletions.
1 change: 1 addition & 0 deletions conf/log4j.properties
Expand Up @@ -56,6 +56,7 @@ log4j.logger.org.apache.nutch.scoring.webgraph.LinkRank=INFO,cmdstdout
log4j.logger.org.apache.nutch.scoring.webgraph.Loops=INFO,cmdstdout
log4j.logger.org.apache.nutch.scoring.webgraph.ScoreUpdater=INFO,cmdstdout
log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout
log4j.logger.org.apache.nutch.scoring.webgraph.NodeDumper=INFO,cmdstdout
log4j.logger.org.apache.nutch.segment.SegmentChecker=INFO,cmdstdout
log4j.logger.org.apache.nutch.segment.SegmentMerger=INFO,cmdstdout
log4j.logger.org.apache.nutch.segment.SegmentReader=INFO,cmdstdout
Expand Down
4 changes: 2 additions & 2 deletions src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
Expand Up @@ -285,9 +285,9 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output,
long start = System.currentTimeMillis();
LOG.info("NodeDumper: starting at " + sdf.format(start));
Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
Configuration conf = getConf();

Job dumper = NutchJob.getInstance(conf);
Job dumper = NutchJob.getInstance(getConf());
Configuration conf = dumper.getConfiguration();
dumper.setJobName("NodeDumper: " + webGraphDb);
FileInputFormat.addInputPath(dumper, nodeDb);
dumper.setInputFormatClass(SequenceFileInputFormat.class);
Expand Down
28 changes: 15 additions & 13 deletions src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
Expand Up @@ -527,8 +527,7 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
LOG.info("WebGraphDb: URL filter: " + filter);
}

Configuration conf = getConf();
FileSystem fs = webGraphDb.getFileSystem(conf);
FileSystem fs = webGraphDb.getFileSystem(getConf());

// lock an existing webgraphdb to prevent multiple simultaneous updates
Path lock = new Path(webGraphDb, LOCK_NAME);
Expand All @@ -548,11 +547,12 @@ public void createWebGraph(Path webGraphDb, Path[] segments,

Path tempOutlinkDb = new Path(outlinkDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
Job outlinkJob = NutchJob.getInstance(conf);
Job outlinkJob = NutchJob.getInstance(getConf());
Configuration outlinkJobConf = outlinkJob.getConfiguration();
outlinkJob.setJobName("Outlinkdb: " + outlinkDb);

boolean deleteGone = conf.getBoolean("link.delete.gone", false);
boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
boolean deleteGone = outlinkJobConf.getBoolean("link.delete.gone", false);
boolean preserveBackup = outlinkJobConf.getBoolean("db.preserve.backup", true);

if (deleteGone) {
LOG.info("OutlinkDb: deleting gone links");
Expand All @@ -561,7 +561,7 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
// get the parse data and crawl fetch data for all segments
if (segments != null) {
for (int i = 0; i < segments.length; i++) {
FileSystem sfs = segments[i].getFileSystem(conf);
FileSystem sfs = segments[i].getFileSystem(outlinkJobConf);
Path parseData = new Path(segments[i], ParseData.DIR_NAME);
if (sfs.exists(parseData)) {
LOG.info("OutlinkDb: adding input: " + parseData);
Expand All @@ -582,8 +582,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
LOG.info("OutlinkDb: adding input: " + outlinkDb);
FileInputFormat.addInputPath(outlinkJob, outlinkDb);

conf.setBoolean(OutlinkDb.URL_NORMALIZING, normalize);
conf.setBoolean(OutlinkDb.URL_FILTERING, filter);
outlinkJobConf.setBoolean(OutlinkDb.URL_NORMALIZING, normalize);
outlinkJobConf.setBoolean(OutlinkDb.URL_FILTERING, filter);

outlinkJob.setInputFormatClass(SequenceFileInputFormat.class);
outlinkJob.setJarByClass(OutlinkDb.class);
Expand All @@ -595,7 +595,7 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
outlinkJob.setOutputValueClass(LinkDatum.class);
FileOutputFormat.setOutputPath(outlinkJob, tempOutlinkDb);
outlinkJob.setOutputFormatClass(MapFileOutputFormat.class);
conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
outlinkJobConf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
false);

// run the outlinkdb job and replace any old outlinkdb with the new one
Expand Down Expand Up @@ -628,7 +628,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
Path tempInlinkDb = new Path(inlinkDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

Job inlinkJob = NutchJob.getInstance(conf);
Job inlinkJob = NutchJob.getInstance(getConf());
Configuration inlinkJobConf = inlinkJob.getConfiguration();
inlinkJob.setJobName("Inlinkdb " + inlinkDb);
LOG.info("InlinkDb: adding input: " + outlinkDb);
FileInputFormat.addInputPath(inlinkJob, outlinkDb);
Expand All @@ -641,7 +642,7 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
inlinkJob.setOutputValueClass(LinkDatum.class);
FileOutputFormat.setOutputPath(inlinkJob, tempInlinkDb);
inlinkJob.setOutputFormatClass(MapFileOutputFormat.class);
conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
inlinkJobConf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
false);

try {
Expand Down Expand Up @@ -672,7 +673,8 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
Path tempNodeDb = new Path(nodeDb + "-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

Job nodeJob = NutchJob.getInstance(conf);
Job nodeJob = NutchJob.getInstance(getConf());
Configuration nodeJobConf = nodeJob.getConfiguration();
nodeJob.setJobName("NodeDb " + nodeDb);
LOG.info("NodeDb: adding input: " + outlinkDb);
LOG.info("NodeDb: adding input: " + inlinkDb);
Expand All @@ -687,7 +689,7 @@ public void createWebGraph(Path webGraphDb, Path[] segments,
nodeJob.setOutputValueClass(Node.class);
FileOutputFormat.setOutputPath(nodeJob, tempNodeDb);
nodeJob.setOutputFormatClass(MapFileOutputFormat.class);
conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
nodeJobConf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
false);

try {
Expand Down

0 comments on commit 497db00

Please sign in to comment.