src/java/org/apache/nutch/scoring/webgraph/WebGraph.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.scoring.webgraph;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;

/**
 * Creates three databases, one for inlinks, one for outlinks, and a node
 * database that holds the number of in and outlinks to a url and the current
 * score for the url.
 * 
 * The score is set by an analysis program such as LinkRank. The WebGraph is an
 * update-able database. Outlinks are stored by their fetch time or by the
 * current system time if no fetch time is available. Only the most recent
 * version of outlinks for a given url is stored. As more crawls are executed
 * and the WebGraph updated, newer Outlinks will replace older Outlinks. This
 * allows the WebGraph to adapt to changes in the link structure of the web.
 * 
 * The Inlink database is created from the Outlink database and is regenerated
 * when the WebGraph is updated. The Node database is created from both the
 * Inlink and Outlink databases. Because the Node database is overwritten when
 * the WebGraph is updated and because the Node database holds current scores
 * for urls it is recommended that a crawl-cycle (one or more full crawls) fully
 * complete before the WebGraph is updated and some type of analysis, such as
 * LinkRank, is run to update scores in the Node database in a stable fashion.
 */
public class WebGraph extends Configured implements Tool {

  private static final Logger LOG = LoggerFactory
      .getLogger(MethodHandles.lookup().lookupClass());
  public static final String LOCK_NAME = ".locked";
  public static final String INLINK_DIR = "inlinks";
  public static final String OUTLINK_DIR = "outlinks/current";
  public static final String OLD_OUTLINK_DIR = "outlinks/old";
  public static final String NODE_DIR = "nodes";

  /**
   * The OutlinkDb creates a database of all outlinks. Outlinks to internal urls
   * by domain and host can be ignored. The number of Outlinks out to a given
   * page or domain can also be limited.
   */
  public static class OutlinkDb extends Configured {

    public static final String URL_NORMALIZING = "webgraph.url.normalizers";
    public static final String URL_FILTERING = "webgraph.url.filters";

    /**
     * Returns the fetch time from the parse data or the current system time if
     * the fetch time doesn't exist.
     * 
     * @param data
     *          The parse data.
     * 
     * @return The fetch time as a long.
     */
    private static long getFetchTime(ParseData data) {

      // default to current system time
      long fetchTime = System.currentTimeMillis();
      String fetchTimeStr = data.getContentMeta().get(Nutch.FETCH_TIME_KEY);
      try {
        // get the fetch time from the parse data
        fetchTime = Long.parseLong(fetchTimeStr);
      } catch (Exception e) {
        fetchTime = System.currentTimeMillis();
      }
      return fetchTime;
    }

    /**
     * Default constructor.
     */
    public OutlinkDb() {
    }

    /**
     * Configurable constructor.
     */
    public OutlinkDb(Configuration conf) {
      setConf(conf);
    }

    /**
     * Passes through existing LinkDatum objects from an existing OutlinkDb and
     * maps out new LinkDatum objects from new crawls ParseData.
     */
    public static class OutlinkDbMapper extends
        Mapper<Text, Writable, Text, NutchWritable> {

      // using normalizers and/or filters
      private boolean normalize = false;
      private boolean filter = false;

      // url normalizers, filters and job configuration
      private URLNormalizers urlNormalizers;
      private URLFilters filters;
      private Configuration conf;

      /**
       * Normalizes and trims extra whitespace from the given url.
       * 
       * @param url
       *          The url to normalize.
       * 
       * @return The normalized url.
       */
      private String normalizeUrl(String url) {

        if (!normalize) {
          return url;
        }

        String normalized = null;
        if (urlNormalizers != null) {
          try {
            
            // normalize and trim the url
            normalized = urlNormalizers.normalize(url,
                URLNormalizers.SCOPE_DEFAULT);
            normalized = normalized.trim();
          } catch (Exception e) {
            LOG.warn("Skipping " + url + ":" + e);
            normalized = null;
          }
        }
        return normalized;
      }

      /**
       * Filters the given url.
       * 
       * @param url
       *          The url to filter.
       * 
       * @return The filtered url or null.
       */
      private String filterUrl(String url) {

        if (!filter) {
          return url;
        }

        try { 
          url = filters.filter(url);
        } catch (Exception e) {
          url = null;
        }

        return url;
      }

      /**
       * Configures the OutlinkDb job mapper. Sets up internal links and link limiting.
       */
      @Override
      public void setup(Mapper<Text, Writable, Text, NutchWritable>.Context context) {
        Configuration config = context.getConfiguration();
        conf = config;

        normalize = conf.getBoolean(URL_NORMALIZING, false);
        filter = conf.getBoolean(URL_FILTERING, false);

        if (normalize) {
          urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
        }

        if (filter) {
          filters = new URLFilters(conf);
        }
      }

      @Override
      public void map(Text key, Writable value,
          Context context)
          throws IOException, InterruptedException {

        // normalize url, stop processing if null
        String url = normalizeUrl(key.toString());
        if (url == null) {
          return;
        }

        // filter url
        if (filterUrl(url) == null) {
          return;
        }

        // Overwrite the key with the normalized URL
        key.set(url);

        if (value instanceof CrawlDatum) {
          CrawlDatum datum = (CrawlDatum) value;

          if (datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
              || datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
              || datum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {

            // Tell the reducer to get rid of all instances of this key
            context.write(key, new NutchWritable(new BooleanWritable(true)));
          }
        } else if (value instanceof ParseData) {
          // get the parse data and the outlinks from the parse data, along with
          // the fetch time for those links
          ParseData data = (ParseData) value;
          long fetchTime = getFetchTime(data);
          Outlink[] outlinkAr = data.getOutlinks();
          Map<String, String> outlinkMap = new LinkedHashMap<>();

          // normalize urls and put into map
          if (outlinkAr != null && outlinkAr.length > 0) {
            for (int i = 0; i < outlinkAr.length; i++) {
              Outlink outlink = outlinkAr[i];
              String toUrl = normalizeUrl(outlink.getToUrl());

              if (filterUrl(toUrl) == null) {
                continue;
              }

              // only put into map if the url doesn't already exist in the map or
              // if it does and the anchor for that link is null, will replace if
              // url is existing
              boolean existingUrl = outlinkMap.containsKey(toUrl);
              if (toUrl != null
                  && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
                outlinkMap.put(toUrl, outlink.getAnchor());
              }
            }
          }

          // collect the outlinks under the fetch time
          for (String outlinkUrl : outlinkMap.keySet()) {
            String anchor = outlinkMap.get(outlinkUrl);
            LinkDatum datum = new LinkDatum(outlinkUrl, anchor, fetchTime);
            context.write(key, new NutchWritable(datum));
          }
        } else if (value instanceof LinkDatum) {
          LinkDatum datum = (LinkDatum) value;
          String linkDatumUrl = normalizeUrl(datum.getUrl());

          if (filterUrl(linkDatumUrl) != null) {
            datum.setUrl(linkDatumUrl);

            // collect existing outlinks from existing OutlinkDb
            context.write(key, new NutchWritable(datum));
          }
        }
      }
    }

    public static class OutlinkDbReducer extends
        Reducer<Text, NutchWritable, Text, LinkDatum> {

      // ignoring internal domains, internal hosts
      private boolean ignoreDomain = true;
      private boolean ignoreHost = true;

      // limiting urls out to a page or to a domain
      private boolean limitPages = true;
      private boolean limitDomains = true;

      // url normalizers, filters and job configuration
      private Configuration conf;

      /**
       * Configures the OutlinkDb job reducer. Sets up internal links and link limiting.
       */
      public void setup(Reducer<Text, NutchWritable, Text, LinkDatum>.Context context) {
        Configuration config = context.getConfiguration();
        conf = config;
        ignoreHost = conf.getBoolean("link.ignore.internal.host", true);
        ignoreDomain = conf.getBoolean("link.ignore.internal.domain", true);
        limitPages = conf.getBoolean("link.ignore.limit.page", true);
        limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
        
      }
   
      public void reduce(Text key, Iterable<NutchWritable> values,
          Context context)
          throws IOException, InterruptedException {

        // aggregate all outlinks, get the most recent timestamp for a fetch
        // which should be the timestamp for all of the most recent outlinks
        long mostRecent = 0L;
        List<LinkDatum> outlinkList = new ArrayList<>();
        for (NutchWritable val : values) {
          final Writable value = val.get();

          if (value instanceof LinkDatum) {
            // loop through, change out most recent timestamp if needed
            LinkDatum next = (LinkDatum) value;
            long timestamp = next.getTimestamp();
            if (mostRecent == 0L || mostRecent < timestamp) {
              mostRecent = timestamp;
            }
            outlinkList.add(WritableUtils.clone(next, conf));
            context.getCounter("WebGraph.outlinks", "added links").increment(1);
          } else if (value instanceof BooleanWritable) {
            BooleanWritable delete = (BooleanWritable) value;
            // Actually, delete is always true, otherwise we don't emit it in the
            // mapper in the first place
            if (delete.get() == true) {
              // This page is gone, do not emit it's outlinks
              context.getCounter("WebGraph.outlinks", "removed links").increment(1);
              return;
            }
          }
        }

        // get the url, domain, and host for the url
        String url = key.toString();
        String domain = URLUtil.getDomainName(url);
        String host = URLUtil.getHost(url);

        // setup checking sets for domains and pages
        Set<String> domains = new HashSet<>();
        Set<String> pages = new HashSet<>();

        // loop through the link datums
        for (LinkDatum datum : outlinkList) {

          // get the url, host, domain, and page for each outlink
          String toUrl = datum.getUrl();
          String toDomain = URLUtil.getDomainName(toUrl);
          String toHost = URLUtil.getHost(toUrl);
          String toPage = URLUtil.getPage(toUrl);
          datum.setLinkType(LinkDatum.OUTLINK);

          // outlinks must be the most recent and conform to internal url and
          // limiting rules, if it does collect it
          if (datum.getTimestamp() == mostRecent
              && (!limitPages || (limitPages && !pages.contains(toPage)))
              && (!limitDomains || (limitDomains && !domains.contains(toDomain)))
              && (!ignoreHost || (ignoreHost && !toHost.equalsIgnoreCase(host)))
              && (!ignoreDomain || (ignoreDomain && !toDomain
                  .equalsIgnoreCase(domain)))) {
            context.write(key, datum);
            pages.add(toPage);
            domains.add(toDomain);
          }
        }
      }
    }

    public void close() {
    }
  }

  /**
   * The InlinkDb creates a database of Inlinks. Inlinks are inverted from the
   * OutlinkDb LinkDatum objects and are regenerated each time the WebGraph is
   * updated.
   */
  private static class InlinkDb extends Configured{

    private static long timestamp;

    /**
     * Inverts the Outlink LinkDatum objects into new LinkDatum objects with a
     * new system timestamp, type and to and from url switched.
     */
    public static class InlinkDbMapper extends
        Mapper<Text, LinkDatum, Text, LinkDatum> {

      /**
       * Configures job mapper. Sets timestamp for all Inlink LinkDatum objects to the
       * current system time.
       */
      public void setup(Mapper<Text, LinkDatum, Text, LinkDatum>.Context context) {
        timestamp = System.currentTimeMillis();
      }

      public void map(Text key, LinkDatum datum,
          Context context)
          throws IOException, InterruptedException {

        // get the to and from url and the anchor
        String fromUrl = key.toString();
        String toUrl = datum.getUrl();
        String anchor = datum.getAnchor();

        // flip the from and to url and set the new link type
        LinkDatum inlink = new LinkDatum(fromUrl, anchor, timestamp);
        inlink.setLinkType(LinkDatum.INLINK);
        context.write(new Text(toUrl), inlink);
      }
    }
  }

  /**
   * Creates the Node database which consists of the number of in and outlinks
   * for each url and a score slot for analysis programs such as LinkRank.
   */
  private static class NodeDb extends Configured {

    /**
     * Counts the number of inlinks and outlinks for each url and sets a default
     * score of 0.0 for each url (node) in the webgraph.
     */
    public static class NodeDbReducer extends 
        Reducer<Text, LinkDatum, Text, Node> {

      /**
       * Configures job reducer.
       */
      public void setup(Reducer<Text, LinkDatum, Text, Node>.Context context) {
      }

      public void reduce(Text key, Iterable<LinkDatum> values,
          Context context) throws IOException, InterruptedException {

        Node node = new Node();
        int numInlinks = 0;
        int numOutlinks = 0;

        // loop through counting number of in and out links
        for (LinkDatum next : values) {
          if (next.getLinkType() == LinkDatum.INLINK) {
            numInlinks++;
          } else if (next.getLinkType() == LinkDatum.OUTLINK) {
            numOutlinks++;
          }
        }

        // set the in and outlinks and a default score of 0
        node.setNumInlinks(numInlinks);
        node.setNumOutlinks(numOutlinks);
        node.setInlinkScore(0.0f);
        context.write(key, node);
      }
    }
  }

  /**
   * Creates the three different WebGraph databases, Outlinks, Inlinks, and
   * Node. If a current WebGraph exists then it is updated, if it doesn't exist
   * then a new WebGraph database is created.
   * 
   * @param webGraphDb
   *          The WebGraph to create or update.
   * @param segments
   *          The array of segments used to update the WebGraph. Newer segments
   *          and fetch times will overwrite older segments.
   * @param normalize
   *          whether to use URLNormalizers on URL's in the segment
   * @param filter
   *          whether to use URLFilters on URL's in the segment
   * 
   * @throws IOException
   *           If an error occurs while processing the WebGraph.
   */
  public void createWebGraph(Path webGraphDb, Path[] segments,
      boolean normalize, boolean filter) throws IOException, 
      InterruptedException, ClassNotFoundException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("WebGraphDb: starting at " + sdf.format(start));
      LOG.info("WebGraphDb: webgraphdb: " + webGraphDb);
      LOG.info("WebGraphDb: URL normalize: " + normalize);
      LOG.info("WebGraphDb: URL filter: " + filter);
    }

    FileSystem fs = webGraphDb.getFileSystem(getConf());

    // lock an existing webgraphdb to prevent multiple simultaneous updates
    Path lock = new Path(webGraphDb, LOCK_NAME);
    if (!fs.exists(webGraphDb)) {
      fs.mkdirs(webGraphDb);
    }

    LockUtil.createLockFile(fs, lock, false);

    // outlink and temp outlink database paths
    Path outlinkDb = new Path(webGraphDb, OUTLINK_DIR);
    Path oldOutlinkDb = new Path(webGraphDb, OLD_OUTLINK_DIR);

    if (!fs.exists(outlinkDb)) {
      fs.mkdirs(outlinkDb);
    }

    Path tempOutlinkDb = new Path(outlinkDb + "-"
        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    Job outlinkJob = NutchJob.getInstance(getConf());
    Configuration outlinkJobConf = outlinkJob.getConfiguration();
    outlinkJob.setJobName("Outlinkdb: " + outlinkDb);

    boolean deleteGone = outlinkJobConf.getBoolean("link.delete.gone", false);
    boolean preserveBackup = outlinkJobConf.getBoolean("db.preserve.backup", true);

    if (deleteGone) {
      LOG.info("OutlinkDb: deleting gone links");
    }

    // get the parse data and crawl fetch data for all segments
    if (segments != null) {
      for (int i = 0; i < segments.length; i++) {
        FileSystem sfs = segments[i].getFileSystem(outlinkJobConf);
        Path parseData = new Path(segments[i], ParseData.DIR_NAME);
        if (sfs.exists(parseData)) {
          LOG.info("OutlinkDb: adding input: " + parseData);
          FileInputFormat.addInputPath(outlinkJob, parseData);
        }

        if (deleteGone) {
          Path crawlFetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
          if (sfs.exists(crawlFetch)) {
            LOG.info("OutlinkDb: adding input: " + crawlFetch);
            FileInputFormat.addInputPath(outlinkJob, crawlFetch);
          }
        }
      }
    }

    // add the existing webgraph
    LOG.info("OutlinkDb: adding input: " + outlinkDb);
    FileInputFormat.addInputPath(outlinkJob, outlinkDb);

    outlinkJobConf.setBoolean(OutlinkDb.URL_NORMALIZING, normalize);
    outlinkJobConf.setBoolean(OutlinkDb.URL_FILTERING, filter);

    outlinkJob.setInputFormatClass(SequenceFileInputFormat.class);
    outlinkJob.setJarByClass(OutlinkDb.class);
    outlinkJob.setMapperClass(OutlinkDb.OutlinkDbMapper.class);
    outlinkJob.setReducerClass(OutlinkDb.OutlinkDbReducer.class);
    outlinkJob.setMapOutputKeyClass(Text.class);
    outlinkJob.setMapOutputValueClass(NutchWritable.class);
    outlinkJob.setOutputKeyClass(Text.class);
    outlinkJob.setOutputValueClass(LinkDatum.class);
    FileOutputFormat.setOutputPath(outlinkJob, tempOutlinkDb);
    outlinkJob.setOutputFormatClass(MapFileOutputFormat.class);
    outlinkJobConf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
        false);

    // run the outlinkdb job and replace any old outlinkdb with the new one
    try {
      LOG.info("OutlinkDb: running");
      boolean success = outlinkJob.waitForCompletion(true);
      if (!success) {
        String message = "OutlinkDb job did not succeed, job status:"
            + outlinkJob.getStatus().getState() + ", reason: "
            + outlinkJob.getStatus().getFailureInfo();
        LOG.error(message);
        NutchJob.cleanupAfterFailure(tempOutlinkDb, lock, fs);
        throw new RuntimeException(message);
      }
      LOG.info("OutlinkDb: installing " + outlinkDb);
      FSUtils.replace(fs, oldOutlinkDb, outlinkDb, true);
      FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true);
      if (!preserveBackup && fs.exists(oldOutlinkDb))
        fs.delete(oldOutlinkDb, true);
      LOG.info("OutlinkDb: finished");
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
      LOG.error("OutlinkDb failed:", e);
      // remove lock file and and temporary directory if an error occurs
      NutchJob.cleanupAfterFailure(tempOutlinkDb, lock, fs);
      throw e;
    }

    // inlink and temp link database paths
    Path inlinkDb = new Path(webGraphDb, INLINK_DIR);
    Path tempInlinkDb = new Path(inlinkDb + "-"
        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Job inlinkJob = NutchJob.getInstance(getConf());
    Configuration inlinkJobConf = inlinkJob.getConfiguration();
    inlinkJob.setJobName("Inlinkdb " + inlinkDb);
    LOG.info("InlinkDb: adding input: " + outlinkDb);
    FileInputFormat.addInputPath(inlinkJob, outlinkDb);
    inlinkJob.setInputFormatClass(SequenceFileInputFormat.class);
    inlinkJob.setJarByClass(InlinkDb.class);
    inlinkJob.setMapperClass(InlinkDb.InlinkDbMapper.class);
    inlinkJob.setMapOutputKeyClass(Text.class);
    inlinkJob.setMapOutputValueClass(LinkDatum.class);
    inlinkJob.setOutputKeyClass(Text.class);
    inlinkJob.setOutputValueClass(LinkDatum.class);
    FileOutputFormat.setOutputPath(inlinkJob, tempInlinkDb);
    inlinkJob.setOutputFormatClass(MapFileOutputFormat.class);
    inlinkJobConf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
        false);

    try {

      // run the inlink and replace any old with new
      LOG.info("InlinkDb: running");
      boolean success = inlinkJob.waitForCompletion(true);
      if (!success) {
        String message = "InlinkDb job did not succeed, job status:"
            + inlinkJob.getStatus().getState() + ", reason: "
            + inlinkJob.getStatus().getFailureInfo();
        LOG.error(message);
        NutchJob.cleanupAfterFailure(tempInlinkDb, lock, fs);
        throw new RuntimeException(message);
      }
      LOG.info("InlinkDb: installing " + inlinkDb);
      FSUtils.replace(fs, inlinkDb, tempInlinkDb, true);
      LOG.info("InlinkDb: finished");
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
      LOG.error("InlinkDb failed:", e);
      // remove lock file and and temporary directory if an error occurs
      NutchJob.cleanupAfterFailure(tempInlinkDb, lock, fs);
      throw e;
    }

    // node and temp node database paths
    Path nodeDb = new Path(webGraphDb, NODE_DIR);
    Path tempNodeDb = new Path(nodeDb + "-"
        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Job nodeJob = NutchJob.getInstance(getConf());
    Configuration nodeJobConf = nodeJob.getConfiguration();
    nodeJob.setJobName("NodeDb " + nodeDb);
    LOG.info("NodeDb: adding input: " + outlinkDb);
    LOG.info("NodeDb: adding input: " + inlinkDb);
    FileInputFormat.addInputPath(nodeJob, outlinkDb);
    FileInputFormat.addInputPath(nodeJob, inlinkDb);
    nodeJob.setInputFormatClass(SequenceFileInputFormat.class);
    nodeJob.setJarByClass(NodeDb.class);
    nodeJob.setReducerClass(NodeDb.NodeDbReducer.class);
    nodeJob.setMapOutputKeyClass(Text.class);
    nodeJob.setMapOutputValueClass(LinkDatum.class);
    nodeJob.setOutputKeyClass(Text.class);
    nodeJob.setOutputValueClass(Node.class);
    FileOutputFormat.setOutputPath(nodeJob, tempNodeDb);
    nodeJob.setOutputFormatClass(MapFileOutputFormat.class);
    nodeJobConf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
        false);

    try {

      // run the node job and replace old nodedb with new
      LOG.info("NodeDb: running");
      boolean success = nodeJob.waitForCompletion(true);
      if (!success) {
        String message = "NodeDb job did not succeed, job status:"
            + nodeJob.getStatus().getState() + ", reason: "
            + nodeJob.getStatus().getFailureInfo();
        LOG.error(message);
        // remove lock file and and temporary directory if an error occurs
        NutchJob.cleanupAfterFailure(tempNodeDb, lock, fs);
        throw new RuntimeException(message);
      }
      LOG.info("NodeDb: installing " + nodeDb);
      FSUtils.replace(fs, nodeDb, tempNodeDb, true);
      LOG.info("NodeDb: finished");
    } catch (IOException | InterruptedException | ClassNotFoundException e) {
      LOG.error("NodeDb failed:", e);
      // remove lock file and and temporary directory if an error occurs
      NutchJob.cleanupAfterFailure(tempNodeDb, lock, fs);
      throw e;
    }

    // remove the lock file for the webgraph
    LockUtil.removeLockFile(fs, lock);

    long end = System.currentTimeMillis();
    LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: "
        + TimingUtil.elapsedTime(start, end));
  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new WebGraph(), args);
    System.exit(res);
  }

  /**
   * Parses command link arguments and runs the WebGraph jobs.
   */
  public int run(String[] args) throws Exception {

    // boolean options
    Option helpOpt = new Option("h", "help", false, "show this help message");
    Option normOpt = new Option("n", "normalize", false,
        "whether to use URLNormalizers on the URL's in the segment");
    Option filtOpt = new Option("f", "filter", false,
        "whether to use URLFilters on the URL's in the segment");

    // argument options
    @SuppressWarnings("static-access")
    Option graphOpt = OptionBuilder
        .withArgName("webgraphdb")
        .hasArg()
        .withDescription(
            "the web graph database to create (if none exists) or use if one does")
        .create("webgraphdb");
    @SuppressWarnings("static-access")
    Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
        .withDescription("the segment(s) to use").create("segment");
    @SuppressWarnings("static-access")
    Option segDirOpt = OptionBuilder.withArgName("segmentDir").hasArgs()
        .withDescription("the segment directory to use").create("segmentDir");

    // create the options
    Options options = new Options();
    options.addOption(helpOpt);
    options.addOption(normOpt);
    options.addOption(filtOpt);
    options.addOption(graphOpt);
    options.addOption(segOpt);
    options.addOption(segDirOpt);

    CommandLineParser parser = new GnuParser();
    try {
      CommandLine line = parser.parse(options, args);
      if (line.hasOption("help") || !line.hasOption("webgraphdb")
          || (!line.hasOption("segment") && !line.hasOption("segmentDir"))) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("WebGraph", options, true);
        return -1;
      }

      String webGraphDb = line.getOptionValue("webgraphdb");

      Path[] segPaths = null;

      // Handle segment option
      if (line.hasOption("segment")) {
        String[] segments = line.getOptionValues("segment");
        segPaths = new Path[segments.length];
        for (int i = 0; i < segments.length; i++) {
          segPaths[i] = new Path(segments[i]);
        }
      }

      // Handle segmentDir option
      if (line.hasOption("segmentDir")) {
        Path dir = new Path(line.getOptionValue("segmentDir"));
        FileSystem fs = dir.getFileSystem(getConf());
        FileStatus[] fstats = fs.listStatus(dir,
            HadoopFSUtil.getPassDirectoriesFilter(fs));
        segPaths = HadoopFSUtil.getPaths(fstats);
      }

      boolean normalize = false;

      if (line.hasOption("normalize")) {
        normalize = true;
      }

      boolean filter = false;

      if (line.hasOption("filter")) {
        filter = true;
      }

      createWebGraph(new Path(webGraphDb), segPaths, normalize, filter);
      return 0;
    } catch (Exception e) {
      LOG.error("WebGraph: " + StringUtils.stringifyException(e));
      return -2;
    }
  }

}