Skip to content

Commit

Permalink
NUTCH-3044 Generator: NPE when extracting the host part of a URL fails
Browse files Browse the repository at this point in the history
- replace deprecated method call
- improve and format Javadoc
  • Loading branch information
sebastian-nagel committed Apr 27, 2024
1 parent 4729786 commit b153279
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 44 deletions.
135 changes: 93 additions & 42 deletions src/java/org/apache/nutch/crawl/Generator.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
import org.apache.nutch.util.URLUtil;

/**
* Generates a subset of a crawl db to fetch. This version allows to generate
* Generates a subset of a CrawlDb to fetch. This version allows to generate
* fetchlists for several segments in one go. Unlike in the initial version
* (OldGenerator), the IP resolution is done ONLY on the entries which have been
* selected for fetching. The URLs are partitioned by IP, domain or host within
Expand Down Expand Up @@ -694,6 +694,29 @@ public Generator(Configuration conf) {
setConf(conf);
}

/**
* @param dbDir
* Crawl database directory
* @param segments
* Segments directory
* @param numLists
* Number of fetch lists (partitions) per segment or number of
* fetcher map tasks. (One fetch list partition is fetched in one
* fetcher map task.)
* @param topN
* Number of top URLs to be selected
* @param curTime
* Current time in milliseconds
* @return Path to generated segment or null if no entries were selected
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
* @throws InterruptedException
* if a thread is waiting, sleeping, or otherwise occupied, and the
* thread is interrupted, either before or during the activity.
* @throws ClassNotFoundException
* if runtime class(es) are not available
*/
public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
long curTime)
throws IOException, InterruptedException, ClassNotFoundException {
Expand All @@ -707,31 +730,39 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
}

/**
* This is an old signature used for compatibility - does not specify whether or not to
* normalise and set the number of segments to 1
* This is an old signature used for compatibility - does not specify whether
* or not to normalise and set the number of segments to 1
*
* @param dbDir
* Crawl database directory
* @param segments
* Segments directory
* @param numLists
* Number of reduce tasks
* Number of fetch lists (partitions) per segment or number of
* fetcher map tasks. (One fetch list partition is fetched in one
* fetcher map task.)
* @param topN
* Number of top URLs to be selected
* @param curTime
* Current time in milliseconds
* @param filter whether to apply filtering operation
* @param force if true, and the target lockfile exists, consider it valid. If false
* and the target file exists, throw an IOException.
* @deprecated since 1.19 use
* {@link #generate(Path, Path, int, long, long, boolean, boolean, boolean, int, String, String)}
* or {@link #generate(Path, Path, int, long, long, boolean, boolean, boolean, int, String)}
* in the instance that no hostdb is available
* @throws IOException if an I/O exception occurs.
* @param filter
* whether to apply filtering operation
* @param force
* if true, and the target lockfile exists, consider it valid. If
* false and the target file exists, throw an IOException.
* @deprecated since 1.19 use
* {@link #generate(Path, Path, int, long, long, boolean, boolean, boolean, int, String, String)}
* or
* {@link #generate(Path, Path, int, long, long, boolean, boolean, boolean, int, String)}
* in the instance that no hostdb is available
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
* @throws InterruptedException if a thread is waiting, sleeping, or
* otherwise occupied, and the thread is interrupted, either before or
* during the activity.
* @throws ClassNotFoundException if runtime class(es) are not available
* @throws InterruptedException
* if a thread is waiting, sleeping, or otherwise occupied, and the
* thread is interrupted, either before or during the activity.
* @throws ClassNotFoundException
* if runtime class(es) are not available
* @return Path to generated segment or null if no entries were selected
**/
@Deprecated
Expand All @@ -748,29 +779,39 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
* is read from the "generate.filter" property set for the job from
* command-line. If the property is not found, the URLs are filtered. Same for
* the normalisation.
*
* @param dbDir
* Crawl database directory
* @param segments
* Segments directory
* @param numLists
* Number of reduce tasks
* Number of fetch lists (partitions) per segment or number of
* fetcher map tasks. (One fetch list partition is fetched in one
* fetcher map task.)
* @param topN
* Number of top URLs to be selected
* @param curTime
* Current time in milliseconds
* @param filter whether to apply filtering operation
* @param norm whether to apply normalization operation
* @param force if true, and the target lockfile exists, consider it valid. If false
* and the target file exists, throw an IOException.
* @param maxNumSegments maximum number of segments to generate
* @param expr a Jexl expression to use in the Generator job.
* @param filter
* whether to apply filtering operation
* @param norm
* whether to apply normalization operation
* @param force
* if true, and the target lockfile exists, consider it valid. If
* false and the target file exists, throw an IOException.
* @param maxNumSegments
* maximum number of segments to generate
* @param expr
* a Jexl expression to use in the Generator job.
* @see JexlUtil#parseExpression(String)
* @throws IOException if an I/O exception occurs.
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
* @throws InterruptedException if a thread is waiting, sleeping, or
* otherwise occupied, and the thread is interrupted, either before or
* during the activity.
* @throws ClassNotFoundException if runtime class(es) are not available
* @throws InterruptedException
* if a thread is waiting, sleeping, or otherwise occupied, and the
* thread is interrupted, either before or during the activity.
* @throws ClassNotFoundException
* if runtime class(es) are not available
* @return Path to generated segment or null if no entries were selected
**/
public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
Expand All @@ -792,26 +833,36 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
* @param segments
* Segments directory
* @param numLists
* Number of reduce tasks
* Number of fetch lists (partitions) per segment or number of
* fetcher map tasks. (One fetch list partition is fetched in one
* fetcher map task.)
* @param topN
* Number of top URLs to be selected
* @param curTime
* Current time in milliseconds
* @param filter whether to apply filtering operation
* @param norm whether to apply normalization operation
* @param force if true, and the target lockfile exists, consider it valid. If false
* and the target file exists, throw an IOException.
* @param maxNumSegments maximum number of segments to generate
* @param expr a Jexl expression to use in the Generator job.
* @param hostdb name of a hostdb from which to execute Jexl expressions in a bid
* to determine the maximum URL count and/or fetch delay per host.
* @param filter
* whether to apply filtering operation
* @param norm
* whether to apply normalization operation
* @param force
* if true, and the target lockfile exists, consider it valid. If
* false and the target file exists, throw an IOException.
* @param maxNumSegments
* maximum number of segments to generate
* @param expr
* a Jexl expression to use in the Generator job.
* @param hostdb
* name of a hostdb from which to execute Jexl expressions in a bid
* to determine the maximum URL count and/or fetch delay per host.
* @see JexlUtil#parseExpression(String)
* @throws IOException if an I/O exception occurs.
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
* @throws InterruptedException if a thread is waiting, sleeping, or
* otherwise occupied, and the thread is interrupted, either before or
* during the activity.
* @throws ClassNotFoundException if runtime class(es) are not available
* @throws InterruptedException
* if a thread is waiting, sleeping, or otherwise occupied, and the
* thread is interrupted, either before or during the activity.
* @throws ClassNotFoundException
* if runtime class(es) are not available
* @return Path to generated segment or null if no entries were selected
*/
public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
Expand Down
5 changes: 3 additions & 2 deletions src/test/org/apache/nutch/crawl/TestGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ private Path generateFetchlist(int numResults, Configuration config,
// generate segment
Generator g = new Generator(config);
Path[] generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
Long.MAX_VALUE, filter, false);
Long.MAX_VALUE, filter, false, false, 1, null, null);
if (generatedSegment == null)
return null;
return generatedSegment[0];
Expand All @@ -407,7 +407,8 @@ private Path generateFetchlist(int numResults, Configuration config,
* Creates CrawlDB.
*
* @param list
* database contents
* database contents. The list must be lexicographically sorted by
* URL.
* @throws IOException
* @throws Exception
*/
Expand Down

0 comments on commit b153279

Please sign in to comment.