From 4b263533a9cdea208383fdbb0a8cc0b537423d7f Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Thu, 25 Apr 2024 17:44:08 +0200 Subject: [PATCH 1/3] NUTCH-3044 Generator: NPE when extracting the host part of a URL fails --- src/java/org/apache/nutch/crawl/Generator.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index 33f743a37a..e7c52e2d4d 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -436,7 +436,10 @@ public void reduce(FloatWritable key, Iterable values, URL u = null; String hostname = URLUtil.getHost(urlString); - if (!hostname.equals(currentHostname)) { + if (hostname == null) { + currentHostname = hostname; + // malformed URLs are counted later on when extracting host or domain + } else if (!hostname.equals(currentHostname)) { currentHostname = hostname; host = hostDatumCache.get(hostname); From 4729786e4d7f9e1136580ceb191274862d03ba5b Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 27 Apr 2024 14:54:55 +0200 Subject: [PATCH 2/3] NUTCH-3044 Generator: NPE when extracting the host part of a URL fails - add unit test to proof that URLs without a host part do not cause errors --- .../org/apache/nutch/crawl/TestGenerator.java | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/test/org/apache/nutch/crawl/TestGenerator.java b/src/test/org/apache/nutch/crawl/TestGenerator.java index 56b24c1a23..151e365bb3 100644 --- a/src/test/org/apache/nutch/crawl/TestGenerator.java +++ b/src/test/org/apache/nutch/crawl/TestGenerator.java @@ -301,6 +301,56 @@ public void testFilter() throws IOException, Exception { } + /** + * Test that Generator can process URLs without a host part. + * + * @throws Exception + * @throws IOException + */ + @Test + public void testURLNoHost() throws IOException, Exception { + + ArrayList list = new ArrayList(); + + list.add(createURLCrawlDatum("file:/path/index.html", 1, 1)); + int numValidURLs = 1; + // The following URL strings will cause a MalformedURLException: + // - unsupported scheme + list.add(createURLCrawlDatum("xyz://foobar/path/index.html", 1, 1)); + + createCrawlDB(list); + + Configuration myConfiguration = new Configuration(conf); + myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, -1); + myConfiguration.set(Generator.GENERATOR_COUNT_MODE, + Generator.GENERATOR_COUNT_VALUE_HOST); + + Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, + myConfiguration, false); + + Path fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-r-00000"); + + ArrayList fetchList = readContents(fetchlistPath); + + Assert.assertEquals("Size of fetch list does not fit", + numValidURLs, fetchList.size()); + + myConfiguration.set(Generator.GENERATOR_COUNT_MODE, + Generator.GENERATOR_COUNT_VALUE_DOMAIN); + + generatedSegment = generateFetchlist(Integer.MAX_VALUE, + myConfiguration, false); + + fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-r-00000"); + + fetchList = readContents(fetchlistPath); + + Assert.assertEquals("Size of fetch list does not fit", + numValidURLs, fetchList.size()); + } + /** * Read contents of fetchlist. * From b153279ad5844b32560ecf62a8e7f83f8ecbd43c Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 27 Apr 2024 14:56:11 +0200 Subject: [PATCH 3/3] NUTCH-3044 Generator: NPE when extracting the host part of a URL fails - replace deprecated method call - improve and format Javadoc --- .../org/apache/nutch/crawl/Generator.java | 135 ++++++++++++------ .../org/apache/nutch/crawl/TestGenerator.java | 5 +- 2 files changed, 96 insertions(+), 44 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index e7c52e2d4d..7bb6159f01 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -81,7 +81,7 @@ import org.apache.nutch.util.URLUtil; /** - * Generates a subset of a crawl db to fetch. This version allows to generate + * Generates a subset of a CrawlDb to fetch. This version allows to generate * fetchlists for several segments in one go. Unlike in the initial version * (OldGenerator), the IP resolution is done ONLY on the entries which have been * selected for fetching. The URLs are partitioned by IP, domain or host within @@ -694,6 +694,29 @@ public Generator(Configuration conf) { setConf(conf); } + /** + * @param dbDir + * Crawl database directory + * @param segments + * Segments directory + * @param numLists + * Number of fetch lists (partitions) per segment or number of + * fetcher map tasks. (One fetch list partition is fetched in one + * fetcher map task.) + * @param topN + * Number of top URLs to be selected + * @param curTime + * Current time in milliseconds + * @return Path to generated segment or null if no entries were selected + * @throws IOException + * if an I/O exception occurs. + * @see LockUtil#createLockFile(Configuration, Path, boolean) + * @throws InterruptedException + * if a thread is waiting, sleeping, or otherwise occupied, and the + * thread is interrupted, either before or during the activity. + * @throws ClassNotFoundException + * if runtime class(es) are not available + */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime) throws IOException, InterruptedException, ClassNotFoundException { @@ -707,31 +730,39 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, } /** - * This is an old signature used for compatibility - does not specify whether or not to - * normalise and set the number of segments to 1 + * This is an old signature used for compatibility - does not specify whether + * or not to normalise and set the number of segments to 1 + * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists - * Number of reduce tasks + * Number of fetch lists (partitions) per segment or number of + * fetcher map tasks. (One fetch list partition is fetched in one + * fetcher map task.) * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds - * @param filter whether to apply filtering operation - * @param force if true, and the target lockfile exists, consider it valid. If false - * and the target file exists, throw an IOException. - * @deprecated since 1.19 use - * {@link #generate(Path, Path, int, long, long, boolean, boolean, boolean, int, String, String)} - * or {@link #generate(Path, Path, int, long, long, boolean, boolean, boolean, int, String)} - * in the instance that no hostdb is available - * @throws IOException if an I/O exception occurs. + * @param filter + * whether to apply filtering operation + * @param force + * if true, and the target lockfile exists, consider it valid. If + * false and the target file exists, throw an IOException. + * @deprecated since 1.19 use + * {@link #generate(Path, Path, int, long, long, boolean, boolean, boolean, int, String, String)} + * or + * {@link #generate(Path, Path, int, long, long, boolean, boolean, boolean, int, String)} + * in the instance that no hostdb is available + * @throws IOException + * if an I/O exception occurs. * @see LockUtil#createLockFile(Configuration, Path, boolean) - * @throws InterruptedException if a thread is waiting, sleeping, or - * otherwise occupied, and the thread is interrupted, either before or - * during the activity. - * @throws ClassNotFoundException if runtime class(es) are not available + * @throws InterruptedException + * if a thread is waiting, sleeping, or otherwise occupied, and the + * thread is interrupted, either before or during the activity. + * @throws ClassNotFoundException + * if runtime class(es) are not available * @return Path to generated segment or null if no entries were selected **/ @Deprecated @@ -748,29 +779,39 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, * is read from the "generate.filter" property set for the job from * command-line. If the property is not found, the URLs are filtered. Same for * the normalisation. + * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists - * Number of reduce tasks + * Number of fetch lists (partitions) per segment or number of + * fetcher map tasks. (One fetch list partition is fetched in one + * fetcher map task.) * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds - * @param filter whether to apply filtering operation - * @param norm whether to apply normalization operation - * @param force if true, and the target lockfile exists, consider it valid. If false - * and the target file exists, throw an IOException. - * @param maxNumSegments maximum number of segments to generate - * @param expr a Jexl expression to use in the Generator job. + * @param filter + * whether to apply filtering operation + * @param norm + * whether to apply normalization operation + * @param force + * if true, and the target lockfile exists, consider it valid. If + * false and the target file exists, throw an IOException. + * @param maxNumSegments + * maximum number of segments to generate + * @param expr + * a Jexl expression to use in the Generator job. * @see JexlUtil#parseExpression(String) - * @throws IOException if an I/O exception occurs. + * @throws IOException + * if an I/O exception occurs. * @see LockUtil#createLockFile(Configuration, Path, boolean) - * @throws InterruptedException if a thread is waiting, sleeping, or - * otherwise occupied, and the thread is interrupted, either before or - * during the activity. - * @throws ClassNotFoundException if runtime class(es) are not available + * @throws InterruptedException + * if a thread is waiting, sleeping, or otherwise occupied, and the + * thread is interrupted, either before or during the activity. + * @throws ClassNotFoundException + * if runtime class(es) are not available * @return Path to generated segment or null if no entries were selected **/ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, @@ -792,26 +833,36 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, * @param segments * Segments directory * @param numLists - * Number of reduce tasks + * Number of fetch lists (partitions) per segment or number of + * fetcher map tasks. (One fetch list partition is fetched in one + * fetcher map task.) * @param topN * Number of top URLs to be selected * @param curTime * Current time in milliseconds - * @param filter whether to apply filtering operation - * @param norm whether to apply normalization operation - * @param force if true, and the target lockfile exists, consider it valid. If false - * and the target file exists, throw an IOException. - * @param maxNumSegments maximum number of segments to generate - * @param expr a Jexl expression to use in the Generator job. - * @param hostdb name of a hostdb from which to execute Jexl expressions in a bid - * to determine the maximum URL count and/or fetch delay per host. + * @param filter + * whether to apply filtering operation + * @param norm + * whether to apply normalization operation + * @param force + * if true, and the target lockfile exists, consider it valid. If + * false and the target file exists, throw an IOException. + * @param maxNumSegments + * maximum number of segments to generate + * @param expr + * a Jexl expression to use in the Generator job. + * @param hostdb + * name of a hostdb from which to execute Jexl expressions in a bid + * to determine the maximum URL count and/or fetch delay per host. * @see JexlUtil#parseExpression(String) - * @throws IOException if an I/O exception occurs. + * @throws IOException + * if an I/O exception occurs. * @see LockUtil#createLockFile(Configuration, Path, boolean) - * @throws InterruptedException if a thread is waiting, sleeping, or - * otherwise occupied, and the thread is interrupted, either before or - * during the activity. - * @throws ClassNotFoundException if runtime class(es) are not available + * @throws InterruptedException + * if a thread is waiting, sleeping, or otherwise occupied, and the + * thread is interrupted, either before or during the activity. + * @throws ClassNotFoundException + * if runtime class(es) are not available * @return Path to generated segment or null if no entries were selected */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, diff --git a/src/test/org/apache/nutch/crawl/TestGenerator.java b/src/test/org/apache/nutch/crawl/TestGenerator.java index 151e365bb3..1003e40c54 100644 --- a/src/test/org/apache/nutch/crawl/TestGenerator.java +++ b/src/test/org/apache/nutch/crawl/TestGenerator.java @@ -397,7 +397,7 @@ private Path generateFetchlist(int numResults, Configuration config, // generate segment Generator g = new Generator(config); Path[] generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults, - Long.MAX_VALUE, filter, false); + Long.MAX_VALUE, filter, false, false, 1, null, null); if (generatedSegment == null) return null; return generatedSegment[0]; @@ -407,7 +407,8 @@ private Path generateFetchlist(int numResults, Configuration config, * Creates CrawlDB. * * @param list - * database contents + * database contents. The list must be lexicographically sorted by + * URL. * @throws IOException * @throws Exception */