From 0ea78907dee6b07058b66a99e395aea8cf623e92 Mon Sep 17 00:00:00 2001
From: Furkan KAMACI
* NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize
* the algorithm, so that the fetch interval either increases or decreases
- * infinitely, with little relevance to the page changes. Please use
- * {@link #main(String[])} method to test the values before applying them in a
- * production system.
- * nutch-site.xml to enable HTTP basic authentication,
* digest authentication or SSL when communicating with RESTAPI.
- * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure
+ * Set restapi.auth.username and restapi.auth.password properties at nutch-site.xml to configure
* credentials when BASIC or DIGEST authentication is used.
* Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used.
*
@@ -117,12 +117,14 @@ public NutchServer() {
* 'INFO' however best attempts should always be made to specify a logging
* level.<br>
* {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property
- * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication,
+ * should be set to BASIC, DIGEST or SSL at nutch-site.xml to enable HTTP basic authentication,
* digest authentication or SSL when communicating with RESTAPI.
- * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure
+ * Set restapi.auth.username and restapi.auth.password properties at nutch-site.xml to configure
* credentials when BASIC or DIGEST authentication is used.
* Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used.
*
+ * @param ramConfManager {@link RAMConfManager}
+ *
* @see org.apache.nutch.api.security.AuthenticationTypeEnum
*/
public NutchServer(RAMConfManager ramConfManager) {
@@ -137,12 +139,15 @@ public NutchServer(RAMConfManager ramConfManager) {
* 'INFO' however best attempts should always be made to specify a logging
* level.<br>
* {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property
- * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication,
+ * should be set to BASIC, DIGEST or SSL at nutch-site.xml to enable HTTP basic authentication,
* digest authentication or SSL when communicating with RESTAPI.
- * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure
+ * Set restapi.auth.username and restapi.auth.password properties at nutch-site.xml to configure
* credentials when BASIC or DIGEST authentication is used.
* Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used.
*
+ * @param ramConfManager {@link RAMConfManager}
+ * @param confId active configuration id
+ *
* @see org.apache.nutch.api.security.AuthenticationTypeEnum
*/
public NutchServer(RAMConfManager ramConfManager, String confId) {
@@ -305,7 +310,7 @@ public void start() {
/**
* Safety and convenience method to determine whether or not it is safe to
* shut down the server. We make this assertion by consulting the
- * {@link org.apache.nutch.api.NutchApp#jobManager} for a list of jobs with
+ * {@link #getJobMgr()} for a list of jobs with
* {@link org.apache.nutch.api.model.response.JobInfo#state} equal to
* 'RUNNING'.
*
@@ -356,8 +361,8 @@ public boolean stop(boolean force) {
/**
* Main method for NutchServer to run via command line.
*
- * @param args arguments for log level, stopping the Server and port.
- * @throws Exception
+ * @param args arguments for log level, stopping the Server and port.
+ * @throws Exception exception
*/
public static void main(String[] args) throws Exception {
CommandLineParser parser = new PosixParser();
diff --git a/src/java/org/apache/nutch/api/impl/RAMConfManager.java b/src/java/org/apache/nutch/api/impl/RAMConfManager.java
index 13c05fd285..356a8bdd25 100644
--- a/src/java/org/apache/nutch/api/impl/RAMConfManager.java
+++ b/src/java/org/apache/nutch/api/impl/RAMConfManager.java
@@ -50,6 +50,9 @@ public RAMConfManager() {
/**
* Public constructor which accepts a configuration id and {@link Configuration} type configuration.
+ *
+ * @param confId configuration id
+ * @param configuration configuration
*/
public RAMConfManager(String confId, Configuration configuration) {
configurations.put(confId, configuration);
diff --git a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
index 045f4cd777..8070c7b376 100755
--- a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
@@ -76,6 +76,7 @@ public void setConf(Configuration conf) {
* @param url
* URL of the page.
* @param page
+ * {@link WebPage} object relative to the URL
*/
@Override
public void initializeSchedule(String url, WebPage page) {
@@ -104,13 +105,7 @@ public void setFetchSchedule(String url, WebPage page, long prevFetchTime,
* @param url
* URL of the page
* @param page
- * @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than
- * @param datum
- * , but implementations should make sure that it contains at least
- * all information from
- * @param datum
- * .
+ * {@link WebPage} object relative to the URL
*/
@Override
public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
@@ -134,6 +129,7 @@ public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
* @param url
* URL of the page
* @param page
+ * {@link WebPage} object relative to the URL
* @param prevFetchTime
* previous fetch time
* @param prevModifiedTime
@@ -163,15 +159,15 @@ public long calculateLastFetchTime(WebPage page) {
* in the current fetchlist. NOTE: a true return value does not guarantee that
* the page will be fetched, it just allows it to be included in the further
* selection process based on scores. The default implementation checks
- * fetchTime, if it is higher than the
- *
- * @param curTime
- * it returns false, and true otherwise. It will also check that
- * fetchTime is not too remote (more than maxIntervalfetchTime, if it is higher than the current time
+ * it returns false, and true otherwise. It will also check that
+ * fetchTime is not too remote (more than maxInterval),
+ * in which case it lowers the interval and returns true.
+ *
* @param url
* URL of the page
* @param page
+ * {@link WebPage} object relative to the URL
* @param curTime
* reference time (usually set to the time when the fetchlist
* generation process was started).
@@ -200,6 +196,7 @@ public boolean shouldFetch(String url, WebPage page, long curTime) {
* @param url
* URL of the page
* @param page
+ * {@link WebPage} object relative to the URL
* @param asap
* if true, force refetch as soon as possible - this sets the
* fetchTime to now. If false, force refetch whenever the next fetch
diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index 1c2780a224..30c6ec7375 100755
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -46,10 +46,8 @@
* fetchTime, if
@@ -90,13 +86,10 @@ public void setFetchSchedule(String url, WebPage page, long prevFetchTime,
/**
* This method specifies how to schedule refetching of pages marked as GONE.
- * Default implementation increases fetchInterval by 50%, and if it exceeds
- * the maxInterval it calls
- * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
- *
* @param url
* URL of the page
* @param page
+ * {@link WebPage} object relative to the URL
*/
public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
long prevModifiedTime, long fetchTime);
@@ -109,6 +102,7 @@ public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
* @param url
* URL of the page
* @param page
+ * {@link WebPage} object relative to the URL
* @param prevFetchTime
* previous fetch time
* @param prevModifiedTime
@@ -133,14 +127,8 @@ public void setPageRetrySchedule(String url, WebPage page,
* selection process based on scores. The default implementation checks
* fetchTime, if it is higher than the
*
- * @param curTime
- * it returns false, and true otherwise. It will also check that
- * fetchTime is not too remote (more than maxInterval getFields(Job job) {
return fields;
}
- /** Generate a random batch id */
+ /**
+ * Generates a random batch id
+ *
+ * @return random batch id
+ */
public static String randomBatchId() {
long curTime = System.currentTimeMillis();
int randomSeed = Math.abs(new Random().nextInt());
@@ -173,6 +177,13 @@ public static String randomBatchId() {
return batchId;
}
+ /**
+ * Runs generator
+ *
+ * @param args map of arguments
+ * @return results
+ * @throws Exception
+ */
public Map
* The algorithm to calculate a page "profile" takes the plain text version of a * page and performs the following steps: diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java b/src/java/org/apache/nutch/fetcher/FetcherJob.java index a7f3df8efc..015c209da3 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherJob.java +++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java @@ -79,19 +79,17 @@ public class FetcherJob extends NutchTool implements Tool { /** *
* Mapper class for Fetcher. - *
+ * ** This class reads the random integer written by {@link GeneratorJob} as its * key while outputting the actual key and value arguments through a * {@link FetchEntry} instance. - *
+ * *- * This approach (combined with the use of {@link PartitionUrlByHost}) makes - * sure that Fetcher is still polite while also randomizing the key order. If * one host has a huge number of URLs in your table while other hosts have * not, {@link FetcherReducer} will not be stuck on one host but process URLs * from other hosts as well. - *
+ * */ public static class FetcherMapper extends GoraMapper
* You can define a set of contexts (or scopes) in which normalizers may be
* called. Each scope can have its own list of normalizers (defined in
- * "urlnormalizer.scope.
* In case no normalizers are required for any given scope, a
* org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer should
* be used.
- *
* Each normalizer may further select among many configurations, depending on * the scope in which it is called, because the scope name is passed as a * parameter to each normalizer. You can also use the same normalizer for many * scopes. - *
+ * ** Several scopes have been defined, and various Nutch tools will attempt using * scope-specific normalizers first (and fall back to default config if * scope-specific configuration is missing). - *
+ * ** Normalizers may be run several times, to ensure that modifications introduced * by normalizers at the end of the list can be further reduced by normalizers @@ -83,7 +83,7 @@ * want to run this loop up to the number of activated normalizers. This loop * count can be configured through urlnormalizer.loop.count property. * As soon as the url is unchanged the loop will stop and return the result. - *
+ * * * @author Andrzej Bialecki */ diff --git a/src/java/org/apache/nutch/parse/NutchSitemapParse.java b/src/java/org/apache/nutch/parse/NutchSitemapParse.java index c0a9d9b6ca..0e57339ba3 100644 --- a/src/java/org/apache/nutch/parse/NutchSitemapParse.java +++ b/src/java/org/apache/nutch/parse/NutchSitemapParse.java @@ -6,9 +6,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * + ** http://www.apache.org/licenses/LICENSE-2.0 - *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/src/java/org/apache/nutch/parse/ParsePluginsReader.java b/src/java/org/apache/nutch/parse/ParsePluginsReader.java
index dddd025163..b4c6f4e810 100644
--- a/src/java/org/apache/nutch/parse/ParsePluginsReader.java
+++ b/src/java/org/apache/nutch/parse/ParsePluginsReader.java
@@ -69,12 +69,10 @@ public ParsePluginsReader() {
/**
* Reads the parse-plugins.xml file and returns the
- * {@link #ParsePluginList} defined by it.
+ * {@link ParsePluginList} defined by it.
*
- * @return A {@link #ParsePluginList} specified by the
+ * @return A {@link ParsePluginList} specified by the
* parse-plugins.xml file.
- * @throws Exception
- * If any parsing error occurs.
*/
public ParsePluginList parse(Configuration conf) {
diff --git a/src/java/org/apache/nutch/parse/Parser.java b/src/java/org/apache/nutch/parse/Parser.java
index b623fd0262..9a8c2b7bf0 100644
--- a/src/java/org/apache/nutch/parse/Parser.java
+++ b/src/java/org/apache/nutch/parse/Parser.java
@@ -34,7 +34,7 @@ public interface Parser extends FieldPluggable, Configurable {
/**
*
* This method parses content in WebPage instance - *
+ * * * @param url * Page's URL diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java index 12faeae90a..4d5c572968 100644 --- a/src/java/org/apache/nutch/parse/ParserChecker.java +++ b/src/java/org/apache/nutch/parse/ParserChecker.java @@ -48,13 +48,13 @@ * is used to remove duplicates during the dedup procedure. It is calculated * using {@link org.apache.nutch.crawl.MD5Signature} or * {@link org.apache.nutch.crawl.TextProfileSignature}.
* The ArchRecordReader class provides a record reader which reads
* records from arc files.
- *
* Arc files are essentially tars of gzips. Each record in an arc file is a * compressed gzip. Multiple records are concatenated together to form a - * complete arc. For more information on the arc file format see {@link http - * ://www.archive.org/web/researcher/ArcFileFormat.php}. - *
- * + * complete arc. For more information on the arc file format see + * + * http://www.archive.org/web/researcher/ArcFileFormat.php. + * ** Arc files are used by the internet archive and grub projects. - *
+ * * - * @see http://www.archive.org/ - * @see http://www.grub.org/ + * @see http://www.archive.org/ + * @see http://www.grub.org/ */ public class ArcRecordReader implements RecordReader* Returns true if the byte array passed matches the gzip header magic number. - *
+ * * * @param input * The byte array to check. @@ -174,7 +174,7 @@ public float getProgress() throws IOException { * Returns true if the next record in the split is read into the key and value * pair. The key will be the arc record header and the values will be the raw * content bytes of the arc record. - * + * * * @param key * The record key diff --git a/src/java/org/apache/nutch/util/Bytes.java b/src/java/org/apache/nutch/util/Bytes.java index db9f4689c9..043a89761d 100644 --- a/src/java/org/apache/nutch/util/Bytes.java +++ b/src/java/org/apache/nutch/util/Bytes.java @@ -980,7 +980,7 @@ public static long readVLong(final byte[] buffer, final int offset) * left operand * @param right * right operand - * @return 0 if equal, < 0 if left is less than right, etc. + * @return 0 if equal, < 0 if left is less than right, etc. */ public static int compareTo(final byte[] left, final byte[] right) { return compareTo(left, 0, left.length, right, 0, right.length); @@ -1001,7 +1001,7 @@ public static int compareTo(final byte[] left, final byte[] right) { * How much to compare from the left buffer * @param length2 * How much to compare from the right buffer - * @return 0 if equal, < 0 if left is less than right, etc. + * @return 0 if equal, < 0 if left is less than right, etc. */ public static int compareTo(byte[] buffer1, int offset1, int length1, byte[] buffer2, int offset2, int length2) { @@ -1050,7 +1050,7 @@ public static boolean startsWith(byte[] bytes, byte[] prefix) { * @return Runs {@link WritableComparator#hashBytes(byte[], int)} on the * passed in array. This method is what * {@link org.apache.hadoop.io.Text} and - * {@link ImmutableBytesWritable} use calculating hash code. + * org.apache.hadoop.hbase.io.ImmutableBytesWritable use calculating hash code. */ public static int hashCode(final byte[] b) { return hashCode(b, b.length); @@ -1064,7 +1064,7 @@ public static int hashCode(final byte[] b) { * @return Runs {@link WritableComparator#hashBytes(byte[], int)} on the * passed in array. This method is what * {@link org.apache.hadoop.io.Text} and - * {@link ImmutableBytesWritable} use calculating hash code. + * org.apache.hadoop.hbase.io.ImmutableBytesWritable use calculating hash code. */ public static int hashCode(final byte[] b, final int length) { return WritableComparator.hashBytes(b, length); @@ -1366,12 +1366,12 @@ else if (cmp < 0) * given amount. * * @param value - * - array of bytes containing long (length <= SIZEOF_LONG) + * - array of bytes containing long (length <= SIZEOF_LONG) * @param amount * value will be incremented on (deincremented if negative) * @return array of bytes containing incremented long (length == SIZEOF_LONG) * @throws IOException - * - if value.length > SIZEOF_LONG + * - if value.length > SIZEOF_LONG */ public static byte[] incrementBytes(byte[] value, long amount) throws IOException { diff --git a/src/java/org/apache/nutch/util/EncodingDetector.java b/src/java/org/apache/nutch/util/EncodingDetector.java index 5b40e29e5d..25f8eefccd 100644 --- a/src/java/org/apache/nutch/util/EncodingDetector.java +++ b/src/java/org/apache/nutch/util/EncodingDetector.java @@ -45,7 +45,7 @@ ** A caller will often have some extra information about what the encoding might @@ -56,7 +56,7 @@ *
null is returned. null is returned. * This method was copied from org.apache.catalina.util.RequestUtil, which is * licensed under the Apache License, Version 2.0 (the "License"). * diff --git a/src/java/org/apache/nutch/util/MimeUtil.java b/src/java/org/apache/nutch/util/MimeUtil.java index 198fdee596..241087c91a 100644 --- a/src/java/org/apache/nutch/util/MimeUtil.java +++ b/src/java/org/apache/nutch/util/MimeUtil.java @@ -50,7 +50,7 @@ * substrate library, Apache * Tika. Any mime handling code should be placed in this utility * class, and hidden from the Nutch classes that rely on it. - *
+ * */ public final class MimeUtil { @@ -229,7 +229,7 @@ public String autoResolveContentType(String typeName, String url, byte[] data) { * method. * * @param url - * A string representation of the document {@link URL} to sense the + * A string representation of the document. URL to sense the * {@link MimeType} for. * @return An appropriate {@link MimeType}, identified from the given Document * url in string form. diff --git a/src/java/org/apache/nutch/util/NodeWalker.java b/src/java/org/apache/nutch/util/NodeWalker.java index 16e84c3598..3e0b0e1827 100644 --- a/src/java/org/apache/nutch/util/NodeWalker.java +++ b/src/java/org/apache/nutch/util/NodeWalker.java @@ -27,12 +27,12 @@ * of recursion. As the node tree is walked the next node is popped off of the * stack and all of its children are automatically added to the stack to be * called in tree order. - * + * * *
* Currently this class is not thread safe. It is assumed that only one thread
* will be accessing the NodeWalker at any given time.
- *
Node on the stack or null if there isn't
* a next node.
@@ -90,12 +90,12 @@ public Node nextNode() {
* When getting a next node from the walker, that node's children are
* automatically added to the stack. You can call this method to remove those
* children from the stack.
- *
+ *
*
* * This is useful when you don't want to process deeper into the current path * of the node tree but you want to continue processing sibling nodes. - *
+ * * */ public void skipChildren() { diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java index c0456c150c..029e7aece7 100644 --- a/src/java/org/apache/nutch/util/NutchJob.java +++ b/src/java/org/apache/nutch/util/NutchJob.java @@ -63,7 +63,7 @@ public NutchJob(Configuration conf, String jobName) throws IOException { * Creates a new {@link NutchJob} with no particular {@link org.apache.hadoop.mapreduce.Cluster} and a * given {@link org.apache.hadoop.conf.Configuration}. * - * TheNutchJob makes a copy of the Configuration so
+ * The NutchJob makes a copy of the Configuration so
* that any necessary internal modifications do not reflect on the incoming
* parameter.
*
@@ -87,7 +87,7 @@ public static NutchJob getInstance(Configuration conf) throws IOException {
* and a given jobName.
* A Cluster will be created from the conf parameter only when it's needed.
*
- * The NutchJob makes a copy of the Configuration so
+ * The NutchJob makes a copy of the Configuration so
* that any necessary internal modifications do not reflect on the incoming
* parameter.
*
diff --git a/src/java/org/apache/nutch/util/NutchTool.java b/src/java/org/apache/nutch/util/NutchTool.java
index 1f5789a608..443d1da681 100644
--- a/src/java/org/apache/nutch/util/NutchTool.java
+++ b/src/java/org/apache/nutch/util/NutchTool.java
@@ -36,11 +36,19 @@ public abstract class NutchTool extends Configured {
/**
* Runs the tool, using a map of arguments. May return results, or null.
+ *
+ * @param args map of arguments
+ * @return results or null
+ * @throws Exception
*/
public abstract Mapinput that is matched,
- * or null if no match exists.
+ * Returns the shortest prefix of input that is matched,
+ * or null if no match exists.
*/
public String shortestMatch(String input) {
TrieNode node = root;
@@ -86,8 +86,8 @@ public String shortestMatch(String input) {
}
/**
- * Returns the longest prefix of input that is matched,
- * or null if no match exists.
+ * Returns the longest prefix of input that is matched,
+ * or null if no match exists.
*/
public String longestMatch(String input) {
TrieNode node = root;
diff --git a/src/java/org/apache/nutch/util/SuffixStringMatcher.java b/src/java/org/apache/nutch/util/SuffixStringMatcher.java
index a967c0177f..6e070b935c 100644
--- a/src/java/org/apache/nutch/util/SuffixStringMatcher.java
+++ b/src/java/org/apache/nutch/util/SuffixStringMatcher.java
@@ -65,8 +65,8 @@ public boolean matches(String input) {
}
/**
- * Returns the shortest suffix of input that is matched,
- * or null if no match exists.
+ * Returns the shortest suffix of input that is matched,
+ * or null if no match exists.
*/
public String shortestMatch(String input) {
TrieNode node = root;
@@ -81,8 +81,8 @@ public String shortestMatch(String input) {
}
/**
- * Returns the longest suffix of input that is matched,
- * or null if no match exists.
+ * Returns the longest suffix of input that is matched,
+ * or null if no match exists.
*/
public String longestMatch(String input) {
TrieNode node = root;
diff --git a/src/java/org/apache/nutch/util/TableUtil.java b/src/java/org/apache/nutch/util/TableUtil.java
index 68ded699fe..e6ccbbc476 100644
--- a/src/java/org/apache/nutch/util/TableUtil.java
+++ b/src/java/org/apache/nutch/util/TableUtil.java
@@ -33,7 +33,7 @@ public class TableUtil {
* E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes
* "com.foo.bar:8983:http/to/index.html?a=b".
*
- * @param url
+ * @param urlString
* url to be reversed
* @return Reversed url
* @throws MalformedURLException
@@ -111,7 +111,7 @@ public static String unreverseUrl(String reversedUrl) {
/**
* Given a reversed url, returns the reversed host E.g
- * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar"
+ * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar"
*
* @param reversedUrl
* Reversed url
diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java
index 524bee6ff6..497716c4bf 100644
--- a/src/java/org/apache/nutch/util/TimingUtil.java
+++ b/src/java/org/apache/nutch/util/TimingUtil.java
@@ -32,7 +32,7 @@ public class TimingUtil {
* @param end
* The end of the time period
* @return a string of the form "XhYmZs" when the elapsed time is X hours, Y
- * minutes and Z seconds or null if start > end.
+ * minutes and Z seconds or null if start > end.
*/
public static String elapsedTime(long start, long end) {
if (start > end) {
diff --git a/src/java/org/apache/nutch/util/TrieStringMatcher.java b/src/java/org/apache/nutch/util/TrieStringMatcher.java
index 95f06ad6f6..e7773cb668 100644
--- a/src/java/org/apache/nutch/util/TrieStringMatcher.java
+++ b/src/java/org/apache/nutch/util/TrieStringMatcher.java
@@ -186,15 +186,15 @@ protected final void addPatternBackward(String s) {
public abstract boolean matches(String input);
/**
- * Returns the shortest substring of input that is
- * matched by a pattern in the trie, or null if no match
+ * Returns the shortest substring of input that is
+ * matched by a pattern in the trie, or null if no match
* exists.
*/
public abstract String shortestMatch(String input);
/**
- * Returns the longest substring of input that is
- * matched by a pattern in the trie, or null if no match
+ * Returns the longest substring of input that is
+ * matched by a pattern in the trie, or null if no match
* exists.
*/
public abstract String longestMatch(String input);
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index 5183ba10b0..e1df9e3604 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -219,52 +219,49 @@ public static String[] getHostBatches(String url)
* Yahoo! Slurp crawler described here:
* How
- * does the Yahoo! webcrawler handle redirects?
+ * does the Yahoo! webcrawler handle redirects?
*
- *
- * - Choose target url if either url is malformed.
- * - If different domains the keep the destination whether or not the
- * redirect is temp or perm
- *
- * - a.com -> b.com*
- *
- * - If the redirect is permanent and the source is root, keep the source.
- *
- * - *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
- *
- * - If the redirect is permanent and the source is not root and the
- * destination is root, keep the destination
- *
- * - a.com/xyz/index.html -> a.com*
- *
- * - If the redirect is permanent and neither the source nor the destination
- * is root, then keep the destination
- *
- * - a.com/xyz/index.html -> a.com/abc/page.html*
- *
- * - If the redirect is temporary and source is root and destination is not
- * root, then keep the source
- *
- * - *a.com -> a.com/xyz/index.html
- *
- * - If the redirect is temporary and source is not root and destination is
- * root, then keep the destination
- *
- * - a.com/xyz/index.html -> a.com*
- *
- * - If the redirect is temporary and neither the source or the destination
+ *
+ *
+ *
+ * - Choose target url if either url is malformed.
+ *
+ * - If different domains the keep the destination whether or not the
+ * redirect is temp or perm
+ * - a.com -> b.com*
+ *
+ * - If the redirect is permanent and the source is root, keep the source.
+ * - *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
+ *
+ * - If the redirect is permanent and the source is not root and the
+ * destination is root, keep the destination
+ * - a.com/xyz/index.html -> a.com*
+ *
+ * - If the redirect is permanent and neither the source nor the destination
+ * is root, then keep the destination
+ * - a.com/xyz/index.html -> a.com/abc/page.html*
+ *
+ * - If the redirect is temporary and source is root and destination is not
+ * root, then keep the source
+ * - *a.com -> a.com/xyz/index.html
+ *
+ * - If the redirect is temporary and source is not root and destination is
+ * root, then keep the destination
+ * - a.com/xyz/index.html -> a.com*
+ *
+ * - If the redirect is temporary and neither the source or the destination
* is root, then keep the shortest url. First check for the shortest host, and
* if both are equal then check by path. Path is first by length then by the
- * number of / path separators.
- *
- * - a.com/xyz/index.html -> a.com/abc/page.html*
- * - *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
- *
- * - If the redirect is temporary and both the source and the destination
- * are root, then keep the shortest sub-domain
- *
- * - *www.a.com -> www.news.a.com
- *
+ * number of / path separators.
+ * - a.com/xyz/index.html -> a.com/abc/page.html*
+ * - *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
+ *
+ * - If the redirect is temporary and both the source and the destination
+ * are root, then keep the shortest sub-domain
+ * - *www.a.com -> www.news.a.com
+ *
+ *
+ *
*
* While not in this logic there is a further piece of representative url
* logic that occurs during indexing and after scoring. During creation of the
diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffix.java b/src/java/org/apache/nutch/util/domain/DomainSuffix.java
index ae03ec4eea..0e0b7b02fc 100644
--- a/src/java/org/apache/nutch/util/domain/DomainSuffix.java
+++ b/src/java/org/apache/nutch/util/domain/DomainSuffix.java
@@ -23,17 +23,16 @@
* name of a host. The domain name of a host is defined to be the last part
* before the domain suffix, w/o subdomain names. As an example the domain name
* of
- * http://lucene.apache.org/
+ * http://lucene.apache.org/
*
* is apache.org
* This class holds three fields, domain field represents the
* suffix (such as "co.uk") boost is a float for boosting score
* of url's with this suffix status field represents domain's
- * status
+ * status. Check also domain-suffixes.xml
*
* @author Enis Soztutar <enis.soz.nutch@gmail.com>
* @see TopLevelDomain
- * @see domain-suffixes.xml
*/
public class DomainSuffix {
diff --git a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
index 6386335e06..87e370e5af 100644
--- a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
+++ b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
@@ -24,8 +24,8 @@
* top-level domain is com.
*
* @author Enis Soztutar <enis.soz.nutch@gmail.com>
- * @see http://www.iana.org/
- * @see http://en.wikipedia.org/wiki/Top-level_domain
+ * @see http://www.iana.org/
+ * @see http://en.wikipedia.org/wiki/Top-level_domain
*/
public class TopLevelDomain extends DomainSuffix {
diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
index 9df4e2724c..0751ddc1be 100644
--- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
+++ b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
@@ -64,7 +64,7 @@
*
* A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced links
* and content present in the feed.
- *
+ *
*
*/
public class FeedParser implements Parser {
diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
index 9e2e75bee1..7e0e24688b 100644
--- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
+++ b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
@@ -36,7 +36,7 @@
* Indexing filter that offers an option to either index all inbound anchor text
* for a document or deduplicate anchors. Deduplication does have it's con's,
*
- * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+ * Check {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
*/
public class AnchorIndexingFilter implements IndexingFilter {
diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
index fdd3b8120f..a97e9edb29 100644
--- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -37,7 +37,7 @@
* Indexer which can be configured to extract metadata from the crawldb, parse
* metadata or content metadata. You can specify the properties "index.db",
* "index.parse" or "index.content" who's values are comma-delimited
- * key1,key2,key3 .
+ * <value>key1,key2,key3</value>.
*/
public class MetadataIndexer implements IndexingFilter {
diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
index b1d99e5ed6..9171b1cebb 100644
--- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -148,14 +148,14 @@ private NutchDocument addLength(NutchDocument doc, WebPage page, String url) {
* primaryType and subType to field "type" as un-stored, indexed and
* un-tokenized, so that search results can be confined by contentType or its
* primaryType or its subType.
- *
+ *
*
* For example, if contentType is application/vnd.ms-powerpoint, search can be
* done with one of the following qualifiers
* type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
* all case insensitive. The query filter is implemented in
* {@link TypeQueryFilter}.
- *
+ *
*
* @param doc
* @param data
diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
index f3af6a987f..064cd8db2a 100644
--- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
+++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
@@ -84,12 +84,19 @@ public class HTMLLanguageParser implements ParseFilter {
/**
* Scan the HTML document looking at possible indications of content language
- * - 1. html lang attribute
- * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
- 2. meta
+ *
+ * - html lang attribute
+ * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
+ *
+ * - meta
* dc.language
* (http://dublincore.org/documents/2000/07/16/usageguide/qualified
- * -html.shtml#language)
- 3. meta http-equiv (content-language)
- * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
+ * -html.shtml#language)
+ *
+ * - meta http-equiv (content-language)
+ * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
+ *
+ *
*/
public Parse filter(String url, WebPage page, Parse parse,
HTMLMetaTags metaTags, DocumentFragment doc) {
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index d374e95a35..a1475a71c7 100644
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -44,19 +44,13 @@
* expressions.
*
*
- * The regular expressions rules are expressed in a file. The file of rules is
- * provided by each implementation using the
- * {@link #getRulesFile(Configuration)} method.
- *
- *
- *
- * The format of this file is made of many rules (one per line):
+ * The format of this file is made of many rules (one per line):
*
* [+-]<regex>
- *
+ *
* where plus (+)means go ahead and index it and minus (
* -)means no.
- *
+ *
*/
public abstract class RegexURLFilterBase implements URLFilter {
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
index 31b54dab3d..6bd430555c 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
@@ -174,7 +174,7 @@ protected void append(Node newNode) throws org.xml.sax.SAXException {
* supply a locator: if it does so, it must supply the locator to the
* application by invoking this method before invoking any of the other
* methods in the ContentHandler interface.
- *
+ *
*
*
* The locator allows the application to determine the end position of any
@@ -183,13 +183,13 @@ protected void append(Node newNode) throws org.xml.sax.SAXException {
* errors (such as character content that does not match an application's
* business rules). The information returned by the locator is probably not
* sufficient for use with a search engine.
- *
+ *
*
*
* Note that the locator will return correct information only during the
* invocation of the events in this interface. The application should not
* attempt to use it at any other time.
- *
+ *
*
* @param locator
* An object that can return the location of any SAX document event.
@@ -206,7 +206,7 @@ public void setDocumentLocator(Locator locator) {
*
* The SAX parser will invoke this method only once, before any other methods
* in this interface or in DTDHandler (except for setDocumentLocator).
- *
+ *
*/
public void startDocument() throws org.xml.sax.SAXException {
@@ -221,7 +221,7 @@ public void startDocument() throws org.xml.sax.SAXException {
* method invoked during the parse. The parser shall not invoke this method
* until it has either abandoned parsing (because of an unrecoverable error)
* or reached the end of input.
- *
+ *
*/
public void endDocument() throws org.xml.sax.SAXException {
@@ -237,14 +237,14 @@ public void endDocument() throws org.xml.sax.SAXException {
* startElement() event (even when the element is empty). All of the element's
* content will be reported, in order, before the corresponding endElement()
* event.
- *
+ *
*
*
* If the element name has a namespace prefix, the prefix will still be
* attached. Note that the attribute list provided will contain only
* attributes with explicit values (specified or defaulted): #IMPLIED
* attributes will be omitted.
- *
+ *
*
*
* @param ns
@@ -328,12 +328,12 @@ public void startElement(String ns, String localName, String name,
* The SAX parser will invoke this method at the end of every element in the
* XML document; there will be a corresponding startElement() event for every
* endElement() event (even when the element is empty).
- *
+ *
*
*
* If the element name has a namespace prefix, the prefix will still be
* attached to the name.
- *
+ *
*
*
* @param ns
@@ -371,18 +371,18 @@ public void setIDAttribute(String id, Element elem) {
* they may split it into several chunks; however, all of the characters in
* any single event must come from the same external entity, so that the
* Locator provides useful information.
- *
+ *
*
*
* The application must not attempt to read from the array outside of the
* specified range.
- *
+ *
*
*
* Note that some parsers will report whitespace using the
* ignorableWhitespace() method rather than this one (validating parsers must
* do so).
- *
+ *
*
* @param ch
* The characters from the XML document.
@@ -489,19 +489,19 @@ public void entityReference(String name) throws org.xml.sax.SAXException {
* whitespace (see the W3C XML 1.0 recommendation, section 2.10):
* non-validating parsers may also use this method if they are capable of
* parsing and using content models.
- *
+ *
*
*
* SAX parsers may return all contiguous whitespace in a single chunk, or they
* may split it into several chunks; however, all of the characters in any
* single event must come from the same external entity, so that the Locator
* provides useful information.
- *
+ *
*
*
* The application must not attempt to read from the array outside of the
* specified range.
- *
+ *
*
* @param ch
* The characters from the XML document.
@@ -539,12 +539,12 @@ private boolean isOutsideDocElem() {
* The Parser will invoke this method once for each processing instruction
* found: note that processing instructions may occur before or after the main
* document element.
- *
+ *
*
*
* A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
* or a text declaration (XML 1.0, section 4.3.1) using this method.
- *
+ *
*
* @param target
* The processing instruction target.
@@ -608,18 +608,18 @@ public void endCDATA() throws org.xml.sax.SAXException {
* they may split it into several chunks; however, all of the characters in
* any single event must come from the same external entity, so that the
* Locator provides useful information.
- *
+ *
*
*
* The application must not attempt to read from the array outside of the
* specified range.
- *
+ *
*
*
* Note that some parsers will report whitespace using the
* ignorableWhitespace() method rather than this one (validating parsers must
* do so).
- *
+ *
*
* @param ch
* The characters from the XML document.
@@ -687,14 +687,14 @@ public void endDTD() throws org.xml.sax.SAXException {
* processing: the SAX XML reader will automatically replace prefixes for
* element and attribute names when the http://xml.org/sax/features/namespaces
* feature is true (the default).
- *
+ *
*
*
* There are cases, however, when applications need to use prefixes in
* character data or in attribute values, where they cannot safely be expanded
* automatically; the start/endPrefixMapping event supplies the information to
* the application to expand prefixes in those contexts itself, if necessary.
- *
+ *
*
*
* Note that start/endPrefixMapping events are not guaranteed to be properly
@@ -702,7 +702,7 @@ public void endDTD() throws org.xml.sax.SAXException {
* before the corresponding startElement event, and all endPrefixMapping
* events will occur after the corresponding endElement event, but their order
* is not guaranteed.
- *
+ *
*
* @param prefix
* The Namespace prefix being declared.
@@ -735,7 +735,7 @@ public void startPrefixMapping(String prefix, String uri)
* See startPrefixMapping for details. This event will always occur after the
* corresponding endElement event, but the order of endPrefixMapping events is
* not otherwise guaranteed.
- *
+ *
*
* @param prefix
* The prefix that was being mapping.
@@ -755,7 +755,7 @@ public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
* DTD subset). All processors may skip external entities, depending on the
* values of the http://xml.org/sax/features/external-general-entities and the
* http://xml.org/sax/features/external-parameter-entities properties.
- *
+ *
*
* @param name
* The name of the skipped entity. If it is a parameter entity, the
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 8e079fb992..488cacd657 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -113,7 +113,7 @@ public boolean getText(StringBuilder sb, Node node,
/**
* This is a convinience method, equivalent to
- * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+ * {@link #getText(StringBuilder, Node, boolean)} which passes false as third argument
*
*/
public void getText(StringBuilder sb, Node node) {
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
index cfef10cc56..0143f06545 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
@@ -35,7 +35,7 @@ public class XMLCharacterRecognizer {
* Returns whether the specified ch conforms to the XML 1.0
* definition of whitespace. Refer to the definition of
- * S for details.
+ * S for details.
*
* @param ch
* Character to check as XML whitespace.
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
index a48175548e..a68584444e 100644
--- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
+++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
@@ -79,10 +79,10 @@ public class JSParseFilter implements ParseFilter, Parser {
* {@link WebPage} object relative to the URL
* @param parse
* {@link Parse} object holding parse status
- * @param metatags
- * within the {@link NutchDocument}
+ * @param metaTags
+ * within the {@link HTMLMetaTags}
* @param doc
- * The {@link NutchDocument} object
+ * The {@link DocumentFragment} object
* @return parse the actual {@link Parse} object
*/
@Override
diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
index a3f779ad89..4fbcad3cc4 100644
--- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
+++ b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
@@ -280,7 +280,7 @@ public void setY(int y) {
/*
* There are some issues with this method: sometimes SWF files define their
- * own font, so short of OCR we cannot guess what is the glyph code -> character
+ * own font, so short of OCR we cannot guess what is the glyph code -> character
* mapping. Additionally, some files don't use literal space character, instead
* they adjust glyphAdvances. We don't handle it at all - in such cases the text
* will be all glued together.
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
index 4f4c8a78fa..db59d13e9c 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
@@ -174,7 +174,7 @@ protected void append(Node newNode) throws org.xml.sax.SAXException {
* supply a locator: if it does so, it must supply the locator to the
* application by invoking this method before invoking any of the other
* methods in the ContentHandler interface.
- *
+ *
*
*
* The locator allows the application to determine the end position of any
@@ -183,13 +183,13 @@ protected void append(Node newNode) throws org.xml.sax.SAXException {
* errors (such as character content that does not match an application's
* business rules). The information returned by the locator is probably not
* sufficient for use with a search engine.
- *
+ *
*
*
* Note that the locator will return correct information only during the
* invocation of the events in this interface. The application should not
* attempt to use it at any other time.
- *
+ *
*
* @param locator
* An object that can return the location of any SAX document event.
@@ -206,7 +206,7 @@ public void setDocumentLocator(Locator locator) {
*
* The SAX parser will invoke this method only once, before any other methods
* in this interface or in DTDHandler (except for setDocumentLocator).
- *
+ *
*/
public void startDocument() throws org.xml.sax.SAXException {
@@ -221,7 +221,7 @@ public void startDocument() throws org.xml.sax.SAXException {
* method invoked during the parse. The parser shall not invoke this method
* until it has either abandoned parsing (because of an unrecoverable error)
* or reached the end of input.
- *
+ *
*/
public void endDocument() throws org.xml.sax.SAXException {
@@ -237,14 +237,14 @@ public void endDocument() throws org.xml.sax.SAXException {
* startElement() event (even when the element is empty). All of the element's
* content will be reported, in order, before the corresponding endElement()
* event.
- *
+ *
*
*
* If the element name has a namespace prefix, the prefix will still be
* attached. Note that the attribute list provided will contain only
* attributes with explicit values (specified or defaulted): #IMPLIED
* attributes will be omitted.
- *
+ *
*
*
* @param ns
@@ -328,12 +328,12 @@ public void startElement(String ns, String localName, String name,
* The SAX parser will invoke this method at the end of every element in the
* XML document; there will be a corresponding startElement() event for every
* endElement() event (even when the element is empty).
- *
+ *
*
*
* If the element name has a namespace prefix, the prefix will still be
* attached to the name.
- *
+ *
*
*
* @param ns
@@ -373,18 +373,18 @@ public void setIDAttribute(String id, Element elem) {
* they may split it into several chunks; however, all of the characters in
* any single event must come from the same external entity, so that the
* Locator provides useful information.
- *
+ *
*
*
* The application must not attempt to read from the array outside of the
* specified range.
- *
+ *
*
*
* Note that some parsers will report whitespace using the
* ignorableWhitespace() method rather than this one (validating parsers must
* do so).
- *
+ *
*
* @param ch
* The characters from the XML document.
@@ -491,19 +491,19 @@ public void entityReference(String name) throws org.xml.sax.SAXException {
* whitespace (see the W3C XML 1.0 recommendation, section 2.10):
* non-validating parsers may also use this method if they are capable of
* parsing and using content models.
- *
+ *
*
*
* SAX parsers may return all contiguous whitespace in a single chunk, or they
* may split it into several chunks; however, all of the characters in any
* single event must come from the same external entity, so that the Locator
* provides useful information.
- *
+ *
*
*
* The application must not attempt to read from the array outside of the
* specified range.
- *
+ *
*
* @param ch
* The characters from the XML document.
@@ -541,12 +541,12 @@ private boolean isOutsideDocElem() {
* The Parser will invoke this method once for each processing instruction
* found: note that processing instructions may occur before or after the main
* document element.
- *
+ *
*
*
* A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
* or a text declaration (XML 1.0, section 4.3.1) using this method.
- *
+ *
*
* @param target
* The processing instruction target.
@@ -610,18 +610,18 @@ public void endCDATA() throws org.xml.sax.SAXException {
* they may split it into several chunks; however, all of the characters in
* any single event must come from the same external entity, so that the
* Locator provides useful information.
- *
+ *
*
*
* The application must not attempt to read from the array outside of the
* specified range.
- *
+ *
*
*
* Note that some parsers will report whitespace using the
* ignorableWhitespace() method rather than this one (validating parsers must
* do so).
- *
+ *
*
* @param ch
* The characters from the XML document.
@@ -689,14 +689,14 @@ public void endDTD() throws org.xml.sax.SAXException {
* processing: the SAX XML reader will automatically replace prefixes for
* element and attribute names when the http://xml.org/sax/features/namespaces
* feature is true (the default).
- *
+ *
*
*
* There are cases, however, when applications need to use prefixes in
* character data or in attribute values, where they cannot safely be expanded
* automatically; the start/endPrefixMapping event supplies the information to
* the application to expand prefixes in those contexts itself, if necessary.
- *
+ *
*
*
* Note that start/endPrefixMapping events are not guaranteed to be properly
@@ -704,7 +704,7 @@ public void endDTD() throws org.xml.sax.SAXException {
* before the corresponding startElement event, and all endPrefixMapping
* events will occur after the corresponding endElement event, but their order
* is not guaranteed.
- *
+ *
*
* @param prefix
* The Namespace prefix being declared.
@@ -737,7 +737,7 @@ public void startPrefixMapping(String prefix, String uri)
* See startPrefixMapping for details. This event will always occur after the
* corresponding endElement event, but the order of endPrefixMapping events is
* not otherwise guaranteed.
- *
+ *
*
* @param prefix
* The prefix that was being mapping.
@@ -757,7 +757,7 @@ public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
* DTD subset). All processors may skip external entities, depending on the
* values of the http://xml.org/sax/features/external-general-entities and the
* http://xml.org/sax/features/external-parameter-entities properties.
- *
+ *
*
* @param name
* The name of the skipped entity. If it is a parameter entity, the
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
index d625c33119..b5c95ce9ae 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
@@ -35,7 +35,7 @@ class XMLCharacterRecognizer {
* Returns whether the specified ch conforms to the XML 1.0
* definition of whitespace. Refer to the definition of
- * S for details.
+ * S for details.
*
* @param ch
* Character to check as XML whitespace.
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
index 67d3dcc5ce..19035c0f60 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
@@ -59,7 +59,7 @@ public class TestRSSParser {
/**
*
* The test method: tests out the following 2 asserts:
- *
+ *
*
*
* - There are 3 outlinks read from the sample rss file
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index 06954394b0..20ba474aa4 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -99,8 +99,8 @@ public void setMaxContentLength(int maxContentLength) {
*
* @param url
* Text containing the url
- * @param datum
- * The CrawlDatum object corresponding to the url
+ * @param page
+ * {@link WebPage} object relative to the URL
*
* @return {@link ProtocolOutput} object for the content of the file indicated
* by url
diff --git a/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java b/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
index 856a649a86..952648ff33 100644
--- a/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
+++ b/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
@@ -39,7 +39,7 @@
*
*
* Unit tests for the {@link File}Protocol.
- *
+ *
* .
*/
public class TestProtocolFile {
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
index ffa2091760..8b272ec432 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
@@ -54,7 +54,7 @@
* servers out there, when partial downloading is enforeced by closing data
* channel socket on our client side, the server side immediately closes control
* channel (socket). Our codes deal with such a bad behavior. (4) LIST is used
- * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but
+ * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but
* not as ubiquitously implemented as LIST. (5) Avoid using ABOR in single
* thread? Do not use it at all.
*
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 3f3a7e8e98..9f3f9c5e35 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -114,8 +114,8 @@ public void setKeepConnection(boolean keepConnection) {
*
* @param url
* Text containing the ftp url
- * @param datum
- * The CrawlDatum object corresponding to the url
+ * @param page
+ * {@link WebPage} object relative to the URL
*
* @return {@link ProtocolOutput} object for the url
*/
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
index afcf24aa39..92baf298e9 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
@@ -97,7 +97,7 @@ public Socket createSocket(String host, int port, InetAddress clientHost,
* create a new socket within the given limit of time. If socket constructor
* does not return until the timeout expires, the controller terminates and
* throws an {@link ConnectTimeoutException}
- *
+ *
*
* @param host
* the host name/IP
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
index d4d7eba062..69abab75a0 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
@@ -161,8 +161,8 @@ public static void main(String[] args) throws Exception {
*
* @param url
* URL to be fetched
- * @param datum
- * Crawl data
+ * @param page
+ * {@link WebPage} object relative to the URL
* @param redirect
* Follow redirects if and only if true
* @return HTTP response
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
index a15f91be5a..cb09e697a1 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
@@ -39,7 +39,7 @@
/**
* Implementation of RFC 2617 Basic Authentication. Usernames and passwords are
* stored in standard Nutch configuration files using the following properties:
- * http.auth.basic..user http.auth.basic..pass
+ * http.auth.basic.<realm>.user http.auth.basic.<realm>.pass
*/
public class HttpBasicAuthentication implements HttpAuthentication,
Configurable {
@@ -128,7 +128,7 @@ public Configuration getConf() {
* Gets the Basic credentials generated by this HttpBasicAuthentication object
*
* @return Credentials in the form of
- * Authorization: Basic <Base64 encoded userid:password>
+ * Authorization: Basic Base64 encoded userid:password
*
*/
public List getCredentials() {
diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
index 9dc0c35823..5006266467 100644
--- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
+++ b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
@@ -16,8 +16,7 @@
*/
/**
- * Scoring filter used in conjunction with
- * {@link org.apache.nutch.scoring.webgraph.WebGraph}.
+ * Scoring filter
*/
package org.apache.nutch.scoring.link;
diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
index c3119226dc..1e07e6adc2 100644
--- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
+++ b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
@@ -39,7 +39,7 @@
/**
* This plugin implements a variant of an Online Page Importance Computation
* (OPIC) score, described in this paper:
+ * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html">
* Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
* On-Line Page Importance Computation .
*
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
index 6c78df5c5f..c905411091 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
@@ -201,7 +201,7 @@ public Collection getAll() {
/**
* Save collections into file
*
- * @throws Exception
+ * @throws IOException
*/
public void save() throws IOException {
try {
diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
index bf1ef4232f..5b7d5816cd 100644
--- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
+++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -43,7 +43,6 @@
* Filters URLs based on a file containing domain suffixes, domain names, and
* hostnames. Only a url that matches one of the suffixes, domains, or hosts
* present in the file is allowed.
- *
*
*
* Urls are checked in order of domain suffix, domain name, and hostname against
@@ -61,18 +60,17 @@
* only urls from www.apache.org. There is no specific ordering to entries. The
* entries are from more general to more specific with the more general
* overridding the more specific.
- *
*
* The domain file defaults to domain-urlfilter.txt in the classpath but can be
* overridden using the:
*
*
- *
+ * -
* property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
- *
- *
+ *
+ * -
* attribute "file" in plugin.xml of this plugin
- *
+ *
*
*
* the attribute "file" has higher precedence if defined.
@@ -114,7 +112,6 @@ public DomainURLFilter() {
* @param domainFile
* The domain file, overrides domain-urlfilter.text default.
*
- * @throws IOException
*/
public DomainURLFilter(String domainFile) {
this.domainFile = domainFile;
diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java
index d2eba1f763..1ff46f69c6 100644
--- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java
+++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java
@@ -18,8 +18,6 @@
/**
* URL filter plugin to include only URLs which match an element in a given list of
* domain suffixes, domain names, and/or host names.
- * See {@link org.apache.nutch.urlfilter.domainblacklist} for the counterpart
- * (exclude URLs by host or domain).
*/
package org.apache.nutch.urlfilter.domain;
diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
index 366c11e46f..75ece12c48 100644
--- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
+++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -49,7 +49,7 @@
*
*
* The format of this file is one URL prefix per line.
- *
+ *
*/
public class PrefixURLFilter implements URLFilter {
diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
index 1a7492ab85..ccad47d4e8 100644
--- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
+++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -68,22 +68,22 @@
* The format of this config file is one URL suffix per line, with no preceding
* whitespace. Order, in which suffixes are specified, doesn't matter. Blank
* lines and comments (#) are allowed.
- *
+ *
*
* A single '+' or '-' sign not followed by any suffix must be used once, to
* signify the mode this plugin operates in. An optional single 'I' can be
* appended, to signify that suffix matches should be case-insensitive. The
* default, if not specified, is to use case-sensitive matches, i.e. suffix
* '.JPG' does not match '.jpg'.
- *
+ *
*
* NOTE: the format of this file is different from urlfilter-prefix, because
* that plugin doesn't support allowed/prohibited prefixes (only supports
* allowed prefixes). Please note that this plugin does not support regular
* expressions, it only accepts literal suffixes. I.e. a suffix "+*.jpg" is most
* probably wrong, you should use "+.jpg" instead.
- *
- * Example 1
+ *
+ * Example 1
*
* The configuration shown below will accept all URLs with '.html' or '.htm'
* suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit
@@ -101,8 +101,8 @@
* .htm
*
*
- *
- * Example 2
+ *
+ * Example 2
*
* The configuration shown below will accept all URLs except common graphical
* formats.
@@ -122,7 +122,7 @@
* .bmp
*
*
- *
+ *
*
* @author Andrzej Bialecki
*/
diff --git a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
index 9b84eae01e..7132a61b45 100644
--- a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
+++ b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
@@ -25,13 +25,13 @@
/**
*
* Validates URLs.
- *
+ *
*
*
* Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date:
* 03/07/02, http://javascript.internet.com. However, this validation now bears
* little resemblance to the php original.
- *
+ *
*
*
* Example of usage:
@@ -47,7 +47,7 @@
*
*
* Based on UrlValidator code from Apache commons-validator.
- *
+ *
*
* @see Uniform Resource
* Identifiers (URI): Generic Syntax
@@ -159,7 +159,7 @@ public void setConf(Configuration conf) {
/**
*
* Checks if a field has a valid url address.
- *
+ *
*
* @param value
* The value validation is being performed on. A null
@@ -323,7 +323,7 @@ private boolean isValidAuthority(String authority) {
*
* Checks if the field isn't null and length of the field is greater than zero
* not including whitespace.
- *
+ *
*
* @param value
* The value validation is being performed on.
diff --git a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
index d460d9e5ad..66f7a1b363 100644
--- a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
+++ b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
@@ -58,11 +58,11 @@
* This class uses the urlnormalizer.regex.file property. It should be
* set to the file name of an xml file which should contain the patterns and
* substitutions to be done on encountered URLs.
- *
+ *
*
* This class also supports different rules depending on the scope. Please see
* the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details.
- *
+ *
*
* @author Luke Baker
* @author Andrzej Bialecki
diff --git a/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java b/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java
index 2e4e61ce51..f7d73880ab 100644
--- a/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java
+++ b/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java
@@ -141,7 +141,7 @@ public void testRequest(int expectedStatusCode, int port, String username, Strin
*/
if (ChallengeScheme.HTTP_DIGEST.equals(challengeScheme)) {
- // User server's data to complete the challengeResponse object
+ // Use server's data to complete the challengeResponse object
ChallengeRequest digestChallengeRequest = retrieveDigestChallengeRequest(resource);
ChallengeResponse challengeResponse = new ChallengeResponse(digestChallengeRequest, resource.getResponse(),
username, password.toCharArray());
diff --git a/src/test/org/apache/nutch/crawl/TestGenerator.java b/src/test/org/apache/nutch/crawl/TestGenerator.java
index 95d1db3609..2345299003 100644
--- a/src/test/org/apache/nutch/crawl/TestGenerator.java
+++ b/src/test/org/apache/nutch/crawl/TestGenerator.java
@@ -42,7 +42,7 @@
*
- Generates entries to fetch
* - Verifies that number of generated urls match, and finally
* - Verifies that highest scoring urls are generated.
- *
+ *
*
*/
public class TestGenerator extends AbstractNutchTest {
diff --git a/src/test/org/apache/nutch/fetcher/TestFetcher.java b/src/test/org/apache/nutch/fetcher/TestFetcher.java
index 8a8fa42154..9d62d50a6d 100644
--- a/src/test/org/apache/nutch/fetcher/TestFetcher.java
+++ b/src/test/org/apache/nutch/fetcher/TestFetcher.java
@@ -161,13 +161,15 @@ public void testFetch() throws Exception {
/**
* Tests a refetch of a URL. This process consists of two consecutive
* inject, generate, fetch, parse then update cycles. The test configuration
- * is defined such that db.fetch.interval.default is set to
+ * is defined such that db.fetch.interval.default is set to
* a very low value (indicating that the URL should be fetched again immediately).
* In addition, configuration tests that relevant
* {@link org.apache.nutch.metadata.Metadata} is present and the values consistent
* and therefore not overwritten.
- * @see https://issues.apache.org/jira/browse/NUTCH-2222
+ *
* @throws Exception
+ *
+ * @see https://issues.apache.org/jira/browse/NUTCH-2222
*/
@Test
public void testReFetch() throws Exception {
diff --git a/src/test/org/apache/nutch/util/CrawlTestUtil.java b/src/test/org/apache/nutch/util/CrawlTestUtil.java
index 5165b38a5a..046c51ec8a 100644
--- a/src/test/org/apache/nutch/util/CrawlTestUtil.java
+++ b/src/test/org/apache/nutch/util/CrawlTestUtil.java
@@ -71,7 +71,8 @@ public static Configuration createConfiguration() {
/**
* Generate seedlist
*
- * @see TestInjector
+ * @see org.apache.nutch.crawl.TestInjector
+ *
* @throws IOException
*/
public static void generateSeedList(FileSystem fs, Path urlPath,
@@ -148,11 +149,11 @@ public static Server getServer(int port, String staticContent)
}
/**
- * Generate Fetchlist.
+ * Generates Fetchlist
*
* @param numResults number of results to generate
* @param config Configuration to use
- * @return path to generated batch
+ *
* @throws IOException
*/
public static void generateFetchlist(int numResults, Configuration config,