From 0ea78907dee6b07058b66a99e395aea8cf623e92 Mon Sep 17 00:00:00 2001
From: Furkan KAMACI <furkankamaci@gmail.com>
Date: Sun, 4 Sep 2016 00:53:31 +0300
Subject: [PATCH] NUTCH-2089 Nutch 2.x is moved to compile on JDK 8

---
 .../org/apache/nutch/api/NutchServer.java     | 23 +++--
 .../apache/nutch/api/impl/RAMConfManager.java |  3 +
 .../nutch/crawl/AbstractFetchSchedule.java    | 23 +++--
 .../nutch/crawl/AdaptiveFetchSchedule.java    |  6 +-
 .../org/apache/nutch/crawl/FetchSchedule.java | 25 ++----
 .../org/apache/nutch/crawl/GeneratorJob.java  | 20 ++++-
 .../apache/nutch/crawl/SignatureFactory.java  |  7 +-
 .../nutch/crawl/TextProfileSignature.java     |  2 +-
 .../org/apache/nutch/fetcher/FetcherJob.java  | 12 ++-
 .../org/apache/nutch/indexer/IndexUtil.java   |  4 +-
 .../org/apache/nutch/net/URLNormalizers.java  | 16 ++--
 .../apache/nutch/parse/NutchSitemapParse.java |  4 +-
 .../nutch/parse/ParsePluginsReader.java       |  6 +-
 src/java/org/apache/nutch/parse/Parser.java   |  2 +-
 .../org/apache/nutch/parse/ParserChecker.java |  6 +-
 .../apache/nutch/plugin/PluginRepository.java |  6 +-
 .../apache/nutch/scoring/ScoringFilter.java   | 19 ++---
 .../apache/nutch/storage/StorageUtils.java    |  2 +-
 .../nutch/tools/arc/ArcRecordReader.java      | 20 ++---
 src/java/org/apache/nutch/util/Bytes.java     | 12 +--
 .../apache/nutch/util/EncodingDetector.java   | 10 +--
 src/java/org/apache/nutch/util/MimeUtil.java  |  4 +-
 .../org/apache/nutch/util/NodeWalker.java     | 10 +--
 src/java/org/apache/nutch/util/NutchJob.java  |  4 +-
 src/java/org/apache/nutch/util/NutchTool.java | 17 +++-
 .../nutch/util/PrefixStringMatcher.java       |  8 +-
 .../nutch/util/SuffixStringMatcher.java       |  8 +-
 src/java/org/apache/nutch/util/TableUtil.java |  4 +-
 .../org/apache/nutch/util/TimingUtil.java     |  2 +-
 .../apache/nutch/util/TrieStringMatcher.java  |  8 +-
 src/java/org/apache/nutch/util/URLUtil.java   | 83 +++++++++----------
 .../nutch/util/domain/DomainSuffix.java       |  5 +-
 .../nutch/util/domain/TopLevelDomain.java     |  4 +-
 .../apache/nutch/parse/feed/FeedParser.java   |  2 +-
 .../indexer/anchor/AnchorIndexingFilter.java  |  2 +-
 .../indexer/metadata/MetadataIndexer.java     |  2 +-
 .../indexer/more/MoreIndexingFilter.java      |  4 +-
 .../analysis/lang/HTMLLanguageParser.java     | 15 +++-
 .../urlfilter/api/RegexURLFilterBase.java     | 12 +--
 .../apache/nutch/parse/html/DOMBuilder.java   | 50 +++++------
 .../nutch/parse/html/DOMContentUtils.java     |  2 +-
 .../parse/html/XMLCharacterRecognizer.java    |  2 +-
 .../apache/nutch/parse/js/JSParseFilter.java  |  6 +-
 .../org/apache/nutch/parse/swf/SWFParser.java |  2 +-
 .../apache/nutch/parse/tika/DOMBuilder.java   | 50 +++++------
 .../parse/tika/XMLCharacterRecognizer.java    |  2 +-
 .../nutch/parse/tika/TestRSSParser.java       |  2 +-
 .../org/apache/nutch/protocol/file/File.java  |  4 +-
 .../nutch/protocol/file/TestProtocolFile.java |  2 +-
 .../org/apache/nutch/protocol/ftp/Client.java |  2 +-
 .../org/apache/nutch/protocol/ftp/Ftp.java    |  4 +-
 .../DummySSLProtocolSocketFactory.java        |  2 +-
 .../nutch/protocol/httpclient/Http.java       |  4 +-
 .../httpclient/HttpBasicAuthentication.java   |  4 +-
 .../nutch/scoring/link/package-info.java      |  3 +-
 .../nutch/scoring/opic/OPICScoringFilter.java |  2 +-
 .../nutch/collection/CollectionManager.java   |  2 +-
 .../urlfilter/domain/DomainURLFilter.java     | 11 +--
 .../nutch/urlfilter/domain/package-info.java  |  2 -
 .../urlfilter/prefix/PrefixURLFilter.java     |  2 +-
 .../urlfilter/suffix/SuffixURLFilter.java     | 14 ++--
 .../urlfilter/validator/UrlValidator.java     | 10 +--
 .../regex/RegexURLNormalizer.java             |  4 +-
 .../nutch/api/AbstractNutchAPITestBase.java   |  2 +-
 .../org/apache/nutch/crawl/TestGenerator.java |  2 +-
 .../org/apache/nutch/fetcher/TestFetcher.java |  6 +-
 .../org/apache/nutch/util/CrawlTestUtil.java  |  7 +-
 67 files changed, 318 insertions(+), 309 deletions(-)

diff --git a/src/java/org/apache/nutch/api/NutchServer.java b/src/java/org/apache/nutch/api/NutchServer.java
index 802bbef7a6..5118497f12 100644
--- a/src/java/org/apache/nutch/api/NutchServer.java
+++ b/src/java/org/apache/nutch/api/NutchServer.java
@@ -98,9 +98,9 @@ public class NutchServer extends Application {
    * 'INFO' however best attempts should always be made to specify a logging
    * level.&lt;br&gt;
    * {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property
-   * should be set to BASIC, DIGEST or SSL at &lt;code&gt;nutch-site.xml&lt;/code&gt; to enable HTTP basic authentication,
+   * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication,
    * digest authentication or SSL when communicating with RESTAPI.
-   * Set restapi.auth.username and restapi.auth.password properties at &lt;code&gt;nutch-site.xml&lt;/code&gt; to configure
+   * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure
    * credentials when BASIC or DIGEST authentication is used.
    * Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used.
    *
@@ -117,12 +117,14 @@ public NutchServer() {
    * 'INFO' however best attempts should always be made to specify a logging
    * level.&lt;br&gt;
    * {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property
-   * should be set to BASIC, DIGEST or SSL at &lt;code&gt;nutch-site.xml&lt;/code&gt; to enable HTTP basic authentication,
+   * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication,
    * digest authentication or SSL when communicating with RESTAPI.
-   * Set restapi.auth.username and restapi.auth.password properties at &lt;code&gt;nutch-site.xml&lt;/code&gt; to configure
+   * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure
    * credentials when BASIC or DIGEST authentication is used.
    * Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used.
    *
+   * @param ramConfManager {@link RAMConfManager}
+   *
    * @see org.apache.nutch.api.security.AuthenticationTypeEnum
    */
   public NutchServer(RAMConfManager ramConfManager) {
@@ -137,12 +139,15 @@ public NutchServer(RAMConfManager ramConfManager) {
    * 'INFO' however best attempts should always be made to specify a logging
    * level.&lt;br&gt;
    * {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property
-   * should be set to BASIC, DIGEST or SSL at &lt;code&gt;nutch-site.xml&lt;/code&gt; to enable HTTP basic authentication,
+   * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication,
    * digest authentication or SSL when communicating with RESTAPI.
-   * Set restapi.auth.username and restapi.auth.password properties at &lt;code&gt;nutch-site.xml&lt;/code&gt; to configure
+   * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure
    * credentials when BASIC or DIGEST authentication is used.
    * Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used.
    *
+   * @param ramConfManager {@link RAMConfManager}
+   * @param confId active configuration id
+   *
    * @see org.apache.nutch.api.security.AuthenticationTypeEnum
    */
   public NutchServer(RAMConfManager ramConfManager, String confId) {
@@ -305,7 +310,7 @@ public void start() {
   /**
    * Safety and convenience method to determine whether or not it is safe to
    * shut down the server. We make this assertion by consulting the
-   * {@link org.apache.nutch.api.NutchApp#jobManager} for a list of jobs with
+   * {@link #getJobMgr()}  for a list of jobs with
    * {@link org.apache.nutch.api.model.response.JobInfo#state} equal to
    * 'RUNNING'.
    * 
@@ -356,8 +361,8 @@ public boolean stop(boolean force) {
   /**
    * Main method for NutchServer to run via command line.
    *
-   * @param args  arguments for log level, stopping the Server and port.
-   * @throws Exception
+   * @param args arguments for log level, stopping the Server and port.
+   * @throws Exception exception
    */
   public static void main(String[] args) throws Exception {
     CommandLineParser parser = new PosixParser();
diff --git a/src/java/org/apache/nutch/api/impl/RAMConfManager.java b/src/java/org/apache/nutch/api/impl/RAMConfManager.java
index 13c05fd285..356a8bdd25 100644
--- a/src/java/org/apache/nutch/api/impl/RAMConfManager.java
+++ b/src/java/org/apache/nutch/api/impl/RAMConfManager.java
@@ -50,6 +50,9 @@ public RAMConfManager() {
 
   /**
    * Public constructor which accepts a configuration id and {@link Configuration} type configuration.
+   *
+   * @param confId configuration id
+   * @param configuration configuration
    */
   public RAMConfManager(String confId, Configuration configuration) {
     configurations.put(confId, configuration);
diff --git a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
index 045f4cd777..8070c7b376 100755
--- a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
@@ -76,6 +76,7 @@ public void setConf(Configuration conf) {
    * @param url
    *          URL of the page.
    * @param page
+   *          {@link WebPage} object relative to the URL
    */
   @Override
   public void initializeSchedule(String url, WebPage page) {
@@ -104,13 +105,7 @@ public void setFetchSchedule(String url, WebPage page, long prevFetchTime,
    * @param url
    *          URL of the page
    * @param page
-   * @return adjusted page information, including all original information.
-   *         NOTE: this may be a different instance than
-   * @param datum
-   *          , but implementations should make sure that it contains at least
-   *          all information from
-   * @param datum
-   *          .
+   *          {@link WebPage} object relative to the URL
    */
   @Override
   public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
@@ -134,6 +129,7 @@ public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
    * @param url
    *          URL of the page
    * @param page
+   *          {@link WebPage} object relative to the URL
    * @param prevFetchTime
    *          previous fetch time
    * @param prevModifiedTime
@@ -163,15 +159,15 @@ public long calculateLastFetchTime(WebPage page) {
    * in the current fetchlist. NOTE: a true return value does not guarantee that
    * the page will be fetched, it just allows it to be included in the further
    * selection process based on scores. The default implementation checks
-   * <code>fetchTime</code>, if it is higher than the
-   * 
-   * @param curTime
-   *          it returns false, and true otherwise. It will also check that
-   *          fetchTime is not too remote (more than <code>maxInterval</code),
-   *          in which case it lowers the interval and returns true.
+   * <code>fetchTime</code>, if it is higher than the current time
+   * it returns false, and true otherwise. It will also check that
+   * fetchTime is not too remote (more than <code>maxInterval</code>),
+   * in which case it lowers the interval and returns true.
+   *
    * @param url
    *          URL of the page
    * @param page
+   *          {@link WebPage} object relative to the URL
    * @param curTime
    *          reference time (usually set to the time when the fetchlist
    *          generation process was started).
@@ -200,6 +196,7 @@ public boolean shouldFetch(String url, WebPage page, long curTime) {
    * @param url
    *          URL of the page
    * @param page
+   *          {@link WebPage} object relative to the URL
    * @param asap
    *          if true, force refetch as soon as possible - this sets the
    *          fetchTime to now. If false, force refetch whenever the next fetch
diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index 1c2780a224..30c6ec7375 100755
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -46,10 +46,8 @@
  * <p>
  * NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize
  * the algorithm, so that the fetch interval either increases or decreases
- * infinitely, with little relevance to the page changes. Please use
- * {@link #main(String[])} method to test the values before applying them in a
- * production system.
- * </p>
+ * infinitely, with little relevance to the page changes.
+ *
  * 
  * @author Andrzej Bialecki
  */
diff --git a/src/java/org/apache/nutch/crawl/FetchSchedule.java b/src/java/org/apache/nutch/crawl/FetchSchedule.java
index eb896a6aec..8219a61f80 100755
--- a/src/java/org/apache/nutch/crawl/FetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/FetchSchedule.java
@@ -49,6 +49,7 @@ public interface FetchSchedule extends Configurable {
    * @param url
    *          URL of the page.
    * @param page
+   *          {@link WebPage} object relative to the URL
    */
   public void initializeSchedule(String url, WebPage page);
 
@@ -60,21 +61,16 @@ public interface FetchSchedule extends Configurable {
    * @param url
    *          url of the page
    * @param page
+   *          {@link WebPage} object relative to the URL
    * @param prevFetchTime
    *          previous value of fetch time, or -1 if not available
    * @param prevModifiedTime
    *          previous value of modifiedTime, or -1 if not available
    * @param fetchTime
-   *          the latest time, when the page was recently re-fetched. Most
-   *          FetchSchedule implementations should update the value in
-   * @param datum
-   *          to something greater than this value.
+   *          the latest time, when the page was recently re-fetched.
    * @param modifiedTime
    *          last time the content was modified. This information comes from
-   *          the protocol implementations, or is set to < 0 if not available.
-   *          Most FetchSchedule implementations should update the value in
-   * @param datum
-   *          to this value.
+   *          the protocol implementations, or is set to &lt; 0 if not available.
    * @param state
    *          if {@link #STATUS_MODIFIED}, then the content is considered to be
    *          "changed" before the <code>fetchTime</code>, if
@@ -90,13 +86,10 @@ public void setFetchSchedule(String url, WebPage page, long prevFetchTime,
 
   /**
    * This method specifies how to schedule refetching of pages marked as GONE.
-   * Default implementation increases fetchInterval by 50%, and if it exceeds
-   * the <code>maxInterval</code> it calls
-   * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
-   * 
    * @param url
    *          URL of the page
    * @param page
+   *          {@link WebPage} object relative to the URL
    */
   public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
       long prevModifiedTime, long fetchTime);
@@ -109,6 +102,7 @@ public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
    * @param url
    *          URL of the page
    * @param page
+   *          {@link WebPage} object relative to the URL
    * @param prevFetchTime
    *          previous fetch time
    * @param prevModifiedTime
@@ -133,14 +127,8 @@ public void setPageRetrySchedule(String url, WebPage page,
    * selection process based on scores. The default implementation checks
    * <code>fetchTime</code>, if it is higher than the
    * 
-   * @param curTime
-   *          it returns false, and true otherwise. It will also check that
-   *          fetchTime is not too remote (more than <code>maxInterval</code),
-   *          in which case it lowers the interval and returns true.
    * @param url
    *          URL of the page
-   * @param row
-   *          url's row
    * @param curTime
    *          reference time (usually set to the time when the fetchlist
    *          generation process was started).
@@ -155,7 +143,6 @@ public void setPageRetrySchedule(String url, WebPage page,
    * 
    * @param url
    *          URL of the page
-   * @param page
    * @param asap
    *          if true, force refetch as soon as possible - this sets the
    *          fetchTime to now. If false, force refetch whenever the next fetch
diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java b/src/java/org/apache/nutch/crawl/GeneratorJob.java
index f47637fabc..162759066c 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorJob.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java
@@ -165,7 +165,11 @@ public Collection<WebPage.Field> getFields(Job job) {
     return fields;
   }
 
-  /** Generate a random batch id */
+  /**
+   * Generates a random batch id
+   *
+   * @return random batch id
+   */
   public static String randomBatchId() {
     long curTime = System.currentTimeMillis();
     int randomSeed = Math.abs(new Random().nextInt());
@@ -173,6 +177,13 @@ public static String randomBatchId() {
     return batchId;
   }
 
+  /**
+   * Runs generator
+   *
+   * @param args map of arguments
+   * @return results
+   * @throws Exception
+   */
   public Map<String, Object> run(Map<String, Object> args) throws Exception {
     String batchId = (String) args.get(Nutch.ARG_BATCH);
     if (batchId == null) {
@@ -290,6 +301,13 @@ public String generate(long topN, long curTime, boolean filter, boolean norm,
     return batchId;
   }
 
+  /**
+   * Runs generator from commandline
+   *
+   * @param args arguments
+   * @return returns -1
+   * @throws Exception
+   */
   public int run(String[] args) throws Exception {
     if (args.length <= 0) {
       System.out
diff --git a/src/java/org/apache/nutch/crawl/SignatureFactory.java b/src/java/org/apache/nutch/crawl/SignatureFactory.java
index 15776340cb..8cf7471a26 100644
--- a/src/java/org/apache/nutch/crawl/SignatureFactory.java
+++ b/src/java/org/apache/nutch/crawl/SignatureFactory.java
@@ -40,7 +40,12 @@ public class SignatureFactory {
   private SignatureFactory() {
   } // no public ctor
 
-  /** Return the default Signature implementation. */
+  /**
+   * Returns the default {@link Signature} implementation
+   *
+   * @param conf configuration
+   * @return default {@link Signature} implementation
+   */
   public static Signature getSignature(Configuration conf) {
     String clazz = conf.get("db.signature.class", MD5Signature.class.getName());
     ObjectCache objectCache = ObjectCache.get(conf);
diff --git a/src/java/org/apache/nutch/crawl/TextProfileSignature.java b/src/java/org/apache/nutch/crawl/TextProfileSignature.java
index 6d7e5e0163..f797b10a62 100644
--- a/src/java/org/apache/nutch/crawl/TextProfileSignature.java
+++ b/src/java/org/apache/nutch/crawl/TextProfileSignature.java
@@ -33,7 +33,7 @@
  * An implementation of a page signature. It calculates an MD5 hash of a plain
  * text "profile" of a page. In case there is no text, it calculates a hash
  * using the {@link MD5Signature}.
- * </p>
+ *
  * <p>
  * The algorithm to calculate a page "profile" takes the plain text version of a
  * page and performs the following steps:
diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java b/src/java/org/apache/nutch/fetcher/FetcherJob.java
index a7f3df8efc..015c209da3 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherJob.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java
@@ -79,19 +79,17 @@ public class FetcherJob extends NutchTool implements Tool {
   /**
    * <p>
    * Mapper class for Fetcher.
-   * </p>
+   *
    * <p>
    * This class reads the random integer written by {@link GeneratorJob} as its
    * key while outputting the actual key and value arguments through a
    * {@link FetchEntry} instance.
-   * </p>
+   *
    * <p>
-   * This approach (combined with the use of {@link PartitionUrlByHost}) makes
-   * sure that Fetcher is still polite while also randomizing the key order. If
    * one host has a huge number of URLs in your table while other hosts have
    * not, {@link FetcherReducer} will not be stuck on one host but process URLs
    * from other hosts as well.
-   * </p>
+   *
    */
   public static class FetcherMapper extends
   GoraMapper<String, WebPage, IntWritable, FetchEntry> {
@@ -246,7 +244,7 @@ private MapFieldValueFilter<String, WebPage> getBatchIdFilter(String batchId) {
    *          number of threads per map task
    * @param shouldResume
    * @param numTasks
-   *          number of fetching tasks (reducers). If set to < 1 then use the
+   *          number of fetching tasks (reducers). If set to &lt; 1 then use the
    *          default, which is mapred.map.tasks.
    * @return 0 on success
    * @throws Exception
@@ -266,7 +264,7 @@ public int fetch(String batchId, int threads, boolean shouldResume,
    *          number of threads per map task
    * @param shouldResume
    * @param numTasks
-   *          number of fetching tasks (reducers). If set to < 1 then use the
+   *          number of fetching tasks (reducers). If set to &lt; 1 then use the
    *          default, which is mapred.map.tasks.
    * @param stmDetect
    *          If set true, sitemap detection is run.
diff --git a/src/java/org/apache/nutch/indexer/IndexUtil.java b/src/java/org/apache/nutch/indexer/IndexUtil.java
index 6d1238228e..ddb6f0d650 100644
--- a/src/java/org/apache/nutch/indexer/IndexUtil.java
+++ b/src/java/org/apache/nutch/indexer/IndexUtil.java
@@ -42,7 +42,7 @@ public IndexUtil(Configuration conf) {
   }
 
   /**
-   * Index a {@link Webpage}, here we add the following fields:
+   * Index a {@link WebPage}, here we add the following fields:
    * <ol>
    * <li><tt>id</tt>: default uniqueKey for the {@link NutchDocument}.</li>
    * <li><tt>digest</tt>: Digest is used to identify pages (like unique ID) and
@@ -60,7 +60,7 @@ public IndexUtil(Configuration conf) {
    * @param key
    *          The key of the page (reversed url).
    * @param page
-   *          The {@link Webpage}.
+   *          The {@link WebPage}.
    * @return The indexed document, or null if skipped by index filters.
    */
   public NutchDocument index(String key, WebPage page) {
diff --git a/src/java/org/apache/nutch/net/URLNormalizers.java b/src/java/org/apache/nutch/net/URLNormalizers.java
index 03a0b855ca..1fc1df849e 100644
--- a/src/java/org/apache/nutch/net/URLNormalizers.java
+++ b/src/java/org/apache/nutch/net/URLNormalizers.java
@@ -51,30 +51,30 @@
  * order). If there are more normalizers activated than explicitly named on this
  * list, the remaining ones will be run in random order after the ones specified
  * on the list are executed.
- * </p>
+ *
  * <p>
  * You can define a set of contexts (or scopes) in which normalizers may be
  * called. Each scope can have its own list of normalizers (defined in
- * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in
- * "urlnormalizer.order.<scope_name>" property). If any of these properties are
+ * "urlnormalizer.scope.&lt;scope_name&gt;" property) and its own order (defined in
+ * "urlnormalizer.order.&lt;scope_name&gt;" property). If any of these properties are
  * missing, default settings are used for the global scope.
- * </p>
+ *
  * <p>
  * In case no normalizers are required for any given scope, a
  * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should
  * be used.
- * </p>
+ *
  * <p>
  * Each normalizer may further select among many configurations, depending on
  * the scope in which it is called, because the scope name is passed as a
  * parameter to each normalizer. You can also use the same normalizer for many
  * scopes.
- * </p>
+ *
  * <p>
  * Several scopes have been defined, and various Nutch tools will attempt using
  * scope-specific normalizers first (and fall back to default config if
  * scope-specific configuration is missing).
- * </p>
+ *
  * <p>
  * Normalizers may be run several times, to ensure that modifications introduced
  * by normalizers at the end of the list can be further reduced by normalizers
@@ -83,7 +83,7 @@
  * want to run this loop up to the number of activated normalizers. This loop
  * count can be configured through <tt>urlnormalizer.loop.count</tt> property.
  * As soon as the url is unchanged the loop will stop and return the result.
- * </p>
+ *
  * 
  * @author Andrzej Bialecki
  */
diff --git a/src/java/org/apache/nutch/parse/NutchSitemapParse.java b/src/java/org/apache/nutch/parse/NutchSitemapParse.java
index c0a9d9b6ca..0e57339ba3 100644
--- a/src/java/org/apache/nutch/parse/NutchSitemapParse.java
+++ b/src/java/org/apache/nutch/parse/NutchSitemapParse.java
@@ -6,9 +6,9 @@
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * <p/>
+ * <p>
  * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
+ * <p>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/src/java/org/apache/nutch/parse/ParsePluginsReader.java b/src/java/org/apache/nutch/parse/ParsePluginsReader.java
index dddd025163..b4c6f4e810 100644
--- a/src/java/org/apache/nutch/parse/ParsePluginsReader.java
+++ b/src/java/org/apache/nutch/parse/ParsePluginsReader.java
@@ -69,12 +69,10 @@ public ParsePluginsReader() {
 
   /**
    * Reads the <code>parse-plugins.xml</code> file and returns the
-   * {@link #ParsePluginList} defined by it.
+   * {@link ParsePluginList} defined by it.
    * 
-   * @return A {@link #ParsePluginList} specified by the
+   * @return A {@link ParsePluginList} specified by the
    *         <code>parse-plugins.xml</code> file.
-   * @throws Exception
-   *           If any parsing error occurs.
    */
   public ParsePluginList parse(Configuration conf) {
 
diff --git a/src/java/org/apache/nutch/parse/Parser.java b/src/java/org/apache/nutch/parse/Parser.java
index b623fd0262..9a8c2b7bf0 100644
--- a/src/java/org/apache/nutch/parse/Parser.java
+++ b/src/java/org/apache/nutch/parse/Parser.java
@@ -34,7 +34,7 @@ public interface Parser extends FieldPluggable, Configurable {
   /**
    * <p>
    * This method parses content in WebPage instance
-   * </p>
+   *
    * 
    * @param url
    *          Page's URL
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java
index 12faeae90a..4d5c572968 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -48,13 +48,13 @@
  * is used to remove duplicates during the dedup procedure. It is calculated
  * using {@link org.apache.nutch.crawl.MD5Signature} or
  * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
- * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
- * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Version</tt>: From org.apache.nutch.parse.ParseData.</li>
+ * <li><tt>Status</tt>: From org.apache.nutch.parse.ParseData.</li>
  * <li><tt>Title</tt>: of the URL</li>
  * <li><tt>Outlinks</tt>: associated with the URL</li>
  * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
  * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
- * <i>Cache-Control</>, etc.</li>
+ * <i>Cache-Control</i>, etc.</li>
  * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
  * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
  * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing
diff --git a/src/java/org/apache/nutch/plugin/PluginRepository.java b/src/java/org/apache/nutch/plugin/PluginRepository.java
index 346ae310de..6486f63f53 100644
--- a/src/java/org/apache/nutch/plugin/PluginRepository.java
+++ b/src/java/org/apache/nutch/plugin/PluginRepository.java
@@ -59,8 +59,10 @@ public class PluginRepository {
       .getLogger(PluginRepository.class);
 
   /**
-   * @throws PluginRuntimeException
-   * @see java.lang.Object#Object()
+   * Pluging repository constructor
+   *
+   * @param conf Configuration
+   * @throws RuntimeException
    */
   public PluginRepository(Configuration conf) throws RuntimeException {
     fActivatedPlugins = new HashMap<String, Plugin>();
diff --git a/src/java/org/apache/nutch/scoring/ScoringFilter.java b/src/java/org/apache/nutch/scoring/ScoringFilter.java
index 8c06ef6cb0..17c6350fc8 100644
--- a/src/java/org/apache/nutch/scoring/ScoringFilter.java
+++ b/src/java/org/apache/nutch/scoring/ScoringFilter.java
@@ -72,8 +72,8 @@ public void initialScore(String url, WebPage page)
    * 
    * @param url
    *          url of the page
-   * @param datum
-   *          page row. Modifications will be persisted.
+   * @param page
+   *          {@link WebPage} object relative to the URL
    * @param initSort
    *          initial sort value, or a value from previous filters in chain
    */
@@ -85,13 +85,8 @@ public float generatorSortValue(String url, WebPage page, float initSort)
    * 
    * @param fromUrl
    *          url of the source page
-   * @param row
-   *          page row
    * @param scoreData
-   *          A list of {@link OutlinkedScoreDatum}s for every outlink. These
-   *          {@link OutlinkedScoreDatum}s will be passed to
-   *          {@link #updateScore(String, OldWebTableRow, List)} for every
-   *          outlinked URL.
+   *          A list of {@link ScoreDatum}
    * @param allCount
    *          number of all collected outlinks from the source page
    * @throws ScoringFilterException
@@ -106,9 +101,9 @@ public void distributeScoreToOutlinks(String fromUrl, WebPage page,
    * 
    * @param url
    *          url of the page
-   * @param page
-   * @param inlinked
-   *          list of {@link OutlinkedScoreDatum}s for all inlinks pointing to
+   * @param page {@link WebPage} object relative to the URL
+   * @param inlinkedScoreData
+   *          list of {@link ScoreDatum}s for all inlinks pointing to
    *          this URL.
    * @throws ScoringFilterException
    */
@@ -124,8 +119,6 @@ public void updateScore(String url, WebPage page,
    *          document. NOTE: this already contains all information collected by
    *          indexing filters. Implementations may modify this instance, in
    *          order to store/remove some information.
-   * @param row
-   *          page row
    * @param initScore
    *          initial boost value for the Lucene document.
    * @return boost value for the Lucene document. This value is passed as an
diff --git a/src/java/org/apache/nutch/storage/StorageUtils.java b/src/java/org/apache/nutch/storage/StorageUtils.java
index b68e8f84fe..e82a3c505b 100644
--- a/src/java/org/apache/nutch/storage/StorageUtils.java
+++ b/src/java/org/apache/nutch/storage/StorageUtils.java
@@ -82,7 +82,7 @@ public static <K, V extends Persistent> DataStore<K, V> createWebStore(
   /**
    * Return the Persistent Gora class used to persist Nutch Web data.
    * 
-   * @param the
+   * @param conf
    *          Nutch configuration
    * @return the Gora DataStore persistent class
    * @throws ClassNotFoundException
diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
index d3f9799be4..7f36b52d81 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
@@ -38,21 +38,21 @@
  * <p>
  * The <code>ArchRecordReader</code> class provides a record reader which reads
  * records from arc files.
- * </p>
+ *
  * 
  * <p>
  * Arc files are essentially tars of gzips. Each record in an arc file is a
  * compressed gzip. Multiple records are concatenated together to form a
- * complete arc. For more information on the arc file format see {@link http
- * ://www.archive.org/web/researcher/ArcFileFormat.php}.
- * </p>
- * 
+ * complete arc. For more information on the arc file format see
+ * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">
+ *   http://www.archive.org/web/researcher/ArcFileFormat.php</a>.
+ *
  * <p>
  * Arc files are used by the internet archive and grub projects.
- * </p>
+ *
  * 
- * @see http://www.archive.org/
- * @see http://www.grub.org/
+ * @see <a href="http://www.archive.org/">http://www.archive.org/</a>
+ * @see <a href="http://www.grub.org/">http://www.grub.org/</a>
  */
 public class ArcRecordReader implements RecordReader<Text, BytesWritable> {
 
@@ -72,7 +72,7 @@ public class ArcRecordReader implements RecordReader<Text, BytesWritable> {
   /**
    * <p>
    * Returns true if the byte array passed matches the gzip header magic number.
-   * </p>
+   *
    * 
    * @param input
    *          The byte array to check.
@@ -174,7 +174,7 @@ public float getProgress() throws IOException {
    * Returns true if the next record in the split is read into the key and value
    * pair. The key will be the arc record header and the values will be the raw
    * content bytes of the arc record.
-   * </p>
+   *
    * 
    * @param key
    *          The record key
diff --git a/src/java/org/apache/nutch/util/Bytes.java b/src/java/org/apache/nutch/util/Bytes.java
index db9f4689c9..043a89761d 100644
--- a/src/java/org/apache/nutch/util/Bytes.java
+++ b/src/java/org/apache/nutch/util/Bytes.java
@@ -980,7 +980,7 @@ public static long readVLong(final byte[] buffer, final int offset)
    *          left operand
    * @param right
    *          right operand
-   * @return 0 if equal, < 0 if left is less than right, etc.
+   * @return 0 if equal, &lt; 0 if left is less than right, etc.
    */
   public static int compareTo(final byte[] left, final byte[] right) {
     return compareTo(left, 0, left.length, right, 0, right.length);
@@ -1001,7 +1001,7 @@ public static int compareTo(final byte[] left, final byte[] right) {
    *          How much to compare from the left buffer
    * @param length2
    *          How much to compare from the right buffer
-   * @return 0 if equal, < 0 if left is less than right, etc.
+   * @return 0 if equal, &lt; 0 if left is less than right, etc.
    */
   public static int compareTo(byte[] buffer1, int offset1, int length1,
       byte[] buffer2, int offset2, int length2) {
@@ -1050,7 +1050,7 @@ public static boolean startsWith(byte[] bytes, byte[] prefix) {
    * @return Runs {@link WritableComparator#hashBytes(byte[], int)} on the
    *         passed in array. This method is what
    *         {@link org.apache.hadoop.io.Text} and
-   *         {@link ImmutableBytesWritable} use calculating hash code.
+   *         org.apache.hadoop.hbase.io.ImmutableBytesWritable use calculating hash code.
    */
   public static int hashCode(final byte[] b) {
     return hashCode(b, b.length);
@@ -1064,7 +1064,7 @@ public static int hashCode(final byte[] b) {
    * @return Runs {@link WritableComparator#hashBytes(byte[], int)} on the
    *         passed in array. This method is what
    *         {@link org.apache.hadoop.io.Text} and
-   *         {@link ImmutableBytesWritable} use calculating hash code.
+   *         org.apache.hadoop.hbase.io.ImmutableBytesWritable use calculating hash code.
    */
   public static int hashCode(final byte[] b, final int length) {
     return WritableComparator.hashBytes(b, length);
@@ -1366,12 +1366,12 @@ else if (cmp < 0)
    * given amount.
    * 
    * @param value
-   *          - array of bytes containing long (length <= SIZEOF_LONG)
+   *          - array of bytes containing long (length &lt;= SIZEOF_LONG)
    * @param amount
    *          value will be incremented on (deincremented if negative)
    * @return array of bytes containing incremented long (length == SIZEOF_LONG)
    * @throws IOException
-   *           - if value.length > SIZEOF_LONG
+   *           - if value.length &gt; SIZEOF_LONG
    */
   public static byte[] incrementBytes(byte[] value, long amount)
       throws IOException {
diff --git a/src/java/org/apache/nutch/util/EncodingDetector.java b/src/java/org/apache/nutch/util/EncodingDetector.java
index 5b40e29e5d..25f8eefccd 100644
--- a/src/java/org/apache/nutch/util/EncodingDetector.java
+++ b/src/java/org/apache/nutch/util/EncodingDetector.java
@@ -45,7 +45,7 @@
  * <li>Taking a set of clues and making a "best guess" as to the "real"
  * encoding.</li>
  * </ol>
- * </p>
+ *
  * 
  * <p>
  * A caller will often have some extra information about what the encoding might
@@ -56,7 +56,7 @@
  * <li>Run step (1) to generate a set of auto-detected clues;</li>
  * <li>Combine these clues with the caller-dependent "extra clues" available;</li>
  * <li>Run step (2) to guess what the most probable answer is.</li>
- * </p>
+ * </ul>
  */
 public class EncodingDetector {
 
@@ -211,9 +211,7 @@ public void addClue(String value, String source) {
 
   /**
    * Guess the encoding with the previously specified list of clues.
-   * 
-   * @param row
-   *          URL's row
+   *
    * @param defaultValue
    *          Default encoding to return if no encoding can be detected with
    *          enough confidence. Note that this will <b>not</b> be normalized
@@ -340,7 +338,7 @@ public static String resolveEncodingAlias(String encoding) {
   /**
    * Parse the character encoding from the specified content type header. If the
    * content type is null, or there is no explicit character encoding,
-   * <code>null</code> is returned. <br />
+   * <code>null</code> is returned. <p>
    * This method was copied from org.apache.catalina.util.RequestUtil, which is
    * licensed under the Apache License, Version 2.0 (the "License").
    * 
diff --git a/src/java/org/apache/nutch/util/MimeUtil.java b/src/java/org/apache/nutch/util/MimeUtil.java
index 198fdee596..241087c91a 100644
--- a/src/java/org/apache/nutch/util/MimeUtil.java
+++ b/src/java/org/apache/nutch/util/MimeUtil.java
@@ -50,7 +50,7 @@
  *        substrate library, <a href="http://incubator.apache.org/tika/">Apache
  *        Tika</a>. Any mime handling code should be placed in this utility
  *        class, and hidden from the Nutch classes that rely on it.
- *        </p>
+ *
  */
 public final class MimeUtil {
 
@@ -229,7 +229,7 @@ public String autoResolveContentType(String typeName, String url, byte[] data) {
    * method.
    * 
    * @param url
-   *          A string representation of the document {@link URL} to sense the
+   *          A string representation of the document. URL to sense the
    *          {@link MimeType} for.
    * @return An appropriate {@link MimeType}, identified from the given Document
    *         url in string form.
diff --git a/src/java/org/apache/nutch/util/NodeWalker.java b/src/java/org/apache/nutch/util/NodeWalker.java
index 16e84c3598..3e0b0e1827 100644
--- a/src/java/org/apache/nutch/util/NodeWalker.java
+++ b/src/java/org/apache/nutch/util/NodeWalker.java
@@ -27,12 +27,12 @@
  * of recursion. As the node tree is walked the next node is popped off of the
  * stack and all of its children are automatically added to the stack to be
  * called in tree order.
- * </p>
+ *
  * 
  * <p>
  * Currently this class is not thread safe. It is assumed that only one thread
  * will be accessing the <code>NodeWalker</code> at any given time.
- * </p>
+ *
  */
 public class NodeWalker {
 
@@ -58,7 +58,7 @@ public NodeWalker(Node rootNode) {
    * children onto the stack, allowing us to walk the node tree without the use
    * of recursion. If there are no more nodes on the stack then null is
    * returned.
-   * </p>
+   *
    * 
    * @return Node The next <code>Node</code> on the stack or null if there isn't
    *         a next node.
@@ -90,12 +90,12 @@ public Node nextNode() {
    * When getting a next node from the walker, that node's children are
    * automatically added to the stack. You can call this method to remove those
    * children from the stack.
-   * </p>
+   *
    * 
    * <p>
    * This is useful when you don't want to process deeper into the current path
    * of the node tree but you want to continue processing sibling nodes.
-   * </p>
+   *
    * 
    */
   public void skipChildren() {
diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java
index c0456c150c..029e7aece7 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java
@@ -63,7 +63,7 @@ public NutchJob(Configuration conf, String jobName) throws IOException {
    * Creates a new {@link NutchJob} with no particular {@link org.apache.hadoop.mapreduce.Cluster} and a 
    * given {@link org.apache.hadoop.conf.Configuration}.
    * 
-   * The <code>NutchJob</code> makes a copy of the <code>Configuration</code> so 
+   * The <code>NutchJob</code> makes a copy of the <code>Configuration</code> so
    * that any necessary internal modifications do not reflect on the incoming 
    * parameter.
    * 
@@ -87,7 +87,7 @@ public static NutchJob getInstance(Configuration conf) throws IOException {
    * and a given jobName.
    * A Cluster will be created from the conf parameter only when it's needed.
    *
-   * The <code>NutchJob</code> makes a copy of the <code>Configuration</code> so 
+   * The <code>NutchJob</code> makes a copy of the <code>Configuration</code> so
    * that any necessary internal modifications do not reflect on the incoming 
    * parameter.
    * 
diff --git a/src/java/org/apache/nutch/util/NutchTool.java b/src/java/org/apache/nutch/util/NutchTool.java
index 1f5789a608..443d1da681 100644
--- a/src/java/org/apache/nutch/util/NutchTool.java
+++ b/src/java/org/apache/nutch/util/NutchTool.java
@@ -36,11 +36,19 @@ public abstract class NutchTool extends Configured {
 
   /**
    * Runs the tool, using a map of arguments. May return results, or null.
+   *
+   * @param args map of arguments
+   * @return results or null
+   * @throws Exception
    */
   public abstract Map<String, Object> run(Map<String, Object> args)
       throws Exception;
 
-  /** Returns relative progress of the tool, a float in range [0,1]. */
+  /**
+   * Returns relative progress of the tool, a float in range [0,1]
+   *
+   * @return relative progress of the tool, a float in range [0,1]
+   */
   public float getProgress() {
     float res = 0;
     if (currentJob != null) {
@@ -62,7 +70,11 @@ public float getProgress() {
     return res;
   }
 
-  /** Returns current status of the running tool. */
+  /**
+   * Returns current status of the running tool
+   *
+   * @return current status of the running tool
+   */
   public Map<String, Object> getStatus() {
     return status;
   }
@@ -72,6 +84,7 @@ public Map<String, Object> getStatus() {
    * this, since by default it calls {@link #killJob()}.
    * 
    * @return true if succeeded, false otherwise
+   * @throws Exception
    */
   public boolean stopJob() throws Exception {
     return killJob();
diff --git a/src/java/org/apache/nutch/util/PrefixStringMatcher.java b/src/java/org/apache/nutch/util/PrefixStringMatcher.java
index e323b67531..6ca48c8b71 100644
--- a/src/java/org/apache/nutch/util/PrefixStringMatcher.java
+++ b/src/java/org/apache/nutch/util/PrefixStringMatcher.java
@@ -70,8 +70,8 @@ public boolean matches(String input) {
   }
 
   /**
-   * Returns the shortest prefix of <code>input<code> that is matched,
-   * or <code>null<code> if no match exists.
+   * Returns the shortest prefix of <code>input</code> that is matched,
+   * or <code>null</code> if no match exists.
    */
   public String shortestMatch(String input) {
     TrieNode node = root;
@@ -86,8 +86,8 @@ public String shortestMatch(String input) {
   }
 
   /**
-   * Returns the longest prefix of <code>input<code> that is matched,
-   * or <code>null<code> if no match exists.
+   * Returns the longest prefix of <code>input</code> that is matched,
+   * or <code>null</code> if no match exists.
    */
   public String longestMatch(String input) {
     TrieNode node = root;
diff --git a/src/java/org/apache/nutch/util/SuffixStringMatcher.java b/src/java/org/apache/nutch/util/SuffixStringMatcher.java
index a967c0177f..6e070b935c 100644
--- a/src/java/org/apache/nutch/util/SuffixStringMatcher.java
+++ b/src/java/org/apache/nutch/util/SuffixStringMatcher.java
@@ -65,8 +65,8 @@ public boolean matches(String input) {
   }
 
   /**
-   * Returns the shortest suffix of <code>input<code> that is matched,
-   * or <code>null<code> if no match exists.
+   * Returns the shortest suffix of <code>input</code> that is matched,
+   * or <code>null</code> if no match exists.
    */
   public String shortestMatch(String input) {
     TrieNode node = root;
@@ -81,8 +81,8 @@ public String shortestMatch(String input) {
   }
 
   /**
-   * Returns the longest suffix of <code>input<code> that is matched,
-   * or <code>null<code> if no match exists.
+   * Returns the longest suffix of <code>input</code> that is matched,
+   * or <code>null</code> if no match exists.
    */
   public String longestMatch(String input) {
     TrieNode node = root;
diff --git a/src/java/org/apache/nutch/util/TableUtil.java b/src/java/org/apache/nutch/util/TableUtil.java
index 68ded699fe..e6ccbbc476 100644
--- a/src/java/org/apache/nutch/util/TableUtil.java
+++ b/src/java/org/apache/nutch/util/TableUtil.java
@@ -33,7 +33,7 @@ public class TableUtil {
    * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes
    * "com.foo.bar:8983:http/to/index.html?a=b".
    * 
-   * @param url
+   * @param urlString
    *          url to be reversed
    * @return Reversed url
    * @throws MalformedURLException
@@ -111,7 +111,7 @@ public static String unreverseUrl(String reversedUrl) {
 
   /**
    * Given a reversed url, returns the reversed host E.g
-   * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar"
+   * "com.foo.bar:http:8983/to/index.html?a=b" -&gt; "com.foo.bar"
    * 
    * @param reversedUrl
    *          Reversed url
diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java
index 524bee6ff6..497716c4bf 100644
--- a/src/java/org/apache/nutch/util/TimingUtil.java
+++ b/src/java/org/apache/nutch/util/TimingUtil.java
@@ -32,7 +32,7 @@ public class TimingUtil {
    * @param end
    *          The end of the time period
    * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y
-   *         minutes and Z seconds or null if start > end.
+   *         minutes and Z seconds or null if start &gt; end.
    */
   public static String elapsedTime(long start, long end) {
     if (start > end) {
diff --git a/src/java/org/apache/nutch/util/TrieStringMatcher.java b/src/java/org/apache/nutch/util/TrieStringMatcher.java
index 95f06ad6f6..e7773cb668 100644
--- a/src/java/org/apache/nutch/util/TrieStringMatcher.java
+++ b/src/java/org/apache/nutch/util/TrieStringMatcher.java
@@ -186,15 +186,15 @@ protected final void addPatternBackward(String s) {
   public abstract boolean matches(String input);
 
   /**
-   * Returns the shortest substring of <code>input<code> that is
-   * matched by a pattern in the trie, or <code>null<code> if no match
+   * Returns the shortest substring of <code>input</code> that is
+   * matched by a pattern in the trie, or <code>null</code> if no match
    * exists.
    */
   public abstract String shortestMatch(String input);
 
   /**
-   * Returns the longest substring of <code>input<code> that is
-   * matched by a pattern in the trie, or <code>null<code> if no match
+   * Returns the longest substring of <code>input</code> that is
+   * matched by a pattern in the trie, or <code>null</code> if no match
    * exists.
    */
   public abstract String longestMatch(String input);
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index 5183ba10b0..e1df9e3604 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -219,52 +219,49 @@ public static String[] getHostBatches(String url)
    * Yahoo! Slurp crawler described here:<br>
    * <a href=
    * "http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> How
-   * does the Yahoo! webcrawler handle redirects?</a> <br>
+   * does the Yahoo! webcrawler handle redirects?</a>
    * <br>
-   * <ol>
-   * <li>Choose target url if either url is malformed.</li>
-   * <li>If different domains the keep the destination whether or not the
-   * redirect is temp or perm</li>
-   * <ul>
-   * <li>a.com -> b.com*</li>
-   * </ul>
-   * <li>If the redirect is permanent and the source is root, keep the source.</li>
-   * <ul>
-   * <li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li>
-   * </ul>
-   * <li>If the redirect is permanent and the source is not root and the
-   * destination is root, keep the destination</li>
-   * <ul>
-   * <li>a.com/xyz/index.html -> a.com*</li>
-   * </ul>
-   * <li>If the redirect is permanent and neither the source nor the destination
-   * is root, then keep the destination</li>
-   * <ul>
-   * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
-   * </ul>
-   * <li>If the redirect is temporary and source is root and destination is not
-   * root, then keep the source</li>
-   * <ul>
-   * <li>*a.com -> a.com/xyz/index.html</li>
-   * </ul>
-   * <li>If the redirect is temporary and source is not root and destination is
-   * root, then keep the destination</li>
-   * <ul>
-   * <li>a.com/xyz/index.html -> a.com*</li>
-   * </ul>
-   * <li>If the redirect is temporary and neither the source or the destination
+   *
+   * <dl>
+   *
+   * <dt>Choose target url if either url is malformed.</dt>
+   *
+   * <dt>If different domains the keep the destination whether or not the
+   * redirect is temp or perm</dt>
+   * <dd>a.com -&gt; b.com*</dd>
+   *
+   * <dt>If the redirect is permanent and the source is root, keep the source.</dt>
+   * <dd>*a.com -&gt; a.com?y=1 || *a.com -&gt; a.com/xyz/index.html</dd>
+   *
+   * <dt>If the redirect is permanent and the source is not root and the
+   * destination is root, keep the destination</dt>
+   * <dd>a.com/xyz/index.html -&gt; a.com*</dd>
+   *
+   * <dt>If the redirect is permanent and neither the source nor the destination
+   * is root, then keep the destination</dt>
+   * <dd>a.com/xyz/index.html -&gt; a.com/abc/page.html*</dd>
+   *
+   * <dt>If the redirect is temporary and source is root and destination is not
+   * root, then keep the source</dt>
+   * <dd>*a.com -&gt; a.com/xyz/index.html</dd>
+   *
+   * <dt>If the redirect is temporary and source is not root and destination is
+   * root, then keep the destination</dt>
+   * <dd>a.com/xyz/index.html -&gt; a.com*</dd>
+   *
+   * <dt>If the redirect is temporary and neither the source or the destination
    * is root, then keep the shortest url. First check for the shortest host, and
    * if both are equal then check by path. Path is first by length then by the
-   * number of / path separators.</li>
-   * <ul>
-   * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
-   * <li>*www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html</li>
-   * </ul>
-   * <li>If the redirect is temporary and both the source and the destination
-   * are root, then keep the shortest sub-domain</li>
-   * <ul>
-   * <li>*www.a.com -> www.news.a.com</li>
-   * </ul>
+   * number of / path separators.</dt>
+   * <dd>a.com/xyz/index.html -&gt; a.com/abc/page.html*</dd>
+   * <dd>*www.a.com/xyz/index.html -&gt; www.news.a.com/xyz/index.html</dd>
+   *
+   * <dt>If the redirect is temporary and both the source and the destination
+   * are root, then keep the shortest sub-domain</dt>
+   * <dd>*www.a.com -&gt; www.news.a.com</dd>
+   *
+   * </dl>
+   *
    * <br>
    * While not in this logic there is a further piece of representative url
    * logic that occurs during indexing and after scoring. During creation of the
diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffix.java b/src/java/org/apache/nutch/util/domain/DomainSuffix.java
index ae03ec4eea..0e0b7b02fc 100644
--- a/src/java/org/apache/nutch/util/domain/DomainSuffix.java
+++ b/src/java/org/apache/nutch/util/domain/DomainSuffix.java
@@ -23,17 +23,16 @@
  * name of a host. The domain name of a host is defined to be the last part
  * before the domain suffix, w/o subdomain names. As an example the domain name
  * of <br>
- * <code> http://lucene.apache.org/ 
+ * <code> http://lucene.apache.org/
  * </code><br>
  * is <code> apache.org</code> <br>
  * This class holds three fields, <strong>domain</strong> field represents the
  * suffix (such as "co.uk") <strong>boost</strong> is a float for boosting score
  * of url's with this suffix <strong>status</strong> field represents domain's
- * status
+ * status. Check also domain-suffixes.xml
  * 
  * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
  * @see TopLevelDomain
- * @see domain-suffixes.xml
  */
 public class DomainSuffix {
 
diff --git a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
index 6386335e06..87e370e5af 100644
--- a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
+++ b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
@@ -24,8 +24,8 @@
  * top-level domain is <code>com</code>.
  * 
  * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
- * @see http://www.iana.org/
- * @see http://en.wikipedia.org/wiki/Top-level_domain
+ * @see <a href="http://www.iana.org/">http://www.iana.org/</a>
+ * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain">http://en.wikipedia.org/wiki/Top-level_domain</a>
  */
 public class TopLevelDomain extends DomainSuffix {
 
diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
index 9df4e2724c..0751ddc1be 100644
--- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
+++ b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
@@ -64,7 +64,7 @@
  * <p>
  * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced links
  * and content present in the feed.
- * </p>
+ *
  * 
  */
 public class FeedParser implements Parser {
diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
index 9e2e75bee1..7e0e24688b 100644
--- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
+++ b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
@@ -36,7 +36,7 @@
  * Indexing filter that offers an option to either index all inbound anchor text
  * for a document or deduplicate anchors. Deduplication does have it's con's,
  * 
- * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+ * Check {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
  */
 public class AnchorIndexingFilter implements IndexingFilter {
 
diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
index fdd3b8120f..a97e9edb29 100644
--- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -37,7 +37,7 @@
  * Indexer which can be configured to extract metadata from the crawldb, parse
  * metadata or content metadata. You can specify the properties "index.db",
  * "index.parse" or "index.content" who's values are comma-delimited
- * <value>key1,key2,key3</value>.
+ * &lt;value&gt;key1,key2,key3&lt;/value&gt;.
  */
 
 public class MetadataIndexer implements IndexingFilter {
diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
index b1d99e5ed6..9171b1cebb 100644
--- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -148,14 +148,14 @@ private NutchDocument addLength(NutchDocument doc, WebPage page, String url) {
    * primaryType and subType to field "type" as un-stored, indexed and
    * un-tokenized, so that search results can be confined by contentType or its
    * primaryType or its subType.
-   * </p>
+   *
    * <p>
    * For example, if contentType is application/vnd.ms-powerpoint, search can be
    * done with one of the following qualifiers
    * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
    * all case insensitive. The query filter is implemented in
    * {@link TypeQueryFilter}.
-   * </p>
+   *
    * 
    * @param doc
    * @param data
diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
index f3af6a987f..064cd8db2a 100644
--- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
+++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
@@ -84,12 +84,19 @@ public class HTMLLanguageParser implements ParseFilter {
 
   /**
    * Scan the HTML document looking at possible indications of content language<br>
-   * <li>1. html lang attribute
-   * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
+   * <ol>
+   * <li>html lang attribute
+   * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
+   * </li>
+   * <li>meta
    * dc.language
    * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
-   * -html.shtml#language) <li>3. meta http-equiv (content-language)
-   * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
+   * -html.shtml#language)
+   * </li>
+   * <li>meta http-equiv (content-language)
+   * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
+   * </li>
+   * </ol>
    */
   public Parse filter(String url, WebPage page, Parse parse,
       HTMLMetaTags metaTags, DocumentFragment doc) {
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index d374e95a35..a1475a71c7 100644
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -44,19 +44,13 @@
  * expressions.
  * 
  * <p>
- * The regular expressions rules are expressed in a file. The file of rules is
- * provided by each implementation using the
- * {@link #getRulesFile(Configuration)} method.
- * </p>
- * 
- * <p>
- * The format of this file is made of many rules (one per line):<br/>
+ * The format of this file is made of many rules (one per line):<br>
  * <code>
  * [+-]&lt;regex&gt;
- * </code><br/>
+ * </code><br>
  * where plus (<code>+</code>)means go ahead and index it and minus (
  * <code>-</code>)means no.
- * </p>
+ *
  */
 public abstract class RegexURLFilterBase implements URLFilter {
 
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
index 31b54dab3d..6bd430555c 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
@@ -174,7 +174,7 @@ protected void append(Node newNode) throws org.xml.sax.SAXException {
    * supply a locator: if it does so, it must supply the locator to the
    * application by invoking this method before invoking any of the other
    * methods in the ContentHandler interface.
-   * </p>
+   *
    * 
    * <p>
    * The locator allows the application to determine the end position of any
@@ -183,13 +183,13 @@ protected void append(Node newNode) throws org.xml.sax.SAXException {
    * errors (such as character content that does not match an application's
    * business rules). The information returned by the locator is probably not
    * sufficient for use with a search engine.
-   * </p>
+   *
    * 
    * <p>
    * Note that the locator will return correct information only during the
    * invocation of the events in this interface. The application should not
    * attempt to use it at any other time.
-   * </p>
+   *
    * 
    * @param locator
    *          An object that can return the location of any SAX document event.
@@ -206,7 +206,7 @@ public void setDocumentLocator(Locator locator) {
    * <p>
    * The SAX parser will invoke this method only once, before any other methods
    * in this interface or in DTDHandler (except for setDocumentLocator).
-   * </p>
+   *
    */
   public void startDocument() throws org.xml.sax.SAXException {
 
@@ -221,7 +221,7 @@ public void startDocument() throws org.xml.sax.SAXException {
    * method invoked during the parse. The parser shall not invoke this method
    * until it has either abandoned parsing (because of an unrecoverable error)
    * or reached the end of input.
-   * </p>
+   *
    */
   public void endDocument() throws org.xml.sax.SAXException {
 
@@ -237,14 +237,14 @@ public void endDocument() throws org.xml.sax.SAXException {
    * startElement() event (even when the element is empty). All of the element's
    * content will be reported, in order, before the corresponding endElement()
    * event.
-   * </p>
+   *
    * 
    * <p>
    * If the element name has a namespace prefix, the prefix will still be
    * attached. Note that the attribute list provided will contain only
    * attributes with explicit values (specified or defaulted): #IMPLIED
    * attributes will be omitted.
-   * </p>
+   *
    * 
    * 
    * @param ns
@@ -328,12 +328,12 @@ public void startElement(String ns, String localName, String name,
    * The SAX parser will invoke this method at the end of every element in the
    * XML document; there will be a corresponding startElement() event for every
    * endElement() event (even when the element is empty).
-   * </p>
+   *
    * 
    * <p>
    * If the element name has a namespace prefix, the prefix will still be
    * attached to the name.
-   * </p>
+   *
    * 
    * 
    * @param ns
@@ -371,18 +371,18 @@ public void setIDAttribute(String id, Element elem) {
    * they may split it into several chunks; however, all of the characters in
    * any single event must come from the same external entity, so that the
    * Locator provides useful information.
-   * </p>
+   *
    * 
    * <p>
    * The application must not attempt to read from the array outside of the
    * specified range.
-   * </p>
+   *
    * 
    * <p>
    * Note that some parsers will report whitespace using the
    * ignorableWhitespace() method rather than this one (validating parsers must
    * do so).
-   * </p>
+   *
    * 
    * @param ch
    *          The characters from the XML document.
@@ -489,19 +489,19 @@ public void entityReference(String name) throws org.xml.sax.SAXException {
    * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
    * non-validating parsers may also use this method if they are capable of
    * parsing and using content models.
-   * </p>
+   *
    * 
    * <p>
    * SAX parsers may return all contiguous whitespace in a single chunk, or they
    * may split it into several chunks; however, all of the characters in any
    * single event must come from the same external entity, so that the Locator
    * provides useful information.
-   * </p>
+   *
    * 
    * <p>
    * The application must not attempt to read from the array outside of the
    * specified range.
-   * </p>
+   *
    * 
    * @param ch
    *          The characters from the XML document.
@@ -539,12 +539,12 @@ private boolean isOutsideDocElem() {
    * The Parser will invoke this method once for each processing instruction
    * found: note that processing instructions may occur before or after the main
    * document element.
-   * </p>
+   *
    * 
    * <p>
    * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
    * or a text declaration (XML 1.0, section 4.3.1) using this method.
-   * </p>
+   *
    * 
    * @param target
    *          The processing instruction target.
@@ -608,18 +608,18 @@ public void endCDATA() throws org.xml.sax.SAXException {
    * they may split it into several chunks; however, all of the characters in
    * any single event must come from the same external entity, so that the
    * Locator provides useful information.
-   * </p>
+   *
    * 
    * <p>
    * The application must not attempt to read from the array outside of the
    * specified range.
-   * </p>
+   *
    * 
    * <p>
    * Note that some parsers will report whitespace using the
    * ignorableWhitespace() method rather than this one (validating parsers must
    * do so).
-   * </p>
+   *
    * 
    * @param ch
    *          The characters from the XML document.
@@ -687,14 +687,14 @@ public void endDTD() throws org.xml.sax.SAXException {
    * processing: the SAX XML reader will automatically replace prefixes for
    * element and attribute names when the http://xml.org/sax/features/namespaces
    * feature is true (the default).
-   * </p>
+   *
    * 
    * <p>
    * There are cases, however, when applications need to use prefixes in
    * character data or in attribute values, where they cannot safely be expanded
    * automatically; the start/endPrefixMapping event supplies the information to
    * the application to expand prefixes in those contexts itself, if necessary.
-   * </p>
+   *
    * 
    * <p>
    * Note that start/endPrefixMapping events are not guaranteed to be properly
@@ -702,7 +702,7 @@ public void endDTD() throws org.xml.sax.SAXException {
    * before the corresponding startElement event, and all endPrefixMapping
    * events will occur after the corresponding endElement event, but their order
    * is not guaranteed.
-   * </p>
+   *
    * 
    * @param prefix
    *          The Namespace prefix being declared.
@@ -735,7 +735,7 @@ public void startPrefixMapping(String prefix, String uri)
    * See startPrefixMapping for details. This event will always occur after the
    * corresponding endElement event, but the order of endPrefixMapping events is
    * not otherwise guaranteed.
-   * </p>
+   *
    * 
    * @param prefix
    *          The prefix that was being mapping.
@@ -755,7 +755,7 @@ public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
    * DTD subset). All processors may skip external entities, depending on the
    * values of the http://xml.org/sax/features/external-general-entities and the
    * http://xml.org/sax/features/external-parameter-entities properties.
-   * </p>
+   *
    * 
    * @param name
    *          The name of the skipped entity. If it is a parameter entity, the
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 8e079fb992..488cacd657 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -113,7 +113,7 @@ public boolean getText(StringBuilder sb, Node node,
 
   /**
    * This is a convinience method, equivalent to
-   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+   * {@link #getText(StringBuilder, Node, boolean)} which passes false as third argument
    * 
    */
   public void getText(StringBuilder sb, Node node) {
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
index cfef10cc56..0143f06545 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
@@ -35,7 +35,7 @@ public class XMLCharacterRecognizer {
    * Returns whether the specified <var>ch</var> conforms to the XML 1.0
    * definition of whitespace. Refer to <A
    * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of
-   * <CODE>S</CODE></A> for details.
+   * <code>S</code></A> for details.
    * 
    * @param ch
    *          Character to check as XML whitespace.
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
index a48175548e..a68584444e 100644
--- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
+++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
@@ -79,10 +79,10 @@ public class JSParseFilter implements ParseFilter, Parser {
    *          {@link WebPage} object relative to the URL
    * @param parse
    *          {@link Parse} object holding parse status
-   * @param metatags
-   *          within the {@link NutchDocument}
+   * @param metaTags
+   *          within the {@link HTMLMetaTags}
    * @param doc
-   *          The {@link NutchDocument} object
+   *          The {@link DocumentFragment} object
    * @return parse the actual {@link Parse} object
    */
   @Override
diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
index a3f779ad89..4fbcad3cc4 100644
--- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
+++ b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
@@ -280,7 +280,7 @@ public void setY(int y) {
 
     /*
      * There are some issues with this method: sometimes SWF files define their
-     * own font, so short of OCR we cannot guess what is the glyph code -> character
+     * own font, so short of OCR we cannot guess what is the glyph code -&gt; character
      * mapping. Additionally, some files don't use literal space character, instead
      * they adjust glyphAdvances. We don't handle it at all - in such cases the text
      * will be all glued together.
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
index 4f4c8a78fa..db59d13e9c 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
@@ -174,7 +174,7 @@ protected void append(Node newNode) throws org.xml.sax.SAXException {
    * supply a locator: if it does so, it must supply the locator to the
    * application by invoking this method before invoking any of the other
    * methods in the ContentHandler interface.
-   * </p>
+   *
    * 
    * <p>
    * The locator allows the application to determine the end position of any
@@ -183,13 +183,13 @@ protected void append(Node newNode) throws org.xml.sax.SAXException {
    * errors (such as character content that does not match an application's
    * business rules). The information returned by the locator is probably not
    * sufficient for use with a search engine.
-   * </p>
+   *
    * 
    * <p>
    * Note that the locator will return correct information only during the
    * invocation of the events in this interface. The application should not
    * attempt to use it at any other time.
-   * </p>
+   *
    * 
    * @param locator
    *          An object that can return the location of any SAX document event.
@@ -206,7 +206,7 @@ public void setDocumentLocator(Locator locator) {
    * <p>
    * The SAX parser will invoke this method only once, before any other methods
    * in this interface or in DTDHandler (except for setDocumentLocator).
-   * </p>
+   *
    */
   public void startDocument() throws org.xml.sax.SAXException {
 
@@ -221,7 +221,7 @@ public void startDocument() throws org.xml.sax.SAXException {
    * method invoked during the parse. The parser shall not invoke this method
    * until it has either abandoned parsing (because of an unrecoverable error)
    * or reached the end of input.
-   * </p>
+   *
    */
   public void endDocument() throws org.xml.sax.SAXException {
 
@@ -237,14 +237,14 @@ public void endDocument() throws org.xml.sax.SAXException {
    * startElement() event (even when the element is empty). All of the element's
    * content will be reported, in order, before the corresponding endElement()
    * event.
-   * </p>
+   *
    * 
    * <p>
    * If the element name has a namespace prefix, the prefix will still be
    * attached. Note that the attribute list provided will contain only
    * attributes with explicit values (specified or defaulted): #IMPLIED
    * attributes will be omitted.
-   * </p>
+   *
    * 
    * 
    * @param ns
@@ -328,12 +328,12 @@ public void startElement(String ns, String localName, String name,
    * The SAX parser will invoke this method at the end of every element in the
    * XML document; there will be a corresponding startElement() event for every
    * endElement() event (even when the element is empty).
-   * </p>
+   *
    * 
    * <p>
    * If the element name has a namespace prefix, the prefix will still be
    * attached to the name.
-   * </p>
+   *
    * 
    * 
    * @param ns
@@ -373,18 +373,18 @@ public void setIDAttribute(String id, Element elem) {
    * they may split it into several chunks; however, all of the characters in
    * any single event must come from the same external entity, so that the
    * Locator provides useful information.
-   * </p>
+   *
    * 
    * <p>
    * The application must not attempt to read from the array outside of the
    * specified range.
-   * </p>
+   *
    * 
    * <p>
    * Note that some parsers will report whitespace using the
    * ignorableWhitespace() method rather than this one (validating parsers must
    * do so).
-   * </p>
+   *
    * 
    * @param ch
    *          The characters from the XML document.
@@ -491,19 +491,19 @@ public void entityReference(String name) throws org.xml.sax.SAXException {
    * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
    * non-validating parsers may also use this method if they are capable of
    * parsing and using content models.
-   * </p>
+   *
    * 
    * <p>
    * SAX parsers may return all contiguous whitespace in a single chunk, or they
    * may split it into several chunks; however, all of the characters in any
    * single event must come from the same external entity, so that the Locator
    * provides useful information.
-   * </p>
+   *
    * 
    * <p>
    * The application must not attempt to read from the array outside of the
    * specified range.
-   * </p>
+   *
    * 
    * @param ch
    *          The characters from the XML document.
@@ -541,12 +541,12 @@ private boolean isOutsideDocElem() {
    * The Parser will invoke this method once for each processing instruction
    * found: note that processing instructions may occur before or after the main
    * document element.
-   * </p>
+   *
    * 
    * <p>
    * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
    * or a text declaration (XML 1.0, section 4.3.1) using this method.
-   * </p>
+   *
    * 
    * @param target
    *          The processing instruction target.
@@ -610,18 +610,18 @@ public void endCDATA() throws org.xml.sax.SAXException {
    * they may split it into several chunks; however, all of the characters in
    * any single event must come from the same external entity, so that the
    * Locator provides useful information.
-   * </p>
+   *
    * 
    * <p>
    * The application must not attempt to read from the array outside of the
    * specified range.
-   * </p>
+   *
    * 
    * <p>
    * Note that some parsers will report whitespace using the
    * ignorableWhitespace() method rather than this one (validating parsers must
    * do so).
-   * </p>
+   *
    * 
    * @param ch
    *          The characters from the XML document.
@@ -689,14 +689,14 @@ public void endDTD() throws org.xml.sax.SAXException {
    * processing: the SAX XML reader will automatically replace prefixes for
    * element and attribute names when the http://xml.org/sax/features/namespaces
    * feature is true (the default).
-   * </p>
+   *
    * 
    * <p>
    * There are cases, however, when applications need to use prefixes in
    * character data or in attribute values, where they cannot safely be expanded
    * automatically; the start/endPrefixMapping event supplies the information to
    * the application to expand prefixes in those contexts itself, if necessary.
-   * </p>
+   *
    * 
    * <p>
    * Note that start/endPrefixMapping events are not guaranteed to be properly
@@ -704,7 +704,7 @@ public void endDTD() throws org.xml.sax.SAXException {
    * before the corresponding startElement event, and all endPrefixMapping
    * events will occur after the corresponding endElement event, but their order
    * is not guaranteed.
-   * </p>
+   *
    * 
    * @param prefix
    *          The Namespace prefix being declared.
@@ -737,7 +737,7 @@ public void startPrefixMapping(String prefix, String uri)
    * See startPrefixMapping for details. This event will always occur after the
    * corresponding endElement event, but the order of endPrefixMapping events is
    * not otherwise guaranteed.
-   * </p>
+   *
    * 
    * @param prefix
    *          The prefix that was being mapping.
@@ -757,7 +757,7 @@ public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
    * DTD subset). All processors may skip external entities, depending on the
    * values of the http://xml.org/sax/features/external-general-entities and the
    * http://xml.org/sax/features/external-parameter-entities properties.
-   * </p>
+   *
    * 
    * @param name
    *          The name of the skipped entity. If it is a parameter entity, the
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
index d625c33119..b5c95ce9ae 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
@@ -35,7 +35,7 @@ class XMLCharacterRecognizer {
    * Returns whether the specified <var>ch</var> conforms to the XML 1.0
    * definition of whitespace. Refer to <A
    * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of
-   * <CODE>S</CODE></A> for details.
+   * <code>S</code></A> for details.
    * 
    * @param ch
    *          Character to check as XML whitespace.
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
index 67d3dcc5ce..19035c0f60 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
@@ -59,7 +59,7 @@ public class TestRSSParser {
   /**
    * <p>
    * The test method: tests out the following 2 asserts:
-   * </p>
+   *
    * 
    * <ul>
    * <li>There are 3 outlinks read from the sample rss file</li>
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index 06954394b0..20ba474aa4 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -99,8 +99,8 @@ public void setMaxContentLength(int maxContentLength) {
    * 
    * @param url
    *          Text containing the url
-   * @param datum
-   *          The CrawlDatum object corresponding to the url
+   * @param page
+   *          {@link WebPage} object relative to the URL
    * 
    * @return {@link ProtocolOutput} object for the content of the file indicated
    *         by url
diff --git a/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java b/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
index 856a649a86..952648ff33 100644
--- a/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
+++ b/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
@@ -39,7 +39,7 @@
  * 
  *          <p>
  *          Unit tests for the {@link File}Protocol.
- *          </p>
+ *
  *          .
  */
 public class TestProtocolFile {
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
index ffa2091760..8b272ec432 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
@@ -54,7 +54,7 @@
  * servers out there, when partial downloading is enforeced by closing data
  * channel socket on our client side, the server side immediately closes control
  * channel (socket). Our codes deal with such a bad behavior. (4) LIST is used
- * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but
+ * to obtain remote file attributes if possible. MDTM &amp; SIZE would be nice, but
  * not as ubiquitously implemented as LIST. (5) Avoid using ABOR in single
  * thread? Do not use it at all.
  * 
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 3f3a7e8e98..9f3f9c5e35 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -114,8 +114,8 @@ public void setKeepConnection(boolean keepConnection) {
    * 
    * @param url
    *          Text containing the ftp url
-   * @param datum
-   *          The CrawlDatum object corresponding to the url
+   * @param page
+   *          {@link WebPage} object relative to the URL
    * 
    * @return {@link ProtocolOutput} object for the url
    */
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
index afcf24aa39..92baf298e9 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
@@ -97,7 +97,7 @@ public Socket createSocket(String host, int port, InetAddress clientHost,
    * create a new socket within the given limit of time. If socket constructor
    * does not return until the timeout expires, the controller terminates and
    * throws an {@link ConnectTimeoutException}
-   * </p>
+   *
    * 
    * @param host
    *          the host name/IP
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
index d4d7eba062..69abab75a0 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
@@ -161,8 +161,8 @@ public static void main(String[] args) throws Exception {
    * 
    * @param url
    *          URL to be fetched
-   * @param datum
-   *          Crawl data
+   * @param page
+   *          {@link WebPage} object relative to the URL
    * @param redirect
    *          Follow redirects if and only if true
    * @return HTTP response
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
index a15f91be5a..cb09e697a1 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
@@ -39,7 +39,7 @@
 /**
  * Implementation of RFC 2617 Basic Authentication. Usernames and passwords are
  * stored in standard Nutch configuration files using the following properties:
- * http.auth.basic.<realm>.user http.auth.basic.<realm>.pass
+ * http.auth.basic.&lt;realm&gt;.user http.auth.basic.&lt;realm&gt;.pass
  */
 public class HttpBasicAuthentication implements HttpAuthentication,
     Configurable {
@@ -128,7 +128,7 @@ public Configuration getConf() {
    * Gets the Basic credentials generated by this HttpBasicAuthentication object
    * 
    * @return Credentials in the form of
-   *         <code>Authorization: Basic &lt;Base64 encoded userid:password&gt;
+   *         <code>Authorization: Basic Base64 encoded userid:password</code>
    * 
    */
   public List<String> getCredentials() {
diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
index 9dc0c35823..5006266467 100644
--- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
+++ b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
@@ -16,8 +16,7 @@
  */
 
 /**
- * Scoring filter used in conjunction with
- * {@link org.apache.nutch.scoring.webgraph.WebGraph}.
+ * Scoring filter
  */
 package org.apache.nutch.scoring.link;
 
diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
index c3119226dc..1e07e6adc2 100644
--- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
+++ b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
@@ -39,7 +39,7 @@
 /**
  * This plugin implements a variant of an Online Page Importance Computation
  * (OPIC) score, described in this paper: <a
- * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
+ * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html">
  * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
  * On-Line Page Importance Computation </a>.
  * 
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
index 6c78df5c5f..c905411091 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
@@ -201,7 +201,7 @@ public Collection<Subcollection> getAll() {
   /**
    * Save collections into file
    * 
-   * @throws Exception
+   * @throws IOException
    */
   public void save() throws IOException {
     try {
diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
index bf1ef4232f..5b7d5816cd 100644
--- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
+++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -43,7 +43,6 @@
  * Filters URLs based on a file containing domain suffixes, domain names, and
  * hostnames. Only a url that matches one of the suffixes, domains, or hosts
  * present in the file is allowed.
- * </p>
  * 
  * <p>
  * Urls are checked in order of domain suffix, domain name, and hostname against
@@ -61,18 +60,17 @@
  * only urls from www.apache.org. There is no specific ordering to entries. The
  * entries are from more general to more specific with the more general
  * overridding the more specific.
- * </p>
  * 
  * The domain file defaults to domain-urlfilter.txt in the classpath but can be
  * overridden using the:
  * 
  * <ul>
- * <ol>
+ * <li>
  * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
- * </ol>
- * <ol>
+ * </li>
+ * <li>
  * attribute "file" in plugin.xml of this plugin
- * </ol>
+ * </li>
  * </ul>
  * 
  * the attribute "file" has higher precedence if defined.
@@ -114,7 +112,6 @@ public DomainURLFilter() {
    * @param domainFile
    *          The domain file, overrides domain-urlfilter.text default.
    * 
-   * @throws IOException
    */
   public DomainURLFilter(String domainFile) {
     this.domainFile = domainFile;
diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java
index d2eba1f763..1ff46f69c6 100644
--- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java
+++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java
@@ -18,8 +18,6 @@
 /**
  * URL filter plugin to include only URLs which match an element in a given list of
  * domain suffixes, domain names, and/or host names.
- * See {@link org.apache.nutch.urlfilter.domainblacklist} for the counterpart
- * (exclude URLs by host or domain).
  */
 package org.apache.nutch.urlfilter.domain;
 
diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
index 366c11e46f..75ece12c48 100644
--- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
+++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -49,7 +49,7 @@
  * 
  * <p>
  * The format of this file is one URL prefix per line.
- * </p>
+ *
  */
 public class PrefixURLFilter implements URLFilter {
 
diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
index 1a7492ab85..ccad47d4e8 100644
--- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
+++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -68,22 +68,22 @@
  * The format of this config file is one URL suffix per line, with no preceding
  * whitespace. Order, in which suffixes are specified, doesn't matter. Blank
  * lines and comments (#) are allowed.
- * </p>
+ *
  * <p>
  * A single '+' or '-' sign not followed by any suffix must be used once, to
  * signify the mode this plugin operates in. An optional single 'I' can be
  * appended, to signify that suffix matches should be case-insensitive. The
  * default, if not specified, is to use case-sensitive matches, i.e. suffix
  * '.JPG' does not match '.jpg'.
- * </p>
+ *
  * <p>
  * NOTE: the format of this file is different from urlfilter-prefix, because
  * that plugin doesn't support allowed/prohibited prefixes (only supports
  * allowed prefixes). Please note that this plugin does not support regular
  * expressions, it only accepts literal suffixes. I.e. a suffix "+*.jpg" is most
  * probably wrong, you should use "+.jpg" instead.
- * </p>
- * <h4>Example 1</h4>
+ *
+ * <strong>Example 1</strong>
  * <p>
  * The configuration shown below will accept all URLs with '.html' or '.htm'
  * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit
@@ -101,8 +101,8 @@
  *  .htm
  * </pre>
  * 
- * </p>
- * <h4>Example 2</h4>
+ *
+ * <strong>Example 2</strong>
  * <p>
  * The configuration shown below will accept all URLs except common graphical
  * formats.
@@ -122,7 +122,7 @@
  *  .bmp
  * </pre>
  * 
- * </p>
+ *
  * 
  * @author Andrzej Bialecki
  */
diff --git a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
index 9b84eae01e..7132a61b45 100644
--- a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
+++ b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
@@ -25,13 +25,13 @@
 /**
  * <p>
  * Validates URLs.
- * </p>
+ *
  * 
  * <p>
  * Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date:
  * 03/07/02, http://javascript.internet.com. However, this validation now bears
  * little resemblance to the php original.
- * </p>
+ *
  * 
  * <pre>
  *   Example of usage:
@@ -47,7 +47,7 @@
  * 
  * <p>
  * Based on UrlValidator code from Apache commons-validator.
- * </p>
+ *
  * 
  * @see <a href='http://www.ietf.org/rfc/rfc2396.txt' > Uniform Resource
  *      Identifiers (URI): Generic Syntax </a>
@@ -159,7 +159,7 @@ public void setConf(Configuration conf) {
   /**
    * <p>
    * Checks if a field has a valid url address.
-   * </p>
+   *
    * 
    * @param value
    *          The value validation is being performed on. A <code>null</code>
@@ -323,7 +323,7 @@ private boolean isValidAuthority(String authority) {
    * <p>
    * Checks if the field isn't null and length of the field is greater than zero
    * not including whitespace.
-   * </p>
+   *
    * 
    * @param value
    *          The value validation is being performed on.
diff --git a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
index d460d9e5ad..66f7a1b363 100644
--- a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
+++ b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
@@ -58,11 +58,11 @@
  * This class uses the <tt>urlnormalizer.regex.file</tt> property. It should be
  * set to the file name of an xml file which should contain the patterns and
  * substitutions to be done on encountered URLs.
- * </p>
+ *
  * <p>
  * This class also supports different rules depending on the scope. Please see
  * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details.
- * </p>
+ *
  * 
  * @author Luke Baker
  * @author Andrzej Bialecki
diff --git a/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java b/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java
index 2e4e61ce51..f7d73880ab 100644
--- a/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java
+++ b/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java
@@ -141,7 +141,7 @@ public void testRequest(int expectedStatusCode, int port, String username, Strin
     */
 
     if (ChallengeScheme.HTTP_DIGEST.equals(challengeScheme)) {
-      // User server's data to complete the challengeResponse object
+      // Use server's data to complete the challengeResponse object
       ChallengeRequest digestChallengeRequest = retrieveDigestChallengeRequest(resource);
       ChallengeResponse challengeResponse = new ChallengeResponse(digestChallengeRequest, resource.getResponse(),
           username, password.toCharArray());
diff --git a/src/test/org/apache/nutch/crawl/TestGenerator.java b/src/test/org/apache/nutch/crawl/TestGenerator.java
index 95d1db3609..2345299003 100644
--- a/src/test/org/apache/nutch/crawl/TestGenerator.java
+++ b/src/test/org/apache/nutch/crawl/TestGenerator.java
@@ -42,7 +42,7 @@
  * <li>Generates entries to fetch</li>
  * <li>Verifies that number of generated urls match, and finally </li>
  * <li>Verifies that highest scoring urls are generated.</li>
- * <ol>
+ * </ol>
  * 
  */
 public class TestGenerator extends AbstractNutchTest {
diff --git a/src/test/org/apache/nutch/fetcher/TestFetcher.java b/src/test/org/apache/nutch/fetcher/TestFetcher.java
index 8a8fa42154..9d62d50a6d 100644
--- a/src/test/org/apache/nutch/fetcher/TestFetcher.java
+++ b/src/test/org/apache/nutch/fetcher/TestFetcher.java
@@ -161,13 +161,15 @@ public void testFetch() throws Exception {
   /**
    * Tests a refetch of a URL. This process consists of two consecutive
    * inject, generate, fetch, parse then update cycles. The test configuration
-   * is defined such that <code>db.fetch.interval.default</code> is set to 
+   * is defined such that <code>db.fetch.interval.default</code> is set to
    * a very low value (indicating that the URL should be fetched again immediately).
    * In addition, configuration tests that relevant 
    * {@link org.apache.nutch.metadata.Metadata} is present and the values consistent 
    * and therefore not overwritten.
-   * @see https://issues.apache.org/jira/browse/NUTCH-2222
+   *
    * @throws Exception
+   *
+   * @see <a href="https://issues.apache.org/jira/browse/NUTCH-2222">https://issues.apache.org/jira/browse/NUTCH-2222</a>
    */
   @Test
   public void testReFetch() throws Exception {
diff --git a/src/test/org/apache/nutch/util/CrawlTestUtil.java b/src/test/org/apache/nutch/util/CrawlTestUtil.java
index 5165b38a5a..046c51ec8a 100644
--- a/src/test/org/apache/nutch/util/CrawlTestUtil.java
+++ b/src/test/org/apache/nutch/util/CrawlTestUtil.java
@@ -71,7 +71,8 @@ public static Configuration createConfiguration() {
   /**
    * Generate seedlist
    * 
-   * @see TestInjector
+   * @see org.apache.nutch.crawl.TestInjector
+   *
    * @throws IOException
    */
   public static void generateSeedList(FileSystem fs, Path urlPath,
@@ -148,11 +149,11 @@ public static Server getServer(int port, String staticContent)
   }
 
   /**
-   * Generate Fetchlist.
+   * Generates Fetchlist
    *
    * @param numResults number of results to generate
    * @param config     Configuration to use
-   * @return path to generated batch
+   *
    * @throws IOException
    */
   public static void generateFetchlist(int numResults, Configuration config,