From 0ea78907dee6b07058b66a99e395aea8cf623e92 Mon Sep 17 00:00:00 2001 From: Furkan KAMACI Date: Sun, 4 Sep 2016 00:53:31 +0300 Subject: [PATCH] NUTCH-2089 Nutch 2.x is moved to compile on JDK 8 --- .../org/apache/nutch/api/NutchServer.java | 23 +++-- .../apache/nutch/api/impl/RAMConfManager.java | 3 + .../nutch/crawl/AbstractFetchSchedule.java | 23 +++-- .../nutch/crawl/AdaptiveFetchSchedule.java | 6 +- .../org/apache/nutch/crawl/FetchSchedule.java | 25 ++---- .../org/apache/nutch/crawl/GeneratorJob.java | 20 ++++- .../apache/nutch/crawl/SignatureFactory.java | 7 +- .../nutch/crawl/TextProfileSignature.java | 2 +- .../org/apache/nutch/fetcher/FetcherJob.java | 12 ++- .../org/apache/nutch/indexer/IndexUtil.java | 4 +- .../org/apache/nutch/net/URLNormalizers.java | 16 ++-- .../apache/nutch/parse/NutchSitemapParse.java | 4 +- .../nutch/parse/ParsePluginsReader.java | 6 +- src/java/org/apache/nutch/parse/Parser.java | 2 +- .../org/apache/nutch/parse/ParserChecker.java | 6 +- .../apache/nutch/plugin/PluginRepository.java | 6 +- .../apache/nutch/scoring/ScoringFilter.java | 19 ++--- .../apache/nutch/storage/StorageUtils.java | 2 +- .../nutch/tools/arc/ArcRecordReader.java | 20 ++--- src/java/org/apache/nutch/util/Bytes.java | 12 +-- .../apache/nutch/util/EncodingDetector.java | 10 +-- src/java/org/apache/nutch/util/MimeUtil.java | 4 +- .../org/apache/nutch/util/NodeWalker.java | 10 +-- src/java/org/apache/nutch/util/NutchJob.java | 4 +- src/java/org/apache/nutch/util/NutchTool.java | 17 +++- .../nutch/util/PrefixStringMatcher.java | 8 +- .../nutch/util/SuffixStringMatcher.java | 8 +- src/java/org/apache/nutch/util/TableUtil.java | 4 +- .../org/apache/nutch/util/TimingUtil.java | 2 +- .../apache/nutch/util/TrieStringMatcher.java | 8 +- src/java/org/apache/nutch/util/URLUtil.java | 83 +++++++++---------- .../nutch/util/domain/DomainSuffix.java | 5 +- .../nutch/util/domain/TopLevelDomain.java | 4 +- .../apache/nutch/parse/feed/FeedParser.java | 2 +- .../indexer/anchor/AnchorIndexingFilter.java | 2 +- .../indexer/metadata/MetadataIndexer.java | 2 +- .../indexer/more/MoreIndexingFilter.java | 4 +- .../analysis/lang/HTMLLanguageParser.java | 15 +++- .../urlfilter/api/RegexURLFilterBase.java | 12 +-- .../apache/nutch/parse/html/DOMBuilder.java | 50 +++++------ .../nutch/parse/html/DOMContentUtils.java | 2 +- .../parse/html/XMLCharacterRecognizer.java | 2 +- .../apache/nutch/parse/js/JSParseFilter.java | 6 +- .../org/apache/nutch/parse/swf/SWFParser.java | 2 +- .../apache/nutch/parse/tika/DOMBuilder.java | 50 +++++------ .../parse/tika/XMLCharacterRecognizer.java | 2 +- .../nutch/parse/tika/TestRSSParser.java | 2 +- .../org/apache/nutch/protocol/file/File.java | 4 +- .../nutch/protocol/file/TestProtocolFile.java | 2 +- .../org/apache/nutch/protocol/ftp/Client.java | 2 +- .../org/apache/nutch/protocol/ftp/Ftp.java | 4 +- .../DummySSLProtocolSocketFactory.java | 2 +- .../nutch/protocol/httpclient/Http.java | 4 +- .../httpclient/HttpBasicAuthentication.java | 4 +- .../nutch/scoring/link/package-info.java | 3 +- .../nutch/scoring/opic/OPICScoringFilter.java | 2 +- .../nutch/collection/CollectionManager.java | 2 +- .../urlfilter/domain/DomainURLFilter.java | 11 +-- .../nutch/urlfilter/domain/package-info.java | 2 - .../urlfilter/prefix/PrefixURLFilter.java | 2 +- .../urlfilter/suffix/SuffixURLFilter.java | 14 ++-- .../urlfilter/validator/UrlValidator.java | 10 +-- .../regex/RegexURLNormalizer.java | 4 +- .../nutch/api/AbstractNutchAPITestBase.java | 2 +- .../org/apache/nutch/crawl/TestGenerator.java | 2 +- .../org/apache/nutch/fetcher/TestFetcher.java | 6 +- .../org/apache/nutch/util/CrawlTestUtil.java | 7 +- 67 files changed, 318 insertions(+), 309 deletions(-) diff --git a/src/java/org/apache/nutch/api/NutchServer.java b/src/java/org/apache/nutch/api/NutchServer.java index 802bbef7a6..5118497f12 100644 --- a/src/java/org/apache/nutch/api/NutchServer.java +++ b/src/java/org/apache/nutch/api/NutchServer.java @@ -98,9 +98,9 @@ public class NutchServer extends Application { * 'INFO' however best attempts should always be made to specify a logging * level.<br> * {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property - * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication, + * should be set to BASIC, DIGEST or SSL at nutch-site.xml to enable HTTP basic authentication, * digest authentication or SSL when communicating with RESTAPI. - * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure + * Set restapi.auth.username and restapi.auth.password properties at nutch-site.xml to configure * credentials when BASIC or DIGEST authentication is used. * Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used. * @@ -117,12 +117,14 @@ public NutchServer() { * 'INFO' however best attempts should always be made to specify a logging * level.<br> * {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property - * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication, + * should be set to BASIC, DIGEST or SSL at nutch-site.xml to enable HTTP basic authentication, * digest authentication or SSL when communicating with RESTAPI. - * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure + * Set restapi.auth.username and restapi.auth.password properties at nutch-site.xml to configure * credentials when BASIC or DIGEST authentication is used. * Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used. * + * @param ramConfManager {@link RAMConfManager} + * * @see org.apache.nutch.api.security.AuthenticationTypeEnum */ public NutchServer(RAMConfManager ramConfManager) { @@ -137,12 +139,15 @@ public NutchServer(RAMConfManager ramConfManager) { * 'INFO' however best attempts should always be made to specify a logging * level.<br> * {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property - * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication, + * should be set to BASIC, DIGEST or SSL at nutch-site.xml to enable HTTP basic authentication, * digest authentication or SSL when communicating with RESTAPI. - * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure + * Set restapi.auth.username and restapi.auth.password properties at nutch-site.xml to configure * credentials when BASIC or DIGEST authentication is used. * Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used. * + * @param ramConfManager {@link RAMConfManager} + * @param confId active configuration id + * * @see org.apache.nutch.api.security.AuthenticationTypeEnum */ public NutchServer(RAMConfManager ramConfManager, String confId) { @@ -305,7 +310,7 @@ public void start() { /** * Safety and convenience method to determine whether or not it is safe to * shut down the server. We make this assertion by consulting the - * {@link org.apache.nutch.api.NutchApp#jobManager} for a list of jobs with + * {@link #getJobMgr()} for a list of jobs with * {@link org.apache.nutch.api.model.response.JobInfo#state} equal to * 'RUNNING'. * @@ -356,8 +361,8 @@ public boolean stop(boolean force) { /** * Main method for NutchServer to run via command line. * - * @param args arguments for log level, stopping the Server and port. - * @throws Exception + * @param args arguments for log level, stopping the Server and port. + * @throws Exception exception */ public static void main(String[] args) throws Exception { CommandLineParser parser = new PosixParser(); diff --git a/src/java/org/apache/nutch/api/impl/RAMConfManager.java b/src/java/org/apache/nutch/api/impl/RAMConfManager.java index 13c05fd285..356a8bdd25 100644 --- a/src/java/org/apache/nutch/api/impl/RAMConfManager.java +++ b/src/java/org/apache/nutch/api/impl/RAMConfManager.java @@ -50,6 +50,9 @@ public RAMConfManager() { /** * Public constructor which accepts a configuration id and {@link Configuration} type configuration. + * + * @param confId configuration id + * @param configuration configuration */ public RAMConfManager(String confId, Configuration configuration) { configurations.put(confId, configuration); diff --git a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java index 045f4cd777..8070c7b376 100755 --- a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java @@ -76,6 +76,7 @@ public void setConf(Configuration conf) { * @param url * URL of the page. * @param page + * {@link WebPage} object relative to the URL */ @Override public void initializeSchedule(String url, WebPage page) { @@ -104,13 +105,7 @@ public void setFetchSchedule(String url, WebPage page, long prevFetchTime, * @param url * URL of the page * @param page - * @return adjusted page information, including all original information. - * NOTE: this may be a different instance than - * @param datum - * , but implementations should make sure that it contains at least - * all information from - * @param datum - * . + * {@link WebPage} object relative to the URL */ @Override public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime, @@ -134,6 +129,7 @@ public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime, * @param url * URL of the page * @param page + * {@link WebPage} object relative to the URL * @param prevFetchTime * previous fetch time * @param prevModifiedTime @@ -163,15 +159,15 @@ public long calculateLastFetchTime(WebPage page) { * in the current fetchlist. NOTE: a true return value does not guarantee that * the page will be fetched, it just allows it to be included in the further * selection process based on scores. The default implementation checks - * fetchTime, if it is higher than the - * - * @param curTime - * it returns false, and true otherwise. It will also check that - * fetchTime is not too remote (more than maxIntervalfetchTime, if it is higher than the current time + * it returns false, and true otherwise. It will also check that + * fetchTime is not too remote (more than maxInterval), + * in which case it lowers the interval and returns true. + * * @param url * URL of the page * @param page + * {@link WebPage} object relative to the URL * @param curTime * reference time (usually set to the time when the fetchlist * generation process was started). @@ -200,6 +196,7 @@ public boolean shouldFetch(String url, WebPage page, long curTime) { * @param url * URL of the page * @param page + * {@link WebPage} object relative to the URL * @param asap * if true, force refetch as soon as possible - this sets the * fetchTime to now. If false, force refetch whenever the next fetch diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 1c2780a224..30c6ec7375 100755 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -46,10 +46,8 @@ *

* NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize * the algorithm, so that the fetch interval either increases or decreases - * infinitely, with little relevance to the page changes. Please use - * {@link #main(String[])} method to test the values before applying them in a - * production system. - *

+ * infinitely, with little relevance to the page changes. + * * * @author Andrzej Bialecki */ diff --git a/src/java/org/apache/nutch/crawl/FetchSchedule.java b/src/java/org/apache/nutch/crawl/FetchSchedule.java index eb896a6aec..8219a61f80 100755 --- a/src/java/org/apache/nutch/crawl/FetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/FetchSchedule.java @@ -49,6 +49,7 @@ public interface FetchSchedule extends Configurable { * @param url * URL of the page. * @param page + * {@link WebPage} object relative to the URL */ public void initializeSchedule(String url, WebPage page); @@ -60,21 +61,16 @@ public interface FetchSchedule extends Configurable { * @param url * url of the page * @param page + * {@link WebPage} object relative to the URL * @param prevFetchTime * previous value of fetch time, or -1 if not available * @param prevModifiedTime * previous value of modifiedTime, or -1 if not available * @param fetchTime - * the latest time, when the page was recently re-fetched. Most - * FetchSchedule implementations should update the value in - * @param datum - * to something greater than this value. + * the latest time, when the page was recently re-fetched. * @param modifiedTime * last time the content was modified. This information comes from - * the protocol implementations, or is set to < 0 if not available. - * Most FetchSchedule implementations should update the value in - * @param datum - * to this value. + * the protocol implementations, or is set to < 0 if not available. * @param state * if {@link #STATUS_MODIFIED}, then the content is considered to be * "changed" before the fetchTime, if @@ -90,13 +86,10 @@ public void setFetchSchedule(String url, WebPage page, long prevFetchTime, /** * This method specifies how to schedule refetching of pages marked as GONE. - * Default implementation increases fetchInterval by 50%, and if it exceeds - * the maxInterval it calls - * {@link #forceRefetch(Text, CrawlDatum, boolean)}. - * * @param url * URL of the page * @param page + * {@link WebPage} object relative to the URL */ public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime, long prevModifiedTime, long fetchTime); @@ -109,6 +102,7 @@ public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime, * @param url * URL of the page * @param page + * {@link WebPage} object relative to the URL * @param prevFetchTime * previous fetch time * @param prevModifiedTime @@ -133,14 +127,8 @@ public void setPageRetrySchedule(String url, WebPage page, * selection process based on scores. The default implementation checks * fetchTime, if it is higher than the * - * @param curTime - * it returns false, and true otherwise. It will also check that - * fetchTime is not too remote (more than maxInterval getFields(Job job) { return fields; } - /** Generate a random batch id */ + /** + * Generates a random batch id + * + * @return random batch id + */ public static String randomBatchId() { long curTime = System.currentTimeMillis(); int randomSeed = Math.abs(new Random().nextInt()); @@ -173,6 +177,13 @@ public static String randomBatchId() { return batchId; } + /** + * Runs generator + * + * @param args map of arguments + * @return results + * @throws Exception + */ public Map run(Map args) throws Exception { String batchId = (String) args.get(Nutch.ARG_BATCH); if (batchId == null) { @@ -290,6 +301,13 @@ public String generate(long topN, long curTime, boolean filter, boolean norm, return batchId; } + /** + * Runs generator from commandline + * + * @param args arguments + * @return returns -1 + * @throws Exception + */ public int run(String[] args) throws Exception { if (args.length <= 0) { System.out diff --git a/src/java/org/apache/nutch/crawl/SignatureFactory.java b/src/java/org/apache/nutch/crawl/SignatureFactory.java index 15776340cb..8cf7471a26 100644 --- a/src/java/org/apache/nutch/crawl/SignatureFactory.java +++ b/src/java/org/apache/nutch/crawl/SignatureFactory.java @@ -40,7 +40,12 @@ public class SignatureFactory { private SignatureFactory() { } // no public ctor - /** Return the default Signature implementation. */ + /** + * Returns the default {@link Signature} implementation + * + * @param conf configuration + * @return default {@link Signature} implementation + */ public static Signature getSignature(Configuration conf) { String clazz = conf.get("db.signature.class", MD5Signature.class.getName()); ObjectCache objectCache = ObjectCache.get(conf); diff --git a/src/java/org/apache/nutch/crawl/TextProfileSignature.java b/src/java/org/apache/nutch/crawl/TextProfileSignature.java index 6d7e5e0163..f797b10a62 100644 --- a/src/java/org/apache/nutch/crawl/TextProfileSignature.java +++ b/src/java/org/apache/nutch/crawl/TextProfileSignature.java @@ -33,7 +33,7 @@ * An implementation of a page signature. It calculates an MD5 hash of a plain * text "profile" of a page. In case there is no text, it calculates a hash * using the {@link MD5Signature}. - *

+ * *

* The algorithm to calculate a page "profile" takes the plain text version of a * page and performs the following steps: diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java b/src/java/org/apache/nutch/fetcher/FetcherJob.java index a7f3df8efc..015c209da3 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherJob.java +++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java @@ -79,19 +79,17 @@ public class FetcherJob extends NutchTool implements Tool { /** *

* Mapper class for Fetcher. - *

+ * *

* This class reads the random integer written by {@link GeneratorJob} as its * key while outputting the actual key and value arguments through a * {@link FetchEntry} instance. - *

+ * *

- * This approach (combined with the use of {@link PartitionUrlByHost}) makes - * sure that Fetcher is still polite while also randomizing the key order. If * one host has a huge number of URLs in your table while other hosts have * not, {@link FetcherReducer} will not be stuck on one host but process URLs * from other hosts as well. - *

+ * */ public static class FetcherMapper extends GoraMapper { @@ -246,7 +244,7 @@ private MapFieldValueFilter getBatchIdFilter(String batchId) { * number of threads per map task * @param shouldResume * @param numTasks - * number of fetching tasks (reducers). If set to < 1 then use the + * number of fetching tasks (reducers). If set to < 1 then use the * default, which is mapred.map.tasks. * @return 0 on success * @throws Exception @@ -266,7 +264,7 @@ public int fetch(String batchId, int threads, boolean shouldResume, * number of threads per map task * @param shouldResume * @param numTasks - * number of fetching tasks (reducers). If set to < 1 then use the + * number of fetching tasks (reducers). If set to < 1 then use the * default, which is mapred.map.tasks. * @param stmDetect * If set true, sitemap detection is run. diff --git a/src/java/org/apache/nutch/indexer/IndexUtil.java b/src/java/org/apache/nutch/indexer/IndexUtil.java index 6d1238228e..ddb6f0d650 100644 --- a/src/java/org/apache/nutch/indexer/IndexUtil.java +++ b/src/java/org/apache/nutch/indexer/IndexUtil.java @@ -42,7 +42,7 @@ public IndexUtil(Configuration conf) { } /** - * Index a {@link Webpage}, here we add the following fields: + * Index a {@link WebPage}, here we add the following fields: *
    *
  1. id: default uniqueKey for the {@link NutchDocument}.
  2. *
  3. digest: Digest is used to identify pages (like unique ID) and @@ -60,7 +60,7 @@ public IndexUtil(Configuration conf) { * @param key * The key of the page (reversed url). * @param page - * The {@link Webpage}. + * The {@link WebPage}. * @return The indexed document, or null if skipped by index filters. */ public NutchDocument index(String key, WebPage page) { diff --git a/src/java/org/apache/nutch/net/URLNormalizers.java b/src/java/org/apache/nutch/net/URLNormalizers.java index 03a0b855ca..1fc1df849e 100644 --- a/src/java/org/apache/nutch/net/URLNormalizers.java +++ b/src/java/org/apache/nutch/net/URLNormalizers.java @@ -51,30 +51,30 @@ * order). If there are more normalizers activated than explicitly named on this * list, the remaining ones will be run in random order after the ones specified * on the list are executed. - *

    + * *

    * You can define a set of contexts (or scopes) in which normalizers may be * called. Each scope can have its own list of normalizers (defined in - * "urlnormalizer.scope." property) and its own order (defined in - * "urlnormalizer.order." property). If any of these properties are + * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in + * "urlnormalizer.order.<scope_name>" property). If any of these properties are * missing, default settings are used for the global scope. - *

    + * *

    * In case no normalizers are required for any given scope, a * org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer should * be used. - *

    + * *

    * Each normalizer may further select among many configurations, depending on * the scope in which it is called, because the scope name is passed as a * parameter to each normalizer. You can also use the same normalizer for many * scopes. - *

    + * *

    * Several scopes have been defined, and various Nutch tools will attempt using * scope-specific normalizers first (and fall back to default config if * scope-specific configuration is missing). - *

    + * *

    * Normalizers may be run several times, to ensure that modifications introduced * by normalizers at the end of the list can be further reduced by normalizers @@ -83,7 +83,7 @@ * want to run this loop up to the number of activated normalizers. This loop * count can be configured through urlnormalizer.loop.count property. * As soon as the url is unchanged the loop will stop and return the result. - *

    + * * * @author Andrzej Bialecki */ diff --git a/src/java/org/apache/nutch/parse/NutchSitemapParse.java b/src/java/org/apache/nutch/parse/NutchSitemapParse.java index c0a9d9b6ca..0e57339ba3 100644 --- a/src/java/org/apache/nutch/parse/NutchSitemapParse.java +++ b/src/java/org/apache/nutch/parse/NutchSitemapParse.java @@ -6,9 +6,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - *

    + *

    * http://www.apache.org/licenses/LICENSE-2.0 - *

    + *

    * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/src/java/org/apache/nutch/parse/ParsePluginsReader.java b/src/java/org/apache/nutch/parse/ParsePluginsReader.java index dddd025163..b4c6f4e810 100644 --- a/src/java/org/apache/nutch/parse/ParsePluginsReader.java +++ b/src/java/org/apache/nutch/parse/ParsePluginsReader.java @@ -69,12 +69,10 @@ public ParsePluginsReader() { /** * Reads the parse-plugins.xml file and returns the - * {@link #ParsePluginList} defined by it. + * {@link ParsePluginList} defined by it. * - * @return A {@link #ParsePluginList} specified by the + * @return A {@link ParsePluginList} specified by the * parse-plugins.xml file. - * @throws Exception - * If any parsing error occurs. */ public ParsePluginList parse(Configuration conf) { diff --git a/src/java/org/apache/nutch/parse/Parser.java b/src/java/org/apache/nutch/parse/Parser.java index b623fd0262..9a8c2b7bf0 100644 --- a/src/java/org/apache/nutch/parse/Parser.java +++ b/src/java/org/apache/nutch/parse/Parser.java @@ -34,7 +34,7 @@ public interface Parser extends FieldPluggable, Configurable { /** *

    * This method parses content in WebPage instance - *

    + * * * @param url * Page's URL diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java index 12faeae90a..4d5c572968 100644 --- a/src/java/org/apache/nutch/parse/ParserChecker.java +++ b/src/java/org/apache/nutch/parse/ParserChecker.java @@ -48,13 +48,13 @@ * is used to remove duplicates during the dedup procedure. It is calculated * using {@link org.apache.nutch.crawl.MD5Signature} or * {@link org.apache.nutch.crawl.TextProfileSignature}.
  4. - *
  5. Version: From {@link org.apache.nutch.parse.ParseData}.
  6. - *
  7. Status: From {@link org.apache.nutch.parse.ParseData}.
  8. + *
  9. Version: From org.apache.nutch.parse.ParseData.
  10. + *
  11. Status: From org.apache.nutch.parse.ParseData.
  12. *
  13. Title: of the URL
  14. *
  15. Outlinks: associated with the URL
  16. *
  17. Content Metadata: such as X-AspNet-Version, Date, * Content-length, servedBy, Content-Type, - * Cache-Control, etc.
  18. + * Cache-Control, etc. *
  19. Parse Metadata: such as CharEncodingForConversion, * OriginalCharEncoding, language, etc.
  20. *
  21. ParseText: The page parse text which varies in length depdnecing diff --git a/src/java/org/apache/nutch/plugin/PluginRepository.java b/src/java/org/apache/nutch/plugin/PluginRepository.java index 346ae310de..6486f63f53 100644 --- a/src/java/org/apache/nutch/plugin/PluginRepository.java +++ b/src/java/org/apache/nutch/plugin/PluginRepository.java @@ -59,8 +59,10 @@ public class PluginRepository { .getLogger(PluginRepository.class); /** - * @throws PluginRuntimeException - * @see java.lang.Object#Object() + * Pluging repository constructor + * + * @param conf Configuration + * @throws RuntimeException */ public PluginRepository(Configuration conf) throws RuntimeException { fActivatedPlugins = new HashMap(); diff --git a/src/java/org/apache/nutch/scoring/ScoringFilter.java b/src/java/org/apache/nutch/scoring/ScoringFilter.java index 8c06ef6cb0..17c6350fc8 100644 --- a/src/java/org/apache/nutch/scoring/ScoringFilter.java +++ b/src/java/org/apache/nutch/scoring/ScoringFilter.java @@ -72,8 +72,8 @@ public void initialScore(String url, WebPage page) * * @param url * url of the page - * @param datum - * page row. Modifications will be persisted. + * @param page + * {@link WebPage} object relative to the URL * @param initSort * initial sort value, or a value from previous filters in chain */ @@ -85,13 +85,8 @@ public float generatorSortValue(String url, WebPage page, float initSort) * * @param fromUrl * url of the source page - * @param row - * page row * @param scoreData - * A list of {@link OutlinkedScoreDatum}s for every outlink. These - * {@link OutlinkedScoreDatum}s will be passed to - * {@link #updateScore(String, OldWebTableRow, List)} for every - * outlinked URL. + * A list of {@link ScoreDatum} * @param allCount * number of all collected outlinks from the source page * @throws ScoringFilterException @@ -106,9 +101,9 @@ public void distributeScoreToOutlinks(String fromUrl, WebPage page, * * @param url * url of the page - * @param page - * @param inlinked - * list of {@link OutlinkedScoreDatum}s for all inlinks pointing to + * @param page {@link WebPage} object relative to the URL + * @param inlinkedScoreData + * list of {@link ScoreDatum}s for all inlinks pointing to * this URL. * @throws ScoringFilterException */ @@ -124,8 +119,6 @@ public void updateScore(String url, WebPage page, * document. NOTE: this already contains all information collected by * indexing filters. Implementations may modify this instance, in * order to store/remove some information. - * @param row - * page row * @param initScore * initial boost value for the Lucene document. * @return boost value for the Lucene document. This value is passed as an diff --git a/src/java/org/apache/nutch/storage/StorageUtils.java b/src/java/org/apache/nutch/storage/StorageUtils.java index b68e8f84fe..e82a3c505b 100644 --- a/src/java/org/apache/nutch/storage/StorageUtils.java +++ b/src/java/org/apache/nutch/storage/StorageUtils.java @@ -82,7 +82,7 @@ public static DataStore createWebStore( /** * Return the Persistent Gora class used to persist Nutch Web data. * - * @param the + * @param conf * Nutch configuration * @return the Gora DataStore persistent class * @throws ClassNotFoundException diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java index d3f9799be4..7f36b52d81 100644 --- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java +++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java @@ -38,21 +38,21 @@ *

    * The ArchRecordReader class provides a record reader which reads * records from arc files. - *

    + * * *

    * Arc files are essentially tars of gzips. Each record in an arc file is a * compressed gzip. Multiple records are concatenated together to form a - * complete arc. For more information on the arc file format see {@link http - * ://www.archive.org/web/researcher/ArcFileFormat.php}. - *

    - * + * complete arc. For more information on the arc file format see + * + * http://www.archive.org/web/researcher/ArcFileFormat.php. + * *

    * Arc files are used by the internet archive and grub projects. - *

    + * * - * @see http://www.archive.org/ - * @see http://www.grub.org/ + * @see http://www.archive.org/ + * @see http://www.grub.org/ */ public class ArcRecordReader implements RecordReader { @@ -72,7 +72,7 @@ public class ArcRecordReader implements RecordReader { /** *

    * Returns true if the byte array passed matches the gzip header magic number. - *

    + * * * @param input * The byte array to check. @@ -174,7 +174,7 @@ public float getProgress() throws IOException { * Returns true if the next record in the split is read into the key and value * pair. The key will be the arc record header and the values will be the raw * content bytes of the arc record. - *

    + * * * @param key * The record key diff --git a/src/java/org/apache/nutch/util/Bytes.java b/src/java/org/apache/nutch/util/Bytes.java index db9f4689c9..043a89761d 100644 --- a/src/java/org/apache/nutch/util/Bytes.java +++ b/src/java/org/apache/nutch/util/Bytes.java @@ -980,7 +980,7 @@ public static long readVLong(final byte[] buffer, final int offset) * left operand * @param right * right operand - * @return 0 if equal, < 0 if left is less than right, etc. + * @return 0 if equal, < 0 if left is less than right, etc. */ public static int compareTo(final byte[] left, final byte[] right) { return compareTo(left, 0, left.length, right, 0, right.length); @@ -1001,7 +1001,7 @@ public static int compareTo(final byte[] left, final byte[] right) { * How much to compare from the left buffer * @param length2 * How much to compare from the right buffer - * @return 0 if equal, < 0 if left is less than right, etc. + * @return 0 if equal, < 0 if left is less than right, etc. */ public static int compareTo(byte[] buffer1, int offset1, int length1, byte[] buffer2, int offset2, int length2) { @@ -1050,7 +1050,7 @@ public static boolean startsWith(byte[] bytes, byte[] prefix) { * @return Runs {@link WritableComparator#hashBytes(byte[], int)} on the * passed in array. This method is what * {@link org.apache.hadoop.io.Text} and - * {@link ImmutableBytesWritable} use calculating hash code. + * org.apache.hadoop.hbase.io.ImmutableBytesWritable use calculating hash code. */ public static int hashCode(final byte[] b) { return hashCode(b, b.length); @@ -1064,7 +1064,7 @@ public static int hashCode(final byte[] b) { * @return Runs {@link WritableComparator#hashBytes(byte[], int)} on the * passed in array. This method is what * {@link org.apache.hadoop.io.Text} and - * {@link ImmutableBytesWritable} use calculating hash code. + * org.apache.hadoop.hbase.io.ImmutableBytesWritable use calculating hash code. */ public static int hashCode(final byte[] b, final int length) { return WritableComparator.hashBytes(b, length); @@ -1366,12 +1366,12 @@ else if (cmp < 0) * given amount. * * @param value - * - array of bytes containing long (length <= SIZEOF_LONG) + * - array of bytes containing long (length <= SIZEOF_LONG) * @param amount * value will be incremented on (deincremented if negative) * @return array of bytes containing incremented long (length == SIZEOF_LONG) * @throws IOException - * - if value.length > SIZEOF_LONG + * - if value.length > SIZEOF_LONG */ public static byte[] incrementBytes(byte[] value, long amount) throws IOException { diff --git a/src/java/org/apache/nutch/util/EncodingDetector.java b/src/java/org/apache/nutch/util/EncodingDetector.java index 5b40e29e5d..25f8eefccd 100644 --- a/src/java/org/apache/nutch/util/EncodingDetector.java +++ b/src/java/org/apache/nutch/util/EncodingDetector.java @@ -45,7 +45,7 @@ *
  22. Taking a set of clues and making a "best guess" as to the "real" * encoding.
  23. *
- *

+ * * *

* A caller will often have some extra information about what the encoding might @@ -56,7 +56,7 @@ *

  • Run step (1) to generate a set of auto-detected clues;
  • *
  • Combine these clues with the caller-dependent "extra clues" available;
  • *
  • Run step (2) to guess what the most probable answer is.
  • - *

    + * */ public class EncodingDetector { @@ -211,9 +211,7 @@ public void addClue(String value, String source) { /** * Guess the encoding with the previously specified list of clues. - * - * @param row - * URL's row + * * @param defaultValue * Default encoding to return if no encoding can be detected with * enough confidence. Note that this will not be normalized @@ -340,7 +338,7 @@ public static String resolveEncodingAlias(String encoding) { /** * Parse the character encoding from the specified content type header. If the * content type is null, or there is no explicit character encoding, - * null is returned.
    + * null is returned.

    * This method was copied from org.apache.catalina.util.RequestUtil, which is * licensed under the Apache License, Version 2.0 (the "License"). * diff --git a/src/java/org/apache/nutch/util/MimeUtil.java b/src/java/org/apache/nutch/util/MimeUtil.java index 198fdee596..241087c91a 100644 --- a/src/java/org/apache/nutch/util/MimeUtil.java +++ b/src/java/org/apache/nutch/util/MimeUtil.java @@ -50,7 +50,7 @@ * substrate library, Apache * Tika. Any mime handling code should be placed in this utility * class, and hidden from the Nutch classes that rely on it. - *

    + * */ public final class MimeUtil { @@ -229,7 +229,7 @@ public String autoResolveContentType(String typeName, String url, byte[] data) { * method. * * @param url - * A string representation of the document {@link URL} to sense the + * A string representation of the document. URL to sense the * {@link MimeType} for. * @return An appropriate {@link MimeType}, identified from the given Document * url in string form. diff --git a/src/java/org/apache/nutch/util/NodeWalker.java b/src/java/org/apache/nutch/util/NodeWalker.java index 16e84c3598..3e0b0e1827 100644 --- a/src/java/org/apache/nutch/util/NodeWalker.java +++ b/src/java/org/apache/nutch/util/NodeWalker.java @@ -27,12 +27,12 @@ * of recursion. As the node tree is walked the next node is popped off of the * stack and all of its children are automatically added to the stack to be * called in tree order. - *

    + * * *

    * Currently this class is not thread safe. It is assumed that only one thread * will be accessing the NodeWalker at any given time. - *

    + * */ public class NodeWalker { @@ -58,7 +58,7 @@ public NodeWalker(Node rootNode) { * children onto the stack, allowing us to walk the node tree without the use * of recursion. If there are no more nodes on the stack then null is * returned. - *

    + * * * @return Node The next Node on the stack or null if there isn't * a next node. @@ -90,12 +90,12 @@ public Node nextNode() { * When getting a next node from the walker, that node's children are * automatically added to the stack. You can call this method to remove those * children from the stack. - *

    + * * *

    * This is useful when you don't want to process deeper into the current path * of the node tree but you want to continue processing sibling nodes. - *

    + * * */ public void skipChildren() { diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java index c0456c150c..029e7aece7 100644 --- a/src/java/org/apache/nutch/util/NutchJob.java +++ b/src/java/org/apache/nutch/util/NutchJob.java @@ -63,7 +63,7 @@ public NutchJob(Configuration conf, String jobName) throws IOException { * Creates a new {@link NutchJob} with no particular {@link org.apache.hadoop.mapreduce.Cluster} and a * given {@link org.apache.hadoop.conf.Configuration}. * - * The NutchJob makes a copy of the Configuration so + * The NutchJob makes a copy of the Configuration so * that any necessary internal modifications do not reflect on the incoming * parameter. * @@ -87,7 +87,7 @@ public static NutchJob getInstance(Configuration conf) throws IOException { * and a given jobName. * A Cluster will be created from the conf parameter only when it's needed. * - * The NutchJob makes a copy of the Configuration so + * The NutchJob makes a copy of the Configuration so * that any necessary internal modifications do not reflect on the incoming * parameter. * diff --git a/src/java/org/apache/nutch/util/NutchTool.java b/src/java/org/apache/nutch/util/NutchTool.java index 1f5789a608..443d1da681 100644 --- a/src/java/org/apache/nutch/util/NutchTool.java +++ b/src/java/org/apache/nutch/util/NutchTool.java @@ -36,11 +36,19 @@ public abstract class NutchTool extends Configured { /** * Runs the tool, using a map of arguments. May return results, or null. + * + * @param args map of arguments + * @return results or null + * @throws Exception */ public abstract Map run(Map args) throws Exception; - /** Returns relative progress of the tool, a float in range [0,1]. */ + /** + * Returns relative progress of the tool, a float in range [0,1] + * + * @return relative progress of the tool, a float in range [0,1] + */ public float getProgress() { float res = 0; if (currentJob != null) { @@ -62,7 +70,11 @@ public float getProgress() { return res; } - /** Returns current status of the running tool. */ + /** + * Returns current status of the running tool + * + * @return current status of the running tool + */ public Map getStatus() { return status; } @@ -72,6 +84,7 @@ public Map getStatus() { * this, since by default it calls {@link #killJob()}. * * @return true if succeeded, false otherwise + * @throws Exception */ public boolean stopJob() throws Exception { return killJob(); diff --git a/src/java/org/apache/nutch/util/PrefixStringMatcher.java b/src/java/org/apache/nutch/util/PrefixStringMatcher.java index e323b67531..6ca48c8b71 100644 --- a/src/java/org/apache/nutch/util/PrefixStringMatcher.java +++ b/src/java/org/apache/nutch/util/PrefixStringMatcher.java @@ -70,8 +70,8 @@ public boolean matches(String input) { } /** - * Returns the shortest prefix of input that is matched, - * or null if no match exists. + * Returns the shortest prefix of input that is matched, + * or null if no match exists. */ public String shortestMatch(String input) { TrieNode node = root; @@ -86,8 +86,8 @@ public String shortestMatch(String input) { } /** - * Returns the longest prefix of input that is matched, - * or null if no match exists. + * Returns the longest prefix of input that is matched, + * or null if no match exists. */ public String longestMatch(String input) { TrieNode node = root; diff --git a/src/java/org/apache/nutch/util/SuffixStringMatcher.java b/src/java/org/apache/nutch/util/SuffixStringMatcher.java index a967c0177f..6e070b935c 100644 --- a/src/java/org/apache/nutch/util/SuffixStringMatcher.java +++ b/src/java/org/apache/nutch/util/SuffixStringMatcher.java @@ -65,8 +65,8 @@ public boolean matches(String input) { } /** - * Returns the shortest suffix of input that is matched, - * or null if no match exists. + * Returns the shortest suffix of input that is matched, + * or null if no match exists. */ public String shortestMatch(String input) { TrieNode node = root; @@ -81,8 +81,8 @@ public String shortestMatch(String input) { } /** - * Returns the longest suffix of input that is matched, - * or null if no match exists. + * Returns the longest suffix of input that is matched, + * or null if no match exists. */ public String longestMatch(String input) { TrieNode node = root; diff --git a/src/java/org/apache/nutch/util/TableUtil.java b/src/java/org/apache/nutch/util/TableUtil.java index 68ded699fe..e6ccbbc476 100644 --- a/src/java/org/apache/nutch/util/TableUtil.java +++ b/src/java/org/apache/nutch/util/TableUtil.java @@ -33,7 +33,7 @@ public class TableUtil { * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes * "com.foo.bar:8983:http/to/index.html?a=b". * - * @param url + * @param urlString * url to be reversed * @return Reversed url * @throws MalformedURLException @@ -111,7 +111,7 @@ public static String unreverseUrl(String reversedUrl) { /** * Given a reversed url, returns the reversed host E.g - * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar" + * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar" * * @param reversedUrl * Reversed url diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java index 524bee6ff6..497716c4bf 100644 --- a/src/java/org/apache/nutch/util/TimingUtil.java +++ b/src/java/org/apache/nutch/util/TimingUtil.java @@ -32,7 +32,7 @@ public class TimingUtil { * @param end * The end of the time period * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y - * minutes and Z seconds or null if start > end. + * minutes and Z seconds or null if start > end. */ public static String elapsedTime(long start, long end) { if (start > end) { diff --git a/src/java/org/apache/nutch/util/TrieStringMatcher.java b/src/java/org/apache/nutch/util/TrieStringMatcher.java index 95f06ad6f6..e7773cb668 100644 --- a/src/java/org/apache/nutch/util/TrieStringMatcher.java +++ b/src/java/org/apache/nutch/util/TrieStringMatcher.java @@ -186,15 +186,15 @@ protected final void addPatternBackward(String s) { public abstract boolean matches(String input); /** - * Returns the shortest substring of input that is - * matched by a pattern in the trie, or null if no match + * Returns the shortest substring of input that is + * matched by a pattern in the trie, or null if no match * exists. */ public abstract String shortestMatch(String input); /** - * Returns the longest substring of input that is - * matched by a pattern in the trie, or null if no match + * Returns the longest substring of input that is + * matched by a pattern in the trie, or null if no match * exists. */ public abstract String longestMatch(String input); diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index 5183ba10b0..e1df9e3604 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -219,52 +219,49 @@ public static String[] getHostBatches(String url) * Yahoo! Slurp crawler described here:
    * How - * does the Yahoo! webcrawler handle redirects?
    + * does the Yahoo! webcrawler handle redirects? *
    - *
      - *
    1. Choose target url if either url is malformed.
    2. - *
    3. If different domains the keep the destination whether or not the - * redirect is temp or perm
    4. - *
        - *
      • a.com -> b.com*
      • - *
      - *
    5. If the redirect is permanent and the source is root, keep the source.
    6. - *
        - *
      • *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
      • - *
      - *
    7. If the redirect is permanent and the source is not root and the - * destination is root, keep the destination
    8. - *
        - *
      • a.com/xyz/index.html -> a.com*
      • - *
      - *
    9. If the redirect is permanent and neither the source nor the destination - * is root, then keep the destination
    10. - *
        - *
      • a.com/xyz/index.html -> a.com/abc/page.html*
      • - *
      - *
    11. If the redirect is temporary and source is root and destination is not - * root, then keep the source
    12. - *
        - *
      • *a.com -> a.com/xyz/index.html
      • - *
      - *
    13. If the redirect is temporary and source is not root and destination is - * root, then keep the destination
    14. - *
        - *
      • a.com/xyz/index.html -> a.com*
      • - *
      - *
    15. If the redirect is temporary and neither the source or the destination + * + *
      + * + *
      Choose target url if either url is malformed.
      + * + *
      If different domains the keep the destination whether or not the + * redirect is temp or perm
      + *
      a.com -> b.com*
      + * + *
      If the redirect is permanent and the source is root, keep the source.
      + *
      *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
      + * + *
      If the redirect is permanent and the source is not root and the + * destination is root, keep the destination
      + *
      a.com/xyz/index.html -> a.com*
      + * + *
      If the redirect is permanent and neither the source nor the destination + * is root, then keep the destination
      + *
      a.com/xyz/index.html -> a.com/abc/page.html*
      + * + *
      If the redirect is temporary and source is root and destination is not + * root, then keep the source
      + *
      *a.com -> a.com/xyz/index.html
      + * + *
      If the redirect is temporary and source is not root and destination is + * root, then keep the destination
      + *
      a.com/xyz/index.html -> a.com*
      + * + *
      If the redirect is temporary and neither the source or the destination * is root, then keep the shortest url. First check for the shortest host, and * if both are equal then check by path. Path is first by length then by the - * number of / path separators.
    16. - *
        - *
      • a.com/xyz/index.html -> a.com/abc/page.html*
      • - *
      • *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
      • - *
      - *
    17. If the redirect is temporary and both the source and the destination - * are root, then keep the shortest sub-domain
    18. - *
        - *
      • *www.a.com -> www.news.a.com
      • - *
      + * number of / path separators. + *
      a.com/xyz/index.html -> a.com/abc/page.html*
      + *
      *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
      + * + *
      If the redirect is temporary and both the source and the destination + * are root, then keep the shortest sub-domain
      + *
      *www.a.com -> www.news.a.com
      + * + * + * *
      * While not in this logic there is a further piece of representative url * logic that occurs during indexing and after scoring. During creation of the diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffix.java b/src/java/org/apache/nutch/util/domain/DomainSuffix.java index ae03ec4eea..0e0b7b02fc 100644 --- a/src/java/org/apache/nutch/util/domain/DomainSuffix.java +++ b/src/java/org/apache/nutch/util/domain/DomainSuffix.java @@ -23,17 +23,16 @@ * name of a host. The domain name of a host is defined to be the last part * before the domain suffix, w/o subdomain names. As an example the domain name * of
      - * http://lucene.apache.org/ + * http://lucene.apache.org/ *
      * is apache.org
      * This class holds three fields, domain field represents the * suffix (such as "co.uk") boost is a float for boosting score * of url's with this suffix status field represents domain's - * status + * status. Check also domain-suffixes.xml * * @author Enis Soztutar <enis.soz.nutch@gmail.com> * @see TopLevelDomain - * @see domain-suffixes.xml */ public class DomainSuffix { diff --git a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java index 6386335e06..87e370e5af 100644 --- a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java +++ b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java @@ -24,8 +24,8 @@ * top-level domain is com. * * @author Enis Soztutar <enis.soz.nutch@gmail.com> - * @see http://www.iana.org/ - * @see http://en.wikipedia.org/wiki/Top-level_domain + * @see http://www.iana.org/ + * @see http://en.wikipedia.org/wiki/Top-level_domain */ public class TopLevelDomain extends DomainSuffix { diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java index 9df4e2724c..0751ddc1be 100644 --- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java +++ b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java @@ -64,7 +64,7 @@ *

      * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced links * and content present in the feed. - *

      + * * */ public class FeedParser implements Parser { diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java index 9e2e75bee1..7e0e24688b 100644 --- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java +++ b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java @@ -36,7 +36,7 @@ * Indexing filter that offers an option to either index all inbound anchor text * for a document or deduplicate anchors. Deduplication does have it's con's, * - * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. + * Check {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. */ public class AnchorIndexingFilter implements IndexingFilter { diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java index fdd3b8120f..a97e9edb29 100644 --- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java +++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java @@ -37,7 +37,7 @@ * Indexer which can be configured to extract metadata from the crawldb, parse * metadata or content metadata. You can specify the properties "index.db", * "index.parse" or "index.content" who's values are comma-delimited - * key1,key2,key3. + * <value>key1,key2,key3</value>. */ public class MetadataIndexer implements IndexingFilter { diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java index b1d99e5ed6..9171b1cebb 100644 --- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java +++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java @@ -148,14 +148,14 @@ private NutchDocument addLength(NutchDocument doc, WebPage page, String url) { * primaryType and subType to field "type" as un-stored, indexed and * un-tokenized, so that search results can be confined by contentType or its * primaryType or its subType. - *

      + * *

      * For example, if contentType is application/vnd.ms-powerpoint, search can be * done with one of the following qualifiers * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint * all case insensitive. The query filter is implemented in * {@link TypeQueryFilter}. - *

      + * * * @param doc * @param data diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java index f3af6a987f..064cd8db2a 100644 --- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java +++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java @@ -84,12 +84,19 @@ public class HTMLLanguageParser implements ParseFilter { /** * Scan the HTML document looking at possible indications of content language
      - *
    19. 1. html lang attribute - * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1)
    20. 2. meta + *
        + *
      1. html lang attribute + * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) + *
      2. + *
      3. meta * dc.language * (http://dublincore.org/documents/2000/07/16/usageguide/qualified - * -html.shtml#language)
      4. 3. meta http-equiv (content-language) - * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
        + * -html.shtml#language) + *
      5. + *
      6. meta http-equiv (content-language) + * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) + *
      7. + *
      */ public Parse filter(String url, WebPage page, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java index d374e95a35..a1475a71c7 100644 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java +++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java @@ -44,19 +44,13 @@ * expressions. * *

      - * The regular expressions rules are expressed in a file. The file of rules is - * provided by each implementation using the - * {@link #getRulesFile(Configuration)} method. - *

      - * - *

      - * The format of this file is made of many rules (one per line):
      + * The format of this file is made of many rules (one per line):
      * * [+-]<regex> - *
      + *

      * where plus (+)means go ahead and index it and minus ( * -)means no. - *

      + * */ public abstract class RegexURLFilterBase implements URLFilter { diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java index 31b54dab3d..6bd430555c 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java @@ -174,7 +174,7 @@ protected void append(Node newNode) throws org.xml.sax.SAXException { * supply a locator: if it does so, it must supply the locator to the * application by invoking this method before invoking any of the other * methods in the ContentHandler interface. - *

      + * * *

      * The locator allows the application to determine the end position of any @@ -183,13 +183,13 @@ protected void append(Node newNode) throws org.xml.sax.SAXException { * errors (such as character content that does not match an application's * business rules). The information returned by the locator is probably not * sufficient for use with a search engine. - *

      + * * *

      * Note that the locator will return correct information only during the * invocation of the events in this interface. The application should not * attempt to use it at any other time. - *

      + * * * @param locator * An object that can return the location of any SAX document event. @@ -206,7 +206,7 @@ public void setDocumentLocator(Locator locator) { *

      * The SAX parser will invoke this method only once, before any other methods * in this interface or in DTDHandler (except for setDocumentLocator). - *

      + * */ public void startDocument() throws org.xml.sax.SAXException { @@ -221,7 +221,7 @@ public void startDocument() throws org.xml.sax.SAXException { * method invoked during the parse. The parser shall not invoke this method * until it has either abandoned parsing (because of an unrecoverable error) * or reached the end of input. - *

      + * */ public void endDocument() throws org.xml.sax.SAXException { @@ -237,14 +237,14 @@ public void endDocument() throws org.xml.sax.SAXException { * startElement() event (even when the element is empty). All of the element's * content will be reported, in order, before the corresponding endElement() * event. - *

      + * * *

      * If the element name has a namespace prefix, the prefix will still be * attached. Note that the attribute list provided will contain only * attributes with explicit values (specified or defaulted): #IMPLIED * attributes will be omitted. - *

      + * * * * @param ns @@ -328,12 +328,12 @@ public void startElement(String ns, String localName, String name, * The SAX parser will invoke this method at the end of every element in the * XML document; there will be a corresponding startElement() event for every * endElement() event (even when the element is empty). - *

      + * * *

      * If the element name has a namespace prefix, the prefix will still be * attached to the name. - *

      + * * * * @param ns @@ -371,18 +371,18 @@ public void setIDAttribute(String id, Element elem) { * they may split it into several chunks; however, all of the characters in * any single event must come from the same external entity, so that the * Locator provides useful information. - *

      + * * *

      * The application must not attempt to read from the array outside of the * specified range. - *

      + * * *

      * Note that some parsers will report whitespace using the * ignorableWhitespace() method rather than this one (validating parsers must * do so). - *

      + * * * @param ch * The characters from the XML document. @@ -489,19 +489,19 @@ public void entityReference(String name) throws org.xml.sax.SAXException { * whitespace (see the W3C XML 1.0 recommendation, section 2.10): * non-validating parsers may also use this method if they are capable of * parsing and using content models. - *

      + * * *

      * SAX parsers may return all contiguous whitespace in a single chunk, or they * may split it into several chunks; however, all of the characters in any * single event must come from the same external entity, so that the Locator * provides useful information. - *

      + * * *

      * The application must not attempt to read from the array outside of the * specified range. - *

      + * * * @param ch * The characters from the XML document. @@ -539,12 +539,12 @@ private boolean isOutsideDocElem() { * The Parser will invoke this method once for each processing instruction * found: note that processing instructions may occur before or after the main * document element. - *

      + * * *

      * A SAX parser should never report an XML declaration (XML 1.0, section 2.8) * or a text declaration (XML 1.0, section 4.3.1) using this method. - *

      + * * * @param target * The processing instruction target. @@ -608,18 +608,18 @@ public void endCDATA() throws org.xml.sax.SAXException { * they may split it into several chunks; however, all of the characters in * any single event must come from the same external entity, so that the * Locator provides useful information. - *

      + * * *

      * The application must not attempt to read from the array outside of the * specified range. - *

      + * * *

      * Note that some parsers will report whitespace using the * ignorableWhitespace() method rather than this one (validating parsers must * do so). - *

      + * * * @param ch * The characters from the XML document. @@ -687,14 +687,14 @@ public void endDTD() throws org.xml.sax.SAXException { * processing: the SAX XML reader will automatically replace prefixes for * element and attribute names when the http://xml.org/sax/features/namespaces * feature is true (the default). - *

      + * * *

      * There are cases, however, when applications need to use prefixes in * character data or in attribute values, where they cannot safely be expanded * automatically; the start/endPrefixMapping event supplies the information to * the application to expand prefixes in those contexts itself, if necessary. - *

      + * * *

      * Note that start/endPrefixMapping events are not guaranteed to be properly @@ -702,7 +702,7 @@ public void endDTD() throws org.xml.sax.SAXException { * before the corresponding startElement event, and all endPrefixMapping * events will occur after the corresponding endElement event, but their order * is not guaranteed. - *

      + * * * @param prefix * The Namespace prefix being declared. @@ -735,7 +735,7 @@ public void startPrefixMapping(String prefix, String uri) * See startPrefixMapping for details. This event will always occur after the * corresponding endElement event, but the order of endPrefixMapping events is * not otherwise guaranteed. - *

      + * * * @param prefix * The prefix that was being mapping. @@ -755,7 +755,7 @@ public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException { * DTD subset). All processors may skip external entities, depending on the * values of the http://xml.org/sax/features/external-general-entities and the * http://xml.org/sax/features/external-parameter-entities properties. - *

      + * * * @param name * The name of the skipped entity. If it is a parameter entity, the diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java index 8e079fb992..488cacd657 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java @@ -113,7 +113,7 @@ public boolean getText(StringBuilder sb, Node node, /** * This is a convinience method, equivalent to - * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * {@link #getText(StringBuilder, Node, boolean)} which passes false as third argument * */ public void getText(StringBuilder sb, Node node) { diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java index cfef10cc56..0143f06545 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java @@ -35,7 +35,7 @@ public class XMLCharacterRecognizer { * Returns whether the specified ch conforms to the XML 1.0 * definition of whitespace. Refer to the definition of - * S for details. + * S for details. * * @param ch * Character to check as XML whitespace. diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java index a48175548e..a68584444e 100644 --- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java +++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java @@ -79,10 +79,10 @@ public class JSParseFilter implements ParseFilter, Parser { * {@link WebPage} object relative to the URL * @param parse * {@link Parse} object holding parse status - * @param metatags - * within the {@link NutchDocument} + * @param metaTags + * within the {@link HTMLMetaTags} * @param doc - * The {@link NutchDocument} object + * The {@link DocumentFragment} object * @return parse the actual {@link Parse} object */ @Override diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java index a3f779ad89..4fbcad3cc4 100644 --- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java +++ b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java @@ -280,7 +280,7 @@ public void setY(int y) { /* * There are some issues with this method: sometimes SWF files define their - * own font, so short of OCR we cannot guess what is the glyph code -> character + * own font, so short of OCR we cannot guess what is the glyph code -> character * mapping. Additionally, some files don't use literal space character, instead * they adjust glyphAdvances. We don't handle it at all - in such cases the text * will be all glued together. diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java index 4f4c8a78fa..db59d13e9c 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java @@ -174,7 +174,7 @@ protected void append(Node newNode) throws org.xml.sax.SAXException { * supply a locator: if it does so, it must supply the locator to the * application by invoking this method before invoking any of the other * methods in the ContentHandler interface. - *

      + * * *

      * The locator allows the application to determine the end position of any @@ -183,13 +183,13 @@ protected void append(Node newNode) throws org.xml.sax.SAXException { * errors (such as character content that does not match an application's * business rules). The information returned by the locator is probably not * sufficient for use with a search engine. - *

      + * * *

      * Note that the locator will return correct information only during the * invocation of the events in this interface. The application should not * attempt to use it at any other time. - *

      + * * * @param locator * An object that can return the location of any SAX document event. @@ -206,7 +206,7 @@ public void setDocumentLocator(Locator locator) { *

      * The SAX parser will invoke this method only once, before any other methods * in this interface or in DTDHandler (except for setDocumentLocator). - *

      + * */ public void startDocument() throws org.xml.sax.SAXException { @@ -221,7 +221,7 @@ public void startDocument() throws org.xml.sax.SAXException { * method invoked during the parse. The parser shall not invoke this method * until it has either abandoned parsing (because of an unrecoverable error) * or reached the end of input. - *

      + * */ public void endDocument() throws org.xml.sax.SAXException { @@ -237,14 +237,14 @@ public void endDocument() throws org.xml.sax.SAXException { * startElement() event (even when the element is empty). All of the element's * content will be reported, in order, before the corresponding endElement() * event. - *

      + * * *

      * If the element name has a namespace prefix, the prefix will still be * attached. Note that the attribute list provided will contain only * attributes with explicit values (specified or defaulted): #IMPLIED * attributes will be omitted. - *

      + * * * * @param ns @@ -328,12 +328,12 @@ public void startElement(String ns, String localName, String name, * The SAX parser will invoke this method at the end of every element in the * XML document; there will be a corresponding startElement() event for every * endElement() event (even when the element is empty). - *

      + * * *

      * If the element name has a namespace prefix, the prefix will still be * attached to the name. - *

      + * * * * @param ns @@ -373,18 +373,18 @@ public void setIDAttribute(String id, Element elem) { * they may split it into several chunks; however, all of the characters in * any single event must come from the same external entity, so that the * Locator provides useful information. - *

      + * * *

      * The application must not attempt to read from the array outside of the * specified range. - *

      + * * *

      * Note that some parsers will report whitespace using the * ignorableWhitespace() method rather than this one (validating parsers must * do so). - *

      + * * * @param ch * The characters from the XML document. @@ -491,19 +491,19 @@ public void entityReference(String name) throws org.xml.sax.SAXException { * whitespace (see the W3C XML 1.0 recommendation, section 2.10): * non-validating parsers may also use this method if they are capable of * parsing and using content models. - *

      + * * *

      * SAX parsers may return all contiguous whitespace in a single chunk, or they * may split it into several chunks; however, all of the characters in any * single event must come from the same external entity, so that the Locator * provides useful information. - *

      + * * *

      * The application must not attempt to read from the array outside of the * specified range. - *

      + * * * @param ch * The characters from the XML document. @@ -541,12 +541,12 @@ private boolean isOutsideDocElem() { * The Parser will invoke this method once for each processing instruction * found: note that processing instructions may occur before or after the main * document element. - *

      + * * *

      * A SAX parser should never report an XML declaration (XML 1.0, section 2.8) * or a text declaration (XML 1.0, section 4.3.1) using this method. - *

      + * * * @param target * The processing instruction target. @@ -610,18 +610,18 @@ public void endCDATA() throws org.xml.sax.SAXException { * they may split it into several chunks; however, all of the characters in * any single event must come from the same external entity, so that the * Locator provides useful information. - *

      + * * *

      * The application must not attempt to read from the array outside of the * specified range. - *

      + * * *

      * Note that some parsers will report whitespace using the * ignorableWhitespace() method rather than this one (validating parsers must * do so). - *

      + * * * @param ch * The characters from the XML document. @@ -689,14 +689,14 @@ public void endDTD() throws org.xml.sax.SAXException { * processing: the SAX XML reader will automatically replace prefixes for * element and attribute names when the http://xml.org/sax/features/namespaces * feature is true (the default). - *

      + * * *

      * There are cases, however, when applications need to use prefixes in * character data or in attribute values, where they cannot safely be expanded * automatically; the start/endPrefixMapping event supplies the information to * the application to expand prefixes in those contexts itself, if necessary. - *

      + * * *

      * Note that start/endPrefixMapping events are not guaranteed to be properly @@ -704,7 +704,7 @@ public void endDTD() throws org.xml.sax.SAXException { * before the corresponding startElement event, and all endPrefixMapping * events will occur after the corresponding endElement event, but their order * is not guaranteed. - *

      + * * * @param prefix * The Namespace prefix being declared. @@ -737,7 +737,7 @@ public void startPrefixMapping(String prefix, String uri) * See startPrefixMapping for details. This event will always occur after the * corresponding endElement event, but the order of endPrefixMapping events is * not otherwise guaranteed. - *

      + * * * @param prefix * The prefix that was being mapping. @@ -757,7 +757,7 @@ public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException { * DTD subset). All processors may skip external entities, depending on the * values of the http://xml.org/sax/features/external-general-entities and the * http://xml.org/sax/features/external-parameter-entities properties. - *

      + * * * @param name * The name of the skipped entity. If it is a parameter entity, the diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java index d625c33119..b5c95ce9ae 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java @@ -35,7 +35,7 @@ class XMLCharacterRecognizer { * Returns whether the specified ch conforms to the XML 1.0 * definition of whitespace. Refer to the definition of - * S for details. + * S for details. * * @param ch * Character to check as XML whitespace. diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java index 67d3dcc5ce..19035c0f60 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java @@ -59,7 +59,7 @@ public class TestRSSParser { /** *

      * The test method: tests out the following 2 asserts: - *

      + * * *
        *
      • There are 3 outlinks read from the sample rss file
      • diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java index 06954394b0..20ba474aa4 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java @@ -99,8 +99,8 @@ public void setMaxContentLength(int maxContentLength) { * * @param url * Text containing the url - * @param datum - * The CrawlDatum object corresponding to the url + * @param page + * {@link WebPage} object relative to the URL * * @return {@link ProtocolOutput} object for the content of the file indicated * by url diff --git a/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java b/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java index 856a649a86..952648ff33 100644 --- a/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java +++ b/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java @@ -39,7 +39,7 @@ * *

        * Unit tests for the {@link File}Protocol. - *

        + * * . */ public class TestProtocolFile { diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java index ffa2091760..8b272ec432 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java @@ -54,7 +54,7 @@ * servers out there, when partial downloading is enforeced by closing data * channel socket on our client side, the server side immediately closes control * channel (socket). Our codes deal with such a bad behavior. (4) LIST is used - * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but + * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but * not as ubiquitously implemented as LIST. (5) Avoid using ABOR in single * thread? Do not use it at all. * diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java index 3f3a7e8e98..9f3f9c5e35 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -114,8 +114,8 @@ public void setKeepConnection(boolean keepConnection) { * * @param url * Text containing the ftp url - * @param datum - * The CrawlDatum object corresponding to the url + * @param page + * {@link WebPage} object relative to the URL * * @return {@link ProtocolOutput} object for the url */ diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java index afcf24aa39..92baf298e9 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java @@ -97,7 +97,7 @@ public Socket createSocket(String host, int port, InetAddress clientHost, * create a new socket within the given limit of time. If socket constructor * does not return until the timeout expires, the controller terminates and * throws an {@link ConnectTimeoutException} - *

        + * * * @param host * the host name/IP diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java index d4d7eba062..69abab75a0 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java @@ -161,8 +161,8 @@ public static void main(String[] args) throws Exception { * * @param url * URL to be fetched - * @param datum - * Crawl data + * @param page + * {@link WebPage} object relative to the URL * @param redirect * Follow redirects if and only if true * @return HTTP response diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java index a15f91be5a..cb09e697a1 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java @@ -39,7 +39,7 @@ /** * Implementation of RFC 2617 Basic Authentication. Usernames and passwords are * stored in standard Nutch configuration files using the following properties: - * http.auth.basic..user http.auth.basic..pass + * http.auth.basic.<realm>.user http.auth.basic.<realm>.pass */ public class HttpBasicAuthentication implements HttpAuthentication, Configurable { @@ -128,7 +128,7 @@ public Configuration getConf() { * Gets the Basic credentials generated by this HttpBasicAuthentication object * * @return Credentials in the form of - * Authorization: Basic <Base64 encoded userid:password> + * Authorization: Basic Base64 encoded userid:password * */ public List getCredentials() { diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java index 9dc0c35823..5006266467 100644 --- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java +++ b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java @@ -16,8 +16,7 @@ */ /** - * Scoring filter used in conjunction with - * {@link org.apache.nutch.scoring.webgraph.WebGraph}. + * Scoring filter */ package org.apache.nutch.scoring.link; diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java index c3119226dc..1e07e6adc2 100644 --- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java +++ b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java @@ -39,7 +39,7 @@ /** * This plugin implements a variant of an Online Page Importance Computation * (OPIC) score, described in this paper: + * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"> * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive * On-Line Page Importance Computation . * diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java index 6c78df5c5f..c905411091 100644 --- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java +++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java @@ -201,7 +201,7 @@ public Collection getAll() { /** * Save collections into file * - * @throws Exception + * @throws IOException */ public void save() throws IOException { try { diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java index bf1ef4232f..5b7d5816cd 100644 --- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java +++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java @@ -43,7 +43,6 @@ * Filters URLs based on a file containing domain suffixes, domain names, and * hostnames. Only a url that matches one of the suffixes, domains, or hosts * present in the file is allowed. - *

        * *

        * Urls are checked in order of domain suffix, domain name, and hostname against @@ -61,18 +60,17 @@ * only urls from www.apache.org. There is no specific ordering to entries. The * entries are from more general to more specific with the more general * overridding the more specific. - *

        * * The domain file defaults to domain-urlfilter.txt in the classpath but can be * overridden using the: * *
          - *
            + *
          1. * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and - *
          - *
            + * + *
          1. * attribute "file" in plugin.xml of this plugin - *
          + * *
        * * the attribute "file" has higher precedence if defined. @@ -114,7 +112,6 @@ public DomainURLFilter() { * @param domainFile * The domain file, overrides domain-urlfilter.text default. * - * @throws IOException */ public DomainURLFilter(String domainFile) { this.domainFile = domainFile; diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java index d2eba1f763..1ff46f69c6 100644 --- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java +++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/package-info.java @@ -18,8 +18,6 @@ /** * URL filter plugin to include only URLs which match an element in a given list of * domain suffixes, domain names, and/or host names. - * See {@link org.apache.nutch.urlfilter.domainblacklist} for the counterpart - * (exclude URLs by host or domain). */ package org.apache.nutch.urlfilter.domain; diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java index 366c11e46f..75ece12c48 100644 --- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java +++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java @@ -49,7 +49,7 @@ * *

        * The format of this file is one URL prefix per line. - *

        + * */ public class PrefixURLFilter implements URLFilter { diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java index 1a7492ab85..ccad47d4e8 100644 --- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java +++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java @@ -68,22 +68,22 @@ * The format of this config file is one URL suffix per line, with no preceding * whitespace. Order, in which suffixes are specified, doesn't matter. Blank * lines and comments (#) are allowed. - *

        + * *

        * A single '+' or '-' sign not followed by any suffix must be used once, to * signify the mode this plugin operates in. An optional single 'I' can be * appended, to signify that suffix matches should be case-insensitive. The * default, if not specified, is to use case-sensitive matches, i.e. suffix * '.JPG' does not match '.jpg'. - *

        + * *

        * NOTE: the format of this file is different from urlfilter-prefix, because * that plugin doesn't support allowed/prohibited prefixes (only supports * allowed prefixes). Please note that this plugin does not support regular * expressions, it only accepts literal suffixes. I.e. a suffix "+*.jpg" is most * probably wrong, you should use "+.jpg" instead. - *

        - *

        Example 1

        + * + * Example 1 *

        * The configuration shown below will accept all URLs with '.html' or '.htm' * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit @@ -101,8 +101,8 @@ * .htm * * - *

        - *

        Example 2

        + * + * Example 2 *

        * The configuration shown below will accept all URLs except common graphical * formats. @@ -122,7 +122,7 @@ * .bmp * * - *

        + * * * @author Andrzej Bialecki */ diff --git a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java index 9b84eae01e..7132a61b45 100644 --- a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java +++ b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java @@ -25,13 +25,13 @@ /** *

        * Validates URLs. - *

        + * * *

        * Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: * 03/07/02, http://javascript.internet.com. However, this validation now bears * little resemblance to the php original. - *

        + * * *
          *   Example of usage:
        @@ -47,7 +47,7 @@
          * 
          * 

        * Based on UrlValidator code from Apache commons-validator. - *

        + * * * @see Uniform Resource * Identifiers (URI): Generic Syntax @@ -159,7 +159,7 @@ public void setConf(Configuration conf) { /** *

        * Checks if a field has a valid url address. - *

        + * * * @param value * The value validation is being performed on. A null @@ -323,7 +323,7 @@ private boolean isValidAuthority(String authority) { *

        * Checks if the field isn't null and length of the field is greater than zero * not including whitespace. - *

        + * * * @param value * The value validation is being performed on. diff --git a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java index d460d9e5ad..66f7a1b363 100644 --- a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java +++ b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java @@ -58,11 +58,11 @@ * This class uses the urlnormalizer.regex.file property. It should be * set to the file name of an xml file which should contain the patterns and * substitutions to be done on encountered URLs. - *

        + * *

        * This class also supports different rules depending on the scope. Please see * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details. - *

        + * * * @author Luke Baker * @author Andrzej Bialecki diff --git a/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java b/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java index 2e4e61ce51..f7d73880ab 100644 --- a/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java +++ b/src/test/org/apache/nutch/api/AbstractNutchAPITestBase.java @@ -141,7 +141,7 @@ public void testRequest(int expectedStatusCode, int port, String username, Strin */ if (ChallengeScheme.HTTP_DIGEST.equals(challengeScheme)) { - // User server's data to complete the challengeResponse object + // Use server's data to complete the challengeResponse object ChallengeRequest digestChallengeRequest = retrieveDigestChallengeRequest(resource); ChallengeResponse challengeResponse = new ChallengeResponse(digestChallengeRequest, resource.getResponse(), username, password.toCharArray()); diff --git a/src/test/org/apache/nutch/crawl/TestGenerator.java b/src/test/org/apache/nutch/crawl/TestGenerator.java index 95d1db3609..2345299003 100644 --- a/src/test/org/apache/nutch/crawl/TestGenerator.java +++ b/src/test/org/apache/nutch/crawl/TestGenerator.java @@ -42,7 +42,7 @@ *
      • Generates entries to fetch
      • *
      • Verifies that number of generated urls match, and finally
      • *
      • Verifies that highest scoring urls are generated.
      • - *
          + *
        * */ public class TestGenerator extends AbstractNutchTest { diff --git a/src/test/org/apache/nutch/fetcher/TestFetcher.java b/src/test/org/apache/nutch/fetcher/TestFetcher.java index 8a8fa42154..9d62d50a6d 100644 --- a/src/test/org/apache/nutch/fetcher/TestFetcher.java +++ b/src/test/org/apache/nutch/fetcher/TestFetcher.java @@ -161,13 +161,15 @@ public void testFetch() throws Exception { /** * Tests a refetch of a URL. This process consists of two consecutive * inject, generate, fetch, parse then update cycles. The test configuration - * is defined such that db.fetch.interval.default is set to + * is defined such that db.fetch.interval.default is set to * a very low value (indicating that the URL should be fetched again immediately). * In addition, configuration tests that relevant * {@link org.apache.nutch.metadata.Metadata} is present and the values consistent * and therefore not overwritten. - * @see https://issues.apache.org/jira/browse/NUTCH-2222 + * * @throws Exception + * + * @see https://issues.apache.org/jira/browse/NUTCH-2222 */ @Test public void testReFetch() throws Exception { diff --git a/src/test/org/apache/nutch/util/CrawlTestUtil.java b/src/test/org/apache/nutch/util/CrawlTestUtil.java index 5165b38a5a..046c51ec8a 100644 --- a/src/test/org/apache/nutch/util/CrawlTestUtil.java +++ b/src/test/org/apache/nutch/util/CrawlTestUtil.java @@ -71,7 +71,8 @@ public static Configuration createConfiguration() { /** * Generate seedlist * - * @see TestInjector + * @see org.apache.nutch.crawl.TestInjector + * * @throws IOException */ public static void generateSeedList(FileSystem fs, Path urlPath, @@ -148,11 +149,11 @@ public static Server getServer(int port, String staticContent) } /** - * Generate Fetchlist. + * Generates Fetchlist * * @param numResults number of results to generate * @param config Configuration to use - * @return path to generated batch + * * @throws IOException */ public static void generateFetchlist(int numResults, Configuration config,