Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Initial import of Nutch to Apache.

git-svn-id: https://svn.apache.org/repos/asf/incubator/nutch/trunk@155829 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
commit bd718ff3ab1f923593e5eccaa0aaeb21370f7169 1 parent 153833c
@cutting cutting authored
Showing with 24,572 additions and 0 deletions.
  1. +368 −0 CHANGES.txt
  2. +15 −0 LICENSE.txt
  3. +25 −0 README.txt
  4. +186 −0 bin/nutch
  5. +85 −0 bin/nutch-daemon.sh
  6. +422 −0 build.xml
  7. +13 −0 conf/common-terms.utf8
  8. +50 −0 conf/crawl-tool.xml
  9. +24 −0 conf/crawl-urlfilter.txt.template
  10. +480 −0 conf/mime.types
  11. +24 −0 conf/nutch-conf.xsl
  12. +664 −0 conf/nutch-default.xml
  13. +8 −0 conf/nutch-site.xml.template
  14. +22 −0 conf/regex-normalize.xml.template
  15. +19 −0 conf/regex-urlfilter.txt.template
  16. +57 −0 default.properties
  17. +169 −0 docs/ca/about.html
  18. +263 −0 docs/ca/developers.html
  19. +209 −0 docs/ca/faq.html
  20. +149 −0 docs/ca/help.html
  21. +119 −0 docs/ca/search.html
  22. +194 −0 docs/de/about.html
  23. +172 −0 docs/de/credits.html
  24. +284 −0 docs/de/faq.html
  25. +160 −0 docs/de/help.html
  26. +119 −0 docs/de/search.html
  27. +143 −0 docs/de/status.html
  28. +186 −0 docs/en/about.html
  29. +276 −0 docs/en/bot.html
  30. +185 −0 docs/en/credits.html
  31. +286 −0 docs/en/developers.html
  32. +216 −0 docs/en/donate.html
  33. +256 −0 docs/en/faq.html
  34. +156 −0 docs/en/help.html
  35. +366 −0 docs/en/i18n.html
  36. +178 −0 docs/en/org.html
  37. +251 −0 docs/en/policies.html
  38. +167 −0 docs/en/press.html
  39. +128 −0 docs/en/search.html
  40. +133 −0 docs/en/status.html
  41. +440 −0 docs/en/tutorial.html
  42. +191 −0 docs/es/about.html
  43. +150 −0 docs/es/help.html
  44. +119 −0 docs/es/search.html
  45. +182 −0 docs/fi/about.html
  46. +161 −0 docs/fi/help.html
  47. +119 −0 docs/fi/search.html
  48. +161 −0 docs/fr/about.html
  49. +252 −0 docs/fr/faq.html
  50. +119 −0 docs/fr/search.html
  51. +182 −0 docs/hu/about.html
  52. +155 −0 docs/hu/help.html
  53. +119 −0 docs/hu/search.html
  54. BIN  docs/img/favicon.ico
  55. BIN  docs/img/lang/arabic.png
  56. BIN  docs/img/lang/brazil.png
  57. BIN  docs/img/lang/bulgarian.png
  58. BIN  docs/img/lang/catala.png
  59. BIN  docs/img/lang/chinese.png
  60. BIN  docs/img/lang/croatian.png
  61. BIN  docs/img/lang/czech.png
  62. BIN  docs/img/lang/danish.png
  63. BIN  docs/img/lang/dutch.png
  64. BIN  docs/img/lang/english.png
  65. BIN  docs/img/lang/french.png
  66. BIN  docs/img/lang/galego.png
  67. BIN  docs/img/lang/german.png
  68. BIN  docs/img/lang/greek.png
  69. BIN  docs/img/lang/hungarian.png
  70. BIN  docs/img/lang/icelandic.png
  71. BIN  docs/img/lang/indonesian.png
  72. BIN  docs/img/lang/italian.png
  73. BIN  docs/img/lang/japanese.png
  74. BIN  docs/img/lang/korean.png
  75. BIN  docs/img/lang/latvian.png
  76. BIN  docs/img/lang/lithuanian.png
  77. BIN  docs/img/lang/malaysian.png
  78. BIN  docs/img/lang/norwegian.png
  79. BIN  docs/img/lang/polish.png
  80. BIN  docs/img/lang/portuguese.png
  81. BIN  docs/img/lang/romanian.png
  82. BIN  docs/img/lang/russian.png
  83. BIN  docs/img/lang/spanish.png
  84. BIN  docs/img/lang/suomi.png
  85. BIN  docs/img/lang/swedish.png
  86. BIN  docs/img/lang/thai.png
  87. BIN  docs/img/lang/turkish.png
  88. BIN  docs/img/poweredbynutch_01.gif
  89. BIN  docs/img/poweredbynutch_02.gif
  90. BIN  docs/img/reiter/002bg_fle.gif
  91. BIN  docs/img/reiter/002bg_fre.gif
  92. BIN  docs/img/reiter/_bg_reiter.gif
  93. BIN  docs/img/reiter/_bg_reiter_inactive.gif
  94. BIN  docs/img/reiter/_spacer_cccccc.gif
  95. BIN  docs/img/reiter/bg_subnavi.gif
  96. BIN  docs/img/reiter/logo_nutch.gif
  97. BIN  docs/img/reiter/reiter_inactive_le.gif
  98. BIN  docs/img/reiter/reiter_inactive_le1.gif
  99. BIN  docs/img/reiter/reiter_inactive_ri.gif
  100. BIN  docs/img/reiter/robots.gif
  101. BIN  docs/img/reiter/spacer_666666.gif
  102. BIN  docs/img/reiter/spacer_ff9900.gif
  103. BIN  docs/img/reiter/ul.gif
  104. +183 −0 docs/jp/about.html
  105. +245 −0 docs/jp/faq.html
  106. +147 −0 docs/jp/help.html
  107. +119 −0 docs/jp/search.html
  108. +176 −0 docs/ms/about.html
  109. +156 −0 docs/ms/help.html
  110. +119 −0 docs/ms/search.html
  111. +189 −0 docs/nl/about.html
  112. +284 −0 docs/nl/faq.html
  113. +163 −0 docs/nl/help.html
  114. +119 −0 docs/nl/search.html
  115. +186 −0 docs/pl/about.html
  116. +281 −0 docs/pl/bot.html
  117. +184 −0 docs/pl/credits.html
  118. +286 −0 docs/pl/developers.html
  119. +211 −0 docs/pl/donate.html
  120. +263 −0 docs/pl/faq.html
  121. +158 −0 docs/pl/help.html
  122. +362 −0 docs/pl/i18n.html
  123. +179 −0 docs/pl/org.html
  124. +252 −0 docs/pl/policies.html
  125. +167 −0 docs/pl/press.html
  126. +128 −0 docs/pl/search.html
  127. +133 −0 docs/pl/status.html
  128. +443 −0 docs/pl/tutorial.html
  129. +192 −0 docs/pt/about.html
  130. +156 −0 docs/pt/help.html
  131. +119 −0 docs/pt/search.html
  132. +185 −0 docs/sv/about.html
  133. +152 −0 docs/sv/help.html
  134. +128 −0 docs/sv/search.html
  135. +188 −0 docs/th/about.html
  136. +154 −0 docs/th/help.html
  137. +119 −0 docs/th/search.html
  138. +180 −0 docs/zh/about.html
  139. +149 −0 docs/zh/help.html
  140. +121 −0 docs/zh/search.html
  141. +9 −0 index.html
  142. +33 −0 lib/concurrent-1.3.4.LICENSE.txt
  143. BIN  lib/concurrent-1.3.4.jar
  144. +40 −0 lib/dom4j-1.4.LICENSE.txt
  145. BIN  lib/dom4j-1.4.jar
  146. BIN  lib/jakarta-oro-2.0.7.jar
  147. +202 −0 lib/jetty-5.1.2.LICENSE.txt
  148. BIN  lib/jetty-5.1.2.jar
  149. +100 −0 lib/junit-3.8.1.LICENSE.txt
  150. BIN  lib/junit-3.8.1.jar
  151. BIN  lib/lucene-1.4.2.jar
  152. BIN  lib/servlet-api.jar
  153. BIN  lib/taglibs-i18n.jar
  154. +406 −0 lib/taglibs-i18n.tld
  155. BIN  lib/xerces-2_6_2-apis.jar
  156. BIN  lib/xerces-2_6_2.jar
  157. +24 −0 src/engines/Altavista.src
  158. +25 −0 src/engines/FAST.src
  159. +24 −0 src/engines/Google.src
  160. +25 −0 src/engines/Inktomi.src
  161. +110 −0 src/java/org/apache/nutch/analysis/CharStream.java
  162. +255 −0 src/java/org/apache/nutch/analysis/CommonGrams.java
  163. +120 −0 src/java/org/apache/nutch/analysis/FastCharStream.java
  164. +852 −0 src/java/org/apache/nutch/analysis/NutchAnalysis.java
  165. +321 −0 src/java/org/apache/nutch/analysis/NutchAnalysis.jj
  166. +52 −0 src/java/org/apache/nutch/analysis/NutchAnalysisConstants.java
  167. +458 −0 src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
  168. +88 −0 src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
  169. +83 −0 src/java/org/apache/nutch/analysis/NutchDocumentTokenizer.java
  170. +192 −0 src/java/org/apache/nutch/analysis/ParseException.java
  171. +81 −0 src/java/org/apache/nutch/analysis/Token.java
  172. +17 −0 src/java/org/apache/nutch/analysis/TokenManager.java
  173. +133 −0 src/java/org/apache/nutch/analysis/TokenMgrError.java
  174. +5 −0 src/java/org/apache/nutch/analysis/package.html
  175. +65 −0 src/java/org/apache/nutch/clustering/HitsCluster.java
  176. +55 −0 src/java/org/apache/nutch/clustering/OnlineClusterer.java
  177. +90 −0 src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
  178. +317 −0 src/java/org/apache/nutch/db/BucketSet.java
  179. +162 −0 src/java/org/apache/nutch/db/DBKeyDivision.java
  180. +331 −0 src/java/org/apache/nutch/db/DBSectionReader.java
  181. +524 −0 src/java/org/apache/nutch/db/DistributedWebDBReader.java
  182. +2,062 −0 src/java/org/apache/nutch/db/DistributedWebDBWriter.java
  183. +186 −0 src/java/org/apache/nutch/db/EditSectionGroupReader.java
  184. +241 −0 src/java/org/apache/nutch/db/EditSectionGroupWriter.java
  185. +120 −0 src/java/org/apache/nutch/db/EditSectionWriter.java
  186. +101 −0 src/java/org/apache/nutch/db/IWebDBReader.java
  187. +110 −0 src/java/org/apache/nutch/db/IWebDBWriter.java
Sorry, we could not display the entire diff because too many files (796) changed.
View
368 CHANGES.txt
@@ -0,0 +1,368 @@
+Nutch Change Log
+
+Release 0.7
+
+ 1. Added support for "type:" in queries. Search results are limited/qualified
+ by mimetype or its primary type or sub type. For example,
+ (1) searching with "type:application/pdf" restricts results
+ to pages which were identified to be of mimetype "application/pdf".
+ (2) with "type:application", nutch will return pages of
+ primary type "application".
+ (3) with "type:pdf", only pages of sub type "pdf" will be listed.
+ (John Xing, 20050120)
+
+ 2. Added support for "date:" in queries. Last-Modified is indexed.
+ Search results are restricted by lower and upper date (inclusive)
+ as date:yyyymmdd-yyyymmdd. For example, date:20040101-20041231
+ only returns pages with Last-Modified in year 2004.
+ (John Xing, 20050122)
+
+ 3. Add URLFilter plugin interface and convert existing url filters into
+ plugins. (John Xing, 20050206)
+
+ 4. Add UpdateSegmentsFromDb tool, which updates the scores and
+ anchors of existing segments with the current values in the web
+ db. This is used by CrawlTool, so that pages are now only fetched
+ once per crawl. (Doug Cutting, 20050221)
+
+ 5. Moved code into org.apache.nutch sub-packages. Changed license to
+ Apache 2.0. Removed jar files whose licenses do not permit
+ redistribution by Apache. Disabled compilation of plugins which
+ require these libraries. (Doug Cutting 20050301)
+
+Release 0.6
+
+ 1. Added clustering-carrot2 plugin, together with introduction of clustering
+ api and modification to search jsp. (Dawid Weiss via John Xing, 20040809)
+
+ 2. Make a number of changes to NDFS (Nutch Distributed File System)
+ to fix bugs, add admin tools, etc.
+
+ Also, modify all command line tools so you can indicate whether to
+ use NDFS or the local filesystem. If you indicate nothing, then
+ it defaults to the local fs.
+
+ I've used this to do a 35m page crawl via NDFS, distributed over a
+ dozen machines. (Mike Cafarella)
+
+ 3. Add support for BASE tags in HTML. Outlinks are now correctly
+ extracted when a BASE tag is present. (cutting)
+
+ 4. Fix two bugs in result pagination. When the last hit on a page
+ was the last hit overall, the "next" button was sometimes shown
+ when the "show all" button should be shown instead. Also, in
+ certain cases, the "show all" button would be shown when the
+ "next" button should have been shown. (cutting)
+
+ 5. Add config parameter "indexer.max.tokens" that determines the
+ maximum number of tokens indexed per field. (Andy Hedges via cutting)
+
+ 6. Add parser for mp3 files. (Andy Hedges via cutting)
+
+ 7. Add RegexUrlNormalizer. This is useful for things like stripping
+ out session IDs from URLs. To use it, add values for
+ urlnormalizer.class and urlnormalizer.regex.file to your
+ nutch-site.xml. The RegexUrlNormalizer class extends the
+ BasicUrlNormalizer, and does basic normalization as well.
+ (Luke Baker via cutting)
+
+ 8. Added Swedish translation (Stefan Verzel via Sami Siren, 20040910)
+
+ 9. Added Polish translation (Andrzej Bialecki, 20040911)
+
+10. Added 3 more language profiles to language identifier (ru,hu,pl).
+ Other changes to language identifier: Porfiles converted to utf8,
+ added some test cases, changed the similarity calculation.
+ (Sami Siren, 20040925)
+
+11. Added plugin parse-rtf (Andy Hedges via John Xing, 20040929)
+
+12. Added plugin index-more and more.jsp (John Xing, 20041003)
+
+13. Added "View as Plain Text" feature. A new op OP_PARSETEXT is introduced
+ in DistributedSearch.java. text.jsp is added. (John Xing, 20041006)
+
+14. Fixed a bug that fails cached.jsp, explain.jsp, anchors.jsp and text.jsp
+ (but not search.jsp) with NullPointerException in distributed search.
+ It seems that this bug appears after "hits per site" stuff is added.
+ The fix is done in Hit.java, making sure String site is never null.
+ Hope this fix not have bad effetct on "hits per site" code.
+ (John Xing, 20041006)
+
+15. Fixed a bug that fails fullyDelete() in FileUtil.java for
+ LocalFileSystem.java. This bug also exposes possible incompleteness
+ of NDFSFile.java, where a few methods are not supported, including
+ delete(). Nothing changed in NDFSFile.java though. Leave it for future
+ improvement (John Xing, 20041022).
+
+16. Introduced option -noParsing to Fetcher.java and added ParseSegment.java.
+ A new status code CANT_PARSE is added to FetcherOutput.java.
+ Without option -noParsing , no change in fetcher behavior. With
+ option -noParsing, fetcher does crawls only, no parsing is carried out.
+ Then, ParseSegment.java should be used to parse in separate pass.
+ (John Xing, 20041025)
+
+17. Added ontology plugin. Currently it is used for query refinement, as
+ examplified in refine-query-init.jsp and refine-query.jsp. By default,
+ query refinement is disabled in search.jsp. Please check
+ ./src/plugin/ontology/README.txt for further description.
+ Ontology plugin certainly can be used for many other things.
+ (Michael J. Pan via John Xing, 20041129)
+
+18. Changed fetcher.server.delay to be a float, so that sub-second
+ delays can be specified. (cutting)
+
+19. Added plugin.includes config parameter that determines which
+ plugins are included. By default now only http, html and basic
+ indexing and search plugins are enabled, rather than all plugins.
+ This should make default performance more predictable and reliable
+ going forward. (cutting)
+
+20. Cleaned up some filesystem code, including:
+
+ - Replaced BufferedRandomAccessFile with two simpler utilties,
+ NFSDataInputStream and NFSDataOutputStream.
+
+ - Fixed the bug where SequenceFiles were no longer flushed when
+ created, so that, when fetches crashed, segments were
+ unreadable. Now segments are always readable after crashes.
+ Only the contents of the last buffer is lost.
+
+ - Simplified the FSOutputStream API to not include seek(). We
+ should never need that functionality.
+
+ - Simplified LocalFileSystem's implementations of FSInputStream
+ and FSOutputStream and optimized FSInputStream.seek().
+
+ (cutting)
+
+21. Fixed BasicUrlNormalizer to better handle relative urls. The file
+ part of a URL is normalized in the following manner:
+
+ 1. "/aa/../" will be replaced by "/" This is done step by step until
+ the url doesn´t change anymore. So we ensure, that
+ "/aa/bb/../../" will be replaced by "/", too
+
+ 2. leading "/../" will be replaced by "/"
+
+ (Sven Wende via cutting)
+
+22. Fix Page constructors so that next fetch date is less likely to be
+ misconstrued as a float. This patches a problem in WebDBInjector,
+ where new pages were added to the db with nextScore set to the
+ intended nextFetch date. This, in turn, confused link analysis.
+
+23. In ndfs code, replace addLocalFile(), putToLocalFile() with
+ copyFromLocalFile(), moveFromLocalFile(), copyToLocalFile() and
+ moveToLocalFile(). (John Xing, 20041217)
+
+24. Added new config parameter fetcher.threads.per.host. This is used
+ by the Http protocol. When this is one behavior is as before.
+ When this is greater than one then multiple threads are permitted
+ to access a host at once. Note that fetcher.server.delay is no
+ longer consistently observed when this is greater than one.
+ (Luke Baker via Doug Cutting)
+
+Release 0.5
+
+ 1. Changed plugin directory to be a list of directories.
+
+ 2. Permit Plugin to be the default plugin implementation.
+
+ 3. Added pluggable interface for network protocols in new package
+ net.nutch.protocol. Moved http code from core into a plugin.
+
+ 4. Added pluggable interface for content parsing in new package
+ net.nutch.parse. Moved html parsing code from core into a
+ plugin.
+
+ 5. Fixed a bug in NutchAnalysis where 16-bit characters were not
+ processed correctly.
+
+ 6. Fixed bug #971731: random summaries on result page.
+ (Daniel Naber via cutting)
+
+ 7. Made Nutch logo transparent. (Daniel Naber via cutting)
+
+ 8. Added file protocol plugin. (John Xing via cutting)
+
+ 9. Added ftp protocol plugin. (John Xing via cutting)
+
+10. Added pdf and msword parser plugins. (John Xing via cutting)
+
+11. Added pluggable indexing interface. By default, url, content,
+ anchors and title are indexed, as before, but now one can easily
+ alter this to, e.g., index metadata. A demonstration is provided
+ which extracts and indexes Creative Commons license urls. (cutting)
+
+12. Add language identification plugin.
+
+ The process of identification is as follows:
+
+ 1. html (html only, HTML 4.0 "lang" attribute)
+ 2. meta tags (html only, http-equiv, dc.language)
+ 3. http header (Content-Language)
+ 4. if all above fail "statistical analysis"
+
+ 1 & 2 are run during the fetching phase and 3 & 4 are run on
+ indexing phase.
+
+ Currently supported languages (in "statistical analysis") are
+ da,de,el,en,es,fi,fr,it,nl,sv and pt. The corpus used was grabbed
+ from http://www.isi.edu/~koehn/europarl/ and the profiles were
+ build with tool supplied in patch.
+
+ After indexing the language can be found from field named "lang"
+
+ It's not 100% accurate but it's a start.
+ (Sami Siren)
+
+13. Added SegmentMergeTool and "mergesegs" command, to remove
+ duplicated or otherwise not used content from several segments and
+ joining them together into a single new segment. The tool also
+ optionally performs several other steps required for proper
+ operation of Nutch - such as indexing segments, deleting
+ duplicates, merging indices, and indexing the new single segment.
+ (Andrzej Bialecki)
+
+14. Add the ability to retrieve ParseData of a search hit. ParseData
+ contains many valuable properties of a search hit.
+
+ This is required (among others) to properly display the cached
+ content because it's not possible to determine the character
+ encoding from the output of the getContent() method (which returns
+ byte[]). The symptoms are that for HTML pages using non-latin1 or
+ non-UTF8 encodings the cached preview will almost certainly look
+ broken. Using the attached patch it is possible to determine the
+ character encoding from the ParseData (for HTTP: Content-Type
+ metadata), and encode the content accordingly. (Andrzej Bialecki)
+
+15. Add a pluggable query interface. By default, the content, anchor
+ and url fields are searched as before. A sample plugin indexes
+ the host name and adds a "site:" keyword to query parsing.
+
+16. Added support for "lang:" in queries. For example, searching with
+ "lang:en" restricts results to pages which were identified to
+ be in English.
+
+17. Automatically optimize field queries to use cached Lucene filters.
+ This makes, for example, searches restricted by languages or sites
+ that are very common much faster.
+
+18. Improved charset handling in jsp pages. (jshin by cutting)
+
+19. Permit topic filtering when injecting DMOZ pages. (jshin by cutting)
+
+20. When parsing crawled pages, interpret charset specifications in
+ html meta tags. (jshin by cutting)
+
+21. Added support for "cc:licensed" in queries, which searches for documents
+ released under Creative Commons licenses. Attributes of the
+ license may also be queried, with, e.g., "cc:by" for
+ attribution-required licenses, "cc:nc" for non-commercial
+ licenses, etc.
+
+22. Relative paths named in plugin.folders are now searched for on the
+ classpath. This makes, e.g., deployment in a war file much simpler.
+
+23. Modifications to Fetcher.java.
+
+ 1. Make sure it works properly with regard to creation and initialization
+ of plugin instances. The problem was that multiple threads race to
+ startUp() or shutDown() plugin instances. It was solved by synchronizing
+ certain codes in PluginRepository.java and Extension.java.
+ (Stefan Groschupf via John Xing)
+
+ 2. Added code to explictly shutDown() plugins. Otherwise FetcherThreads
+ may never return (quit) if there are still data or other structures
+ (e.g., persistent socket connections) associated with plugins. (John Xing)
+
+ 3. Fixed one type of Fetcher "hang" problems by monitoring named
+ FetcherThreads. If all FetcherThreads are gone (finished),
+ Fetcher.java is considered done. The problem was: there could be
+ runaway threads started by external libs via FetcherThreads.
+ Those threads never return, thus keep Fetcher from exiting normally.
+ (John Xing)
+
+24. Eliminate excessive hits from sites. This is done efficiently by
+ adding the site name to Hit instances, and, when needed,
+ re-querying with too-frequent sites prohibited in the query.
+
+
+Release 0.4
+
+ 1. Http class refactored. (Kevin Smith via Tom Pierce)
+
+ 2. Add Finnish translation. (Sampo Syreeni via Doug Cutting)
+
+ 3. Added Japanese translation. (Yukio Andoh via Doug Cutting)
+
+ 4. Updated Dutch translation. (Ype Kingma via Doug Cutting)
+
+ 5. Initial version of Distributed DB code. (Mike Cafarella)
+
+ 6. Make things more tolerant of crashed fetcher output files.
+ (Doug Cutting)
+
+ 7. New skin for website. (Frank Henze via Doug Cutting)
+
+ 8. Added Spanish translation. (Diego Basch via Doug Cutting)
+
+ 9. Add FTP support to fetcher. (John Xing via Doug Cutting)
+
+10. Added Thai translation. (Pichai Ongvasith via Doug Cutting)
+
+11. Added Robots.txt & throttling support to Fetcher.java. (Mike
+ Cafarella)
+
+12. Added nightly build. (Doug Cutting)
+
+13. Default all link scores to 1.0. (Doug Cutting)
+
+14. Permit one to keep internal links. (Doug Cutting)
+
+15. Fixed dedup to select shortest URL. (Doug Cutting)
+
+16. Changed index merger so that merged index is written to named
+ directory, rather than to a generated name in that directory.
+ (Doug Cutting)
+
+17. Disable coordination weighting of query clauses and other minor
+ scoring improvements. (Doug Cutting)
+
+18. Added a new command, crawl, that constructs a database, injects a
+ url file and performs a few rounds of generate/fetch/updatedb.
+ This simplifies use for intranet sites. Changed some defaults to
+ be more intranet friendly. (Doug Cutting)
+
+19. Fixed a bug where Fetcher.java didn't construct correct relative
+ links when a page was redirected. (Doug Cutting)
+
+20. Fixed a query parser problem with lookahead over plusses and minuses.
+ (Doug Cutting)
+
+21. Add support for HTTP proxy servers. (Sami Siren via Doug Cutting)
+
+22. Permit searching while fetching and/or indexing.
+ (Sami Siren via Doug Cutting)
+
+23. Fix a bug when throttling is disabled. (Sami Siren via Doug Cutting)
+
+24. Updated Bahasa Malaysia translation. (Michael Lim via Doug Cutting)
+
+25. Added Catalan translation. (Xavier Guardiola via Doug Cutting)
+
+26. Added brazilian portuguese translation.
+ (A. Moreir via Doug Cutting)
+
+27. Added a french translation. (Julien Nioche via Doug Cutting)
+
+28. Updated to Lucene 1.4RC3. (Doug Cutting)
+
+29. Add capability to boost by link count & use it in crawl tool.
+ (Doug Cutting)
+
+30. Added plugin system. (Stefan Groschupf via Doug Cutting)
+
+31. Add this change log file, for recording significant changes to
+ Nutch. Populate it with changes from the last few months.
View
15 LICENSE.txt
@@ -0,0 +1,15 @@
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
View
25 README.txt
@@ -0,0 +1,25 @@
+Nutch README
+
+Interesting files include:
+
+ docs/en/tutorial.html
+ The place to start if you want to use Nutch.
+
+ docs/en/developers.html
+ Information for potential Nutch developers.
+
+ docs/api/index.html
+ Javadocs for the Nutch software.
+
+ CHANGES.txt
+ Log of changes to Nutch.
+
+
+For the latest information about Nutch, please visit our website at:
+
+ http://www.nutch.org/
+
+and our wiki, at:
+
+ http://www.nutch.org/cgi-bin/twiki/view/Main/Nutch
+
View
186 bin/nutch
@@ -0,0 +1,186 @@
+#!/bin/sh
+#
+# The Nutch command script
+#
+# Environment Variables
+#
+# NUTCH_JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
+#
+# NUTCH_HEAPSIZE The maximum amount of heap to use, in MB.
+# Default is 1000.
+#
+# NUTCH_OPTS Extra Java runtime options.
+#
+
+# resolve links - $0 may be a softlink
+THIS="$0"
+while [ -h "$THIS" ]; do
+ ls=`ls -ld "$THIS"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '.*/.*' > /dev/null; then
+ THIS="$link"
+ else
+ THIS=`dirname "$THIS"`/"$link"
+ fi
+done
+
+# if no args specified, show usage
+if [ $# = 0 ]; then
+ echo "Usage: nutch COMMAND"
+ echo "where COMMAND is one of:"
+ echo " crawl one-step crawler for intranets"
+ echo " admin database administration, including creation"
+ echo " inject inject new urls into the database"
+ echo " generate generate new segments to fetch"
+ echo " fetchlist print the fetchlist of a segment"
+ echo " fetch fetch a segment's pages"
+ echo " parse parse a segment's pages"
+ echo " index run the indexer on a segment's fetcher output"
+ echo " merge merge several segment indexes"
+ echo " dedup remove duplicates from a set of segment indexes"
+ echo " updatedb update database from a segment's fetcher output"
+ echo " mergesegs merge multiple segments into a single segment"
+ echo " readdb examine arbitrary fields of the database"
+ echo " analyze adjust database link-analysis scoring"
+ echo " prune prune segment index(es) of unwanted content"
+ echo " segread read, fix and dump segment data"
+ echo " segslice append, join and slice segment data"
+ echo " server run a search server"
+ echo " namenode run the NDFS namenode"
+ echo " datanode run an NDFS datanode"
+ echo " ndfs run an NDFS admin client"
+ echo " jobtracker run the MapReduce job Tracker node"
+ echo " tasktracker run a MapReduce task Tracker node"
+ echo " or"
+ echo " CLASSNAME run the class named CLASSNAME"
+ echo "Most commands print help when invoked w/o parameters."
+ exit 1
+fi
+
+# get arguments
+COMMAND=$1
+shift
+
+# some directories
+THIS_DIR=`dirname "$THIS"`
+NUTCH_HOME=`cd "$THIS_DIR/.." ; pwd`
+
+# some Java parameters
+if [ "$NUTCH_JAVA_HOME" != "" ]; then
+ echo "run java in $NUTCH_JAVA_HOME"
+ JAVA_HOME=$NUTCH_JAVA_HOME
+fi
+
+if [ "$JAVA_HOME" = "" ]; then
+ echo "Error: JAVA_HOME is not set."
+ exit 1
+fi
+
+JAVA=$JAVA_HOME/bin/java
+JAVA_HEAP_MAX=-Xmx1000m
+
+# check envvars which might override default args
+if [ "$NUTCH_HEAPSIZE" != "" ]; then
+ echo "run with heapsize $NUTCH_HEAPSIZE"
+ JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
+ echo $JAVA_HEAP_MAX
+fi
+
+# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
+CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}
+
+# for developers, add Nutch classes to CLASSPATH
+if [ -d "$NUTCH_HOME/build/classes" ]; then
+ CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes
+fi
+if [ -d "$NUTCH_HOME/build/plugins" ]; then
+ CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build
+fi
+if [ -d "$NUTCH_HOME/build/test/classes" ]; then
+ CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes
+fi
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+# for releases, add Nutch jar to CLASSPATH
+for f in $NUTCH_HOME/nutch-*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+# add plugins to classpath
+if [ -d "$NUTCH_HOME/plugins" ]; then
+ CLASSPATH=${CLASSPATH}:$NUTCH_HOME
+fi
+
+# add libs to CLASSPATH
+for f in $NUTCH_HOME/lib/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+for f in $NUTCH_HOME/lib/jettyext/*.jar; do
+ CLASSPATH=${CLASSPATH}:$f;
+done
+
+# restore ordinary behaviour
+unset IFS
+
+# figure out which class to run
+if [ "$COMMAND" = "crawl" ] ; then
+ CLASS=org.apache.nutch.tools.CrawlTool
+elif [ "$COMMAND" = "admin" ] ; then
+ CLASS=org.apache.nutch.tools.WebDBAdminTool
+elif [ "$COMMAND" = "inject" ] ; then
+ CLASS=org.apache.nutch.db.WebDBInjector
+elif [ "$COMMAND" = "generate" ] ; then
+ CLASS=org.apache.nutch.tools.FetchListTool
+elif [ "$COMMAND" = "fetchlist" ] ; then
+ CLASS=org.apache.nutch.pagedb.FetchListEntry
+elif [ "$COMMAND" = "fetch" ] ; then
+ CLASS=org.apache.nutch.fetcher.Fetcher
+elif [ "$COMMAND" = "parse" ] ; then
+ CLASS=org.apache.nutch.tools.ParseSegment
+elif [ "$COMMAND" = "index" ] ; then
+ CLASS=org.apache.nutch.indexer.IndexSegment
+elif [ "$COMMAND" = "merge" ] ; then
+ CLASS=org.apache.nutch.indexer.IndexMerger
+elif [ "$COMMAND" = "dedup" ] ; then
+ CLASS=org.apache.nutch.indexer.DeleteDuplicates
+elif [ "$COMMAND" = "updatedb" ] ; then
+ CLASS=org.apache.nutch.tools.UpdateDatabaseTool
+elif [ "$COMMAND" = "mergesegs" ] ; then
+ CLASS=org.apache.nutch.tools.SegmentMergeTool
+elif [ "$COMMAND" = "readdb" ] ; then
+ CLASS=org.apache.nutch.db.WebDBReader
+elif [ "$COMMAND" = "prune" ] ; then
+ CLASS=org.apache.nutch.tools.PruneIndexTool
+elif [ "$COMMAND" = "segread" ] ; then
+ CLASS=org.apache.nutch.segment.SegmentReader
+elif [ "$COMMAND" = "segslice" ] ; then
+ CLASS=org.apache.nutch.segment.SegmentSlicer
+elif [ "$COMMAND" = "analyze" ] ; then
+ CLASS=org.apache.nutch.tools.LinkAnalysisTool
+elif [ "$COMMAND" = "server" ] ; then
+ CLASS='org.apache.nutch.searcher.DistributedSearch$Server'
+elif [ "$COMMAND" = "namenode" ] ; then
+ CLASS='org.apache.nutch.ndfs.NDFS$NameNode'
+elif [ "$COMMAND" = "datanode" ] ; then
+ CLASS='org.apache.nutch.ndfs.NDFS$DataNode'
+elif [ "$COMMAND" = "ndfs" ] ; then
+ CLASS=org.apache.nutch.fs.TestClient
+elif [ "$COMMAND" = "jobtracker" ] ; then
+ CLASS=org.apache.nutch.mapReduce.JobTracker
+elif [ "$COMMAND" = "tasktracker" ] ; then
+ CLASS=org.apache.nutch.mapReduce.TaskTracker
+else
+ CLASS=$COMMAND
+fi
+
+# cygwin path translation
+if expr match `uname` 'CYGWIN*' > /dev/null; then
+ CLASSPATH=`cygpath -p -w "$CLASSPATH"`
+fi
+
+# run it
+exec $JAVA $JAVA_HEAP_MAX $NUTCH_OPTS -classpath "$CLASSPATH" $CLASS "$@"
+
View
85 bin/nutch-daemon.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+#
+# Runs a Nutch command as a daemon.
+#
+# Environment Variables
+#
+# NUTCH_LOG_DIR Where log files are stored. PWD by default.
+#
+
+# resolve links - $0 may be a softlink
+this="$0"
+while [ -h "$this" ]; do
+ ls=`ls -ld "$this"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '.*/.*' > /dev/null; then
+ this="$link"
+ else
+ this=`dirname "$this"`/"$link"
+ fi
+done
+
+usage="Usage: nutch-daemon [start|stop] [nutch-command] [args...]"
+
+# if no args specified, show usage
+if [ $# -le 1 ]; then
+ echo $usage
+ exit 1
+fi
+
+# get arguments
+startStop=$1
+shift
+command=$1
+shift
+
+# some directories
+this_dir=`dirname "$this"`
+
+if [ "$NUTCH_LOG_DIR" = "" ]; then
+ NUTCH_LOG_DIR=.
+fi
+
+# some variables
+log=$NUTCH_LOG_DIR/nutch-$command-`hostname`.log
+pid=/tmp/nutch-$command.pid
+
+case $startStop in
+
+ (start)
+
+ if [ -f $pid ]; then
+ if [ -a /proc/`cat $pid` ]; then
+ echo $command running as process `cat $pid`. Stop it first.
+ exit 1
+ fi
+ fi
+
+ echo starting $command, logging to $log
+ $this_dir/nutch $command "$@" >& $log < /dev/null &
+ echo $! > $pid
+ sleep 1; head $log
+ ;;
+
+ (stop)
+
+ if [ -f $pid ]; then
+ if [ -a /proc/`cat $pid` ]; then
+ echo stopping $command
+ kill `cat $pid`
+ else
+ echo no $command to stop
+ fi
+ else
+ echo no $command to stop
+ fi
+ ;;
+
+ (*)
+ echo $usage
+ exit 1
+ ;;
+
+esac
+
+
View
422 build.xml
@@ -0,0 +1,422 @@
+<?xml version="1.0"?>
+
+<project name="Nutch" default="compile" basedir=".">
+
+ <!-- Load all the default properties, and any the user wants -->
+ <!-- to contribute (without having to type -D or edit this file -->
+ <property file="${user.home}/build.properties" />
+ <property file="${basedir}/build.properties" />
+ <property file="${basedir}/default.properties" />
+
+ <!-- the normal classpath -->
+ <path id="classpath">
+ <pathelement location="${build.classes}"/>
+ <fileset dir="${lib.dir}">
+ <include name="*.jar" />
+ </fileset>
+ </path>
+
+ <!-- the unit test classpath -->
+ <dirname property="plugins.classpath.dir" file="${build.plugins}"/>
+ <path id="test.classpath">
+ <pathelement location="${test.build.classes}" />
+ <pathelement location="${conf.dir}"/>
+ <pathelement location="${plugins.classpath.dir}"/>
+ <path refid="classpath"/>
+ </path>
+
+
+ <!-- ====================================================== -->
+ <!-- Stuff needed by all targets -->
+ <!-- ====================================================== -->
+ <target name="init">
+ <mkdir dir="${build.dir}"/>
+ <mkdir dir="${build.classes}"/>
+
+ <mkdir dir="${test.build.dir}"/>
+ <mkdir dir="${test.build.classes}"/>
+
+ <touch datetime="01/25/1971 2:00 pm">
+ <fileset dir="${conf.dir}" includes="**/*.template"/>
+ </touch>
+
+ <copy todir="${conf.dir}" verbose="true">
+ <fileset dir="${conf.dir}" includes="**/*.template"/>
+ <mapper type="glob" from="*.template" to="*"/>
+ </copy>
+
+
+ </target>
+
+ <!-- ====================================================== -->
+ <!-- Compile the Java files -->
+ <!-- ====================================================== -->
+ <target name="compile" depends="compile-core, compile-plugins, jar"/>
+
+ <target name="compile-core" depends="init">
+ <javac
+ encoding="${build.encoding}"
+ srcdir="${src.dir}"
+ includes="org/apache/nutch/**/*.java"
+ destdir="${build.classes}"
+ debug="${debug}"
+ optimize="${optimize}"
+ deprecation="${deprecation}">
+ <classpath refid="classpath"/>
+ </javac>
+ </target>
+
+ <target name="compile-plugins">
+ <ant dir="src/plugin" target="deploy" inheritAll="false"/>
+ </target>
+
+ <target name="generate-src" depends="init">
+ <javacc target="${src.dir}/org/apache/nutch/quality/dynamic/PageDescription.jj"
+ javacchome="${javacc.home}">
+ </javacc>
+ <javacc target="${src.dir}/org/apache/nutch/analysis/NutchAnalysis.jj"
+ javacchome="${javacc.home}">
+ </javacc>
+
+ <fixcrlf srcdir="${src.dir}" eol="lf" includes="**/*.java"/>
+
+ </target>
+
+ <target name="dynamic" depends="generate-src, compile">
+ </target>
+
+
+ <!-- ================================================================== -->
+ <!-- Make nutch.jar -->
+ <!-- ================================================================== -->
+ <!-- -->
+ <!-- ================================================================== -->
+ <target name="jar" depends="compile-core">
+ <copy file="${conf.dir}/nutch-default.xml"
+ todir="${build.classes}"/>
+ <copy file="${conf.dir}/nutch-site.xml"
+ todir="${build.classes}"/>
+ <jar
+ jarfile="${build.dir}/${final.name}.jar"
+ basedir="${build.classes}"
+ />
+ <symlink overwrite="true" link="${name}.jar"
+ resource="${build.dir}/${final.name}.jar"/>
+ </target>
+
+ <!-- ================================================================== -->
+ <!-- Make nutch.war -->
+ <!-- ================================================================== -->
+ <!-- -->
+ <!-- ================================================================== -->
+ <target name="war" depends="jar,generate-docs">
+ <war destfile="${build.dir}/${final.name}.war"
+ webxml="${web.src.dir}/web.xml">
+ <fileset dir="${web.src.dir}/jsp"/>
+ <zipfileset dir="${docs.src}" includes="include/*.html"/>
+ <zipfileset dir="${build.docs}" includes="*/include/*.html"/>
+ <fileset dir="${docs.dir}"/>
+ <lib dir="${lib.dir}">
+ <include name="lucene*.jar"/>
+ <include name="taglibs-*.jar"/>
+ <include name="dom4j-*.jar"/>
+ <include name="xerces-*.jar"/>
+ </lib>
+ <lib dir="${build.dir}">
+ <include name="${final.name}.jar"/>
+ </lib>
+ <classes dir="${conf.dir}" excludes="**/*.template"/>
+ <classes dir="${web.src.dir}/locale"/>
+ <zipfileset prefix="WEB-INF/classes/plugins" dir="${build.plugins}"/>
+ <webinf dir="${lib.dir}">
+ <include name="taglibs-*.tld"/>
+ </webinf>
+ </war>
+ </target>
+
+
+ <!-- ================================================================== -->
+ <!-- Compile test code -->
+ <!-- ================================================================== -->
+ <target name="compile-core-test" depends="compile-core">
+ <javac
+ encoding="${build.encoding}"
+ srcdir="${test.src.dir}"
+ includes="org/apache/nutch/**/*.java"
+ destdir="${test.build.classes}"
+ debug="${debug}">
+ <classpath refid="test.classpath"/>
+ </javac>
+ </target>
+
+ <!-- ================================================================== -->
+ <!-- Run unit tests -->
+ <!-- ================================================================== -->
+ <target name="test" depends="test-core, test-plugins"/>
+
+ <target name="test-core" depends="compile, compile-core-test">
+
+ <delete dir="${test.build.data}"/>
+ <mkdir dir="${test.build.data}"/>
+
+ <copy file="${test.src.dir}/nutch-site.xml"
+ todir="${test.build.classes}"/>
+
+ <junit printsummary="yes" haltonfailure="no" fork="yes" dir="${basedir}"
+ errorProperty="tests.failed" failureProperty="tests.failed">
+ <sysproperty key="test.build.data" value="${test.build.data}"/>
+ <sysproperty key="test.src.dir" value="${test.src.dir}"/>
+ <classpath refid="test.classpath"/>
+ <formatter type="plain" />
+ <batchtest todir="${test.build.dir}" unless="testcase">
+ <fileset dir="${test.src.dir}"
+ includes="**/Test*.java" excludes="**/${test.exclude}.java" />
+ </batchtest>
+ <batchtest todir="${test.build.dir}" if="testcase">
+ <fileset dir="${test.src.dir}" includes="**/${testcase}.java"/>
+ </batchtest>
+ </junit>
+
+ <fail if="tests.failed">Tests failed!</fail>
+
+ </target>
+
+ <target name="test-plugins" depends="compile">
+ <ant dir="src/plugin" target="test" inheritAll="false"/>
+ </target>
+
+
+ <!-- ================================================================== -->
+ <!-- Documentation -->
+ <!-- ================================================================== -->
+ <target name="javadoc" depends="compile">
+ <mkdir dir="${build.javadoc}"/>
+ <javadoc
+ overview="${src.dir}/overview.html"
+ destdir="${build.javadoc}"
+ author="true"
+ version="true"
+ use="true"
+ windowtitle="${Name} ${version} API"
+ doctitle="${Name} ${version} API"
+ bottom="Copyright &amp;copy; ${year} The Apache Software Foundation"
+ >
+ <packageset dir="${src.dir}"/>
+ <packageset dir="${plugins.dir}/protocol-file/src/java"/>
+ <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
+ <packageset dir="${plugins.dir}/protocol-http/src/java"/>
+ <packageset dir="${plugins.dir}/parse-html/src/java"/>
+ <packageset dir="${plugins.dir}/parse-text/src/java"/>
+ <packageset dir="${plugins.dir}/parse-pdf/src/java"/>
+ <packageset dir="${plugins.dir}/parse-rtf/src/java"/>
+ <packageset dir="${plugins.dir}/parse-mp3/src/java"/>
+ <packageset dir="${plugins.dir}/parse-msword/src/java"/>
+ <packageset dir="${plugins.dir}/index-basic/src/java"/>
+ <packageset dir="${plugins.dir}/index-more/src/java"/>
+ <packageset dir="${plugins.dir}/query-more/src/java"/>
+ <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
+ <packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
+ <packageset dir="${plugins.dir}/creativecommons/src/java"/>
+ <packageset dir="${plugins.dir}/languageidentifier/src/java"/>
+ <packageset dir="${plugins.dir}/clustering-carrot2/src/java"/>
+ <packageset dir="${plugins.dir}/ontology/src/java"/>
+ <link href="${javadoc.link.java}"/>
+ <link href="${javadoc.link.lucene}"/>
+ <classpath refid="classpath"/>
+ <classpath>
+ <fileset dir="${plugins.dir}" >
+ <include name="**/*.jar"/>
+ </fileset>
+ </classpath>
+ <group title="Core" packages="org.apache.nutch.*"/>
+ <group title="Plugins" packages="${plugins.packages}"/>
+ </javadoc>
+ </target>
+
+ <target name="default-doc">
+ <style basedir="${conf.dir}" destdir="${docs.dir}"
+ includes="nutch-default.xml" style="conf/nutch-conf.xsl"/>
+ </target>
+
+ <target name="generate-locale" if="doc.locale">
+ <echo message="Generating docs for locale=${doc.locale}"/>
+
+ <mkdir dir="${build.docs}/${doc.locale}/include"/>
+ <xslt in="${docs.src}/include/${doc.locale}/header.xml"
+ out="${build.docs}/${doc.locale}/include/header.html"
+ style="${docs.src}/style/nutch-header.xsl"/>
+
+ <dependset>
+ <srcfileset dir="${docs.src}/include/${doc.locale}" includes="*.xml"/>
+ <srcfileset dir="${docs.src}/style" includes="*.xsl"/>
+ <targetfileset dir="${docs.dir}/${doc.locale}" includes="*.html"/>
+ </dependset>
+
+ <copy file="${docs.src}/style/nutch-page.xsl"
+ todir="${build.docs}/${doc.locale}"
+ preservelastmodified="true"/>
+
+ <xslt basedir="${docs.src}/pages/${doc.locale}"
+ destdir="${docs.dir}/${doc.locale}"
+ includes="*.xml"
+ style="${build.docs}/${doc.locale}/nutch-page.xsl"/>
+ </target>
+
+
+ <target name="generate-docs" depends="init">
+ <dependset>
+ <srcfileset dir="${docs.src}/include" includes="*.html"/>
+ <targetfileset dir="${docs.dir}" includes="**/*.html"/>
+ </dependset>
+
+ <mkdir dir="${build.docs}/include"/>
+ <copy todir="${build.docs}/include">
+ <fileset dir="${docs.src}/include"/>
+ </copy>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="ca"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="de"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="en"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="es"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="fi"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="fr"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="hu"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="jp"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="ms"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="nl"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="pl"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="pt"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="sv"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="th"/>
+ </antcall>
+
+ <antcall target="generate-locale">
+ <param name="doc.locale" value="zh"/>
+ </antcall>
+
+ <fixcrlf srcdir="${docs.dir}" eol="lf" encoding="utf-8"
+ includes="**/*.html"/>
+
+ </target>
+
+ <!-- ================================================================== -->
+ <!-- D I S T R I B U T I O N -->
+ <!-- ================================================================== -->
+ <!-- -->
+ <!-- ================================================================== -->
+ <target name="package" depends="jar, war, javadoc">
+ <mkdir dir="${dist.dir}"/>
+ <mkdir dir="${dist.dir}/lib"/>
+ <mkdir dir="${dist.dir}/bin"/>
+ <mkdir dir="${dist.dir}/docs"/>
+ <mkdir dir="${dist.dir}/docs/api"/>
+ <mkdir dir="${dist.dir}/plugins"/>
+
+ <copy todir="${dist.dir}/lib" includeEmptyDirs="false">
+ <fileset dir="lib"/>
+ </copy>
+
+ <copy todir="${dist.dir}/plugins">
+ <fileset dir="${build.plugins}"/>
+ </copy>
+
+ <copy file="${build.dir}/${final.name}.jar" todir="${dist.dir}"/>
+
+ <copy file="${build.dir}/${final.name}.war" todir="${dist.dir}"/>
+
+ <copy todir="${dist.dir}/bin">
+ <fileset dir="bin"/>
+ </copy>
+
+ <copy todir="${dist.dir}/conf">
+ <fileset dir="${conf.dir}" excludes="**/*.template"/>
+ </copy>
+
+ <apply executable="chmod">
+ <arg value="+x"/>
+ <fileset dir="${dist.dir}/bin"/>
+ </apply>
+
+ <copy todir="${dist.dir}/docs">
+ <fileset dir="${docs.dir}"/>
+ </copy>
+
+ <copy todir="${dist.dir}/docs/api">
+ <fileset dir="${build.javadoc}"/>
+ </copy>
+
+ <copy todir="${dist.dir}">
+ <fileset dir=".">
+ <include name="*.txt" />
+ </fileset>
+ </copy>
+
+ <copy todir="${dist.dir}/src" includeEmptyDirs="false">
+ <fileset dir="src"/>
+ </copy>
+
+ <copy todir="${dist.dir}/" file="build.xml"/>
+ <copy todir="${dist.dir}/" file="default.properties"/>
+
+ </target>
+
+ <!-- ================================================================== -->
+ <!-- Make release tarball -->
+ <!-- ================================================================== -->
+ <target name="tar" depends="package">
+ <exec executable="tar" dir="${build.dir}">
+ <arg value="czf"/>
+ <arg value="${final.name}.tar.gz"/>
+ <arg value="${final.name}"/>
+ </exec>
+ </target>
+
+ <!-- ================================================================== -->
+ <!-- Clean. Delete the build files, and their directories -->
+ <!-- ================================================================== -->
+ <target name="clean">
+ <delete dir="${build.dir}"/>
+ <exec executable="rm"><arg value="-f"/><arg value="${name}.jar"/></exec>
+ </target>
+
+</project>
View
13 conf/common-terms.utf8
@@ -0,0 +1,13 @@
+# Common terms and phrases which will be indexed in n-grams
+# in order to optimize search.
+content:a
+content:and
+content:for
+content:in
+content:of
+content:the
+content:to
+url:com
+url:http
+url:http-www
+url:www
View
50 conf/crawl-tool.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0" ?>
+<?xml:stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+
+<!-- Overidden defaults for intranet use. -->
+
+<!-- Do not modify this file directly. Instead, copy entries that you -->
+<!-- wish to modify from this file into nutch-site.xml and change them -->
+<!-- there. If nutch-site.xml does not already exist, create it. -->
+
+<nutch-conf>
+
+<property>
+ <name>urlfilter.regex.file</name>
+ <value>crawl-urlfilter.txt</value>
+</property>
+
+<property>
+ <name>indexer.boost.by.link.count</name>
+ <value>true</value>
+ <description>When true scores for a page are multipled by the log of
+ the number of incoming links to the page.</description>
+</property>
+
+<property>
+ <name>db.ignore.internal.links</name>
+ <value>false</value>
+ <description>If true, when adding new links to a page, links from
+ the same host are ignored. This is an effective way to limit the
+ size of the link database, keeping the only the highest quality
+ links.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.server.delay</name>
+ <value>1.0</value>
+ <description>The number of seconds the fetcher will delay between
+ successive requests to the same server.</description>
+</property>
+
+<property>
+ <name>http.max.delays</name>
+ <value>100</value>
+ <description>The number of times a thread will delay when trying to
+ fetch a page. When using the crawl tool there are likely to be very
+ few different hosts, so we need to be willing to wait longer for
+ each.</description>
+</property>
+
+</nutch-conf>
View
24 conf/crawl-urlfilter.txt.template
@@ -0,0 +1,24 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'. The first matching pattern in the file
+# determines whether a URL is included or ignored. If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# accept hosts in MY.DOMAIN.NAME
++^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
+
+# skip everything else
+-.
View
480 conf/mime.types
@@ -0,0 +1,480 @@
+# This is a comment. I love comments.
+
+# This file controls what Internet media types are sent to the client for
+# given file extension(s). Sending the correct media type to the client
+# is important so they know how to handle the content of the file.
+# Extra types can either be added here or by using an AddType directive
+# in your config files. For more information about Internet media types,
+# please read RFC 2045, 2046, 2047, 2048, and 2077. The Internet media type
+# registry is at <http://www.iana.org/assignments/media-types/>.
+
+# MIME type Extension
+application/EDI-Consent
+application/EDI-X12
+application/EDIFACT
+application/activemessage
+application/andrew-inset ez
+application/applefile
+application/atomicmail
+application/batch-SMTP
+application/beep+xml
+application/cals-1840
+application/commonground
+application/cybercash
+application/dca-rft
+application/dec-dx
+application/dvcs
+application/eshop
+application/http
+application/hyperstudio
+application/iges
+application/index
+application/index.cmd
+application/index.obj
+application/index.response
+application/index.vnd
+application/iotp
+application/ipp
+application/isup
+application/font-tdpfr
+application/mac-binhex40 hqx
+application/mac-compactpro cpt
+application/macwriteii
+application/marc
+application/mathematica
+application/mathematica-old
+application/msword doc
+application/news-message-id
+application/news-transmission
+application/ocsp-request
+application/ocsp-response
+application/octet-stream bin dms lha lzh exe class so dll
+application/oda oda
+application/parityfec
+application/pdf pdf
+application/pgp-encrypted
+application/pgp-keys
+application/pgp-signature
+application/pkcs10
+application/pkcs7-mime
+application/pkcs7-signature
+application/pkix-cert
+application/pkix-crl
+application/pkixcmp
+application/postscript ai eps ps
+application/prs.alvestrand.titrax-sheet
+application/prs.cww
+application/prs.nprend
+application/qsig
+application/remote-printing
+application/riscos
+application/rtf rtf
+application/sdp
+application/set-payment
+application/set-payment-initiation
+application/set-registration
+application/set-registration-initiation
+application/sgml
+application/sgml-open-catalog
+application/sieve
+application/slate
+application/smil smi smil
+application/timestamp-query
+application/timestamp-reply
+application/vemmi
+application/vnd.3M.Post-it-Notes
+application/vnd.FloGraphIt
+application/vnd.accpac.simply.aso
+application/vnd.accpac.simply.imp
+application/vnd.acucobol
+application/vnd.aether.imp
+application/vnd.anser-web-certificate-issue-initiation
+application/vnd.anser-web-funds-transfer-initiation
+application/vnd.audiograph
+application/vnd.businessobjects
+application/vnd.bmi
+application/vnd.canon-cpdl
+application/vnd.canon-lips
+application/vnd.claymore
+application/vnd.commerce-battelle
+application/vnd.commonspace
+application/vnd.comsocaller
+application/vnd.contact.cmsg
+application/vnd.cosmocaller
+application/vnd.cups-postscript
+application/vnd.cups-raster
+application/vnd.cups-raw
+application/vnd.ctc-posml
+application/vnd.cybank
+application/vnd.dna
+application/vnd.dpgraph
+application/vnd.dxr
+application/vnd.ecdis-update
+application/vnd.ecowin.chart
+application/vnd.ecowin.filerequest
+application/vnd.ecowin.fileupdate
+application/vnd.ecowin.series
+application/vnd.ecowin.seriesrequest
+application/vnd.ecowin.seriesupdate
+application/vnd.enliven
+application/vnd.epson.esf
+application/vnd.epson.msf
+application/vnd.epson.quickanime
+application/vnd.epson.salt
+application/vnd.epson.ssf
+application/vnd.ericsson.quickcall
+application/vnd.eudora.data
+application/vnd.fdf
+application/vnd.ffsns
+application/vnd.framemaker
+application/vnd.fsc.weblaunch
+application/vnd.fujitsu.oasys
+application/vnd.fujitsu.oasys2
+application/vnd.fujitsu.oasys3
+application/vnd.fujitsu.oasysgp
+application/vnd.fujitsu.oasysprs
+application/vnd.fujixerox.ddd
+application/vnd.fujixerox.docuworks
+application/vnd.fujixerox.docuworks.binder
+application/vnd.fut-misnet
+application/vnd.grafeq
+application/vnd.groove-account
+application/vnd.groove-identity-message
+application/vnd.groove-injector
+application/vnd.groove-tool-message
+application/vnd.groove-tool-template
+application/vnd.groove-vcard
+application/vnd.hhe.lesson-player
+application/vnd.hp-HPGL
+application/vnd.hp-PCL
+application/vnd.hp-PCLXL
+application/vnd.hp-hpid
+application/vnd.hp-hps
+application/vnd.httphone
+application/vnd.hzn-3d-crossword
+application/vnd.ibm.afplinedata
+application/vnd.ibm.MiniPay
+application/vnd.ibm.modcap
+application/vnd.informix-visionary
+application/vnd.intercon.formnet
+application/vnd.intertrust.digibox
+application/vnd.intertrust.nncp
+application/vnd.intu.qbo
+application/vnd.intu.qfx
+application/vnd.irepository.package+xml
+application/vnd.is-xpr
+application/vnd.japannet-directory-service
+application/vnd.japannet-jpnstore-wakeup
+application/vnd.japannet-payment-wakeup
+application/vnd.japannet-registration
+application/vnd.japannet-registration-wakeup
+application/vnd.japannet-setstore-wakeup
+application/vnd.japannet-verification
+application/vnd.japannet-verification-wakeup
+application/vnd.koan
+application/vnd.lotus-1-2-3
+application/vnd.lotus-approach
+application/vnd.lotus-freelance
+application/vnd.lotus-notes
+application/vnd.lotus-organizer
+application/vnd.lotus-screencam
+application/vnd.lotus-wordpro
+application/vnd.mcd
+application/vnd.mediastation.cdkey
+application/vnd.meridian-slingshot
+application/vnd.mif mif
+application/vnd.minisoft-hp3000-save
+application/vnd.mitsubishi.misty-guard.trustweb
+application/vnd.mobius.daf
+application/vnd.mobius.dis
+application/vnd.mobius.msl
+application/vnd.mobius.plc
+application/vnd.mobius.txf
+application/vnd.motorola.flexsuite
+application/vnd.motorola.flexsuite.adsi
+application/vnd.motorola.flexsuite.fis
+application/vnd.motorola.flexsuite.gotap
+application/vnd.motorola.flexsuite.kmr
+application/vnd.motorola.flexsuite.ttc
+application/vnd.motorola.flexsuite.wem
+application/vnd.mozilla.xul+xml
+application/vnd.ms-artgalry
+application/vnd.ms-asf
+application/vnd.ms-excel xls
+application/vnd.ms-lrm
+application/vnd.ms-powerpoint ppt
+application/vnd.ms-project
+application/vnd.ms-tnef
+application/vnd.ms-works
+application/vnd.mseq
+application/vnd.msign
+application/vnd.music-niff
+application/vnd.musician
+application/vnd.netfpx
+application/vnd.noblenet-directory
+application/vnd.noblenet-sealer
+application/vnd.noblenet-web
+application/vnd.novadigm.EDM
+application/vnd.novadigm.EDX
+application/vnd.novadigm.EXT
+application/vnd.osa.netdeploy
+application/vnd.palm
+application/vnd.pg.format
+application/vnd.pg.osasli
+application/vnd.powerbuilder6
+application/vnd.powerbuilder6-s
+application/vnd.powerbuilder7
+application/vnd.powerbuilder7-s
+application/vnd.powerbuilder75
+application/vnd.powerbuilder75-s
+application/vnd.previewsystems.box
+application/vnd.publishare-delta-tree
+application/vnd.pvi.ptid1
+application/vnd.pwg-xhtml-print+xml
+application/vnd.rapid
+application/vnd.s3sms
+application/vnd.seemail
+application/vnd.shana.informed.formdata
+application/vnd.shana.informed.formtemplate
+application/vnd.shana.informed.interchange
+application/vnd.shana.informed.package
+application/vnd.sss-cod
+application/vnd.sss-dtf
+application/vnd.sss-ntf
+application/vnd.street-stream
+application/vnd.svd
+application/vnd.swiftview-ics
+application/vnd.triscape.mxs
+application/vnd.trueapp
+application/vnd.truedoc
+application/vnd.tve-trigger
+application/vnd.ufdl
+application/vnd.uplanet.alert
+application/vnd.uplanet.alert-wbxml
+application/vnd.uplanet.bearer-choice-wbxml
+application/vnd.uplanet.bearer-choice
+application/vnd.uplanet.cacheop
+application/vnd.uplanet.cacheop-wbxml
+application/vnd.uplanet.channel
+application/vnd.uplanet.channel-wbxml
+application/vnd.uplanet.list
+application/vnd.uplanet.list-wbxml
+application/vnd.uplanet.listcmd
+application/vnd.uplanet.listcmd-wbxml
+application/vnd.uplanet.signal
+application/vnd.vcx
+application/vnd.vectorworks
+application/vnd.vidsoft.vidconference
+application/vnd.visio
+application/vnd.vividence.scriptfile
+application/vnd.wap.sic
+application/vnd.wap.slc
+application/vnd.wap.wbxml wbxml
+application/vnd.wap.wmlc wmlc
+application/vnd.wap.wmlscriptc wmlsc
+application/vnd.webturbo
+application/vnd.wrq-hp3000-labelled
+application/vnd.wt.stf
+application/vnd.xara
+application/vnd.xfdl
+application/vnd.yellowriver-custom-menu
+application/whoispp-query
+application/whoispp-response
+application/wita
+application/wordperfect5.1
+application/x-bcpio bcpio
+application/x-bzip2 bz2
+application/x-cdlink vcd
+application/x-chess-pgn pgn
+application/x-compress
+application/x-cpio cpio
+application/x-csh csh
+application/x-director dcr dir dxr
+application/x-dvi dvi
+application/x-futuresplash spl
+application/x-gtar gtar
+application/x-gzip gz tgz
+application/x-hdf hdf
+application/x-javascript js
+application/x-kword kwd kwt
+application/x-kspread ksp
+application/x-kpresenter kpr kpt
+application/x-kchart chrt
+application/x-killustrator kil
+application/x-koan skp skd skt skm
+application/x-latex latex
+application/x-netcdf nc cdf
+application/x-ogg ogg
+# This conflicts with audio/x-pn-realaudio-plugin, which is commented out below.
+application/x-rpm rpm
+application/x-sh sh
+application/x-shar shar
+application/x-shockwave-flash swf
+application/x-stuffit sit
+application/x-sv4cpio sv4cpio
+application/x-sv4crc sv4crc
+application/x-tar tar
+application/x-tcl tcl
+application/x-tex tex
+application/x-texinfo texinfo texi
+application/x-troff t tr roff
+application/x-troff-man man
+application/x-troff-me me
+application/x-troff-ms ms
+application/x-ustar ustar
+application/x-wais-source src
+application/x400-bp
+application/xhtml+xml xhtml xht
+application/xml
+application/xml-dtd
+application/xml-external-parsed-entity
+application/zip zip
+audio/32kadpcm
+audio/basic au snd
+audio/g.722.1
+audio/l16
+audio/midi mid midi kar
+audio/mp4a-latm
+audio/mpa-robust
+audio/mpeg mpga mp2 mp3
+audio/parityfec
+audio/prs.sid
+audio/telephone-event
+audio/tone
+audio/vnd.cisco.nse
+audio/vnd.cns.anp1
+audio/vnd.cns.inf1
+audio/vnd.digital-winds
+audio/vnd.everad.plj
+audio/vnd.lucent.voice
+audio/vnd.nortel.vbk
+audio/vnd.nuera.ecelp4800
+audio/vnd.nuera.ecelp7470
+audio/vnd.nuera.ecelp9600
+audio/vnd.octel.sbc
+audio/vnd.qcelp
+audio/vnd.rhetorex.32kadpcm
+audio/vnd.vmx.cvsd
+audio/x-aiff aif aiff aifc
+audio/x-mpegurl m3u
+audio/x-pn-realaudio ram rm
+#audio/x-pn-realaudio-plugin rpm
+audio/x-realaudio ra
+audio/x-wav wav
+chemical/x-pdb pdb
+chemical/x-xyz xyz
+image/bmp bmp
+image/cgm
+image/g3fax
+image/gif gif
+image/ief ief
+image/jpeg jpeg jpg jpe
+image/naplps
+image/png png
+image/prs.btif
+image/prs.pti
+image/tiff tiff tif
+image/vnd.cns.inf2
+image/vnd.djvu djvu djv
+image/vnd.dwg
+image/vnd.dxf
+image/vnd.fastbidsheet
+image/vnd.fpx
+image/vnd.fst
+image/vnd.fujixerox.edmics-mmr
+image/vnd.fujixerox.edmics-rlc
+image/vnd.mix
+image/vnd.net-fpx
+image/vnd.svf
+image/vnd.wap.wbmp wbmp
+image/vnd.xiff
+image/x-cmu-raster ras
+image/x-portable-anymap pnm
+image/x-portable-bitmap pbm
+image/x-portable-graymap pgm
+image/x-portable-pixmap ppm
+image/x-rgb rgb
+image/x-xbitmap xbm
+image/x-xpixmap xpm
+image/x-xwindowdump xwd
+message/delivery-status
+message/disposition-notification
+message/external-body
+message/http
+message/news
+message/partial
+message/rfc822
+message/s-http
+model/iges igs iges
+model/mesh msh mesh silo
+model/vnd.dwf
+model/vnd.flatland.3dml
+model/vnd.gdl
+model/vnd.gs-gdl
+model/vnd.gtw
+model/vnd.mts
+model/vnd.vtu
+model/vrml wrl vrml
+multipart/alternative
+multipart/appledouble
+multipart/byteranges
+multipart/digest
+multipart/encrypted
+multipart/form-data
+multipart/header-set
+multipart/mixed
+multipart/parallel
+multipart/related
+multipart/report
+multipart/signed
+multipart/voice-message
+text/calendar
+text/css css
+text/directory
+text/enriched
+text/html html htm
+text/parityfec
+text/plain asc txt
+text/prs.lines.tag
+text/rfc822-headers
+text/richtext rtx
+text/rtf rtf
+text/sgml sgml sgm
+text/tab-separated-values tsv
+text/t140
+text/uri-list
+text/vnd.DMClientScript
+text/vnd.IPTC.NITF
+text/vnd.IPTC.NewsML
+text/vnd.abc
+text/vnd.curl
+text/vnd.flatland.3dml
+text/vnd.fly
+text/vnd.fmi.flexstor
+text/vnd.in3d.3dml
+text/vnd.in3d.spot
+text/vnd.latex-z
+text/vnd.motorola.reflex
+text/vnd.ms-mediapackage
+text/vnd.wap.si
+text/vnd.wap.sl
+text/vnd.wap.wml wml
+text/vnd.wap.wmlscript wmls
+text/x-setext etx
+text/xml xml xsl
+text/xml-external-parsed-entity
+video/mp4v-es
+video/mpeg mpeg mpg mpe
+video/parityfec
+video/pointer
+video/quicktime qt mov
+video/vnd.fvt
+video/vnd.motorola.video
+video/vnd.motorola.videop
+video/vnd.mpegurl mxu
+video/vnd.mts
+video/vnd.nokia.interleaved-multimedia
+video/vnd.vivo
+video/x-msvideo avi
+video/x-sgi-movie movie
+x-conference/x-cooltalk ice
View
24 conf/nutch-conf.xsl
@@ -0,0 +1,24 @@
+<?xml version="1.0"?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+<xsl:output method="html"/>
+<xsl:template match="nutch-conf">
+<html>
+<body>
+<table border="1">
+<tr>
+ <td>name</td>
+ <td>value</td>
+ <td>description</td>
+</tr>
+<xsl:for-each select="property">
+<tr>
+ <td><xsl:value-of select="name"/></td>
+ <td><xsl:value-of select="value"/></td>
+ <td><xsl:value-of select="description"/></td>
+</tr>
+</xsl:for-each>
+</table>
+</body>
+</html>
+</xsl:template>
+</xsl:stylesheet>
View
664 conf/nutch-default.xml
@@ -0,0 +1,664 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+
+<!-- Do not modify this file directly. Instead, copy entries that you -->
+<!-- wish to modify from this file into nutch-site.xml and change them -->
+<!-- there. If nutch-site.xml does not already exist, create it. -->
+
+<nutch-conf>
+
+<!-- HTTP properties -->
+
+<property>
+ <name>http.agent.name</name>
+ <value>NutchCVS</value>
+ <description>Our HTTP 'User-Agent' request header.</description>
+</property>
+
+<property>
+ <name>http.robots.agents</name>
+ <value>NutchCVS,Nutch,*</value>
+ <description>The agent strings we'll look for in robots.txt files,
+ comma-separated, in decreasing order of precedence.</description>
+</property>
+
+<property>
+ <name>http.agent.description</name>
+ <value>Nutch</value>
+ <description>Further description of our bot- this text is used in
+ the User-Agent header. It appears in parenthesis after the agent name.
+ </description>
+</property>
+
+<property>
+ <name>http.agent.url</name>
+ <value>http://www.nutch.org/docs/en/bot.html</value>
+ <description>A URL to advertise in the User-Agent header. This will
+ appear in parenthesis after the agent name.
+ </description>
+</property>
+
+<property>
+ <name>http.agent.email</name>
+ <value>nutch-agent@lists.sourceforge.net</value>
+ <description>An email address to advertise in the HTTP 'From' request
+ header and User-Agent header.</description>
+</property>
+
+<property>
+ <name>http.agent.version</name>
+ <value>0.06-dev</value>
+ <description>A version string to advertise in the User-Agent
+ header.</description>
+</property>
+
+<property>
+ <name>http.timeout</name>
+ <value>10000</value>
+ <description>The default network timeout, in milliseconds.</description>
+</property>
+
+<property>
+ <name>http.max.delays</name>
+ <value>3</value>
+ <description>The number of times a thread will delay when trying to
+ fetch a page. Each time it finds that a host is busy, it will wait
+ fetcher.server.delay. After http.max.delays attepts, it will give
+ up on the page for now.</description>
+</property>
+
+<property>
+ <name>http.content.limit</name>
+ <value>65536</value>
+ <description>The length limit for downloaded content, in bytes.
+ If this value is nonnegative (>=0), content longer than it will be truncated;
+ otherwise, no truncation at all.
+ </description>
+</property>
+
+<property>
+ <name>http.proxy.host</name>
+ <value></value>
+ <description>The proxy hostname. If empty, no proxy is used.</description>
+</property>
+
+<property>
+ <name>http.proxy.port</name>
+ <value></value>
+ <description>The proxy port.</description>
+</property>
+
+<property>
+ <name>http.verbose</name>
+ <value>false</value>
+ <description>If true, HTTP will log more verbosely.</description>
+</property>
+
+<property>
+ <name>http.redirect.max</name>
+ <value>3</value>
+ <description>The maximum number of redirects the fetcher will follow when
+ trying to fetch a page.</description>
+</property>
+
+<!-- FILE properties -->
+
+<property>
+ <name>file.content.limit</name>
+ <value>65536</value>
+ <description>The length limit for downloaded content, in bytes.
+ If this value is larger than zero, content longer than it will be
+ truncated; otherwise (zero or negative), no truncation at all.
+ </description>
+</property>
+
+<property>
+ <name>file.content.ignored</name>
+ <value>true</value>
+ <description>If true, no file content will be saved during fetch.
+ And it is probably what we want to set most of time, since file:// URLs
+ are meant to be local and we can always use them directly at parsing
+ and indexing stages. Otherwise file contents will be saved.
+ !! NO IMPLEMENTED YET !!
+ </description>
+</property>
+
+<!-- FTP properties -->
+
+<property>
+ <name>ftp.username</name>
+ <value>anonymous</value>
+ <description>ftp login username.</description>
+</property>
+
+<property>
+ <name>ftp.password</name>
+ <value>anonymous@example.com</value>
+ <description>ftp login password.</description>
+</property>
+
+<property>
+ <name>ftp.content.limit</name>
+ <value>65536</value>
+ <description>The length limit for downloaded content, in bytes.
+ If this value is larger than zero, content longer than it is truncated;
+ otherwise (zero or negative), no truncation at all. Caution: classical
+ ftp RFCs never defines partial transfer and, in fact, some ftp servers
+ out there do not handle client side forced close-down very well.
+ Our implementation tries its best to handle such situations smoothly.
+ </description>
+</property>
+
+<property>
+ <name>ftp.timeout</name>
+ <value>60000</value>
+ <description>Default timeout for ftp client socket, in millisec.
+ Please also see ftp.keep.connection below.</description>
+</property>
+
+<property>
+ <name>ftp.server.timeout</name>
+ <value>100000</value>
+ <description>An estimation of ftp server idle time, in millisec.
+ Typically it is 120000 millisec for many ftp servers out there.
+ Better be conservative here. Together with ftp.timeout, it is used to
+ decide if we need to delete (annihilate) current ftp.client instance and
+ force to start another ftp.client instance anew. This is necessary because
+ a fetcher thread may not be able to obtain next request from queue in time
+ (due to idleness) before our ftp client times out or remote server
+ disconnects. Used only when ftp.keep.connection is true (please see below).
+ </description>
+</property>
+
+<property>
+ <name>ftp.keep.connection</name>
+ <value>false</value>
+ <description>Whether to keep ftp connection. Useful if crawling same host
+ again and again. When set to true, it avoids connection, login and dir list
+ parser setup for subsequent urls. If it is set to true, however, you must
+ make sure (roughly):
+ (1) ftp.timeout is less than ftp.server.timeout
+ (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay)
+ Otherwise there will be too many "delete client because idled too long"
+ messages in thread logs.</description>
+</property>
+
+<property>
+ <name>ftp.follow.talk</name>
+ <value>false</value>
+ <description>Whether to log dialogue between our client and remote
+ server. Useful for debugging.</description>
+</property>
+
+<!-- web db properties -->
+
+<property>
+ <name>db.default.fetch.interval</name>
+ <value>30</value>
+ <description>The default number of days between re-fetches of a page.
+ </description>
+</property>
+
+<property>
+ <name>db.ignore.internal.links</name>
+ <value>true</value>
+ <description>If true, when adding new links to a page, links from
+ the same host are ignored. This is an effective way to limit the
+ size of the link database, keeping the only the highest quality
+ links.
+ </description>
+</property>
+
+<property>
+ <name>db.score.injected</name>
+ <value>1.0</value>
+ <description>The score of new pages added by the injector.
+ </description>
+</property>
+
+<property>
+ <name>db.score.link.external</name>
+ <value>1.0</value>
+ <description>The score factor for new pages added due to a link from
+ another host relative to the referencing page's score.
+ </description>
+</property>
+
+<property>
+ <name>db.score.link.internal</name>
+ <value>1.0</value>
+ <description>The score factor for pages added due to a link from the
+ same host, relative to the referencing page's score.
+ </description>
+</property>
+
+<property>
+ <name>db.max.outlinks.per.page</name>
+ <value>100</value>
+ <description>The maximum number of outlinks that we'll process for a page.
+ </description>
+</property>
+
+<property>
+ <name>db.max.anchor.length</name>
+ <value>100</value>
+ <description>The maximum number of characters permitted in an anchor.
+ </description>
+</property>
+
+<!-- fetchlist tool properties -->
+
+<property>
+ <name>fetchlist.score.by.link.count</name>
+ <value>false</value>
+ <description>If true, set page scores on fetchlist entries based on
+ log(number of anchors), instead of using original page scores. This
+ results in prioritization of pages with many incoming links.
+ </description>
+</property>
+
+<!-- fetcher properties -->
+
+<property>
+ <name>fetcher.server.delay</name>
+ <value>5.0</value>
+ <description>The number of seconds the fetcher will delay between
+ successive requests to the same server.</description>
+</property>
+
+<property>
+ <name>fetcher.threads.fetch</name>
+ <value>10</value>
+ <description>The number of FetcherThreads the fetcher should use.
+ This is also determines the maximum number of requests that are
+ made at once (each FetcherThread handles one connection).</description>
+</property>
+
+<property>
+ <name>fetcher.threads.per.host</name>
+ <value>1</value>
+ <description>This number is the maximum number of threads that
+ should be allowed to access a host at one time.</description>
+</property>
+
+<property>
+ <name>fetcher.retry.max</name>
+ <value>3</value>
+ <description>The maximum number of times the fetcher will attempt to get
+ a page that has encountered recoverable errors. </description>
+</property>
+
+<property>
+ <name>fetcher.verbose</name>
+ <value>false</value>
+ <description>If true, fetcher will log more verbosely.</description>
+</property>
+
+<!-- parser properties -->
+<property>
+ <name>parser.threads.parse</name>
+ <value>10</value>
+ <description>Number of ParserThreads ParseSegment should use.</description>
+</property>
+
+<!-- i/o properties -->
+
+<property>
+ <name>io.sort.factor</name>
+ <value>100</value>
+ <description>The number of streams to merge at once while sorting
+ files. This determines the number of open file handles.</description>
+</property>
+
+<property>
+ <name>io.sort.mb</name>
+ <value>100</value>
+ <description>The total amount of buffer memory to use while sorting
+ files, in megabytes. By default, gives each merge stream 1MB, which
+ should minimize seeks.</description>
+</property>
+
+<property>
+ <name>io.file.buffer.size</name>
+ <value>131072</value>
+ <description>The size of buffer for use in sequence files.
+ The size of this buffer should probably be a multiple of hardware
+ page size (4096 on Intel x86), and it determines how much data is
+ buffered during read and write operations.</description>
+</property>
+
+<!-- file system properties -->
+
+<property>
+ <name>fs.default.name</name>
+ <value>local</value>
+ <description>The name of the default file system. Either the
+ literal string "local" or a host:port for NDFS.</description>
+</property>
+
+<property>
+ <name>ndfs.name.dir</name>
+ <value>/tmp/nutch/ndfs/name</value>
+ <description>Determines where on the local filesystem the NDFS name node
+ should store the name table.</description>
+</property>
+
+<property>
+ <name>ndfs.data.dir</name>
+ <value>/tmp/nutch/ndfs/data</value>
+ <description>Determines where on the local filesystem an NDFS data node
+ should store its blocks.</description>
+</property>
+
+<!-- map/reduce properties -->
+
+<property>
+ <name>mapred.job.tracker</name>
+ <value>localhost:8010</value>
+ <description>The host and port that the MapReduce job tracker runs at.
+ </description>
+</property>
+
+<property>
+ <name>mapred.local.dir</name>
+ <value>/tmp/nutch/mapred/local</value>
+ <description>The local directory where MapReduce stores temprorary files
+ related to tasks and jobs.
+ </description>
+</property>
+
+<!-- indexer properties -->
+
+<property>
+ <name>indexer.score.power</name>
+ <value>0.5</value>
+ <description>Determines the power of link analyis scores. Each
+ pages's boost is set to <i>score<sup>scorePower</sup></i> where
+ <i>score</i> is its link analysis score and <i>scorePower</i> is the
+ value of this parameter. This is compiled into indexes, so, when
+ this is changed, pages must be re-indexed for it to take
+ effect.</description>
+</property>
+
+<property>
+ <name>indexer.boost.by.link.count</name>
+ <value>false</value>
+ <description>When true scores for a page are multipled by the log of
+ the number of incoming links to the page.</description>
+</property>
+
+<property>
+ <name>indexer.max.title.length</name>
+ <value>100</value>
+ <description>The maximum number of characters of a title that are indexed.
+ </description>
+</property>
+
+<property>
+ <name>indexer.max.tokens</name>
+ <value>10000</value>
+ <description>
+ The maximum number of tokens that will be indexed for a single field
+ in a document. This limits the amount of memory required for
+ indexing, so that collections with very large files will not crash
+ the indexing process by running out of memory.
+
+ Note that this effectively truncates large documents, excluding
+ from the index tokens that occur further in the document. If you
+ know your source documents are large, be sure to set this value
+ high enough to accomodate the expected size. If you set it to
+ Integer.MAX_VALUE, then the only limit is your memory, but you
+ should anticipate an OutOfMemoryError.
+ </description>
+</property>
+
+<property>
+ <name>indexer.mergeFactor</name>
+ <value>50</value>
+ <description>The factor that determines the frequency of Lucene segment
+ merges. This must not be less than 2, higher values increase indexing
+ speed but lead to increased RAM usage, and increase the number of
+ open file handles (which may lead to "Too many open files" errors).
+ NOTE: the "segments" here have nothing to do with Nutch segments, they
+ are a low-level data unit used by Lucene.
+ </description>
+</property>
+
+<property>
+ <name>indexer.minMergeDocs</name>
+ <value>50</value>
+ <description>This number determines the minimum number of Lucene
+ Documents buffered in memory between Lucene segment merges. Larger
+ values increase indexing speed and increase RAM usage.
+ </