Skip to content
Browse files

NUTCH-193: MapReduce and NDFS code moved to new project, Hadoop. See …

…bug report for details.

git-svn-id: https://svn.apache.org/repos/asf/lucene/nutch/trunk@374796 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent e0093e6 commit 25437bc825d50715dcc9a289a1b4c8fbbc6aced1 @cutting cutting committed Feb 4, 2006
Showing with 554 additions and 17,137 deletions.
  1. +0 −18 bin/nutch
  2. +0 −101 bin/nutch-daemon.sh
  3. +0 −16 bin/nutch-daemons.sh
  4. +0 −27 bin/slaves.sh
  5. +0 −11 bin/start-all.sh
  6. +0 −11 bin/stop-all.sh
  7. +3 −3 conf/crawl-tool.xml
  8. +237 −0 conf/hadoop-default.xml
  9. +2 −2 conf/mapred-default.xml.template
  10. +3 −229 conf/nutch-default.xml
  11. +3 −3 conf/nutch-site.xml.template
  12. BIN lib/hadoop-0.1-dev.jar
  13. +0 −202 lib/jetty-5.1.4.LICENSE.txt
  14. BIN lib/jetty-5.1.4.jar
  15. BIN lib/jetty-ext/ant.jar
  16. BIN lib/jetty-ext/commons-el.jar
  17. BIN lib/jetty-ext/jasper-compiler.jar
  18. BIN lib/jetty-ext/jasper-runtime.jar
  19. BIN lib/jetty-ext/jsp-api.jar
  20. +10 −9 src/java/org/apache/nutch/analysis/AnalyzerFactory.java
  21. +10 −9 src/java/org/apache/nutch/analysis/CommonGrams.java
  22. +8 −7 src/java/org/apache/nutch/analysis/NutchAnalysis.java
  23. +1 −1 src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
  24. +7 −6 src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
  25. +5 −5 src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
  26. +21 −19 src/java/org/apache/nutch/crawl/Crawl.java
  27. +2 −1 src/java/org/apache/nutch/crawl/CrawlDatum.java
  28. +12 −9 src/java/org/apache/nutch/crawl/CrawlDb.java
  29. +28 −26 src/java/org/apache/nutch/crawl/CrawlDbReader.java
  30. +2 −2 src/java/org/apache/nutch/crawl/CrawlDbReducer.java
  31. +10 −7 src/java/org/apache/nutch/crawl/Generator.java
  32. +11 −8 src/java/org/apache/nutch/crawl/Injector.java
  33. +1 −1 src/java/org/apache/nutch/crawl/Inlink.java
  34. +1 −1 src/java/org/apache/nutch/crawl/Inlinks.java
  35. +12 −9 src/java/org/apache/nutch/crawl/LinkDb.java
  36. +17 −15 src/java/org/apache/nutch/crawl/LinkDbReader.java
  37. +1 −1 src/java/org/apache/nutch/crawl/MD5Signature.java
  38. +2 −2 src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
  39. +6 −6 src/java/org/apache/nutch/crawl/Signature.java
  40. +5 −5 src/java/org/apache/nutch/crawl/SignatureFactory.java
  41. +4 −3 src/java/org/apache/nutch/crawl/TextProfileSignature.java
  42. +20 −17 src/java/org/apache/nutch/fetcher/Fetcher.java
  43. +12 −10 src/java/org/apache/nutch/fetcher/FetcherOutput.java
  44. +10 −10 src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
  45. +0 −26 src/java/org/apache/nutch/fs/ChecksumException.java
  46. +0 −25 src/java/org/apache/nutch/fs/FSError.java
  47. +0 −122 src/java/org/apache/nutch/fs/FileUtil.java
  48. +0 −415 src/java/org/apache/nutch/fs/LocalFileSystem.java
  49. +0 −320 src/java/org/apache/nutch/fs/NDFSFileSystem.java
  50. +0 −261 src/java/org/apache/nutch/fs/NDFSShell.java
  51. +0 −248 src/java/org/apache/nutch/fs/NFSDataInputStream.java
  52. +0 −146 src/java/org/apache/nutch/fs/NFSDataOutputStream.java
  53. +0 −38 src/java/org/apache/nutch/fs/NFSInputStream.java
  54. +0 −30 src/java/org/apache/nutch/fs/NFSOutputStream.java
  55. +0 −389 src/java/org/apache/nutch/fs/NutchFileSystem.java
  56. +0 −28 src/java/org/apache/nutch/fs/Seekable.java
  57. +18 −14 src/java/org/apache/nutch/indexer/DeleteDuplicates.java
  58. +17 −17 src/java/org/apache/nutch/indexer/{NdfsDirectory.java → FsDirectory.java}
  59. +24 −21 src/java/org/apache/nutch/indexer/IndexMerger.java
  60. +5 −3 src/java/org/apache/nutch/indexer/IndexSorter.java
  61. +14 −11 src/java/org/apache/nutch/indexer/Indexer.java
  62. +3 −3 src/java/org/apache/nutch/indexer/IndexingFilter.java
  63. +7 −7 src/java/org/apache/nutch/indexer/IndexingFilters.java
  64. +0 −79 src/java/org/apache/nutch/io/ArrayFile.java
  65. +0 −102 src/java/org/apache/nutch/io/ArrayWritable.java
  66. +0 −107 src/java/org/apache/nutch/io/BooleanWritable.java
  67. +0 −50 src/java/org/apache/nutch/io/BytesWritable.java
  68. +0 −83 src/java/org/apache/nutch/io/CompressedWritable.java
  69. +0 −87 src/java/org/apache/nutch/io/DataInputBuffer.java
  70. +0 −91 src/java/org/apache/nutch/io/DataOutputBuffer.java
  71. +0 −85 src/java/org/apache/nutch/io/FloatWritable.java
  72. +0 −84 src/java/org/apache/nutch/io/IntWritable.java
  73. +0 −95 src/java/org/apache/nutch/io/LongWritable.java
  74. +0 −199 src/java/org/apache/nutch/io/MD5Hash.java
  75. +0 −497 src/java/org/apache/nutch/io/MapFile.java
  76. +0 −34 src/java/org/apache/nutch/io/NullWritable.java
  77. +0 −261 src/java/org/apache/nutch/io/ObjectWritable.java
  78. +0 −885 src/java/org/apache/nutch/io/SequenceFile.java
  79. +0 −89 src/java/org/apache/nutch/io/SetFile.java
  80. +0 −89 src/java/org/apache/nutch/io/TwoDArrayWritable.java
  81. +0 −287 src/java/org/apache/nutch/io/UTF8.java
  82. +0 −39 src/java/org/apache/nutch/io/VersionMismatchException.java
  83. +0 −50 src/java/org/apache/nutch/io/VersionedWritable.java
  84. +0 −41 src/java/org/apache/nutch/io/Writable.java
  85. +0 −24 src/java/org/apache/nutch/io/WritableComparable.java
  86. +0 −153 src/java/org/apache/nutch/io/WritableComparator.java
  87. +0 −76 src/java/org/apache/nutch/io/WritableName.java
  88. +0 −193 src/java/org/apache/nutch/io/WritableUtils.java
  89. +0 −6 src/java/org/apache/nutch/io/package.html
  90. +0 −370 src/java/org/apache/nutch/ipc/Client.java
  91. +0 −236 src/java/org/apache/nutch/ipc/RPC.java
  92. +0 −291 src/java/org/apache/nutch/ipc/Server.java
  93. +0 −5 src/java/org/apache/nutch/ipc/package.html
  94. +0 −81 src/java/org/apache/nutch/mapred/CombiningCollector.java
  95. +0 −77 src/java/org/apache/nutch/mapred/FileSplit.java
  96. +0 −50 src/java/org/apache/nutch/mapred/InputFormat.java
  97. +0 −133 src/java/org/apache/nutch/mapred/InputFormatBase.java
  98. +0 −63 src/java/org/apache/nutch/mapred/InterTrackerProtocol.java
  99. +0 −376 src/java/org/apache/nutch/mapred/JobClient.java
  100. +0 −276 src/java/org/apache/nutch/mapred/JobConf.java
  101. +0 −26 src/java/org/apache/nutch/mapred/JobConfigurable.java
  102. +0 −433 src/java/org/apache/nutch/mapred/JobInProgress.java
  103. +0 −85 src/java/org/apache/nutch/mapred/JobProfile.java
  104. +0 −77 src/java/org/apache/nutch/mapred/JobStatus.java
  105. +0 −62 src/java/org/apache/nutch/mapred/JobSubmissionProtocol.java
  106. +0 −814 src/java/org/apache/nutch/mapred/JobTracker.java
  107. +0 −117 src/java/org/apache/nutch/mapred/JobTrackerInfoServer.java
  108. +0 −186 src/java/org/apache/nutch/mapred/LocalJobRunner.java
  109. +0 −35 src/java/org/apache/nutch/mapred/MRConstants.java
  110. +0 −80 src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
  111. +0 −146 src/java/org/apache/nutch/mapred/MapOutputFile.java
  112. +0 −67 src/java/org/apache/nutch/mapred/MapOutputLocation.java
  113. +0 −31 src/java/org/apache/nutch/mapred/MapOutputProtocol.java
  114. +0 −30 src/java/org/apache/nutch/mapred/MapRunnable.java
  115. +0 −56 src/java/org/apache/nutch/mapred/MapRunner.java
  116. +0 −149 src/java/org/apache/nutch/mapred/MapTask.java
  117. +0 −43 src/java/org/apache/nutch/mapred/MapTaskRunner.java
  118. +0 −42 src/java/org/apache/nutch/mapred/Mapper.java
  119. +0 −34 src/java/org/apache/nutch/mapred/OutputCollector.java
  120. +0 −37 src/java/org/apache/nutch/mapred/OutputFormat.java
  121. +0 −33 src/java/org/apache/nutch/mapred/Partitioner.java
  122. +0 −43 src/java/org/apache/nutch/mapred/RecordReader.java
  123. +0 −39 src/java/org/apache/nutch/mapred/RecordWriter.java
  124. +0 −309 src/java/org/apache/nutch/mapred/ReduceTask.java
  125. +0 −138 src/java/org/apache/nutch/mapred/ReduceTaskRunner.java
  126. +0 −41 src/java/org/apache/nutch/mapred/Reducer.java
  127. +0 −28 src/java/org/apache/nutch/mapred/Reporter.java
  128. +0 −77 src/java/org/apache/nutch/mapred/RunningJob.java
  129. +0 −57 src/java/org/apache/nutch/mapred/SequenceFileInputFormat.java
  130. +0 −71 src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java
  131. +0 −74 src/java/org/apache/nutch/mapred/SequenceFileRecordReader.java
  132. +0 −122 src/java/org/apache/nutch/mapred/Task.java
  133. +0 −442 src/java/org/apache/nutch/mapred/TaskInProgress.java
  134. +0 −182 src/java/org/apache/nutch/mapred/TaskRunner.java
  135. +0 −83 src/java/org/apache/nutch/mapred/TaskStatus.java
  136. +0 −685 src/java/org/apache/nutch/mapred/TaskTracker.java
  137. +0 −143 src/java/org/apache/nutch/mapred/TaskTrackerStatus.java
  138. +0 −54 src/java/org/apache/nutch/mapred/TaskUmbilicalProtocol.java
  139. +0 −96 src/java/org/apache/nutch/mapred/TextInputFormat.java
  140. +0 −51 src/java/org/apache/nutch/mapred/TextOutputFormat.java
  141. +0 −89 src/java/org/apache/nutch/mapred/demo/Grep.java
  142. +0 −36 src/java/org/apache/nutch/mapred/lib/HashPartitioner.java
  143. +0 −42 src/java/org/apache/nutch/mapred/lib/IdentityMapper.java
  144. +0 −45 src/java/org/apache/nutch/mapred/lib/IdentityReducer.java
  145. +0 −41 src/java/org/apache/nutch/mapred/lib/InverseMapper.java
  146. +0 −48 src/java/org/apache/nutch/mapred/lib/LongSumReducer.java
  147. +0 −56 src/java/org/apache/nutch/mapred/lib/RegexMapper.java
  148. +0 −53 src/java/org/apache/nutch/mapred/lib/TokenCountMapper.java
  149. +0 −7 src/java/org/apache/nutch/mapred/lib/package.html
  150. +0 −16 src/java/org/apache/nutch/mapred/package.html
  151. +0 −126 src/java/org/apache/nutch/ndfs/Block.java
  152. +0 −108 src/java/org/apache/nutch/ndfs/BlockCommand.java
  153. +0 −139 src/java/org/apache/nutch/ndfs/ClientProtocol.java
  154. +0 −93 src/java/org/apache/nutch/ndfs/DF.java
  155. +0 −752 src/java/org/apache/nutch/ndfs/DataNode.java
  156. +0 −166 src/java/org/apache/nutch/ndfs/DatanodeInfo.java
  157. +0 −35 src/java/org/apache/nutch/ndfs/DatanodeProtocol.java
  158. +0 −114 src/java/org/apache/nutch/ndfs/FSConstants.java
  159. +0 −429 src/java/org/apache/nutch/ndfs/FSDataset.java
  160. +0 −744 src/java/org/apache/nutch/ndfs/FSDirectory.java
Sorry, we could not display the entire diff because too many files (319) changed.
View
18 bin/nutch
@@ -43,12 +43,6 @@ if [ $# = 0 ]; then
echo " dedup remove duplicates from a set of segment indexes"
echo " plugin load a plugin and run one of its classes main()"
echo " server run a search server"
- echo " namenode run the NDFS namenode"
- echo " datanode run an NDFS datanode"
- echo " ndfs run an NDFS admin client"
- echo " jobtracker run the MapReduce job Tracker node"
- echo " tasktracker run a MapReduce task Tracker node"
- echo " job manipulate MapReduce jobs"
echo " or"
echo " CLASSNAME run the class named CLASSNAME"
echo "Most commands print help when invoked w/o parameters."
@@ -155,18 +149,6 @@ elif [ "$COMMAND" = "plugin" ] ; then
CLASS=org.apache.nutch.plugin.PluginRepository
elif [ "$COMMAND" = "server" ] ; then
CLASS='org.apache.nutch.searcher.DistributedSearch$Server'
-elif [ "$COMMAND" = "namenode" ] ; then
- CLASS='org.apache.nutch.ndfs.NameNode'
-elif [ "$COMMAND" = "datanode" ] ; then
- CLASS='org.apache.nutch.ndfs.DataNode'
-elif [ "$COMMAND" = "ndfs" ] ; then
- CLASS=org.apache.nutch.fs.NDFSShell
-elif [ "$COMMAND" = "jobtracker" ] ; then
- CLASS=org.apache.nutch.mapred.JobTracker
-elif [ "$COMMAND" = "tasktracker" ] ; then
- CLASS=org.apache.nutch.mapred.TaskTracker
-elif [ "$COMMAND" = "job" ] ; then
- CLASS=org.apache.nutch.mapred.JobClient
else
CLASS=$COMMAND
fi
View
101 bin/nutch-daemon.sh
@@ -1,101 +0,0 @@
-#!/bin/bash
-#
-# Runs a Nutch command as a daemon.
-#
-# Environment Variables
-#
-# NUTCH_LOG_DIR Where log files are stored. PWD by default.
-# NUTCH_MASTER host:path where nutch code should be rsync'd from
-# NUTCH_PID_DIR The pid files are stored. /tmp by default.
-# NUTCH_IDENT_STRING A string representing this instance of nutch. $USER by default
-##
-
-usage="Usage: nutch-daemon [start|stop] [nutch-command] [args...]"
-
-# if no args specified, show usage
-if [ $# -le 1 ]; then
- echo $usage
- exit 1
-fi
-
-# get arguments
-startStop=$1
-shift
-command=$1
-shift
-
-# resolve links - $0 may be a softlink
-this="$0"
-while [ -h "$this" ]; do
- ls=`ls -ld "$this"`
- link=`expr "$ls" : '.*-> \(.*\)$'`
- if expr "$link" : '.*/.*' > /dev/null; then
- this="$link"
- else
- this=`dirname "$this"`/"$link"
- fi
-done
-
-# get log directory
-if [ "$NUTCH_LOG_DIR" = "" ]; then
- NUTCH_LOG_DIR=$PWD
-fi
-
-if [ "$NUTCH_PID_DIR" = "" ]; then
- NUTCH_PID_DIR=/tmp
-fi
-
-if [ "$NUTCH_IDENT_STRING" = "" ]; then
- NUTCH_IDENT_STRING=$USER
-fi
-
-# some variables
-log=$NUTCH_LOG_DIR/nutch-$NUTCH_IDENT_STRING-$command-`hostname`.log
-pid=$NUTCH_PID_DIR/nutch-$NUTCH_IDENT_STRING-$command.pid
-
-case $startStop in
-
- (start)
-
- if [ -f $pid ]; then
- if [ -a /proc/`cat $pid` ]; then
- echo $command running as process `cat $pid`. Stop it first.
- exit 1
- fi
- fi
-
- root=`dirname $this`/..
- if [ "$NUTCH_MASTER" != "" ]; then
- echo rsync from $NUTCH_MASTER
- rsync -a --delete --exclude=.svn $NUTCH_MASTER/ $root
- fi
-
- cd $root
- echo starting $command, logging to $log
- nohup bin/nutch $command "$@" >& $log < /dev/null &
- echo $! > $pid
- sleep 1; head $log
- ;;
-
- (stop)
-
- if [ -f $pid ]; then
- if [ -a /proc/`cat $pid` ]; then
- echo stopping $command
- kill `cat $pid`
- else
- echo no $command to stop
- fi
- else
- echo no $command to stop
- fi
- ;;
-
- (*)
- echo $usage
- exit 1
- ;;
-
-esac
-
-
View
16 bin/nutch-daemons.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-#
-# Run a Nutch command on all slave hosts.
-
-usage="Usage: nutch-daemons.sh [start|stop] command args..."
-
-# if no args specified, show usage
-if [ $# -le 1 ]; then
- echo $usage
- exit 1
-fi
-
-bin=`dirname $0`
-bin=`cd $bin; pwd`
-
-exec $bin/slaves.sh $bin/nutch-daemon.sh "$@"
View
27 bin/slaves.sh
@@ -1,27 +0,0 @@
-#!/bin/bash
-#
-# Run a shell command on all slave hosts.
-#
-# Environment Variables
-#
-# NUTCH_SLAVES File naming remote hosts. Default is ~/.slaves
-##
-
-usage="Usage: slaves.sh command..."
-
-# if no args specified, show usage
-if [ $# -le 0 ]; then
- echo $usage
- exit 1
-fi
-
-if [ "$NUTCH_SLAVES" = "" ]; then
- export NUTCH_SLAVES=$HOME/.slaves
-fi
-
-for slave in `cat $NUTCH_SLAVES`; do
- ssh -o ConnectTimeout=1 $slave "$@" \
- 2>&1 | sed "s/^/$slave: /" &
-done
-
-wait
View
11 bin/start-all.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-# Start all nutch daemons. Run this on master node.
-
-bin=`dirname $0`
-bin=`cd $bin; pwd`
-
-$bin/nutch-daemons.sh start datanode
-$bin/nutch-daemon.sh start namenode
-$bin/nutch-daemon.sh start jobtracker
-$bin/nutch-daemons.sh start tasktracker
View
11 bin/stop-all.sh
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-# Stop all nutch daemons. Run this on master node.
-
-bin=`dirname $0`
-bin=`cd $bin; pwd`
-
-$bin/nutch-daemon.sh stop jobtracker
-$bin/nutch-daemons.sh stop tasktracker
-$bin/nutch-daemon.sh stop namenode
-$bin/nutch-daemons.sh stop datanode
View
6 conf/crawl-tool.xml
@@ -1,13 +1,13 @@
<?xml version="1.0" ?>
-<?xml:stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+<?xml:stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Overidden defaults for intranet use. -->
<!-- Do not modify this file directly. Instead, copy entries that you -->
<!-- wish to modify from this file into nutch-site.xml and change them -->
<!-- there. If nutch-site.xml does not already exist, create it. -->
-<nutch-conf>
+<configuration>
<property>
<name>urlfilter.regex.file</name>
@@ -40,4 +40,4 @@
each.</description>
</property>
-</nutch-conf>
+</configuration>
View
237 conf/hadoop-default.xml
@@ -0,0 +1,237 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Do not modify this file directly. Instead, copy entries that you -->
+<!-- wish to modify from this file into hadoop-site.xml and change them -->
+<!-- there. If hadoop-site.xml does not already exist, create it. -->
+
+<configuration>
+
+<!-- file properties -->
+
+<property>
+ <name>file.content.limit</name>
+ <value>65536</value>
+ <description>The length limit for downloaded content, in bytes.
+ If this value is larger than zero, content longer than it will be
+ truncated; otherwise (zero or negative), no truncation at all.
+ </description>
+</property>
+
+<property>
+ <name>file.content.ignored</name>
+ <value>true</value>
+ <description>If true, no file content will be saved during fetch.
+ And it is probably what we want to set most of time, since file:// URLs
+ are meant to be local and we can always use them directly at parsing
+ and indexing stages. Otherwise file contents will be saved.
+ !! NO IMPLEMENTED YET !!
+ </description>
+</property>
+
+<!-- i/o properties -->
+
+<property>
+ <name>io.sort.factor</name>
+ <value>10</value>
+ <description>The number of streams to merge at once while sorting
+ files. This determines the number of open file handles.</description>
+</property>
+
+<property>
+ <name>io.sort.mb</name>
+ <value>100</value>
+ <description>The total amount of buffer memory to use while sorting
+ files, in megabytes. By default, gives each merge stream 1MB, which
+ should minimize seeks.</description>
+</property>
+
+<property>
+ <name>io.file.buffer.size</name>
+ <value>4096</value>
+ <description>The size of buffer for use in sequence files.
+ The size of this buffer should probably be a multiple of hardware
+ page size (4096 on Intel x86), and it determines how much data is
+ buffered during read and write operations.</description>
+</property>
+
+<property>
+ <name>io.bytes.per.checksum</name>
+ <value>512</value>
+ <description>The number of bytes per checksum. Must not be larger than
+ io.file.buffer.size.</description>
+</property>
+
+<property>
+ <name>io.skip.checksum.errors</name>
+ <value>false</value>
+ <description>If true, when a checksum error is encountered while
+ reading a sequence file, entries are skipped, instead of throwing an
+ exception.</description>
+</property>
+
+<property>
+ <name>io.map.index.skip</name>
+ <value>0</value>
+ <description>Number of index entries to skip between each entry.
+ Zero by default. Setting this to values larger than zero can
+ facilitate opening large map files using less memory.</description>
+</property>
+
+<!-- file system properties -->
+
+<property>
+ <name>fs.default.name</name>
+ <value>local</value>
+ <description>The name of the default file system. Either the
+ literal string "local" or a host:port for DFS.</description>
+</property>
+
+<property>
+ <name>dfs.datanode.port</name>
+ <value>50010</value>
+ <description>The port number that the dfs datanode server uses as a starting
+ point to look for a free port to listen on.
+</description>
+</property>
+
+<property>
+ <name>dfs.name.dir</name>
+ <value>/tmp/hadoop/dfs/name</value>
+ <description>Determines where on the local filesystem the DFS name node
+ should store the name table.</description>
+</property>
+
+<property>
+ <name>dfs.data.dir</name>
+ <value>/tmp/hadoop/dfs/data</value>
+ <description>Determines where on the local filesystem an DFS data node
+ should store its blocks. If this is a comma- or space-delimited
+ list of directories, then data will be stored in all named
+ directories, typically on different devices.</description>
+</property>
+
+<property>
+ <name>dfs.replication</name>
+ <value>3</value>
+ <description>How many copies we try to have at all times. The actual
+ number of replications is at max the number of datanodes in the
+ cluster.</description>
+</property>
+
+<!-- map/reduce properties -->
+
+<property>
+ <name>mapred.job.tracker</name>
+ <value>local</value>
+ <description>The host and port that the MapReduce job tracker runs
+ at. If "local", then jobs are run in-process as a single map
+ and reduce task.
+ </description>
+</property>
+
+<property>
+ <name>mapred.job.tracker.info.port</name>
+ <value>50030</value>
+ <description>The port that the MapReduce job tracker info webserver runs at.
+ </description>
+</property>
+
+<property>
+ <name>mapred.task.tracker.output.port</name>
+ <value>50040</value>
+ <description>The port number that the MapReduce task tracker output server uses as a starting
+ point to look for a free port to listen on.
+ </description>
+</property>
+
+<property>
+ <name>mapred.task.tracker.report.port</name>
+ <value>50050</value>
+ <description>The port number that the MapReduce task tracker report server uses as a starting
+ point to look for a free port to listen on.
+ </description>
+</property>
+
+<property>
+ <name>mapred.local.dir</name>
+ <value>/tmp/hadoop/mapred/local</value>
+ <description>The local directory where MapReduce stores intermediate
+ data files. May be a space- or comma- separated list of
+ directories on different devices in order to spread disk i/o.
+ </description>
+</property>
+
+<property>
+ <name>mapred.system.dir</name>
+ <value>/tmp/hadoop/mapred/system</value>
+ <description>The shared directory where MapReduce stores control files.
+ </description>
+</property>
+
+<property>
+ <name>mapred.temp.dir</name>
+ <value>/tmp/hadoop/mapred/temp</value>
+ <description>A shared directory for temporary files.
+ </description>
+</property>
+
+<property>
+ <name>mapred.map.tasks</name>
+ <value>2</value>
+ <description>The default number of map tasks per job. Typically set
+ to a prime several times greater than number of available hosts.
+ Ignored when mapred.job.tracker is "local".
+ </description>
+</property>
+
+<property>
+ <name>mapred.reduce.tasks</name>
+ <value>1</value>
+ <description>The default number of reduce tasks per job. Typically set
+ to a prime close to the number of available hosts. Ignored when
+ mapred.job.tracker is "local".
+ </description>
+</property>
+
+<property>
+ <name>mapred.task.timeout</name>
+ <value>600000</value>
+ <description>The number of milliseconds before a task will be
+ terminated if it neither reads an input, writes an output, nor
+ updates its status string.
+ </description>
+</property>
+
+<property>
+ <name>mapred.tasktracker.tasks.maximum</name>
+ <value>2</value>
+ <description>The maximum number of tasks that will be run
+ simultaneously by a task tracker.
+ </description>
+</property>
+
+<property>
+ <name>mapred.child.heap.size</name>
+ <value>200m</value>
+ <description>The heap size (-Xmx) that will be used for task tracker
+ child processes.</description>
+</property>
+
+<property>
+ <name>mapred.combine.buffer.size</name>
+ <value>100000</value>
+ <description>The number of entries the combining collector caches before
+ combining them and writing to disk.</description>
+</property>
+
+
+<!-- ipc properties -->
+
+<property>
+ <name>ipc.client.timeout</name>
+ <value>60000</value>
+ <description>Defines the timeout for IPC calls in milliseconds.</description>
+</property>
+
+</configuration>
View
4 conf/mapred-default.xml.template
@@ -3,6 +3,6 @@
<!-- Put mapred-specific property overrides in this file. -->
-<nutch-conf>
+<configuration>
-</nutch-conf>
+</configuration>
View
232 conf/nutch-default.xml
@@ -1,11 +1,11 @@
<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Do not modify this file directly. Instead, copy entries that you -->
<!-- wish to modify from this file into nutch-site.xml and change them -->
<!-- there. If nutch-site.xml does not already exist, create it. -->
-<nutch-conf>
+<configuration>
<!-- HTTP properties -->
@@ -110,28 +110,6 @@
trying to fetch a page.</description>
</property>
-<!-- FILE properties -->
-
-<property>
- <name>file.content.limit</name>
- <value>65536</value>
- <description>The length limit for downloaded content, in bytes.
- If this value is larger than zero, content longer than it will be
- truncated; otherwise (zero or negative), no truncation at all.
- </description>
-</property>
-
-<property>
- <name>file.content.ignored</name>
- <value>true</value>
- <description>If true, no file content will be saved during fetch.
- And it is probably what we want to set most of time, since file:// URLs
- are meant to be local and we can always use them directly at parsing
- and indexing stages. Otherwise file contents will be saved.
- !! NO IMPLEMENTED YET !!
- </description>
-</property>
-
<!-- FTP properties -->
<property>
@@ -338,202 +316,6 @@
<description>If true, fetcher will store content.</description>
</property>
-<!-- i/o properties -->
-
-<property>
- <name>io.sort.factor</name>
- <value>10</value>
- <description>The number of streams to merge at once while sorting
- files. This determines the number of open file handles.</description>
-</property>
-
-<property>
- <name>io.sort.mb</name>
- <value>100</value>
- <description>The total amount of buffer memory to use while sorting
- files, in megabytes. By default, gives each merge stream 1MB, which
- should minimize seeks.</description>
-</property>
-
-<property>
- <name>io.file.buffer.size</name>
- <value>4096</value>
- <description>The size of buffer for use in sequence files.
- The size of this buffer should probably be a multiple of hardware
- page size (4096 on Intel x86), and it determines how much data is
- buffered during read and write operations.</description>
-</property>
-
-<property>
- <name>io.bytes.per.checksum</name>
- <value>512</value>
- <description>The number of bytes per checksum. Must not be larger than
- io.file.buffer.size.</description>
-</property>
-
-<property>
- <name>io.skip.checksum.errors</name>
- <value>false</value>
- <description>If true, when a checksum error is encountered while
- reading a sequence file, entries are skipped, instead of throwing an
- exception.</description>
-</property>
-
-<property>
- <name>io.map.index.skip</name>
- <value>0</value>
- <description>Number of index entries to skip between each entry.
- Zero by default. Setting this to values larger than zero can
- facilitate opening large map files using less memory.</description>
-</property>
-
-<!-- file system properties -->
-
-<property>
- <name>fs.default.name</name>
- <value>local</value>
- <description>The name of the default file system. Either the
- literal string "local" or a host:port for NDFS.</description>
-</property>
-
-<property>
- <name>ndfs.datanode.port</name>
- <value>50010</value>
- <description>The port number that the ndfs datanode server uses as a starting
- point to look for a free port to listen on.
-</description>
-</property>
-
-<property>
- <name>ndfs.name.dir</name>
- <value>/tmp/nutch/ndfs/name</value>
- <description>Determines where on the local filesystem the NDFS name node
- should store the name table.</description>
-</property>
-
-<property>
- <name>ndfs.data.dir</name>
- <value>/tmp/nutch/ndfs/data</value>
- <description>Determines where on the local filesystem an NDFS data node
- should store its blocks. If this is a comma- or space-delimited
- list of directories, then data will be stored in all named
- directories, typically on different devices.</description>
-</property>
-
-<property>
- <name>ndfs.replication</name>
- <value>3</value>
- <description>How many copies we try to have at all times. The actual
- number of replications is at max the number of datanodes in the
- cluster.</description>
-</property>
-
-<!-- map/reduce properties -->
-
-<property>
- <name>mapred.job.tracker</name>
- <value>local</value>
- <description>The host and port that the MapReduce job tracker runs
- at. If "local", then jobs are run in-process as a single map
- and reduce task.
- </description>
-</property>
-
-<property>
- <name>mapred.job.tracker.info.port</name>
- <value>50030</value>
- <description>The port that the MapReduce job tracker info webserver runs at.
- </description>
-</property>
-
-<property>
- <name>mapred.task.tracker.output.port</name>
- <value>50040</value>
- <description>The port number that the MapReduce task tracker output server uses as a starting
- point to look for a free port to listen on.
- </description>
-</property>
-
-<property>
- <name>mapred.task.tracker.report.port</name>
- <value>50050</value>
- <description>The port number that the MapReduce task tracker report server uses as a starting
- point to look for a free port to listen on.
- </description>
-</property>
-
-<property>
- <name>mapred.local.dir</name>
- <value>/tmp/nutch/mapred/local</value>
- <description>The local directory where MapReduce stores intermediate
- data files. May be a space- or comma- separated list of
- directories on different devices in order to spread disk i/o.
- </description>
-</property>
-
-<property>
- <name>mapred.system.dir</name>
- <value>/tmp/nutch/mapred/system</value>
- <description>The shared directory where MapReduce stores control files.
- </description>
-</property>
-
-<property>
- <name>mapred.temp.dir</name>
- <value>/tmp/nutch/mapred/temp</value>
- <description>A shared directory for temporary files.
- </description>
-</property>
-
-<property>
- <name>mapred.map.tasks</name>
- <value>2</value>
- <description>The default number of map tasks per job. Typically set
- to a prime several times greater than number of available hosts.
- Ignored when mapred.job.tracker is "local".
- </description>
-</property>
-
-<property>
- <name>mapred.reduce.tasks</name>
- <value>1</value>
- <description>The default number of reduce tasks per job. Typically set
- to a prime close to the number of available hosts. Ignored when
- mapred.job.tracker is "local".
- </description>
-</property>
-
-<property>
- <name>mapred.task.timeout</name>
- <value>600000</value>
- <description>The number of milliseconds before a task will be
- terminated if it neither reads an input, writes an output, nor
- updates its status string.
- </description>
-</property>
-
-<property>
- <name>mapred.tasktracker.tasks.maximum</name>
- <value>2</value>
- <description>The maximum number of tasks that will be run
- simultaneously by a task tracker.
- </description>
-</property>
-
-<property>
- <name>mapred.child.heap.size</name>
- <value>200m</value>
- <description>The heap size (-Xmx) that will be used for task tracker
- child processes.</description>
-</property>
-
-<property>
- <name>mapred.combine.buffer.size</name>
- <value>100000</value>
- <description>The number of entries the combining collector caches before
- combining them and writing to disk.</description>
-</property>
-
<!-- indexer properties -->
<property>
@@ -727,14 +509,6 @@
</description>
</property>
-<!-- ipc properties -->
-
-<property>
- <name>ipc.client.timeout</name>
- <value>60000</value>
- <description>Defines the timeout for IPC calls in milliseconds.</description>
-</property>
-
<!-- plugin properties -->
<property>
@@ -949,4 +723,4 @@
</description>
</property>
-</nutch-conf>
+</configuration>
View
6 conf/nutch-site.xml.template
@@ -1,8 +1,8 @@
<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
-<nutch-conf>
+<configuration>
-</nutch-conf>
+</configuration>
View
BIN lib/hadoop-0.1-dev.jar
Binary file not shown.
View
202 lib/jetty-5.1.4.LICENSE.txt
@@ -1,202 +0,0 @@
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
View
BIN lib/jetty-5.1.4.jar
Binary file not shown.
View
BIN lib/jetty-ext/ant.jar
Binary file not shown.
View
BIN lib/jetty-ext/commons-el.jar
Binary file not shown.
View
BIN lib/jetty-ext/jasper-compiler.jar
Binary file not shown.
View
BIN lib/jetty-ext/jasper-runtime.jar
Binary file not shown.
View
BIN lib/jetty-ext/jsp-api.jar
Binary file not shown.
View
19 src/java/org/apache/nutch/analysis/AnalyzerFactory.java
@@ -22,8 +22,9 @@
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.ExtensionPoint;
import org.apache.nutch.plugin.PluginRuntimeException;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
/**
@@ -40,12 +41,12 @@
private NutchAnalyzer DEFAULT_ANALYZER;
private ExtensionPoint extensionPoint;
- private NutchConf nutchConf;
+ private Configuration conf;
- public AnalyzerFactory (NutchConf nutchConf) {
- DEFAULT_ANALYZER = new NutchDocumentAnalyzer(nutchConf);
- this.nutchConf = nutchConf;
- this.extensionPoint = nutchConf.getPluginRepository().getExtensionPoint(NutchAnalyzer.X_POINT_ID);
+ public AnalyzerFactory (Configuration conf) {
+ DEFAULT_ANALYZER = new NutchDocumentAnalyzer(conf);
+ this.conf = conf;
+ this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(NutchAnalyzer.X_POINT_ID);
if(this.extensionPoint == null) {
throw new RuntimeException("x point " + NutchAnalyzer.X_POINT_ID +
" not found.");
@@ -77,10 +78,10 @@ public NutchAnalyzer get(String lang) {
private Extension getExtension(String lang) {
- Extension extension = (Extension) this.nutchConf.getObject(lang);
+ Extension extension = (Extension) this.conf.getObject(lang);
if (extension == null) {
extension = findExtension(lang);
- this.nutchConf.setObject(lang, extension);
+ this.conf.setObject(lang, extension);
}
return extension;
}
View
19 src/java/org/apache/nutch/analysis/CommonGrams.java
@@ -24,8 +24,9 @@
import java.util.*;
import java.util.logging.Logger;
-import org.apache.nutch.util.*;
-
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.searcher.Query.*;
/** Construct n-grams for frequently occuring terms and phrases while indexing.
@@ -40,10 +41,10 @@
/**
* The constructor.
- * @param nutchConf
+ * @param conf
*/
- public CommonGrams(NutchConf nutchConf) {
- init(nutchConf);
+ public CommonGrams(Configuration conf) {
+ init(conf);
}
private static class Filter extends TokenFilter {
@@ -133,10 +134,10 @@ private Token gramToken(Token first, Token second) {
}
/** Construct using the provided config file. */
- private void init(NutchConf nutchConf) {
+ private void init(Configuration conf) {
try {
- Reader reader = nutchConf.getConfResourceAsReader
- (nutchConf.get("analysis.common.terms.file"));
+ Reader reader = conf.getConfResourceAsReader
+ (conf.get("analysis.common.terms.file"));
BufferedReader in = new BufferedReader(reader);
String line;
while ((line = in.readLine()) != null) {
@@ -236,7 +237,7 @@ public static void main(String[] args) throws Exception {
text.append(' ');
}
TokenStream ts = new NutchDocumentTokenizer(new StringReader(text.toString()));
- CommonGrams commonGrams = new CommonGrams(new NutchConf());
+ CommonGrams commonGrams = new CommonGrams(NutchConfiguration.create());
ts = commonGrams.getFilter(ts, "url");
Token token;
while ((token = ts.next()) != null) {
View
15 src/java/org/apache/nutch/analysis/NutchAnalysis.java
@@ -5,8 +5,9 @@
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.searcher.Query.Clause;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.analysis.StopFilter;
+import org.apache.nutch.util.NutchConfiguration;
import java.io.*;
import java.util.*;
@@ -35,12 +36,12 @@ public static boolean isStopWord(String word) {
}
/** Construct a query parser for the text in a reader. */
- public static Query parseQuery(String queryString, NutchConf nutchConf) throws IOException {
+ public static Query parseQuery(String queryString, Configuration conf) throws IOException {
NutchAnalysis parser =
new NutchAnalysis(new FastCharStream(new StringReader(queryString)));
parser.queryString = queryString;
- parser.queryFilters = new QueryFilters(nutchConf);
- return parser.parse(nutchConf);
+ parser.queryFilters = new QueryFilters(conf);
+ return parser.parse(conf);
}
/** For debugging. */
@@ -49,13 +50,13 @@ public static void main(String[] args) throws Exception {
while (true) {
System.out.print("Query: ");
String line = in.readLine();
- System.out.println(parseQuery(line, new NutchConf()));
+ System.out.println(parseQuery(line, NutchConfiguration.create()));
}
}
/** Parse a query. */
- final public Query parse(NutchConf nutchConf) throws ParseException {
- Query query = new Query(nutchConf);
+ final public Query parse(Configuration conf) throws ParseException {
+ Query query = new Query(conf);
ArrayList terms;
Token token;
String field;
View
2 src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
@@ -4,7 +4,7 @@
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.searcher.Query.Clause;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.analysis.StopFilter;
import java.io.*;
import java.util.*;
View
13 src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
@@ -24,7 +24,8 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.conf.Configuration;
/**
* The analyzer used for Nutch documents. Uses the JavaCC-defined lexical
@@ -44,13 +45,13 @@
public static final int INTER_ANCHOR_GAP = 4;
/** Analyzer used to analyze anchors. */
private static Analyzer ANCHOR_ANALYZER;
- private NutchConf nutchConf;
+ private Configuration conf;
/**
* @param conf
*/
- public NutchDocumentAnalyzer(NutchConf conf) {
- this.nutchConf = conf;
+ public NutchDocumentAnalyzer(Configuration conf) {
+ this.conf = conf;
CONTENT_ANALYZER = new ContentAnalyzer(conf);
ANCHOR_ANALYZER = new AnchorAnalyzer();
}
@@ -59,8 +60,8 @@ public NutchDocumentAnalyzer(NutchConf conf) {
private static class ContentAnalyzer extends Analyzer {
private CommonGrams commonGrams;
- public ContentAnalyzer(NutchConf nutchConf) {
- this.commonGrams = new CommonGrams(nutchConf);
+ public ContentAnalyzer(Configuration conf) {
+ this.commonGrams = new CommonGrams(conf);
}
/** Constructs a {@link NutchDocumentTokenizer}. */
View
10 src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
@@ -17,9 +17,9 @@
package org.apache.nutch.clustering;
import org.apache.nutch.plugin.*;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.util.LogFormatter;
/**
* A factory for retrieving {@link OnlineClusterer} extensions.
@@ -33,9 +33,9 @@
private ExtensionPoint extensionPoint;
private String extensionName;
- public OnlineClustererFactory(NutchConf nutchConf) {
- this.extensionPoint = nutchConf.getPluginRepository().getExtensionPoint(OnlineClusterer.X_POINT_ID);
- this.extensionName = nutchConf.get("extension.clustering.extension-name");
+ public OnlineClustererFactory(Configuration conf) {
+ this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(OnlineClusterer.X_POINT_ID);
+ this.extensionName = conf.get("extension.clustering.extension-name");
}
/**
View
40 src/java/org/apache/nutch/crawl/Crawl.java
@@ -22,13 +22,15 @@
import java.util.logging.*;
import org.apache.nutch.fetcher.Fetcher;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.indexer.DeleteDuplicates;
import org.apache.nutch.indexer.IndexMerger;
import org.apache.nutch.indexer.Indexer;
+import org.apache.nutch.util.NutchConfiguration;
public class Crawl {
public static final Logger LOG =
@@ -48,13 +50,13 @@ public static void main(String args[]) throws Exception {
return;
}
- NutchConf nutchConf = new NutchConf();
- nutchConf.addConfResource("crawl-tool.xml");
- JobConf conf = new JobConf(nutchConf);
+ Configuration conf = NutchConfiguration.create();
+ conf.addAppResource("crawl-tool.xml");
+ JobConf job = new JobConf(conf);
File rootUrlDir = null;
File dir = new File("crawl-" + getDate());
- int threads = conf.getInt("fetcher.threads.fetch", 10);
+ int threads = job.getInt("fetcher.threads.fetch", 10);
int depth = 5;
int topN = Integer.MAX_VALUE;
@@ -76,7 +78,7 @@ public static void main(String args[]) throws Exception {
}
}
- NutchFileSystem fs = NutchFileSystem.get(conf);
+ FileSystem fs = FileSystem.get(job);
if (fs.exists(dir)) {
throw new RuntimeException(dir + " already exists.");
}
@@ -95,28 +97,28 @@ public static void main(String args[]) throws Exception {
File indexes = new File(dir + "/indexes");
File index = new File(dir + "/index");
- File tmpDir = conf.getLocalFile("crawl", getDate());
+ File tmpDir = job.getLocalFile("crawl", getDate());
// initialize crawlDb
- new Injector(conf).inject(crawlDb, rootUrlDir);
+ new Injector(job).inject(crawlDb, rootUrlDir);
for (int i = 0; i < depth; i++) { // generate new segment
File segment =
- new Generator(conf).generate(crawlDb, segments, -1,
+ new Generator(job).generate(crawlDb, segments, -1,
topN, System.currentTimeMillis());
- new Fetcher(conf).fetch(segment, threads, Fetcher.isParsing(conf)); // fetch it
- if (!Fetcher.isParsing(conf)) {
- new ParseSegment(conf).parse(segment); // parse it, if needed
+ new Fetcher(job).fetch(segment, threads, Fetcher.isParsing(job)); // fetch it
+ if (!Fetcher.isParsing(job)) {
+ new ParseSegment(job).parse(segment); // parse it, if needed
}
- new CrawlDb(conf).update(crawlDb, segment); // update crawldb
+ new CrawlDb(job).update(crawlDb, segment); // update crawldb
}
- new LinkDb(conf).invert(linkDb, segments); // invert links
+ new LinkDb(job).invert(linkDb, segments); // invert links
// index, dedup & merge
- new Indexer(conf).index(indexes, crawlDb, linkDb, fs.listFiles(segments));
- new DeleteDuplicates(conf).dedup(new File[] { indexes });
- new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, nutchConf).merge();
+ new Indexer(job).index(indexes, crawlDb, linkDb, fs.listFiles(segments));
+ new DeleteDuplicates(job).dedup(new File[] { indexes });
+ new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, job).merge();
LOG.info("crawl finished: " + dir);
}
View
3 src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -19,7 +19,8 @@
import java.io.*;
import java.util.*;
-import org.apache.nutch.io.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.conf.*;
import org.apache.nutch.util.*;
/* The crawl state of a url. */
View
21 src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -20,20 +20,23 @@
import java.util.*;
import java.util.logging.*;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
+
+import org.apache.nutch.util.NutchConfiguration;
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system. */
-public class CrawlDb extends NutchConfigured {
+public class CrawlDb extends Configured {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.crawl.CrawlDb");
/** Construct an CrawlDb. */
- public CrawlDb(NutchConf conf) {
+ public CrawlDb(Configuration conf) {
super(conf);
}
@@ -53,7 +56,7 @@ public void update(File crawlDb, File segment) throws IOException {
LOG.info("CrawlDb update: done");
}
- public static JobConf createJob(NutchConf config, File crawlDb) {
+ public static JobConf createJob(Configuration config, File crawlDb) {
File newCrawlDb =
new File(crawlDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -77,7 +80,7 @@ public static JobConf createJob(NutchConf config, File crawlDb) {
public static void install(JobConf job, File crawlDb) throws IOException {
File newCrawlDb = job.getOutputDir();
- NutchFileSystem fs = new JobClient(job).getFs();
+ FileSystem fs = new JobClient(job).getFs();
File old = new File(crawlDb, "old");
File current = new File(crawlDb, CrawlDatum.DB_DIR_NAME);
fs.delete(old);
@@ -87,7 +90,7 @@ public static void install(JobConf job, File crawlDb) throws IOException {
}
public static void main(String[] args) throws Exception {
- CrawlDb crawlDb = new CrawlDb(new NutchConf());
+ CrawlDb crawlDb = new CrawlDb(NutchConfiguration.create());
if (args.length < 2) {
System.err.println("Usage: <crawldb> <segment>");
View
54 src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -22,27 +22,29 @@
import java.util.TreeMap;
import java.util.logging.Logger;
-import org.apache.nutch.fs.NutchFileSystem;
-import org.apache.nutch.io.LongWritable;
-import org.apache.nutch.io.MapFile;
-import org.apache.nutch.io.SequenceFile;
-import org.apache.nutch.io.UTF8;
-import org.apache.nutch.io.Writable;
-import org.apache.nutch.io.WritableComparable;
-import org.apache.nutch.mapred.JobClient;
-import org.apache.nutch.mapred.JobConf;
-import org.apache.nutch.mapred.MapFileOutputFormat;
-import org.apache.nutch.mapred.Mapper;
-import org.apache.nutch.mapred.OutputCollector;
-import org.apache.nutch.mapred.Reducer;
-import org.apache.nutch.mapred.Reporter;
-import org.apache.nutch.mapred.SequenceFileInputFormat;
-import org.apache.nutch.mapred.SequenceFileOutputFormat;
-import org.apache.nutch.mapred.TextOutputFormat;
-import org.apache.nutch.mapred.lib.HashPartitioner;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.MapFile.Reader;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.mapred.lib.HashPartitioner;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
/**
* Read utility for the CrawlDB.
*
@@ -130,7 +132,7 @@ public void configure(JobConf job) {
}
}
- public void processStatJob(String crawlDb, NutchConf config) throws IOException {
+ public void processStatJob(String crawlDb, Configuration config) throws IOException {
LOG.info("CrawlDb statistics start: " + crawlDb);
File tmpFolder = new File(crawlDb, "stat_tmp" + System.currentTimeMillis());
@@ -152,7 +154,7 @@ public void processStatJob(String crawlDb, NutchConf config) throws IOException
JobClient.runJob(job);
// reading the result
- NutchFileSystem fileSystem = NutchFileSystem.get(config);
+ FileSystem fileSystem = FileSystem.get(config);
SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);
UTF8 key = new UTF8();
@@ -201,8 +203,8 @@ else if (k.startsWith("min"))
}
- public void readUrl(String crawlDb, String url, NutchConf config) throws IOException {
- NutchFileSystem fs = NutchFileSystem.get(config);
+ public void readUrl(String crawlDb, String url, Configuration config) throws IOException {
+ FileSystem fs = FileSystem.get(config);
UTF8 key = new UTF8(url);
CrawlDatum val = new CrawlDatum();
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new File(crawlDb, CrawlDatum.DB_DIR_NAME), config);
@@ -215,7 +217,7 @@ public void readUrl(String crawlDb, String url, NutchConf config) throws IOExcep
}
}
- public void processDumpJob(String crawlDb, String output, NutchConf config) throws IOException {
+ public void processDumpJob(String crawlDb, String output, Configuration config) throws IOException {
LOG.info("CrawlDb dump: starting");
LOG.info("CrawlDb db: " + crawlDb);
@@ -249,7 +251,7 @@ public static void main(String[] args) throws IOException {
}
String param = null;
String crawlDb = args[0];
- NutchConf conf = new NutchConf();
+ Configuration conf = NutchConfiguration.create();
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-stats")) {
dbr.processStatJob(crawlDb, conf);
View
4 src/java/org/apache/nutch/crawl/CrawlDbReducer.java
@@ -19,8 +19,8 @@
import java.util.Iterator;
import java.io.IOException;
-import org.apache.nutch.io.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
/** Merge new page entries with existing entries. */
public class CrawlDbReducer implements Reducer {
View
17 src/java/org/apache/nutch/crawl/Generator.java
@@ -22,13 +22,16 @@
import java.text.*;
import java.util.logging.*;
-import org.apache.nutch.io.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
-import org.apache.nutch.mapred.lib.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.mapred.lib.*;
+
+import org.apache.nutch.util.NutchConfiguration;
/** Generates a subset of a crawl db to fetch. */
-public class Generator extends NutchConfigured {
+public class Generator extends Configured {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.crawl.Generator");
@@ -141,7 +144,7 @@ private static int hash(byte[] bytes, int start, int length) {
}
/** Construct a generator. */
- public Generator(NutchConf conf) {
+ public Generator(Configuration conf) {
super(conf);
}
@@ -258,7 +261,7 @@ public static void main(String args[]) throws Exception {
if (topN != Long.MAX_VALUE)
LOG.info("topN: " + topN);
- Generator gen = new Generator(new NutchConf());
+ Generator gen = new Generator(NutchConfiguration.create());
gen.generate(dbDir, segmentsDir, numFetchers, topN, curTime);
}
}
View
19 src/java/org/apache/nutch/crawl/Injector.java
@@ -20,15 +20,18 @@
import java.util.*;
import java.util.logging.*;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
+
import org.apache.nutch.net.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
+import org.apache.nutch.util.NutchConfiguration;
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system. */
-public class Injector extends NutchConfigured {
+public class Injector extends Configured {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.crawl.Injector");
@@ -79,7 +82,7 @@ public void reduce(WritableComparable key, Iterator values,
}
/** Construct an Injector. */
- public Injector(NutchConf conf) {
+ public Injector(Configuration conf) {
super(conf);
}
@@ -114,14 +117,14 @@ public void inject(File crawlDb, File urlDir) throws IOException {
CrawlDb.install(mergeJob, crawlDb);
// clean up
- NutchFileSystem fs = new JobClient(getConf()).getFs();
+ FileSystem fs = new JobClient(getConf()).getFs();
fs.delete(tempDir);
LOG.info("Injector: done");
}
public static void main(String[] args) throws Exception {
- Injector injector = new Injector(new NutchConf());
+ Injector injector = new Injector(NutchConfiguration.create());
if (args.length < 2) {
System.err.println("Usage: Injector <crawldb> <url_dir>");
View
2 src/java/org/apache/nutch/crawl/Inlink.java
@@ -17,7 +17,7 @@
package org.apache.nutch.crawl;
import java.io.*;
-import org.apache.nutch.io.*;
+import org.apache.hadoop.io.*;
/* An incoming link to a page. */
public class Inlink implements Writable {
View
2 src/java/org/apache/nutch/crawl/Inlinks.java
@@ -20,7 +20,7 @@
import java.net.*;
import java.util.*;
-import org.apache.nutch.io.*;
+import org.apache.hadoop.io.*;
/** A list of {@link Inlink}s. */
public class Inlinks implements Writable {
View
21 src/java/org/apache/nutch/crawl/LinkDb.java
@@ -21,14 +21,17 @@
import java.util.logging.*;
import java.net.*;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
+
import org.apache.nutch.parse.*;
+import org.apache.nutch.util.NutchConfiguration;
/** Maintains an inverted link map, listing incoming links for each url. */
-public class LinkDb extends NutchConfigured implements Mapper, Reducer {
+public class LinkDb extends Configured implements Mapper, Reducer {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.crawl.LinkDb");
@@ -44,7 +47,7 @@ public LinkDb() {
}
/** Construct an LinkDb. */
- public LinkDb(NutchConf conf) {
+ public LinkDb(Configuration conf) {
super(conf);
}
@@ -145,7 +148,7 @@ public void invert(File linkDb, File[] segments) throws IOException {
LOG.info("LinkDb: done");
}
- private static JobConf createJob(NutchConf config, File linkDb) {
+ private static JobConf createJob(Configuration config, File linkDb) {
File newLinkDb =
new File(linkDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -171,7 +174,7 @@ private static JobConf createJob(NutchConf config, File linkDb) {
public static void install(JobConf job, File linkDb) throws IOException {
File newLinkDb = job.getOutputDir();
- NutchFileSystem fs = new JobClient(job).getFs();
+ FileSystem fs = new JobClient(job).getFs();
File old = new File(linkDb, "old");
File current = new File(linkDb, CURRENT_NAME);
fs.delete(old);
@@ -181,7 +184,7 @@ public static void install(JobConf job, File linkDb) throws IOException {
}
public static void main(String[] args) throws Exception {
- LinkDb linkDb = new LinkDb(new NutchConf());
+ LinkDb linkDb = new LinkDb(NutchConfiguration.create());
if (args.length < 2) {
System.err.println("Usage: <linkdb> <segments>");
View
32 src/java/org/apache/nutch/crawl/LinkDbReader.java
@@ -19,12 +19,14 @@
import java.io.IOException;
import java.io.File;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.mapred.*;
-import org.apache.nutch.mapred.lib.HashPartitioner;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.mapred.lib.HashPartitioner;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.util.NutchConfiguration;
import java.util.logging.Logger;
@@ -34,15 +36,15 @@
private static final Partitioner PARTITIONER = new HashPartitioner();
- private NutchFileSystem fs;
+ private FileSystem fs;
private File directory;
private MapFile.Reader[] readers;
- private NutchConf nutchConf;
+ private Configuration conf;
- public LinkDbReader(NutchFileSystem fs, File directory, NutchConf nutchConf) {
+ public LinkDbReader(FileSystem fs, File directory, Configuration conf) {
this.fs = fs;
this.directory = directory;
- this.nutchConf = nutchConf;
+ this.conf = conf;
}
public String[] getAnchors(UTF8 url) throws IOException {
@@ -57,15 +59,15 @@ public Inlinks getInlinks(UTF8 url) throws IOException {
synchronized (this) {
if (readers == null) {
readers = MapFileOutputFormat.getReaders
- (fs, new File(directory, LinkDb.CURRENT_NAME), this.nutchConf);
+ (fs, new File(directory, LinkDb.CURRENT_NAME), this.conf);
}
}
return (Inlinks)MapFileOutputFormat.getEntry
(readers, PARTITIONER, url, new Inlinks());
}
- public static void processDumpJob(String linkdb, String output, NutchConf config) throws IOException {
+ public static void processDumpJob(String linkdb, String output, Configuration config) throws IOException {
LOG.info("LinkDb dump: starting");
LOG.info("LinkDb db: " + linkdb);
File outFolder = new File(output);
@@ -92,11 +94,11 @@ public static void main(String[] args) throws Exception {
System.err.println("\t-url <url>\tprint information about <url> to System.out");
return;
}
- NutchConf nutchConf = new NutchConf();
+ Configuration conf = NutchConfiguration.create();
if (args[1].equals("-dump")) {
- LinkDbReader.processDumpJob(args[0], args[2], nutchConf);
+ LinkDbReader.processDumpJob(args[0], args[2], conf);
} else if (args[1].equals("-url")) {
- LinkDbReader dbr = new LinkDbReader(NutchFileSystem.get(new NutchConf()), new File(args[0]), nutchConf);
+ LinkDbReader dbr = new LinkDbReader(FileSystem.get(NutchConfiguration.create()), new File(args[0]), conf);
Inlinks links = dbr.getInlinks(new UTF8(args[2]));
if (links == null) {
System.out.println(" - no link information.");
View
2 src/java/org/apache/nutch/crawl/MD5Signature.java
@@ -16,7 +16,7 @@
package org.apache.nutch.crawl;
-import org.apache.nutch.io.MD5Hash;
+import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.protocol.Content;
View
4 src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
@@ -19,8 +19,8 @@
import java.net.URL;
import java.net.MalformedURLException;
-import org.apache.nutch.io.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
/** Partition urls by hostname. */
public class PartitionUrlByHost implements Partitioner {
View
12 src/java/org/apache/nutch/crawl/Signature.java
@@ -18,19 +18,19 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.NutchConfigurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
-public abstract class Signature implements NutchConfigurable {
- protected NutchConf conf;
+public abstract class Signature implements Configurable {
+ protected Configuration conf;
public abstract byte[] calculate(Content content, Parse parse);
- public NutchConf getConf() {