From bf85be9edb79e72012c87971a4507817a294b1f4 Mon Sep 17 00:00:00 2001
From: Alex Holmes <grep.alex@gmail.com>
Date: Tue, 24 Jan 2012 22:20:25 -0500
Subject: [PATCH] updated configs and docs to reflect new config file

---
 README.md                                     | 270 +++++++-----------
 TESTING.md                                    |  17 +-
 src/main/config/examples/basic.conf           |   7 +
 src/main/config/examples/dynamic-dest.conf    |   7 +
 src/main/config/examples/lzop-verify.conf     |  10 +
 src/main/config/examples/test.conf            |   7 +
 .../config/{example.conf => slurper.conf}     |   9 +-
 7 files changed, 138 insertions(+), 189 deletions(-)
 create mode 100644 src/main/config/examples/basic.conf
 create mode 100644 src/main/config/examples/dynamic-dest.conf
 create mode 100644 src/main/config/examples/lzop-verify.conf
 create mode 100644 src/main/config/examples/test.conf
 rename src/main/config/{example.conf => slurper.conf} (94%)
diff --git a/README.md b/README.md
index 8ebab09..e55aa22 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,6 @@ The source or destination directories can be local, HDFS, or any other Hadoop Fi
 know a single destination directory
 * Customizable pre-processing of file prior to transfer via script
 and all files are copied into that location.
-* A daemon mode which will `nohup` the process and keep polling for files in the source directory
 * A daemon mode which is compatible with `inittab` respawn
 * Multi-threaded data transfer
 
@@ -53,182 +52,86 @@ To get started, simply:
 
 1. Download, and run `mvn package`
 2. Copy the generated tarball under `target/` to a machine that has access to Hadoop, and untar.
-3. Set the `HADOOP_HOME` environment variable to refer to your local hadoop installation (not required if you are
-running a packaged version of CDH).
-4. Run!
+3. Edit `conf/slurper-env` to set your Java and Hadoop home directories
+4. Edit `conf/slurper.conf` and set the properties for your environment
+5. Run!
 
-Example environment setup:
 
-<pre><code># CDH hadoop script location
-export HADOOP_HOME=/usr/lib/hadoop
-</code></pre>
+### Key Settings in the Slurper Properties File
+
+The required properties which you'll need to set in `conf/slurper.conf` are:
+
+1.  "DATASOURCE_NAME", a logical name for the data being transferred.
+2.  "SRC_DIR", which is the source directory for the Slurper.
+3.  "WORK_DIR", which is the where files from the source directory are moved prior to copying them to the destination.
+4.  "COMPLETE_DIR", which is where files are moved once the copy has succeeded successfully.
+5.  "ERROR_DIR", where files are moved if the copy failed.
+6.  "DEST_STAGING_DIR", the staging directory on the destination file system where the file is copied.  When the copy
+completes files are then moved into "DEST_DIR".
+7.  "DEST_DIR", the destination directory for files.
+
+
+### Example 1
 
-To see all the options available:
-
-<pre><code>usage: Slurper [-a] [-c <arg>] [-d <arg>] [-e <arg>] [-g <arg>] [-i <arg>]
-                  [-n] [-o <arg>] [-p] [-r] [-s <arg>] [-t <arg>] [-u] [-v] [-w
-                  <arg>] [-x <arg>] [-y] [-z <arg>]
-            -a,--daemon                   Whether to run as a daemon (always up), or
-                                          just process the existing files and exit.
-                                          This option will also 'nohup' the process
-            -c,--compress <arg>           The codec to use to compress the file as it
-                                          is being written to the destination.
-                                          (Optional)
-            -d,--datasource-name <arg>    The data source name.  This is used to log
-                                          slurper activity to a unique log file.
-            -e,--error-dir <arg>          Error directory.  This must be a
-                                          fully-qualified URI.  For example, for a
-                                          local  /tmp directory, this would be
-                                          file:/tmp.  For a /tmp directory in HDFS,
-                                          this would be hdfs://localhost:8020/tmp or
-                                          hdfs:/tmp if you wanted to use the NameNode
-                                          host and port settings in your
-                                          core-site.xml file.
-            -g,--dest-staging-dir <arg>   Staging directory.  Files are first copied
-                                          into this directory, and after the copy has
-                                          been completed and verified, they are moved
-                                          into the destination directory.  This must
-                                          be a fully-qualified URI.  For example, for
-                                          a local  /tmp directory, this would be
-                                          file:/tmp.  For a /tmp directory in HDFS,
-                                          this would be hdfs://localhost:8020/tmp or
-                                          hdfs:/tmp if you wanted to use the NameNode
-                                          host and port settings in your
-                                          core-site.xml file.
-            -i,--script <arg>             A shell script which can be called to
-                                          determine the destination directory.The
-                                          standard input will contain a single line
-                                          with the fully qualified URI of the source
-                                          file, and the script must put the
-                                          destination  full path on standard output.
-                                          This must be a fully-qualified URI.  For
-                                          example, for a local  /tmp directory, this
-                                          would be file:/tmp.  For a /tmp directory
-                                          in HDFS, this would be
-                                          hdfs://localhost:8020/tmp or hdfs:/tmp if
-                                          you wanted to use the NameNode host and
-                                          port settings in your core-site.xml file.
-                                          Either this or the "hdfsdif" option must be
-                                          set.
-            -n,--create-done-file         Touch a file in the destination directory
-                                          after the file copy process has completed.
-                                          The done filename is the same as the
-                                          destination file appended with ".done"
-                                          (Optional)
-            -o,--complete-dir <arg>       Completion directory where file is moved
-                                          after successful copy.  Must be in the same
-                                          filesystem as the source file.  Either this
-                                          or the "remove" option must be set.
-            -p,--poll-period-millis       The time threads wait in milliseconds
-                                          between polling the file system for new
-                                          files. (Optional)
-            -r,--remove-after-copy        Remove the source file after a successful
-                                          copy.  Either this or the "completedir"
-                                          option must be set.
-            -s,--src-dir <arg>            Source directory.  This must be a
-                                          fully-qualified URI.  For example, for a
-                                          local  /tmp directory, this would be
-                                          file:/tmp.  For a /tmp directory in HDFS,
-                                          this would be hdfs://localhost:8020/tmp or
-                                          hdfs:/tmp if you wanted to use the NameNode
-                                          host and port settings in your
-                                          core-site.xml file.
-            -t,--dest-dir <arg>           Destination directory where files should be
-                                          copied. Either this or the "script" option
-                                          must be set. This must be a fully-qualified
-                                          URI.  For example, for a local  /tmp
-                                          directory, this would be file:/tmp.  For a
-                                          /tmp directory in HDFS, this would be
-                                          hdfs://localhost:8020/tmp or hdfs:/tmp if
-                                          you wanted to use the NameNode host and
-                                          port settings in your core-site.xml file.
-            -u,--daemon-no-bkgrnd         Whether to run as a daemon (always up), or
-                                          just process the existing files and exit.
-                                          This option is suitable for inittab respawn
-                                          execution, where the Java process isn't
-                                          launched in the background.
-            -v,--verify                   Verify the file after it has been copied.
-                                          This is slow as it involves reading the
-                                          entire destination file after the copy has
-                                          completed. (Optional)
-            -w,--work-dir <arg>           Work directory.  This must be a
-                                          fully-qualified URI.  For example, for a
-                                          local  /tmp directory, this would be
-                                          file:/tmp.  For a /tmp directory in HDFS,
-                                          this would be hdfs://localhost:8020/tmp or
-                                          hdfs:/tmp if you wanted to use the NameNode
-                                          host and port settings in your
-                                          core-site.xml file.
-            -x,--threads <arg>            The number of worker threads.  (Optional)
-            -y,--create-lzo-index         If the compression codec is
-                                          com.hadoop.compression.lzo.LzopCodec,  an
-                                          index file will be created post transfer.
-                                          (Optional)
-            -z,--work-script <arg>        A shell script which can be called to after
-                                          the file is moved into the work directory
-                                          but before it is copied to the
-                                          destination.This gives users the chance to
-                                          modify the contents of the file and change
-                                          the filename prior to it being uploaded by
-                                          the Slurper.An example of usage would be if
-                                          files are dumped into the in folder and you
-                                          need to uncompress them and also change the
-                                          filename to include a timestamp.The
-                                          standard input will contain a single line
-                                          with the fully qualified URI of the source
-                                          file in the work directory.The script must
-                                          create a file in the word directory and
-                                          return the fully-qualified URI of the new
-                                          file in the work directory on standard
-                                          output.
+To copy from the localhost directory `/tmp/slurper/in` to the HDFS directory `/incoming/`.  Note that the Hadoop URI is used, which is a
+ requirement for all paths.
+
+<pre><code>shell$ cat conf/examples/basic.conf
+DATASOURCE_NAME = test
+SRC_DIR = file:/tmp/slurper/in
+WORK_DIR = file:/tmp/slurper/work
+COMPLETE_DIR = file:/tmp/slurper/complete
+ERROR_DIR = file:/tmp/slurper/error
+DEST_STAGING_DIR = hdfs:/incoming/stage
+DEST_DIR = hdfs:/incoming
 </code></pre>
 
-If you wanted a one-time transfer of files from a local /app/slurper/in directory into a /user/ali/in directory in
-HDFS your usage would look like this:
-
-<pre><code>bin/slurper.sh \
-  --datasource-name test \
-  --src-dir file:/app/slurper/in \
-  --dest-dir hdfs:/user/ali/in \
-  --dest-staging-dir hdfs:/user/ali/staging \
-  --work-dir file:/app/slurper/work \
-  --complete-dir file:/app/slurper/complete \
-  --error-dir file:/app/slurper/error
+Run the Slurper in foreground mode in a console:
+
+<pre><code>shell$ bin/slurper.sh \
+  --config-file /path/to/slurper/conf/examples/basic.conf
 </code></pre>
 
-After a file is copied into HDFS there are two options for how the source file is handled,
-you can either supply the `--remove` option to remove it, or specify the `--complete-dir` directory (as in our above example) into which
-the file is moved.
+In another console create an empty file and watch the Slurper do its stuff:
+
+<pre><code>shell$ echo "test" > /tmp/slurper/in/test.txt
+</code></pre>
 
 
-### Compression
+### Example 2
 
-You can also compress the HDFS output file with the `--compress` option, which takes a Hadoop CompressionCodec
-class.  The default behavior is to append the codec-specific extension to the end of the destination file in HDFS.  If
-you don't want this to occur, you must provide a script and specify an alternative HDFS filename.
-For example to run the same command as above and use the default (DEFLATE) compression codec in Hadoop, you would:
+Same directories as Example 1, but LZOP-compress and index files as they are written to the destination.
+We're also verifying the write, which reads the file after the write has completed and compares the
+checksum with the original file.  This can work with or without compression.
 
-<pre><code>bin/slurper.sh \
-  --datasource-name test \
-  --src-dir file:/app/slurper/in \
-  --dest-dir hdfs:/user/ali/in \
-  --dest-staging-dir hdfs:/user/ali/staging \
-  --work-dir file:/app/slurper/work \
-  --complete-dir file:/app/slurper/complete \
-  --error-dir file:/app/slurper/error \
-  --compress org.apache.hadoop.io.compress.DefaultCodec
+<pre><code>shell$ cat conf/examples/lzop-verify.conf
+DATASOURCE_NAME = test
+SRC_DIR = file:/tmp/slurper/in
+WORK_DIR = file:/tmp/slurper/work
+COMPLETE_DIR = file:/tmp/slurper/complete
+ERROR_DIR = file:/tmp/slurper/error
+DEST_STAGING_DIR = hdfs:/incoming/stage
+DEST_DIR = hdfs:/incoming
+COMPRESSION_CODEC = com.hadoop.compression.lzo.LzopCodec
+CREATE_LZO_INDEX = true
+VERIFY = true
 </code></pre>
 
-### Fine-grained control over HDFS file destinations
+The slurper can be executed in the same fashion as we saw in Example 1.
 
-If you want to have control on a file-by-file basis as to the destination HDFS directory and file, use the
-"--script" option to specify a local executable script.  The source filename in URI form will be supplied to the standard input
-of the script, and the script should produce the target HDFS destination file in URI form on standard output as a single line.
+### Example 3
 
-For example, this is a simple Python script which uses the date in the filename to partition files into separate
-directories in HDFS by date.
+In our final example we can use the same local source directory, but use it in conjunction with
+a script to dynamically map the source filename to a HDFS directory.
 
-<pre><code>#!/usr/bin/python
+The source filename in URI form will be supplied to the standard input of the script, and the
+script should produce the target destination file in URI form on standard output as a single line.
+
+The script below, which is in Python (but can be any executable), extracts the date from the filename and
+uses it to write into a HDFS directory `/data/YYYY/MM/DD/<original-filename>`.
+
+<pre><code>shell$ cat bin/sample-python.py
+#!/usr/bin/python
 
 import sys, os, re
 
@@ -239,26 +142,45 @@ input_file=sys.stdin.readline()
 filename = os.path.basename(input_file)
 
 # extract the date from the filename
-date=re.search(r'([0-9]{4}\-[0-9]{2}\-[0-9]{2})', filename).group(1)
+match=re.search(r'([0-9]{4})([0-9]{2})([0-9]{2})', filename)
+
+year=match.group(1)
+mon=match.group(2)
+day=match.group(3)
 
 # construct our destination HDFS file
-hdfs_dest="hdfs:/data/%s/%s" % (date, filename)
+hdfs_dest="hdfs:/data/%s/%s/%s/%s" % (year, mon, day, filename)
 
 # write it to standard output
 print hdfs_dest,
 </code></pre>
 
+Our configuration needs to include the absolute path to the Python script.  Note too that we don't
+define "DEST_DIR", since it and "SCRIPT" are mutually exclusive.
+
+<pre><code>shell$ cat conf/examples/dynamic-dest.conf
+DATASOURCE_NAME = test
+SRC_DIR = file:/tmp/slurper/in
+WORK_DIR = file:/tmp/slurper/work
+COMPLETE_DIR = file:/tmp/slurper/complete
+ERROR_DIR = file:/tmp/slurper/error
+DEST_STAGING_DIR = hdfs:/incoming/stage
+SCRIPT = /path/to/slurper/bin/sample-python.py
+</code></pre>
+
+
 And you would use it as follows:
 
-<pre><code>touch /app/apache-20110202.log
-
-bin/slurper.sh \
-  --datasource-name test \
-  --src-dir file:/app/slurper/in \
-  --script "/app/slurper.sh/sample-python.py" \
-  --dest-staging-dir hdfs:/user/ali/staging \
-  --work-dir file:/app/slurper/work  \
-  --complete-dir file:/app/slurper/complete \
-  --error-dir file:/app/slurper/error
-INFO hdfsslurper.Slurper: Copying source file 'file:/app/slurper/in/apache-2011-02-02.log' to destination 'hdfs:/data/2011/02/02/apache-2011-02-02.log
+<pre><code>shell$ touch /tmp/slurper/in/apache-20110202.log
+
+shell$ bin/slurper.sh \
+  --config-file /path/to/slurper/conf/examples/basic.conf
+
+Launching script '/tmp/hdfs-file-slurper/src/main/python/sample-python.py' and piping the following to stdin 'file:/tmp/slurper/work/apache-20110202.log'
+Copying source file 'file:/tmp/slurper/work/apache-20110202.log' to staging destination 'hdfs:/incoming/stage/675861557'
+Attempting creation of target directory: hdfs:/data/2011/02/02
+Local file size = 0, HDFS file size = 0
+Moving staging file 'hdfs:/incoming/stage/675861557' to destination 'hdfs:/data/2011/02/02/apache-20110202.log'
+File copy successful, moving source file:/tmp/slurper/work/apache-20110202.log to completed file file:/tmp/slurper/complete/apache-20110202.log
 </code></pre>
+
diff --git a/TESTING.md b/TESTING.md
index 9a8e3dd..e729786 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -14,23 +14,18 @@ $ sudo dd bs=1048576 count=1 skip=0 if=/dev/sda of=/tmp/slurper-test/in/random-f
 $ md5sum /tmp/slurper-test/in/random-file
 969249981fa294b1273b91ec4dc3d34b  /tmp/slurper-test/in/random-file
 </code></pre>
-3.  Run the HDFS Slurper in standalone mode.
-<pre><code>export HADOOP_HOME=/usr/lib/hadoop
+3.  Edit `conf/slurper-env.sh` and set your JAVA_HOME and HADOOP_HOME settings.
+4.  Run the HDFS Slurper in standalone mode.
+<pre><code>
 bin/slurper.sh \
-  --datasource-name test \
-  --src-dir file:/tmp/slurper-test/in \
-  --dest-dir hdfs:/tmp/slurper-test/dest \
-  --dest-staging-dir hdfs:/tmp/slurper-test/staging \
-  --work-dir file:/tmp/slurper-test/work \
-  --complete-dir file:/tmp/slurper-test/complete \
-  --error-dir file:/tmp/slurper-test/error
+  --config-file /path/to/slurper/conf/examples/test.conf
 </code></pre>
-4.  Verify that the file was copied into HDFS
+5.  Verify that the file was copied into HDFS
 <pre><code>$ fs -ls /tmp/slurper-test/dest/random-file
 Found 1 items
 -rw-r--r--   1 user group    1048576 2012-01-17 21:09 /tmp/slurper-test/dest/random-file
 </code></pre>
-5.  Get the MD5 hash of the file in HDFS and verify it's the same as the original MD5 in step 2
+6.  Get the MD5 hash of the file in HDFS and verify it's the same as the original MD5 in step 2
 <pre><code>$ fs -cat /tmp/slurper-test/dest/random-file | md5sum
 969249981fa294b1273b91ec4dc3d34b  -
 </code></pre>
diff --git a/src/main/config/examples/basic.conf b/src/main/config/examples/basic.conf
new file mode 100644
index 0000000..37c03f1
--- /dev/null
+++ b/src/main/config/examples/basic.conf
@@ -0,0 +1,7 @@
+DATASOURCE_NAME = test
+SRC_DIR = file:/tmp/slurper/in
+WORK_DIR = file:/tmp/slurper/work
+COMPLETE_DIR = file:/tmp/slurper/complete
+ERROR_DIR = file:/tmp/slurper/error
+DEST_STAGING_DIR = hdfs:/incoming/stage
+DEST_DIR = hdfs:/incoming
diff --git a/src/main/config/examples/dynamic-dest.conf b/src/main/config/examples/dynamic-dest.conf
new file mode 100644
index 0000000..1bdda7d
--- /dev/null
+++ b/src/main/config/examples/dynamic-dest.conf
@@ -0,0 +1,7 @@
+DATASOURCE_NAME = test
+SRC_DIR = file:/tmp/slurper/in
+WORK_DIR = file:/tmp/slurper/work
+COMPLETE_DIR = file:/tmp/slurper/complete
+ERROR_DIR = file:/tmp/slurper/error
+DEST_STAGING_DIR = hdfs:/incoming/stage
+SCRIPT = /tmp/hdfs-file-slurper/src/main/python/sample-python.py
\ No newline at end of file
diff --git a/src/main/config/examples/lzop-verify.conf b/src/main/config/examples/lzop-verify.conf
new file mode 100644
index 0000000..f2640ae
--- /dev/null
+++ b/src/main/config/examples/lzop-verify.conf
@@ -0,0 +1,10 @@
+DATASOURCE_NAME = test
+SRC_DIR = file:/tmp/slurper/in
+WORK_DIR = file:/tmp/slurper/work
+COMPLETE_DIR = file:/tmp/slurper/complete
+ERROR_DIR = file:/tmp/slurper/error
+DEST_STAGING_DIR = hdfs:/incoming/stage
+DEST_DIR = hdfs:/incoming
+COMPRESSION_CODEC = com.hadoop.compression.lzo.LzopCodec
+CREATE_LZO_INDEX = true
+VERIFY = true
diff --git a/src/main/config/examples/test.conf b/src/main/config/examples/test.conf
new file mode 100644
index 0000000..2297f24
--- /dev/null
+++ b/src/main/config/examples/test.conf
@@ -0,0 +1,7 @@
+DATASOURCE_NAME = test
+SRC_DIR = file:/tmp/slurper-test/in
+WORK_DIR = file:/tmp/slurper-test/work
+COMPLETE_DIR = file:/tmp/slurper-test/complete
+ERROR_DIR = file:/tmp/slurper-test/error
+DEST_STAGING_DIR = hdfs:/tmp/slurper-test/staging
+DEST_DIR = hdfs:/tmp/slurper-test/dest
\ No newline at end of file
diff --git a/src/main/config/example.conf b/src/main/config/slurper.conf
similarity index 94%
rename from src/main/config/example.conf
rename to src/main/config/slurper.conf
index 048862d..81b9e5f 100644
--- a/src/main/config/example.conf
+++ b/src/main/config/slurper.conf
@@ -1,14 +1,14 @@
 #############################################################################
 #
 #  A configuration file which can be used with the
-#  "__config_file" option.  
+#  "--config-file" option.
 #
 #############################################################################
 
 # A name used for the PID file, as well as the log filename, to support
 # multiple Slurpers instances working from the same installation directory.
 #
-DATASOURCE_NAME = alex
+DATASOURCE_NAME = test
 
 # The source directory.  This must be a fully_qualified URI.
 #
@@ -45,11 +45,12 @@ DEST_DIR = hdfs:/tmp/slurper/dest
 
 # The compression codec which should be used to compress the output.
 #
-COMPRESSION_CODEC = com.hadoop.compression.lzo.LzopCodec
+# COMPRESSION_CODEC = com.hadoop.compression.lzo.LzopCodec
+
 
 # If the destination file is LZOP, this option will create an index file.
 #
-CREATE_LZO_INDEX = true
+# CREATE_LZO_INDEX = true
 
 # Reads the destination file after the copy has completed and verifies
 # its integrity.