Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

moved from existing repo to its own repo

  • Loading branch information...
commit a7d0b7870099a317795548646f31c2e0a576767f 0 parents
@aeden aeden authored
Showing with 7,357 additions and 0 deletions.
  1. +1 −0  .gitignore
  2. +6 −0 0.9-UPGRADE
  3. +190 −0 CHANGELOG
  4. +8 −0 HOW_TO_RELEASE
  5. +7 −0 LICENSE
  6. +85 −0 README
  7. +153 −0 Rakefile
  8. +28 −0 TODO
  9. +78 −0 active_support_logger.patch
  10. +28 −0 bin/etl
  11. +8 −0 bin/etl.cmd
  12. +16 −0 examples/database.example.yml
  13. +78 −0 lib/etl.rb
  14. +2 −0  lib/etl/batch.rb
  15. +111 −0 lib/etl/batch/batch.rb
  16. +55 −0 lib/etl/batch/directives.rb
  17. +2 −0  lib/etl/builder.rb
  18. +96 −0 lib/etl/builder/date_dimension_builder.rb
  19. +31 −0 lib/etl/builder/time_dimension_builder.rb
  20. +89 −0 lib/etl/commands/etl.rb
  21. +3 −0  lib/etl/control.rb
  22. +403 −0 lib/etl/control/control.rb
  23. +420 −0 lib/etl/control/destination.rb
  24. +95 −0 lib/etl/control/destination/database_destination.rb
  25. +124 −0 lib/etl/control/destination/file_destination.rb
  26. +109 −0 lib/etl/control/source.rb
  27. +220 −0 lib/etl/control/source/database_source.rb
  28. +11 −0 lib/etl/control/source/enumerable_source.rb
  29. +90 −0 lib/etl/control/source/file_source.rb
  30. +39 −0 lib/etl/control/source/model_source.rb
  31. +1 −0  lib/etl/core_ext.rb
  32. +5 −0 lib/etl/core_ext/time.rb
  33. +42 −0 lib/etl/core_ext/time/calculations.rb
  34. +552 −0 lib/etl/engine.rb
  35. +20 −0 lib/etl/execution.rb
  36. +9 −0 lib/etl/execution/base.rb
  37. +8 −0 lib/etl/execution/batch.rb
  38. +8 −0 lib/etl/execution/job.rb
  39. +85 −0 lib/etl/execution/migration.rb
  40. +18 −0 lib/etl/execution/record.rb
  41. +2 −0  lib/etl/generator.rb
  42. +20 −0 lib/etl/generator/generator.rb
  43. +39 −0 lib/etl/generator/surrogate_key_generator.rb
  44. +139 −0 lib/etl/http_tools.rb
  45. +11 −0 lib/etl/parser.rb
  46. +49 −0 lib/etl/parser/apache_combined_log_parser.rb
  47. +74 −0 lib/etl/parser/delimited_parser.rb
  48. +65 −0 lib/etl/parser/fixed_width_parser.rb
  49. +41 −0 lib/etl/parser/parser.rb
  50. +218 −0 lib/etl/parser/sax_parser.rb
  51. +65 −0 lib/etl/parser/xml_parser.rb
  52. +11 −0 lib/etl/processor.rb
  53. +14 −0 lib/etl/processor/block_processor.rb
  54. +81 −0 lib/etl/processor/bulk_import_processor.rb
  55. +80 −0 lib/etl/processor/check_exist_processor.rb
  56. +35 −0 lib/etl/processor/check_unique_processor.rb
  57. +26 −0 lib/etl/processor/copy_field_processor.rb
  58. +55 −0 lib/etl/processor/encode_processor.rb
  59. +55 −0 lib/etl/processor/hierarchy_exploder_processor.rb
  60. +12 −0 lib/etl/processor/print_row_processor.rb
  61. +25 −0 lib/etl/processor/processor.rb
  62. +24 −0 lib/etl/processor/rename_processor.rb
  63. +26 −0 lib/etl/processor/require_non_blank_processor.rb
  64. +17 −0 lib/etl/processor/row_processor.rb
  65. +23 −0 lib/etl/processor/sequence_processor.rb
  66. +53 −0 lib/etl/processor/surrogate_key_processor.rb
  67. +35 −0 lib/etl/processor/truncate_processor.rb
  68. +20 −0 lib/etl/row.rb
  69. +14 −0 lib/etl/screen.rb
  70. +20 −0 lib/etl/screen/row_count_screen.rb
  71. +2 −0  lib/etl/transform.rb
  72. +13 −0 lib/etl/transform/block_transform.rb
  73. +20 −0 lib/etl/transform/date_to_string_transform.rb
  74. +51 −0 lib/etl/transform/decode_transform.rb
  75. +20 −0 lib/etl/transform/default_transform.rb
  76. +122 −0 lib/etl/transform/foreign_key_lookup_transform.rb
  77. +49 −0 lib/etl/transform/hierarchy_lookup_transform.rb
  78. +12 −0 lib/etl/transform/ordinalize_transform.rb
  79. +13 −0 lib/etl/transform/sha1_transform.rb
  80. +16 −0 lib/etl/transform/string_to_date_transform.rb
  81. +14 −0 lib/etl/transform/string_to_datetime_transform.rb
  82. +11 −0 lib/etl/transform/string_to_time_transform.rb
  83. +61 −0 lib/etl/transform/transform.rb
  84. +26 −0 lib/etl/transform/trim_transform.rb
  85. +35 −0 lib/etl/transform/type_transform.rb
  86. +59 −0 lib/etl/util.rb
  87. +9 −0 lib/etl/version.rb
  88. +2 −0  test/.ignore
  89. +6 −0 test/all.ebf
  90. +11 −0 test/apache_combined_log.ctl
  91. +41 −0 test/batch_test.rb
  92. +6 −0 test/batch_with_error.ebf
  93. 0  test/batched1.ctl
  94. 0  test/batched2.ctl
  95. +6 −0 test/block_processor.ctl
  96. +1 −0  test/block_processor_error.ctl
  97. +4 −0 test/block_processor_pre_post_process.ctl
  98. +5 −0 test/block_processor_remove_rows.ctl
  99. +38 −0 test/block_processor_test.rb
  100. +9 −0 test/connection/native_mysql/connection.rb
  101. +36 −0 test/connection/native_mysql/schema.sql
  102. +13 −0 test/connection/postgresql/connection.rb
  103. +43 −0 test/connection/postgresql/schema.sql
  104. +43 −0 test/control_test.rb
  105. +3 −0  test/data/apache_combined_log.txt
  106. +3 −0  test/data/bulk_import.txt
  107. +3 −0  test/data/bulk_import_with_empties.txt
  108. +3 −0  test/data/decode.txt
  109. +3 −0  test/data/delimited.txt
  110. +2 −0  test/data/encode_source_latin1.txt
  111. +3 −0  test/data/fixed_width.txt
  112. +3 −0  test/data/multiple_delimited_1.txt
  113. +3 −0  test/data/multiple_delimited_2.txt
  114. +3 −0  test/data/people.txt
  115. +14 −0 test/data/sax.xml
  116. +16 −0 test/data/xml.xml
  117. +18 −0 test/database.example.yml
  118. +18 −0 test/database.mysql.yml
  119. +18 −0 test/database.postgres.yml
  120. +18 −0 test/database.yml
  121. +96 −0 test/date_dimension_builder_test.rb
  122. +30 −0 test/delimited.ctl
  123. +33 −0 test/delimited_absolute.ctl
  124. +25 −0 test/delimited_destination_db.ctl
  125. +34 −0 test/delimited_with_bulk_load.ctl
  126. +171 −0 test/destination_test.rb
  127. +23 −0 test/directive_test.rb
  128. +31 −0 test/encode_processor_test.rb
  129. +32 −0 test/engine_test.rb
  130. +24 −0 test/errors.ctl
  131. +42 −0 test/etl_test.rb
  132. +35 −0 test/fixed_width.ctl
  133. +14 −0 test/generator_test.rb
  134. +17 −0 test/inline_parser.ctl
  135. +26 −0 test/mocks/mock_destination.rb
  136. +25 −0 test/mocks/mock_source.rb
  137. +14 −0 test/model_source.ctl
  138. +22 −0 test/multiple_delimited.ctl
  139. +39 −0 test/multiple_source_delimited.ctl
  140. +1 −0  test/output/.ignore
  141. +3 −0  test/output/delimited.txt
  142. +2 −0  test/output/encode_destination_utf-8.txt
  143. +3 −0  test/output/fixed_width.txt
  144. +3 −0  test/output/inline_parser.txt
  145. +1 −0  test/output/scd_test_type_1.txt
  146. +1 −0  test/output/scd_test_type_1_1.txt
  147. +1 −0  test/output/scd_test_type_1_2.txt
  148. +2 −0  test/output/scd_test_type_2.txt
  149. +2 −0  test/output/test_file_destination.2.txt
  150. +2 −0  test/output/test_file_destination.txt
  151. +1 −0  test/output/test_multiple_unique.txt
  152. +2 −0  test/output/test_unique.txt
  153. +200 −0 test/parser_test.rb
  154. +30 −0 test/performance/delimited.ctl
  155. +38 −0 test/processor_test.rb
  156. +17 −0 test/row_processor_test.rb
  157. +26 −0 test/sax.ctl
  158. +1 −0  test/scd/1.txt
  159. +1 −0  test/scd/2.txt
  160. +1 −0  test/scd/3.txt
  161. +271 −0 test/scd_test.rb
  162. +43 −0 test/scd_test_type_1.ctl
  163. +42 −0 test/scd_test_type_2.ctl
  164. +9 −0 test/screen_test.rb
  165. +3 −0  test/screen_test_error.ctl
  166. +3 −0  test/screen_test_fatal.ctl
  167. +139 −0 test/source_test.rb
  168. +33 −0 test/test_helper.rb
  169. +101 −0 test/transform_test.rb
  170. +31 −0 test/xml.ctl
1  .gitignore
@@ -0,0 +1 @@
+pkg/*
6 0.9-UPGRADE
@@ -0,0 +1,6 @@
+The 0.9 revision of ActiveWarehouse ETL significantly changes how connections are maintained. This release is not backwards compatible.
+
+To upgrade, you must do the following:
+
+1.) All database connections used in ETL control files must be declared in database.yml in the directory that contains your ETL control files.
+2.) All sources, destinations, transforms and processors that use a database connection must include the configuration name/value pair of :target => 'name' where name is replaced with the connection name defined in database.yml. Connection information should no longer be included in control files.
190 CHANGELOG
@@ -0,0 +1,190 @@
+0.1.0 - Dec 6, 2006
+* Initial release
+
+0.2.0 - Dec 7, 2006
+* Added an XML parser for source parsing
+* Added support for compound key constraints in destinations via the
+ :unique => [] option
+* Added ability to declare explicit columns in bulk import
+* Added support for generators in destinations
+* Added a SurrogateKeyGenerator for cases where the database doesn't support
+ auto generated surrogate keys
+
+0.3.0 - Dec 19, 2006
+* Added support for calculated values in virtual fields with Proc
+
+0.4.0 - Jan 11, 2006
+* Added :skip_lines option to file source configurations, which can be used
+ to skip the first n lines in the source data file
+* Added better error handling in delimited parser - an error is now raised
+ if the expected and actual field lengths do not match
+* Added :truncate option for database destination. Set to true to truncate
+ before importing data.
+* Added support for :unique => [] option and virtual fields for the database
+ destination
+
+0.5.0 - Feb 17, 2007
+* Changed require_gem to gem and added alias to allow for older versions of
+ rubygems.
+* Added support for Hash in the source configuration where :name => :parser_name
+ defines the parser to use and :options => {} defines options to pass to the
+ parser.
+* Added support for passing a custom Parser class in the source configuration.
+* Removed the need to include Enumerable in each parser implementation.
+* Added new date_to_string and string_to_date transformers.
+* Implemented foreign_key_lookup transform including an ActiveRecordResolver.
+* Added real time activity logging which is called when the etl bin script is
+ invoked.
+* Improved error handling.
+* Default logger level is now WARN.
+
+0.5.1 - Feb 18, 2007
+* Fixed up truncate processor.
+* Updated HOW_TO_RELEASE doc.
+
+0.5.2 - Feb 19, 2007
+* Added error threshold.
+* Fixed problem with transform error handling.
+
+0.6.0 - Mar 8, 2007
+* Fixed missing method problem in validate in Control class.
+* Removed control validation for now (source could be code in the control file).
+* Transform interface now defined as taking 3 arguments, the field name, field
+ value and the row. This is not backwards compatible.
+* Added HierarchyLookupTransform.
+* Added DefaultTransform which will return a specified value if the initial
+ value is blank.
+* Added row-level processing.
+* Added HierarchyExploderProcessor which takes a single hierarchy row and
+ explodes it to multiple rows as used in a hierarchy bridge.
+* Added ApacheCombinedLogParser which parses Apache Combined Log format,
+ including parsing of the
+ user agent string and the URI, returning a Hash.
+* Fixed bug in SAX parser so that attributes are now set when the start_element
+ event is received.
+* Added an HttpTools module which provides some parsing methods (for user agent
+ and URI).
+* Database source now uses its own class for establishing an ActiveRecord
+ connection.
+* Log files are now timestamped.
+* Source files are now archived automatically during the extraction process
+* Added a :condition option to the destination configuration Hash that accepts
+ a Proc with a single argument passed to it (the row).
+* Added an :append_rows option to the destination configuration Hash that
+ accepts either a Hash (to append a single row) or an Array of Hashes (to
+ append multiple rows).
+* Only print the read and written row counts if there is at least one source
+ and one destination respectively.
+* Added a depends_on directive that accepts a list of arguments of either strings
+ or symbols. Each symbol is converted to a string and .ctl is appended;
+ strings are passed through directly. The dependencies are executed in the order
+ they are specified.
+* The default field separator in the bulk loader is now a comma (was a tab).
+
+0.6.1 - Mar 22, 2007
+* Added support for absolute paths in file sources
+* Added CopyFieldProcessor
+
+0.7 - Apr 8, 2007
+* Job execution is now tracked in a database. This means that ActiveRecord is
+ required regardless of the sources being used in the ETL scripts. An example
+ database configuration for the etl can be found in test/database.example.yml.
+ This file is loaded from either a.) the current working directory or b.) the
+ location specified using the -c command line argument when running the etl
+ command.
+* etl script now supports the following command line arguments:
+** -h or --help: Prints the usage
+** -l or --limit: Specifies a limit for the number of source rows to read,
+ useful for testing your control files before executing a full ETL process
+** -o or --offset: Specified a start offset for reading from the source, useful
+ for testing your control files before executing a full ETL process
+** -c or --config: Specify the database.yml file to configure the ETL
+ execution data store
+** -n or --newlog: Write to the logfile rather than appending to it
+* Database source now supports specifying the select, join and order parts of
+ the query.
+* Database source understands the limit argument specified on the etl command
+ line
+* Added CheckExistProcessor
+* Added CheckUniqueProcessor
+* Added SurrogateKeyProcessor. The SurrogateKey processor should be used in
+ conjunction with the CheckExistProcessor and CheckUniqueProcessor to provide
+ surrogate keys for all dimension records.
+* Added SequenceProcessor
+* Added OrdinalizeTransform
+* Fixed a bug in the trim transform
+* Sources now provide a trigger file which can be used to indicate that the
+ original source data has been completely extracted to the local file system.
+ This is useful if you need to recover from a failed ETL process.
+* Updated README
+
+0.7.1 - Apr 8, 2007
+* Fixed source caching
+
+0.7.2 - Apr 8, 2007
+* Fixed quoting bug in CheckExistProcessor
+
+0.8.0 - Apr 12, 2007
+* Source now available through the current row source accessor.
+* Added new_rows_only configuration option to DatabaseSource. A date field must
+ be specified and only records that are greater than the date value in that
+ field, relative to the last successful
+ execution, will be returned from the source.
+* Added an (untested) count feature which returns the number of rows for
+ processing.
+* If no natural key is defined then an empty array will now be used, resulting
+ in the row being written to the output without going through change checks.
+* Mapping argument in destination is now optional. An empty hash will be used
+ if the mapping hash is not specified. If the mapping hash is not specified
+ then the order will be determined using the originating source's order.
+* ActiveRecord configurations loaded from database.yml by the etl tool will be
+ merged with ActiveRecord::Base.configurations.
+* Fixed several bugs in how record change detection was implemented.
+* Fixed how the read_locally functionality was implemented so that it will find
+ that last completed local source copy using the source's trigger file (untested).
+
+0.8.1 - Apr 12, 2007
+* Added EnumerableSource
+* Added :type configuration option to the source directive, allowing the source
+ type to be explicitly specified. The source type can be a string or symbol
+ (in which case the class will be constructed by appending Source to the type
+ name), a class (which will be instantiate and passed the control,
+ configuration and mapping) and finally an actual Source instance.
+
+0.8.2 - April 15, 2007
+* Fixed bug with premature destination closing.
+* Added indexes to execution records table.
+* Added a PrintRowProcessor.
+* Added support for conditions and "group by" in the database source.
+* Added after_initialize hook in Processor base class.
+* Added examples directory
+
+0.8.3 - May 13, 2007
+* Added patches from Andy Triboletti
+
+0.8.4 - May 24, 2007
+* Added fix for backslash in file writer
+
+0.9.0 - August 9, 2007
+* Added support for batch processing through .ebf files. These files are
+ essentially control files that apply settings to an entire ETL process.
+* Implemented support for screen blocks. These blocks can be used to test
+ the data and raise an error if the screens do not pass.
+* Connections are now cached in a Hash available through
+ ETL::Engine.connection(name). This should be used rather than including
+ connection information in the control files.
+* Implemented temp table support throughout.
+* DateDimensionBuilder now included in ActiveWarehouse ETL directly.
+* Time calculations for fiscal year now included in ActiveWarehouse ETL.
+
+0.9.1 -
+* SQLResolver now uses ETL::Engine.table so it may utilize temp tables. (aeden)
+* Added Thibaut Barrère's encode processor.
+* Added MockSource and MockDestination test helpers (thbar)
+* Added the block processor. Can call a block once (pre/post processor)
+ or once for each row (after_read/before_write row processor) (thbar)
+* Changed temp table to use new AdapterExtension copy_table method (aeden)
+* Added bin/etl.cmd windows batch - just add the bin folder to your PATH
+ and it will let you call etl on an unpacked/pistoned version of AW-ETL (thbar)
+* Upgraded to support Rails 2.1. No longer compatible with older versions of Rails.
+* Added ETL::Builder::TimeDimensionBuilder
8 HOW_TO_RELEASE
@@ -0,0 +1,8 @@
+cd trunk
+rake release
+cd ..
+svn cp trunk tags/release-x.y.z
+cd tags/release-x.y.z
+svn commit
+cd ../../trunk
+rake pdoc
7 LICENSE
@@ -0,0 +1,7 @@
+Copyright (c) 2006-2007 Anthony Eden
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
85 README
@@ -0,0 +1,85 @@
+Ruby Extract-Transform-Load (ETL) tool.
+
+== Requirements
+
+* Ruby 1.8.5 or higher
+* Rubygems
+
+== Online Documentation
+
+Available at http://activewarehouse.rubyforge.org/docs/activewarehouse-etl.html
+
+== Features
+
+Current supported features:
+
+* ETL Domain Specific Language (DSL) - Control files are specified in a Ruby-based DSL
+* Multiple source types. Current supported types:
+ * Fixed-width and delimited text files
+ * XML files through SAX
+ * Apache combined log format
+* Multiple destination types - file and database destinations
+* Support for extracting from multiple sources in a single job
+* Support for writing to multiple destinations in a single job
+* A variety of built-in transformations are included:
+ * Date-to-string, string-to-date, string-to-datetime, string-to-timestamp
+ * Type transformation supporting strings, integers, floats and big decimals
+ * Trim
+ * SHA-1
+ * Decode from an external decode file
+ * Default replacement for empty values
+ * Ordinalize
+ * Hierarchy lookup
+ * Foreign key lookup
+ * Ruby blocks
+ * Any custom transformation class
+* A variety of build-in row-level processors
+ * Check exists processor to determine if the record already exists in the destination database
+ * Check unique processor to determine whether a matching record was processed during this job execution
+ * Copy field
+ * Rename field
+ * Hierarchy exploder which takes a tree structure defined through a parent id and explodes it into a hierarchy bridge table
+ * Surrogate key generator including support for looking up the last surrogate key from the target table using a custom query
+ * Sequence generator including support for context-sensitive sequences where the context can be defined as a combination of fields from the source data
+ * New row-level processors can easily be defined and applied
+* Pre-processing
+ * Truncate processor
+* Post-processing
+ * Bulk import using native RDBMS bulk loader tools
+* Virtual fields - Add a field to the destination data which doesn't exist in the source data
+* Built in job and record meta data
+* Support for type 1 and type 2 slowly changing dimensions
+ * Automated effective date and end date time stamping for type 2
+ * CRC checking
+
+== Dependencies
+ActiveWarehouse ETL depends on the following gems:
+* ActiveSupport Gem
+* ActiveRecord Gem
+* FasterCSV Gem
+* AdapterExtensions Gem
+
+== Usage
+Once the ActiveWarehouse ETL gem is installed jobs can be invoked using the
+included `etl` script. The etl script includes several command line options
+and can process multiple control files at a time.
+
+Command line options:
+* <tt>--help, -h</tt>: Display the usage message.
+* <tt>--config, -c</tt>: Specify a database.yml configuration file to use.
+* <tt>--limit, -l</tt>: Specify a limit to the number of rows to process. This option is currently only applicable to database sources.
+* <tt>--offset, -o</tt>: Specify the start offset for reading from the source. This option is currently only applicable to database sources.
+* <tt>--newlog, -n</tt>: Instruct the engine to create a new ETL log rather than append to the last ETL log.
+* <tt>--skip-bulk-import, -s</tt>: Skip any bulk imports.
+* <tt>--read-locally</tt>: Read from the local cache (skip source extraction)
+
+== Control File Examples
+Control file examples can be found in the examples directory.
+
+== Running Tests
+The tests require Shoulda 1.x.
+
+== Feedback
+This is a work in progress. Comments should be made on the
+activewarehouse-discuss mailing list at the moment. Contributions are always
+welcome.
153 Rakefile
@@ -0,0 +1,153 @@
+require 'rake'
+require 'rake/testtask'
+require 'rake/rdoctask'
+require 'rake/packagetask'
+require 'rake/gempackagetask'
+require 'rake/contrib/rubyforgepublisher'
+
+require File.join(File.dirname(__FILE__), 'lib/etl', 'version')
+
+module AWETL
+ PKG_BUILD = ENV['PKG_BUILD'] ? '.' + ENV['PKG_BUILD'] : ''
+ PKG_NAME = 'activewarehouse-etl'
+ PKG_VERSION = ETL::VERSION::STRING + PKG_BUILD
+ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
+ PKG_DESTINATION = ENV["PKG_DESTINATION"] || "../#{PKG_NAME}"
+end
+
+desc 'Default: run unit tests.'
+task :default => :test
+
+desc 'Test the ETL application.'
+Rake::TestTask.new(:test) do |t|
+ t.libs << 'lib'
+ t.pattern = 'test/**/*_test.rb'
+ t.verbose = true
+ # TODO: reset the database
+end
+
+desc 'Generate documentation for the ETL application.'
+Rake::RDocTask.new(:rdoc) do |rdoc|
+ rdoc.rdoc_dir = 'rdoc'
+ rdoc.title = 'ActiveWarehouse ETL'
+ rdoc.options << '--line-numbers' << '--inline-source'
+ rdoc.rdoc_files.include('README')
+ rdoc.rdoc_files.include('lib/**/*.rb')
+end
+
+namespace :rcov do
+ desc 'Measures test coverage'
+ task :test do
+ rm_f 'coverage.data'
+ mkdir 'coverage' unless File.exist?('coverage')
+ rcov = "rcov --aggregate coverage.data --text-summary -Ilib"
+ system("#{rcov} test/*_test.rb")
+ # system("open coverage/index.html") if PLATFORM['darwin']
+ end
+end
+
+# Gem Spec
+
+module AWETL
+ def self.package_files(package_prefix)
+ FileList[
+ "#{package_prefix}CHANGELOG",
+ "#{package_prefix}LICENSE",
+ "#{package_prefix}README",
+ "#{package_prefix}TODO",
+ "#{package_prefix}Rakefile",
+ "#{package_prefix}bin/**/*",
+ "#{package_prefix}doc/**/*",
+ "#{package_prefix}lib/**/*",
+ "#{package_prefix}examples/**/*",
+ ] - [ "#{package_prefix}test" ]
+ end
+
+ def self.spec(package_prefix = '')
+ Gem::Specification.new do |s|
+ s.name = 'activewarehouse-etl'
+ s.version = AWETL::PKG_VERSION
+ s.summary = "Pure Ruby ETL package."
+ s.description = <<-EOF
+ ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.
+ EOF
+
+ s.add_dependency('rake', '>= 0.7.1')
+ s.add_dependency('activesupport', '>= 1.3.1')
+ s.add_dependency('activerecord', '>= 1.14.4')
+ s.add_dependency('fastercsv', '>= 1.2.0')
+ s.add_dependency('adapter_extensions', '>= 0.1.0')
+
+ s.rdoc_options << '--exclude' << '.'
+ s.has_rdoc = false
+
+ s.files = package_files(package_prefix).to_a.delete_if {|f| f.include?('.svn')}
+ s.require_path = 'lib'
+
+ s.bindir = "#{package_prefix}bin" # Use these for applications.
+ s.executables = ['etl']
+ s.default_executable = "etl"
+
+ s.author = "Anthony Eden"
+ s.email = "anthonyeden@gmail.com"
+ s.homepage = "http://activewarehouse.rubyforge.org/etl"
+ s.rubyforge_project = "activewarehouse"
+ end
+ end
+end
+
+Rake::GemPackageTask.new(AWETL.spec) do |pkg|
+ pkg.gem_spec = AWETL.spec
+ pkg.need_tar = true
+ pkg.need_zip = true
+end
+
+desc "Generate code statistics"
+task :lines do
+ lines, codelines, total_lines, total_codelines = 0, 0, 0, 0
+
+ for file_name in FileList["lib/**/*.rb"]
+ next if file_name =~ /vendor/
+ f = File.open(file_name)
+
+ while line = f.gets
+ lines += 1
+ next if line =~ /^\s*$/
+ next if line =~ /^\s*#/
+ codelines += 1
+ end
+ puts "L: #{sprintf("%4d", lines)}, LOC #{sprintf("%4d", codelines)} | #{file_name}"
+
+ total_lines += lines
+ total_codelines += codelines
+
+ lines, codelines = 0, 0
+ end
+
+ puts "Total: Lines #{total_lines}, LOC #{total_codelines}"
+end
+
+desc "Publish the release files to RubyForge."
+task :release => [ :package ] do
+ `rubyforge login`
+
+ for ext in %w( gem tgz zip )
+ release_command = "rubyforge add_release activewarehouse #{AWETL::PKG_NAME} 'REL #{AWETL::PKG_VERSION}' pkg/#{AWETL::PKG_NAME}-#{AWETL::PKG_VERSION}.#{ext}"
+ puts release_command
+ system(release_command)
+ end
+end
+
+desc "Publish the API documentation"
+task :pdoc => [:rdoc] do
+ Rake::SshDirPublisher.new("aeden@rubyforge.org", "/var/www/gforge-projects/activewarehouse/etl/rdoc", "rdoc").upload
+end
+
+desc "Reinstall the gem from a local package copy"
+task :reinstall => [:package] do
+ windows = RUBY_PLATFORM =~ /mswin/
+ sudo = windows ? '' : 'sudo'
+ gem = windows ? 'gem.bat' : 'gem'
+ `#{sudo} #{gem} uninstall #{AWETL::PKG_NAME} -x`
+ `#{sudo} #{gem} install pkg/#{AWETL::PKG_NAME}-#{AWETL::PKG_VERSION}`
+end
28 TODO
@@ -0,0 +1,28 @@
+TODO
+
+* Add build-in support for audit_dimension
+* Do not rerun the processing if it isn't needed, i.e. the source and control files have not been modified (allow forced override)
+* Provide greater control in error handling
+** Allow a error threshold
+** Don't die completely if a parse error, just stop processing that specific file if error threshold is reached
+** Allow mismatch row length error in delimited parser to be ignored
+* Improve error messages throughout, but especially in problems with the control files
+* Add support for paritioned views during the insert process. Use specifiable columns as the trigger columns for determining the data output destination.
+* Check if a temp table exists and the last job run was successful, in which case skip during the current run
+* Create models for each of the tables in each of the databases defined in ETL::Engine.connections
+
+Audit Record
+
+Process-Level
+ * Start Time
+ * End Time
+ * (Duration)
+ * Rows Read
+ * Rows Written
+ * Rows Rejected
+ * Errors
+ * Destination
+Record-Level
+ * Source
+ * Timestamp
+ * Transformation Log
78 active_support_logger.patch
@@ -0,0 +1,78 @@
+Index: lib/active_support/clean_logger.rb
+===================================================================
+--- lib/active_support/clean_logger.rb (revision 5963)
++++ lib/active_support/clean_logger.rb (working copy)
+@@ -1,10 +1,21 @@
+ require 'logger'
+ require File.dirname(__FILE__) + '/core_ext/class/attribute_accessors'
+
+-class Logger #:nodoc:
++# Extensions to the built in Ruby logger.
++#
++# If you want to use the default log formatter as defined in the Ruby core, then you
++# will need to set the formatter for the logger as in:
++#
++# logger.formatter = Formatter.new
++#
++# You can then specify the datetime format, for example:
++#
++# logger.datetime_format = "%Y-%m-%d"
++class Logger
++ # Set to false to disable the silencer
+ cattr_accessor :silencer
+ self.silencer = true
+-
++
+ # Silences the logger for the duration of the block.
+ def silence(temporary_level = Logger::ERROR)
+ if silencer
+@@ -18,6 +29,35 @@
+ yield self
+ end
+ end
++
++ alias :old_datetime_format= :datetime_format=
++ # Logging date-time format (string passed to +strftime+). Ignored if the formatter
++ # does not respond to datetime_format=.
++ def datetime_format=(datetime_format)
++ formatter.datetime_format = datetime_format if formatter.respond_to?(:datetime_format=)
++ end
++
++ alias :old_datetime_format :datetime_format
++ # Get the logging datetime format. Returns nil if the formatter does not support
++ # datetime formatting.
++ def datetime_format
++ formatter.datetime_format if formatter.respond_to?(:datetime_format)
++ end
++
++ alias :old_formatter :formatter
++ # Get the current formatter. The default formatter is a SimpleFormatter which only
++ # displays the log message
++ def formatter
++ @formatter ||= SimpleFormatter.new
++ end
++
++ # Simple formatter which only displays the message.
++ class SimpleFormatter < Logger::Formatter
++ # This method is invoked when a log event occurs
++ def call(severity, timestamp, progname, msg)
++ "#{msg}\n"
++ end
++ end
+
+ private
+ alias old_format_message format_message
+@@ -28,11 +68,11 @@
+ # with Logger from 1.8.3 and vice versa.
+ if method_defined?(:formatter=)
+ def format_message(severity, timestamp, progname, msg)
+- "#{msg}\n"
++ formatter.call(severity, timestamp, progname, msg)
+ end
+ else
+ def format_message(severity, timestamp, msg, progname)
+- "#{msg}\n"
++ formatter.call(severity, timestamp, progname, msg)
+ end
+ end
+ end
28 bin/etl
@@ -0,0 +1,28 @@
+#!/usr/bin/env ruby
+
+#--
+# Copyright (c) 2006 Anthony Eden
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+
+$:.unshift(File.dirname(__FILE__) + '/../lib/')
+require 'etl'
+require 'etl/commands/etl'
8 bin/etl.cmd
@@ -0,0 +1,8 @@
+@echo off
+
+rem The purpose of this Windows script is to let you use the etl command line with a non-gem version of AW-ETL (eg: unpacked gem, pistoned trunk).
+rem Just add the current folder on top of your PATH variable to use it instead of the etl command provided with the gem release.
+
+rem %~dp0 returns the absolute path where the current script is. We just append 'etl' to it, and forward all the arguments with %*
+
+ruby "%~dp0etl" %*
16 examples/database.example.yml
@@ -0,0 +1,16 @@
+etl_execution:
+ adapter: mysql
+ username: root
+ host: localhost
+ database: etl_execution
+ encoding: utf8
+datawarehouse:
+ adapter: mysql
+ username: root
+ host: localhost
+ database: datawarehouse_development
+operational:
+ adapter: mysql
+ username: root
+ host: localhost
+ database: operational_production
78 lib/etl.rb
@@ -0,0 +1,78 @@
+# This source file requires all of the necessary gems and source files for ActiveWarehouse ETL. If you
+# load this source file all of the other required files and gems will also be brought into the
+# runtime.
+
+#--
+# Copyright (c) 2006-2007 Anthony Eden
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+
+require 'logger'
+require 'yaml'
+require 'erb'
+
+require 'rubygems'
+
+unless defined?(REXML::VERSION)
+ require 'rexml/rexml'
+ REXML::VERSION = REXML::Version
+end
+
+require 'active_support'
+require 'active_record'
+require 'adapter_extensions'
+require 'faster_csv'
+
+$:.unshift(File.dirname(__FILE__))
+
+require 'etl/core_ext'
+require 'etl/util'
+require 'etl/http_tools'
+require 'etl/builder'
+require 'etl/version'
+require 'etl/engine'
+require 'etl/control'
+require 'etl/batch'
+require 'etl/row'
+require 'etl/parser'
+require 'etl/transform'
+require 'etl/processor'
+require 'etl/generator'
+require 'etl/screen'
+
+module ETL #:nodoc:
+ class ETLError < StandardError #:nodoc:
+ end
+ class ControlError < ETLError #:nodoc:
+ end
+ class DefinitionError < ControlError #:nodoc:
+ end
+ class ConfigurationError < ControlError #:nodoc:
+ end
+ class MismatchError < ETLError #:nodoc:
+ end
+ class ResolverError < ETLError #:nodoc:
+ end
+ class ScreenError < ETLError #:nodoc:
+ end
+ class FatalScreenError < ScreenError #:nodoc:
+ end
+end
2  lib/etl/batch.rb
@@ -0,0 +1,2 @@
+require 'etl/batch/batch'
+require 'etl/batch/directives'
111 lib/etl/batch/batch.rb
@@ -0,0 +1,111 @@
+module ETL #:nodoc:
+ module Batch
+ class Context
+ attr_reader :batch
+
+ class << self
+ # Create a context that is used when evaluating the batch file
+ def create(batch)
+ Context.new(batch).get_binding
+ end
+ end
+
+ def initialize(batch)
+ @batch = batch
+ end
+
+ def file
+ batch.file
+ end
+
+ def get_binding
+ binding
+ end
+
+ def run(file)
+ batch.run(File.dirname(self.file) + "/" + file)
+ end
+
+ def use_temp_tables(value=true)
+ batch.use_temp_tables(value)
+ end
+
+ end
+ class Batch
+ attr_accessor :file
+ attr_accessor :engine
+
+ class << self
+ # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
+ # are:
+ # * The path to a control file as a String
+ # * A File object referencing the control file
+ # * The ETL::Control::Control object (which will just be returned)
+ #
+ # Raises a ControlError if any other type is given
+ def resolve(batch, engine)
+ batch = do_resolve(batch)
+ batch.engine = engine
+ batch
+ end
+
+ protected
+ def parse(batch_file)
+ batch_file = batch_file.path if batch_file.instance_of?(File)
+ batch = ETL::Batch::Batch.new(batch_file)
+ eval(IO.readlines(batch_file).join("\n"), Context.create(batch), batch_file)
+ batch
+ end
+
+ def do_resolve(batch)
+ case batch
+ when String
+ ETL::Batch::Batch.parse(File.new(batch))
+ when File
+ ETL::Batch::Batch.parse(batch)
+ when ETL::Batch::Batch
+ batch
+ else
+ raise RuntimeError, "Batch must be a String, File or Batch object"
+ end
+ end
+ end
+
+ def initialize(file)
+ @file = file
+ end
+
+ def run(file)
+ directives << Run.new(self, file)
+ end
+
+ def use_temp_tables(value = true)
+ directives << UseTempTables.new(self)
+ end
+
+ def execute
+ engine.say "Executing batch"
+ before_execute
+ directives.each do |directive|
+ directive.execute
+ end
+ engine.say "Finishing batch"
+ after_execute
+ engine.say "Batch complete"
+ end
+
+ def directives
+ @directives ||= []
+ end
+
+ def before_execute
+
+ end
+
+ def after_execute
+ ETL::Engine.finish # TODO: should be moved to the directive?
+ ETL::Engine.use_temp_tables = false # reset the temp tables
+ end
+ end
+ end
+end
55 lib/etl/batch/directives.rb
@@ -0,0 +1,55 @@
+module ETL #:nodoc:
+ module Batch #:nodoc:
+ # Abstract base class for directives
+ class Directive
+ # Method to access the batch object
+ attr_reader :batch
+
+ # Initialize the directive with the given batch object
+ def initialize(batch)
+ @batch = batch
+ end
+
+ # Execute the directive
+ def execute
+ do_execute
+ end
+
+ protected
+ # Implemented by subclasses
+ def do_execute
+ raise RuntimeError, "Directive must implement do_execute method"
+ end
+ end
+
+ # Directive indicating that the specified ETL control file should be
+ # run
+ class Run < Directive
+ # The file to execute
+ attr_reader :file
+
+ # Initialize the directive with the given batch object and file
+ def initialize(batch, file)
+ super(batch)
+ @file = file
+ end
+
+ protected
+ # Execute the process
+ def do_execute
+ batch.engine.process(file)
+ end
+ end
+
+ # Directive indicating temp tables should be used.
+ class UseTempTables < Directive
+ def initialize(batch)
+ super(batch)
+ end
+ protected
+ def do_execute
+ ETL::Engine.use_temp_tables = true
+ end
+ end
+ end
+end
2  lib/etl/builder.rb
@@ -0,0 +1,2 @@
+require 'etl/builder/date_dimension_builder'
+require 'etl/builder/time_dimension_builder'
96 lib/etl/builder/date_dimension_builder.rb
@@ -0,0 +1,96 @@
+module ETL #:nodoc:
+ module Builder #:nodoc:
+ # A builder which will build a data structure which can be used to populate a date dimension using
+ # commonly used date dimension columns.
+ class DateDimensionBuilder
+ # Specify the start date for the first record
+ attr_accessor :start_date
+
+ # Specify the end date for the last record
+ attr_accessor :end_date
+
+ # Define any holiday indicators
+ attr_accessor :holiday_indicators
+
+ # Add offset month for fiscal year
+ attr_accessor :fiscal_year_offset_month
+
+ # Define the weekday indicators. The default array begins on Sunday and goes to Saturday.
+ cattr_accessor :weekday_indicators
+ @@weekday_indicators = ['Weekend','Weekday','Weekday','Weekday','Weekday','Weekday','Weekend']
+
+ # Initialize the builder.
+ #
+ # * <tt>start_date</tt>: The start date. Defaults to 5 years ago from today.
+ # * <tt>end_date</tt>: The end date. Defaults to now.
+ def initialize(start_date=Time.now.years_ago(5), end_date=Time.now, fiscal_year_offset_month=10)
+ @start_date = start_date.to_date
+ @end_date = end_date.to_date
+ @fiscal_year_offset_month = fiscal_year_offset_month.to_i
+ @holiday_indicators = []
+ end
+
+ # Returns an array of hashes representing records in the dimension.
+ def build(options={})
+ (start_date..end_date).map { |date| record_from_date(date) }
+ end
+
+ private
+
+ # Returns a hash representing a record in the dimension. The values for each record are
+ # accessed by name.
+ def record_from_date(date)
+ time = date.to_time # need methods only available in Time
+ record = {}
+ record[:date] = time.strftime("%m/%d/%Y")
+ record[:full_date_description] = time.strftime("%B %d,%Y")
+ record[:day_of_week] = time.strftime("%A")
+ record[:day_in_week] = record[:day_of_week] # alias
+ #record[:day_number_in_epoch] = time.to_i / 24
+ #record[:week_number_in_epoch] = time.to_i / (24 * 7)
+ #record[:month_number_in_epoch] = time.to_i / (24 * 7 * 30)
+ record[:day_number_in_calendar_month] = time.day
+ record[:day_number_in_calendar_year] = time.yday
+ record[:day_number_in_fiscal_month] = time.day # should this be different from CY?
+ record[:day_number_in_fiscal_year] = time.fiscal_year_yday(fiscal_year_offset_month)
+ #record[:last_day_in_week_indicator] =
+ #record[:last_day_in_month_indicator] =
+ #record[:calendar_week_ending_date] =
+ record[:calendar_week] = "Week #{time.week}"
+ record[:calendar_week_number] = time.week
+ record[:calendar_week_number_in_year] = time.week # DEPRECATED
+ record[:calendar_month_name] = time.strftime("%B")
+ record[:calendar_month_number_in_year] = time.month # DEPRECATED
+ record[:calendar_month_number] = time.month
+ record[:calendar_year_month] = time.strftime("%Y-%m")
+ record[:calendar_quarter] = "Q#{time.quarter}"
+ record[:calendar_quarter_number] = time.quarter
+ record[:calendar_quarter_number_in_year] = time.quarter # DEPRECATED
+ record[:calendar_year_quarter] = "#{time.strftime('%Y')}-#{record[:calendar_quarter]}"
+ #record[:calendar_half_year] =
+ record[:calendar_year] = "#{time.year}"
+ record[:fiscal_week] = "FY Week #{time.fiscal_year_week(fiscal_year_offset_month)}"
+ record[:fiscal_week_number_in_year] = time.fiscal_year_week(fiscal_year_offset_month) # DEPRECATED
+ record[:fiscal_week_number] = time.fiscal_year_week(fiscal_year_offset_month)
+ record[:fiscal_month] = time.fiscal_year_month(fiscal_year_offset_month)
+ record[:fiscal_month_number] = time.fiscal_year_month(fiscal_year_offset_month)
+ record[:fiscal_month_number_in_year] = time.fiscal_year_month(fiscal_year_offset_month) # DEPRECATED
+ record[:fiscal_year_month] = "FY#{time.fiscal_year(fiscal_year_offset_month)}-" + time.fiscal_year_month(fiscal_year_offset_month).to_s.rjust(2, '0')
+ record[:fiscal_quarter] = "FY Q#{time.fiscal_year_quarter(fiscal_year_offset_month)}"
+ record[:fiscal_year_quarter] = "FY#{time.fiscal_year(fiscal_year_offset_month)}-Q#{time.fiscal_year_quarter(fiscal_year_offset_month)}"
+ record[:fiscal_quarter_number] = time.fiscal_year_quarter(fiscal_year_offset_month) # DEPRECATED
+ record[:fiscal_year_quarter_number] = time.fiscal_year_quarter(fiscal_year_offset_month)
+ #record[:fiscal_half_year] =
+ record[:fiscal_year] = "FY#{time.fiscal_year(fiscal_year_offset_month)}"
+ record[:fiscal_year_number] = time.fiscal_year(fiscal_year_offset_month)
+ record[:holiday_indicator] = holiday_indicators.include?(date) ? 'Holiday' : 'Nonholiday'
+ record[:weekday_indicator] = weekday_indicators[time.wday]
+ record[:selling_season] = 'None'
+ record[:major_event] = 'None'
+ record[:sql_date_stamp] = date
+
+ record
+ end
+ end
+ end
+end
31 lib/etl/builder/time_dimension_builder.rb
@@ -0,0 +1,31 @@
+module ETL #:nodoc:
+ module Builder #:nodoc:
+ # Builder that creates a simple time dimension.
+ class TimeDimensionBuilder
+ def initialize
+ # Returns an array of hashes representing records in the dimension. The values for each record are
+ # accessed by name.
+ def build(options={})
+ records = []
+ 0.upto(23) do |t_hour|
+ 0.upto(59) do |t_minute|
+ 0.upto(59) do |t_second|
+ t_hour_string = t_hour.to_s.rjust(2, '0')
+ t_minute_string = t_minute.to_s.rjust(2, '0')
+ t_second_string = t_second.to_s.rjust(2, '0')
+ record = {}
+ record[:hour] = t_hour
+ record[:minute] = t_minute
+ record[:second] = t_second
+ record[:minute_description] = "#{t_hour_string}:#{t_minute_string}"
+ record[:full_description] = "#{t_hour_string}:#{t_minute_string}:#{t_second_string}"
+ records << record
+ end
+ end
+ end
+ records
+ end
+ end
+ end
+ end
+end
89 lib/etl/commands/etl.rb
@@ -0,0 +1,89 @@
+#--
+# Copyright (c) 2006 Anthony Eden
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+
+require 'benchmark'
+require 'getoptlong'
+
+# Print a usage statement
+def usage #:nodoc:
+ puts "Usage: etl file [file file ...]" # TODO: add the command line options
+end
+
+def execute
+ opts = GetoptLong.new(
+ [ '--version', '-v', GetoptLong::NO_ARGUMENT],
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
+ [ '--config', '-c', GetoptLong::REQUIRED_ARGUMENT ],
+ [ '--limit', '-l', GetoptLong::REQUIRED_ARGUMENT ],
+ [ '--offset', '-o', GetoptLong::REQUIRED_ARGUMENT],
+ [ '--newlog', '-n', GetoptLong::NO_ARGUMENT ],
+ [ '--skip-bulk-import', '-s', GetoptLong::NO_ARGUMENT ],
+ [ '--read-locally', GetoptLong::NO_ARGUMENT],
+ [ '--rails-root', GetoptLong::REQUIRED_ARGUMENT]
+ )
+
+ options = {}
+ opts.each do |opt, arg|
+ case opt
+ when '--version'
+ puts "ActiveWarehouse ETL version #{ETL::VERSION::STRING}"
+ return
+ when '--help'
+ usage
+ return
+ when '--config'
+ options[:config] = arg
+ when '--limit'
+ options[:limit] = arg.to_i
+ when '--offset'
+ options[:offset] = arg.to_i
+ when '--newlog'
+ options[:newlog] = true
+ when '--skip-bulk-import'
+ puts "skip bulk import enabled"
+ options[:skip_bulk_import] = true
+ when '--read-locally'
+ puts "read locally enabled"
+ options[:read_locally] = true
+ when '--rails-root'
+ options[:rails_root] = arg
+ puts "rails root set to #{options[:rails_root]}"
+ end
+ end
+
+ if ARGV.length < 1
+ usage
+ else
+ puts "Starting ETL process"
+
+ ETL::Engine.init(options)
+ ARGV.each do |f|
+ ETL::Engine.realtime_activity = true
+ ETL::Engine.process(f)
+ end
+
+ puts "ETL process complete\n\n"
+ end
+end
+
+execute
3  lib/etl/control.rb
@@ -0,0 +1,3 @@
+require 'etl/control/control'
+require 'etl/control/source'
+require 'etl/control/destination'
403 lib/etl/control/control.rb
@@ -0,0 +1,403 @@
+module ETL #:nodoc:
+ module Control #:nodoc:
+ # The Context is passed to eval.
+ class Context
+ require 'test/unit/assertions'
+ include Test::Unit::Assertions
+ attr_reader :control
+
+ class << self
+ # Create a Context instance
+ def create(control)
+ Context.new(control).get_binding
+ end
+ end
+
+ # Initialize the context
+ def initialize(control)
+ @control = control
+ end
+
+ # Get the control file
+ def file
+ control.file
+ end
+
+ # Set the allowed error threshold
+ def set_error_threshold(error_threshold)
+ control.error_threshold = error_threshold
+ end
+
+ # Define a list of control files that this file depends on. Those control
+ # files will be executed prior to this control file. The list may
+ # contain symbols that will be converted to file names by calling
+ # to_s + '.ctl', or they may be strings in which case they will be used
+ # as is
+ def depends_on(*args)
+ (dependencies << args).flatten!
+ end
+
+ # Get the defined dependencies
+ def dependencies
+ control.dependencies
+ end
+
+ # Define a source.
+ def source(name, configuration={}, definition={})
+ if configuration[:type]
+ case configuration[:type]
+ when Class
+ source_class = configuration[:type]
+ sources << source_class.new(self, configuration, definition)
+ when String, Symbol
+ source_class = ETL::Control::Source.class_for_name(configuration[:type])
+ sources << source_class.new(self, configuration, definition)
+ else
+ if configuration[:type].is_a?(ETL::Control::Source)
+ sources << configuration[:type]
+ else
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Source"
+ end
+ end
+ else
+ source_types.each do |source_type|
+ if configuration[source_type]
+ source_class = ETL::Control::Source.class_for_name(source_type)
+ sources << source_class.new(self, configuration, definition)
+ break
+ end
+ raise ControlError, "A source was specified but no matching type was found"
+ end
+ end
+ end
+
+ # Get the defined source
+ def sources
+ control.sources
+ end
+
+ # Define a destination
+ def destination(name, configuration={}, mapping={})
+ if configuration[:type]
+ case configuration[:type]
+ when Class
+ dest_class = configuration[:type]
+ destinations << dest_class.new(self, configuration, mapping)
+ when String, Symbol
+ dest_class = ETL::Control::Destination.class_for_name(configuration[:type])
+ destinations << dest_class.new(self, configuration, mapping)
+ else
+ if configuration[:type].is_a?(ETL::Control::Destination)
+ destinations << configuration[:type]
+ else
+ raise ControlError, "Type must be a Class, String, Symbol or object extending ETL::Control::Destination"
+ end
+ end
+ else
+ destination_types.each do |dest_type|
+ if configuration[dest_type]
+ dest_class = ETL::Control::Destination.class_for_name(dest_type)
+ destinations << dest_class.new(self, configuration, mapping)
+ break
+ end
+ raise ControlError, "A destination was specified but no matching destination type was found"
+ end
+ end
+ end
+
+ # Get the defined destinations
+ def destinations
+ control.destinations
+ end
+
+ # Define a transform
+ def transform(name, transformer=nil, configuration={}, &block)
+ if transformer
+ case transformer
+ when String, Symbol
+ class_name = "#{transformer.to_s.camelize}Transform"
+ begin
+ transform_class = ETL::Transform.const_get(class_name)
+ transforms << transform_class.new(self, name, configuration)
+ rescue NameError => e
+ raise ControlError, "Unable to find transformer #{class_name}: #{e}"
+ end
+ else
+ #transformer.class.inspect
+ if transformer.is_a?(ETL::Transform::Transform)
+ Engine.logger.debug "Adding transformer #{transformer.inspect} for field #{name}"
+ t = transformer.dup
+ t.name = name
+ transforms << t
+ else
+ raise ControlError, "Transformer must be a String, Symbol or Transform instance"
+ end
+ end
+ elsif block_given?
+ transforms << ETL::Transform::BlockTransform.new(self, name, :block => block)
+ else
+ raise ControlError, "Either a transformer or a block must be specified"
+ end
+ end
+
+ # Get the defined transforms
+ def transforms
+ control.transforms
+ end
+
+ # Define a before post-process screen block. The type argument must be
+ # one of :fatal, :error or :warn
+ def screen(type, &block)
+ screens[type] << block
+ end
+
+ # Get the before post-process screen blocks
+ def screens
+ control.screens
+ end
+
+ # Define an after post-proces screen block. The type argument must be
+ # one of :fatal, :error or :warn
+ def after_post_process_screen(type, &block)
+ after_post_process_screens[type] << block
+ end
+
+ # Get the after post-process screen blocks
+ def after_post_process_screens
+ control.after_post_process_screens
+ end
+
+ # Rename the source field to the destination field
+ def rename(source, destination)
+ after_read :rename, :source => source, :dest => destination
+ end
+
+ # Copy the source field to the destination field
+ def copy(source, destination)
+ after_read :copy_field, :source => source, :dest => destination
+ end
+
+ protected
+ # This method is used to define a processor and insert into the specified processor
+ # collection.
+ def define_processor(name, processor_collection, configuration, proc)
+ case name
+ when String, Symbol, nil
+ name ||= 'block'
+ class_name = "#{name.to_s.camelize}Processor"
+ begin
+ processor_class = ETL::Processor.const_get(class_name)
+ if name == 'block'
+ raise ControlError, "A block must be passed for block processor" if proc.nil?
+ configuration[:block] = proc
+ end
+ processor_collection << processor_class.new(self, configuration)
+ rescue NameError => e
+ raise ControlError, "Unable to find processor #{class_name}: #{e}"
+ end
+ when Class
+ processor_collection << name.new(self, configuration)
+ else
+ raise ControlError, "The process declaration requires a String, Symbol or Class, or a Block to be passed"
+ end
+ end
+
+ public
+ # Define an "after read" processor. This must be a row-level processor.
+ def after_read(name='block', configuration={}, &block)
+ define_processor(name, after_read_processors, configuration, block)
+ end
+
+ # Get the defined "after read" processors
+ def after_read_processors
+ control.after_read_processors
+ end
+
+ # Define a "before write" processor. This must be a row-level processor.
+ def before_write(name='block', configuration={}, &block)
+ define_processor(name, before_write_processors, configuration, block)
+ end
+
+ # Get the defined "before write" processors
+ def before_write_processors
+ control.before_write_processors
+ end
+
+ # Define a pre-processor
+ def pre_process(name='block', configuration={}, &block)
+ define_processor(name, pre_processors, configuration, block)
+ end
+
+ # Get the defined pre-processors
+ def pre_processors
+ control.pre_processors
+ end
+
+ # Define a post-processor
+ def post_process(name='block', configuration={}, &block)
+ define_processor(name, post_processors, configuration, block)
+ end
+
+ # Get the defined post-processors
+ def post_processors
+ control.post_processors
+ end
+
+ # Get the binding object
+ def get_binding
+ binding
+ end
+
+ protected
+ # Get an array of supported source types
+ def source_types
+ control.source_types
+ end
+
+ # Get an array of supported destination types
+ def destination_types
+ control.destination_types
+ end
+
+ end
+
+ # Object representation of a control file
+ class Control
+ # The File object
+ attr_reader :file
+
+ # The error threshold
+ attr_accessor :error_threshold
+
+ class << self
+ # Parse a control file and return a Control instance
+ def parse(control_file)
+ control_file = control_file.path if control_file.instance_of?(File)
+ control = ETL::Control::Control.new(control_file)
+ # TODO: better handling of parser errors. Return the line in the control file where the error occurs.
+ eval(IO.readlines(control_file).join("\n"), Context.create(control), control_file)
+ control.validate
+ control
+ end
+
+ def parse_text(text)
+ control = ETL::Control::Control.new(nil)
+ eval(text, Context.create(control), 'inline')
+ control.validate
+ control
+ end
+
+ # Resolve the given object to an ETL::Control::Control instance. Acceptable arguments
+ # are:
+ # * The path to a control file as a String
+ # * A File object referencing the control file
+ # * The ETL::Control::Control object (which will just be returned)
+ #
+ # Raises a ControlError if any other type is given
+ def resolve(control)
+ case control
+ when String
+ ETL::Control::Control.parse(File.new(control))
+ when File
+ ETL::Control::Control.parse(control)
+ when ETL::Control::Control
+ control
+ else
+ raise ControlError, "Control must be a String, File or Control object"
+ end
+ end
+ end
+
+ # Initialize the instance with the given File object
+ def initialize(file)
+ @file = file
+ end
+
+ # Get a list of dependencies
+ def dependencies
+ @dependencies ||= []
+ end
+
+ # Get the defined source
+ def sources
+ @sources ||= []
+ end
+
+ # Get the defined destinations
+ def destinations
+ @destinations ||= []
+ end
+
+ # Get the transforms with the specified name
+ # def transform(name)
+# transforms[name] ||= []
+# end
+
+ def after_read_processors
+ @after_read_processors ||= []
+ end
+
+ # Get all of the "before write" processors
+ def before_write_processors
+ @before_write_processors ||= []
+ end
+
+ # Get an Array of preprocessors
+ def pre_processors
+ @pre_processors ||= []
+ end
+
+ # Get an Array of post processors
+ def post_processors
+ @post_processors ||= []
+ end
+
+ # Get an Array of all transforms for this control
+ def transforms
+ @transforms ||= []
+ end
+
+ # A hash of the screens executed before post-process
+ def screens
+ @screens ||= {
+ :fatal => [],
+ :error => [],
+ :warn => []
+ }
+ end
+
+ # A hash of the screens executed after post-process
+ def after_post_process_screens
+ @after_post_process_screens ||= {
+ :fatal => [],
+ :error => [],
+ :warn => []
+ }
+ end
+
+ # Get the error threshold. Defaults to 100.
+ def error_threshold
+ @error_threshold ||= 100
+ end
+
+ # Validate the control file
+ def validate
+ #unless sources.length > 0
+ # raise ControlError, "Configuration must include one of the following for the source: #{source_types.join(',')}"
+ #end
+ #unless destinations.length > 0
+ # raise ControlError, "Configuration must include one of the following for the destination: #{destination_types.join(',')}"
+ #end
+ end
+
+ def source_types
+ [:file, :database]
+ end
+
+ def destination_types
+ [:file, :database]
+ end
+
+ end
+ end
+end
420 lib/etl/control/destination.rb
@@ -0,0 +1,420 @@
+module ETL #:nodoc:
+ module Control #:nodoc:
+ # Base class for destinations.
+ class Destination
+ # Read-only accessor for the ETL::Control::Control instance
+ attr_reader :control
+
+ # Read-only accessor for the configuration Hash
+ attr_reader :configuration
+
+ # Read-only accessor for the destination mapping Hash
+ attr_reader :mapping
+
+ # Accessor to the buffer size
+ attr_accessor :buffer_size
+
+ # Unique flag.
+ attr_accessor :unique
+
+ # A condition for writing
+ attr_accessor :condition
+
+ # An array of rows to append to the destination
+ attr_accessor :append_rows
+
+ class << self
+ # Get the destination class for the specified name.
+ #
+ # For example if name is :database or 'database' then the
+ # DatabaseDestination class is returned
+ def class_for_name(name)
+ ETL::Control.const_get("#{name.to_s.camelize}Destination")
+ end
+ end
+
+ # Initialize the destination
+ #
+ # Arguments:
+ # * <tt>control</tt>: The ETL::Control::Control instance
+ # * <tt>configuration</tt>: The configuration Hash
+ # * <tt>mapping</tt>: The mapping Hash
+ #
+ # Options:
+ # * <tt>:buffer_size</tt>: The output buffer size (default 1000 records)
+ # * <tt>:condition</tt>: A conditional proc that must return true for the
+ # row to be written
+ # * <tt>:append_rows</tt>: An array of rows to append
+ def initialize(control, configuration, mapping)
+ @control = control
+ @configuration = configuration
+ @mapping = mapping
+ @buffer_size = configuration[:buffer_size] ||= 100
+ @condition = configuration[:condition]
+ @append_rows = configuration[:append_rows]
+ end
+
+ # Get the current row number
+ def current_row
+ @current_row ||= 1
+ end
+
+ # Write the given row
+ def write(row)
+ if @condition.nil? || @condition.call(row)
+ process_change(row)
+ end
+ flush if buffer.length >= buffer_size
+ end
+
+ # Abstract method
+ def flush
+ raise NotImplementedError, "flush method must be implemented by subclasses"
+ end
+
+ # Abstract method
+ def close
+ raise NotImplementedError, "close method must be implemented by subclasses"
+ end
+
+ def errors
+ @errors ||= []
+ end
+
+ protected
+ # Access the buffer
+ def buffer
+ @buffer ||= []
+ end
+
+ # Access the generators map
+ def generators
+ @generators ||= {}
+ end
+
+ # Get the order of elements from the source order
+ def order_from_source
+ order = []
+ control.sources.first.definition.each do |item|
+ case item
+ when Hash
+ order << item[:name]
+ else
+ order << item
+ end
+ end
+ order
+ end
+
+ # Return true if the row is allowed. The row will not be allowed if the
+ # :unique option is specified in the configuration and the compound key
+ # already exists
+ def row_allowed?(row)
+ if unique
+ key = (unique.collect { |k| row[k] }).join('|')
+ return false if compound_key_constraints[key]
+ compound_key_constraints[key] = 1
+ end
+ return true
+ end
+
+ # Get a hash of compound key contraints. This is used to determine if a
+ # row can be written when the unique option is specified
+ def compound_key_constraints
+ @compound_key_constraints ||= {}
+ end
+
+ # Return fields which are Slowly Changing Dimension fields.
+ # Uses the scd_fields specified in the configuration. If that's
+ # missing, uses all of the row's fields.
+ def scd_fields(row)
+ @scd_fields ||= configuration[:scd_fields] || row.keys
+ end
+
+ def non_scd_fields(row)
+ @non_csd_fields ||= row.keys - natural_key - scd_fields(row) -
+ [primary_key, scd_effective_date_field, scd_end_date_field, scd_latest_version_field]
+ end
+
+ def non_evolving_fields
+ (Array(configuration[:scd][:non_evolving_fields]) << primary_key).uniq
+ end
+
+ def scd?
+ !configuration[:scd].nil?
+ end
+
+ def scd_type
+ scd? ? configuration[:scd][:type] : nil
+ end
+
+ # Get the Slowly Changing Dimension effective date field. Defaults to
+ # 'effective_date'.
+ def scd_effective_date_field
+ configuration[:scd][:effective_date_field] || :effective_date if scd?
+ end
+
+ # Get the Slowly Changing Dimension end date field. Defaults to
+ # 'end_date'.
+ def scd_end_date_field
+ configuration[:scd][:end_date_field] || :end_date if scd?
+ end
+
+ # Get the Slowly Changing Dimension latest version field. Defaults to
+ # 'latest_version'.
+ def scd_latest_version_field
+ configuration[:scd][:latest_version_field] || :latest_version if scd?
+ end
+
+ # Return the natural key field names, defaults to []
+ def natural_key
+ @natural_key ||= determine_natural_key
+ end
+
+ # Get the dimension table if specified
+ def dimension_table
+ @dimension_table ||= if scd?
+ ETL::Engine.table(configuration[:scd][:dimension_table], dimension_target) or raise ConfigurationError, "dimension_table setting required"
+ end
+ end
+
+ # Get the dimension target if specified
+ def dimension_target
+ @dimension_target ||= if scd?
+ configuration[:scd][:dimension_target] or raise ConfigurationError, "dimension_target setting required"
+ end
+ end
+
+ # Process a row to determine the change type
+ def process_change(row)
+ ETL::Engine.logger.debug "Processing row: #{row.inspect}"
+ return unless row
+
+ # Change processing can only occur if the natural key exists in the row
+ ETL::Engine.logger.debug "Checking for natural key existence"
+ unless has_natural_key?(row)
+ buffer << row
+ return
+ end
+
+ @timestamp = Time.now
+
+ # See if the scd_fields of the current record have changed
+ # from the last time this record was loaded into the data
+ # warehouse. If they match then throw away this row (no need
+ # to process). If they do not match then the record is an
+ # 'update'. If the record doesn't exist then it is an 'insert'
+ ETL::Engine.logger.debug "Checking record for SCD change"
+ if @existing_row = preexisting_row(row)
+ if has_scd_field_changes?(row)
+ process_scd_change(row)
+ else
+ process_scd_match(row)
+ end
+ else
+ schedule_new_record(row)
+ end
+ end
+
+ # Add any virtual fields to the row. Virtual rows will get their value
+ # from one of the following:
+ # * If the mapping is a Class, then an object which implements the next
+ # method
+ # * If the mapping is a Symbol, then the XGenerator where X is the
+ # classified symbol
+ # * If the mapping is a Proc, then it will be called with the row
+ # * Otherwise the value itself will be assigned to the field
+ def add_virtuals!(row)
+ if mapping[:virtual]
+ mapping[:virtual].each do |key,value|
+ # If the row already has the virtual set, assume that's correct
+ next if row[key]
+ # Engine.logger.debug "Mapping virtual #{key}/#{value} for row #{row}"
+ case value
+ when Class
+ generator = generators[key] ||= value.new
+ row[key] = generator.next
+ when Symbol
+ generator = generators[key] ||= ETL::Generator::Generator.class_for_name(value).new(options)
+ row[key] = generator.next
+ when Proc
+ row[key] = value.call(row)
+ else
+ if value.is_a?(ETL::Generator::Generator)
+ row[key] = value.next
+ else
+ row[key] = value
+ end
+ end
+ end
+ end
+ end
+
+ private
+
+ # Determine the natural key. This method will always return an array
+ # of symbols. The default value is [].
+ def determine_natural_key
+ Array(configuration[:natural_key]).collect(&:to_sym)
+ end
+
+ # Check whether a natural key has been defined, and if so, whether
+ # this row has enough information to do searches based on that natural
+ # key.
+ #
+ # TODO: This should be factored out into
+ # ETL::Row#has_all_fields?(field_array) But that's not possible
+ # until *all* sources cast to ETL::Row, instead of sometimes
+ # using Hash
+ def has_natural_key?(row)
+ natural_key.any? && natural_key.all? { |key| row.has_key?(key) }
+ end
+
+ # Helper for generating the SQL where clause that allows searching
+ # by a natural key
+ def natural_key_equality_for_row(row)
+ statement = []
+ values = []
+ natural_key.each do |nk|
+ statement << "#{nk} = ?"
+ values << row[nk]
+ end
+ statement = statement.join(" AND ")
+ ActiveRecord::Base.send(:sanitize_sql, [statement, *values])
+ end
+
+ # Do all the steps required when a SCD *has* changed. Exact steps
+ # depend on what type of SCD we're handling.
+ def process_scd_change(row)
+ ETL::Engine.logger.debug "SCD fields do not match"
+
+ if scd_type == 2
+ # SCD Type 2: new row should be added and old row should be updated
+ ETL::Engine.logger.debug "type 2 SCD"
+
+ # To update the old row, we delete the version in the database
+ # and insert a new expired version
+
+ # If there is no truncate then the row will exist twice in the database
+ delete_outdated_record
+
+ ETL::Engine.logger.debug "expiring original record"
+ @existing_row[scd_end_date_field] = @timestamp
+ @existing_row[scd_latest_version_field] = false
+
+ buffer << @existing_row
+
+ elsif scd_type == 1
+ # SCD Type 1: only the new row should be added
+ ETL::Engine.logger.debug "type 1 SCD"
+
+ # Copy primary key, and other non-evolving fields over from
+ # original version of record
+ non_evolving_fields.each do |non_evolving_field|
+ row[non_evolving_field] = @existing_row[non_evolving_field]
+ end
+
+ # If there is no truncate then the row will exist twice in the database
+ delete_outdated_record
+ else
+ # SCD Type 3: not supported
+ ETL::Engine.logger.debug "SCD type #{scd_type} not supported"
+ end
+
+ # In all cases, the latest, greatest version of the record
+ # should go into the load
+ schedule_new_record(row)
+ end
+
+ # Do all the steps required when a SCD has *not* changed. Exact
+ # steps depend on what type of SCD we're handling.
+ def process_scd_match(row)
+ ETL::Engine.logger.debug "SCD fields match"
+
+ if scd_type == 2 && has_non_scd_field_changes?(row)
+ ETL::Engine.logger.debug "Non-SCD field changes"
+ # Copy important data over from original version of record
+ row[primary_key] = @existing_row[primary_key]
+ row[scd_end_date_field] = @existing_row[scd_end_date_field]
+ row[scd_effective_date_field] = @existing_row[scd_effective_date_field]
+ row[scd_latest_version_field] = @existing_row[scd_latest_version_field]
+
+ # If there is no truncate then the row will exist twice in the database
+ delete_outdated_record
+
+ buffer << row
+ else
+ # The record is totally the same, so skip it
+ end
+ end
+
+ # Find the version of this row that already exists in the datawarehouse.
+ def preexisting_row(row)
+ q = "SELECT * FROM #{dimension_table} WHERE #{natural_key_equality_for_row(row)}"
+ q << " AND #{scd_latest_version_field}" if scd_type == 2
+
+ #puts "looking for original record"
+ result = connection.select_one(q)
+
+ #puts "Result: #{result.inspect}"
+
+ result ? ETL::Row[result.symbolize_keys!] : nil
+ end
+
+ # Check whether non-scd fields have changed since the last
+ # load of this record.
+ def has_scd_field_changes?(row)
+ scd_fields(row).any? { |csd_field| row[csd_field].to_s != @existing_row[csd_field].to_s }
+ end
+
+ # Check whether non-scd fields have changed since the last
+ # load of this record.
+ def has_non_scd_field_changes?(row)
+ non_scd_fields(row).any? { |non_csd_field| row[non_csd_field].to_s != @existing_row[non_csd_field].to_s }
+ end
+
+ # Grab, or re-use, a database connection for running queries directly
+ # during the destination processing.
+ def connection
+ @conn ||= ETL::Engine.connection(dimension_target)
+ end
+
+ # Utility for removing a row that has outdated information. Note
+ # that this deletes directly from the database, even if this is a file
+ # destination. It needs to do this because you can't do deletes in a
+ # bulk load.
+ def delete_outdated_record
+ ETL::Engine.logger.debug "deleting old row"
+
+ q = "DELETE FROM #{dimension_table} WHERE #{primary_key} = #{@existing_row[primary_key]}"
+ connection.delete(q)
+ end
+
+ # Schedule the latest, greatest version of the row for insertion
+ # into the database
+ def schedule_new_record(row)
+ ETL::Engine.logger.debug "writing new record"
+ if scd_type == 2
+ row[scd_effective_date_field] = @timestamp
+ row[scd_end_date_field] = '9999-12-31 00:00:00'
+ row[scd_latest_version_field] = true
+ end
+ buffer << row
+ end
+
+ # Get the name of the primary key for this table. Asks the dimension
+ # model class for this information, but if that class hasn't been
+ # defined, just defaults to :id.
+ def primary_key
+ return @primary_key if @primary_key
+ @primary_key = dimension_table.to_s.camelize.constantize.primary_key.to_sym
+ rescue NameError => e
+ ETL::Engine.logger.debug "couldn't get primary_key from dimension model class, using default :id"
+ @primary_key = :id
+ end
+
+ end
+ end
+end
+
+Dir[File.dirname(__FILE__) + "/destination/*.rb"].each { |file| require(file) }
95 lib/etl/control/destination/database_destination.rb
@@ -0,0 +1,95 @@
+module ETL #:nodoc:
+ module Control #:nodoc:
+ # Destination which writes directly to a database. This is useful when you are dealing with
+ # a small amount of data. For larger amounts of data you should probably use the bulk
+ # loader if it is supported with your target database as it will use a much faster load
+ # method.
+ class DatabaseDestination < Destination
+ # The target connection
+ attr_reader :target
+
+ # The table
+ attr_reader :table
+
+ # Specify the order from the source
+ attr_reader :order
+
+ # Set to true to truncate the destination table first
+ attr_reader :truncate
+
+ # Initialize the database destination
+ #
+ # * <tt>control</tt>: The ETL::Control::Control instance
+ # * <tt>configuration</tt>: The configuration Hash
+ # * <tt>mapping</tt>: The mapping
+ #
+ # Configuration options:
+ # * <tt>:database</tt>: The database name (REQUIRED)
+ # * <tt>:target</tt>: The target connection (REQUIRED)
+ # * <tt>:table</tt>: The table to write to (REQUIRED)
+ # * <tt>:truncate</tt>: Set to true to truncate before writing (defaults to false)
+ # * <tt>:unique</tt>: Set to true to only insert unique records (defaults to false)
+ # * <tt>:append_rows</tt>: Array of rows to append
+ #
+ # Mapping options:
+ # * <tt>:order</tt>: The order of fields to write (REQUIRED)
+ def initialize(control, configuration, mapping={})
+ super
+ @target = configuration[:target]
+ @table = configuration[:table]
+ @truncate = configuration[:truncate] ||= false
+ @unique = configuration[:unique]
+ @order = mapping[:order] || order_from_source
+ raise ControlError, "Order required in mapping" unless @order
+ raise ControlError, "Table required" unless @table
+ raise ControlError, "Target required" unless @target
+ end
+
+ # Flush the currently buffered data
+ def flush
+ conn.transaction do
+ buffer.flatten.each do |row|
+ # check to see if this row's compound key constraint already exists
+ # note that the compound key constraint may not utilize virtual fields
+ next unless row_allowed?(row)
+
+ # add any virtual fields
+ add_virtuals!(row)
+
+ names = []
+ values = []
+ order.each do |name|
+ names << name
+ values << conn.quote(row[name]) # TODO: this is probably not database agnostic
+ end
+ q = "INSERT INTO #{table_name} (#{names.join(',')}) VALUES (#{values.join(',')})"
+ ETL::Engine.logger.debug("Executing insert: #{q}")
+ conn.insert(q, "Insert row #{current_row}")
+ @current_row += 1
+ end
+ buffer.clear
+ end
+ end
+
+ # Close the connection
+ def close
+ buffer << append_rows if append_rows
+ flush
+ end
+
+ private
+ def conn
+ @conn ||= begin
+ conn = ETL::Engine.connection(target)
+ conn.truncate(table_name) if truncate
+ conn
+ end
+ end
+
+ def table_name
+ ETL::Engine.table(table, ETL::Engine.connection(target))
+ end
+
+ end
+ end
+end
124 lib/etl/control/destination/file_destination.rb
@@ -0,0 +1,124 @@
+# This source file contains the ETL::Control::FileDestination
+
+module ETL #:nodoc:
+ module Control #:nodoc:
+ # File as the final destination.
+ class FileDestination < Destination
+ # The File to write to
+ attr_reader :file
+
+ # The output order
+ attr_reader :order
+
+ # Flag which indicates to append (default is to overwrite)
+ attr_accessor :append
+
+ # The separator
+ attr_accessor :separator
+
+ # The end of line marker
+ attr_accessor :eol
+
+ # The enclosure character
+ attr_accessor :enclose
+
+ # Initialize the object.
+ # * <tt>control</tt>: The Control object
+ # * <tt>configuration</tt>: The configuration map
+ # * <tt>mapping</tt>: The output mapping
+ #
+ # Configuration options:
+ # * <tt>:file<tt>: The file to write to (REQUIRED)
+ # * <tt>:append</tt>: Set to true to append to the file (default is to overwrite)
+ # * <tt>:separator</tt>: Record separator (default is a comma)
+ # * <tt>:eol</tt>: End of line marker (default is \n)
+ # * <tt>:enclose</tt>: Enclosure character (default is none)
+ # * <tt>:unique</tt>: Set to true to only write unique records
+ # * <tt>:append_rows</tt>: Array of rows to append
+ #
+ # Mapping options:
+ # * <tt>:order</tt>: The order array
+ def initialize(control, configuration, mapping={})
+ super
+ @file = File.join(File.dirname(control.file), configuration[:file])
+ @append = configuration[:append] ||= false
+ @separator = configuration[:separator] ||= ','
+ @eol = configuration[:eol] ||= "\n"
+ @enclose = configuration[:enclose]
+ @unique = configuration[:unique]
+
+ @order = mapping[:order] || order_from_source
+ raise ControlError, "Order required in mapping" unless @order
+ end
+
+ # Close the destination. This will flush the buffer and close the underlying stream or connection.
+ def close
+ buffer << append_rows if append_rows
+ flush
+ f.close
+ end
+
+ # Flush the destination buffer
+ def flush
+ #puts "Flushing buffer (#{file}) with #{buffer.length} rows"
+ buffer.flatten.each do |row|
+ #puts "row change type: #{row.change_type}"
+ # check to see if this row's compound key constraint already exists
+ # note that the compound key constraint may not utilize virtual fields
+ next unless row_allowed?(row)
+
+ # add any virtual fields
+ add_virtuals!(row)
+
+ # collect all of the values using the order designated in the configuration
+ values = order.collect do |name|
+ value = row[name]
+ case value
+ when Date, Time, DateTime
+ value.to_s(:db)
+ else
+ value.to_s
+ end
+ end
+
+ values.collect! { |v| v.gsub(/\\/, '\\\\\\\\')}
+ values.collect! { |v| v.gsub(separator, "\\#{separator}")}
+ values.collect! { |v| v.gsub(/\n|\r/, '')}
+
+ # enclose the value if required
+ if !enclose.nil?
+ values.collect! { |v| enclose + v.gsub(/(#{enclose})/, '\\\\\1') + enclose }
+ end
+
+ # write the values joined by the separator defined in the configuration
+ f.write(values.join(separator))
+
+ # write the end-of-line
+ f.write(eol)
+ end
+ f.flush
+ buffer.clear
+ #puts "After flush there are #{buffer.length} rows"
+ end
+
+ private
+ # Get the open file stream
+ def f
+ @f ||= open(file, mode)
+ end
+
+ def options
+ @options ||= {
+ :col_sep => separator,
+ :row_sep => eol,
+ :force_quotes => !enclose.nil?
+ }
+ end
+
+ # Get the appropriate mode to open the file stream
+ def mode
+ append ? 'a' : 'w'
+ end
+ end
+ end
+end
109 lib/etl/control/source.rb
@@ -0,0 +1,109 @@
+module ETL #:nodoc:
+ module Control #:nodoc:
+ # ETL source. Subclasses must implement the <tt>each</tt> method.
+ class Source
+ include Enumerable
+
+ # The control object
+ attr_accessor :control
+
+ # The configuration Hash
+ attr_accessor :configuration
+
+ # The definition Hash
+ attr_accessor :definition
+
+ # Returns true if the source data should be stored locally for archival
+ # Default behavior will return true.
+ attr_accessor :store_locally
+
+ class << self
+ # Convert the name to a Source class.
+ #
+ # For example if name is :database then this will return a
+ # DatabaseSource class
+ def class_for_name(name)
+ ETL::Control.const_get("#{name.to_s.camelize}Source")
+ end
+ end
+
+ # Initialize the Source instance
+ # * <tt>control</tt>: The control object
+ # * <tt>configuration</tt>: The configuration hash
+ # * <tt>definition</tt>: The source layout definition
+ #
+ # Configuration options:
+ # * <tt>:store_locally</tt>: Set to false to not store source data
+ # locally (defaults to true)
+ def initialize(control, configuration, definition)
+ @control = control
+ @configuration = configuration
+ @definition = definition
+
+ @store_locally = configuration[:store_locally] || true
+ end
+
+ # Get an array of errors that occur during reading from the source
+ def errors
+ @errors ||= []
+ end
+
+ # Get a timestamp value as a string
+ def timestamp
+ Engine.timestamp
+ end
+
+ # The base directory where local files are stored.
+ attr_accessor :local_base
+
+ # Get the local base, defaults to 'source_data'
+ def local_base
+ @local_base ||= 'source_data'
+ end
+
+ # The local directory for storing. This method must be overriden by
+ # subclasses
+ def local_directory
+ raise "local_directory method is abstract"
+ end
+
+ # Return the local file for storing the raw source data. Each call to
+ # this method will result in a timestamped file, so you cannot expect
+ # to call it multiple times and reference the same file
+ #
+ # Optional sequence can be specified if there are multiple source files
+ def local_file(sequence=nil)
+ filename = timestamp.to_s
+ filename += sequence.to_s if sequence
+
+ local_dir = local_directory
+ FileUtils.mkdir_p(local_dir)
+ File.join(local_dir, "#{filename}.csv")
+ end
+
+ # Get the last fully written local file
+ def last_local_file
+ File.join(local_directory, File.basename(last_local_file_trigger, '.trig'))
+ end
+
+ # Get the last local file trigger
+ def last_local_file_trigger
+ Dir.glob(File.join(local_directory, '*.trig')).last
+ end
+
+ # Get the local trigger file that is used to indicate that the file has
+ # been completely written
+ def local_file_trigger(file)
+ Pathname.new(file.to_s + '.trig')
+ end
+
+ # Return true if the source should read locally.
+ def read_locally
+ Engine.read_locally
+ end
+
+ end
+ end
+end
+
+Dir[File.dirname(__FILE__) + "/source/*.rb"].each { |file| require(file) }
220 lib/etl/control/source/database_source.rb
@@ -0,0 +1,220 @@
+require 'fileutils'
+
+module ETL #:nodoc:
+ class Source<