Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Make a new excel parser with his tests

  • Loading branch information...
commit 7d4064f74bbed527a219fa41763b4728961526fa 1 parent 0d9a484
@gkfabs gkfabs authored
View
3  README
@@ -86,6 +86,7 @@ The tests require:
- gem install shoulda
- gem install flexmock
- gem install pg (if you want to run the tests on pg)
+- gem install spreadsheet
The tests subfolder contains examples database.yml for mysql and postgres.
@@ -96,4 +97,4 @@ To run the tests:
== Feedback
This is a work in progress. Comments should be made on the
activewarehouse-discuss mailing list at the moment. Contributions are always
-welcome.
+welcome.
View
2  activewarehouse-etl.gemspec
@@ -12,7 +12,7 @@ Gem::Specification.new do |s|
s.description = %q{ActiveWarehouse ETL is a pure Ruby Extract-Transform-Load application for loading data into a database.}
s.email = %q{anthonyeden@gmail.com}
s.executables = ["etl"]
- s.files = ["CHANGELOG", "LICENSE", "README", "TODO", "Rakefile", "bin/etl", "bin/etl.cmd", "lib/etl", "lib/etl.rb", "lib/etl/batch", "lib/etl/batch.rb", "lib/etl/builder", "lib/etl/builder.rb", "lib/etl/commands", "lib/etl/control", "lib/etl/control.rb", "lib/etl/core_ext", "lib/etl/core_ext.rb", "lib/etl/engine.rb", "lib/etl/execution", "lib/etl/execution.rb", "lib/etl/generator", "lib/etl/generator.rb", "lib/etl/http_tools.rb", "lib/etl/parser", "lib/etl/parser.rb", "lib/etl/processor", "lib/etl/processor.rb", "lib/etl/row.rb", "lib/etl/screen", "lib/etl/screen.rb", "lib/etl/transform", "lib/etl/transform.rb", "lib/etl/util.rb", "lib/etl/version.rb", "lib/etl/batch/batch.rb", "lib/etl/batch/directives.rb", "lib/etl/builder/date_dimension_builder.rb", "lib/etl/builder/time_dimension_builder.rb", "lib/etl/commands/etl.rb", "lib/etl/control/control.rb", "lib/etl/control/destination", "lib/etl/control/destination.rb", "lib/etl/control/source", "lib/etl/control/source.rb", "lib/etl/control/destination/database_destination.rb", "lib/etl/control/destination/file_destination.rb", "lib/etl/control/source/database_source.rb", "lib/etl/control/source/enumerable_source.rb", "lib/etl/control/source/file_source.rb", "lib/etl/control/source/model_source.rb", "lib/etl/core_ext/time", "lib/etl/core_ext/time.rb", "lib/etl/core_ext/time/calculations.rb", "lib/etl/execution/base.rb", "lib/etl/execution/batch.rb", "lib/etl/execution/job.rb", "lib/etl/execution/migration.rb", "lib/etl/generator/generator.rb", "lib/etl/generator/surrogate_key_generator.rb", "lib/etl/parser/apache_combined_log_parser.rb", "lib/etl/parser/delimited_parser.rb", "lib/etl/parser/fixed_width_parser.rb", "lib/etl/parser/parser.rb", "lib/etl/parser/sax_parser.rb", "lib/etl/parser/xml_parser.rb", "lib/etl/processor/block_processor.rb", "lib/etl/processor/bulk_import_processor.rb", "lib/etl/processor/check_exist_processor.rb", "lib/etl/processor/check_unique_processor.rb", "lib/etl/processor/copy_field_processor.rb", "lib/etl/processor/encode_processor.rb", "lib/etl/processor/hierarchy_exploder_processor.rb", "lib/etl/processor/print_row_processor.rb", "lib/etl/processor/processor.rb", "lib/etl/processor/rename_processor.rb", "lib/etl/processor/require_non_blank_processor.rb", "lib/etl/processor/row_processor.rb", "lib/etl/processor/sequence_processor.rb", "lib/etl/processor/surrogate_key_processor.rb", "lib/etl/processor/truncate_processor.rb", "lib/etl/screen/row_count_screen.rb", "lib/etl/transform/block_transform.rb", "lib/etl/transform/date_to_string_transform.rb", "lib/etl/transform/decode_transform.rb", "lib/etl/transform/default_transform.rb", "lib/etl/transform/foreign_key_lookup_transform.rb", "lib/etl/transform/hierarchy_lookup_transform.rb", "lib/etl/transform/ordinalize_transform.rb", "lib/etl/transform/sha1_transform.rb", "lib/etl/transform/string_to_date_transform.rb", "lib/etl/transform/string_to_datetime_transform.rb", "lib/etl/transform/string_to_time_transform.rb", "lib/etl/transform/transform.rb", "lib/etl/transform/trim_transform.rb", "lib/etl/transform/type_transform.rb", "examples/database.example.yml"]
+ s.files = ["CHANGELOG", "LICENSE", "README", "TODO", "Rakefile", "bin/etl", "bin/etl.cmd", "lib/etl", "lib/etl.rb", "lib/etl/batch", "lib/etl/batch.rb", "lib/etl/builder", "lib/etl/builder.rb", "lib/etl/commands", "lib/etl/control", "lib/etl/control.rb", "lib/etl/core_ext", "lib/etl/core_ext.rb", "lib/etl/engine.rb", "lib/etl/execution", "lib/etl/execution.rb", "lib/etl/generator", "lib/etl/generator.rb", "lib/etl/http_tools.rb", "lib/etl/parser", "lib/etl/parser.rb", "lib/etl/processor", "lib/etl/processor.rb", "lib/etl/row.rb", "lib/etl/screen", "lib/etl/screen.rb", "lib/etl/transform", "lib/etl/transform.rb", "lib/etl/util.rb", "lib/etl/version.rb", "lib/etl/batch/batch.rb", "lib/etl/batch/directives.rb", "lib/etl/builder/date_dimension_builder.rb", "lib/etl/builder/time_dimension_builder.rb", "lib/etl/commands/etl.rb", "lib/etl/control/control.rb", "lib/etl/control/destination", "lib/etl/control/destination.rb", "lib/etl/control/source", "lib/etl/control/source.rb", "lib/etl/control/destination/database_destination.rb", "lib/etl/control/destination/file_destination.rb", "lib/etl/control/source/database_source.rb", "lib/etl/control/source/enumerable_source.rb", "lib/etl/control/source/file_source.rb", "lib/etl/control/source/model_source.rb", "lib/etl/core_ext/time", "lib/etl/core_ext/time.rb", "lib/etl/core_ext/time/calculations.rb", "lib/etl/execution/base.rb", "lib/etl/execution/batch.rb", "lib/etl/execution/job.rb", "lib/etl/execution/migration.rb", "lib/etl/generator/generator.rb", "lib/etl/generator/surrogate_key_generator.rb", "lib/etl/parser/apache_combined_log_parser.rb", "lib/etl/parser/delimited_parser.rb", "lib/etl/parser/excel_parser.rb", "lib/etl/parser/fixed_width_parser.rb", "lib/etl/parser/parser.rb", "lib/etl/parser/sax_parser.rb", "lib/etl/parser/xml_parser.rb", "lib/etl/processor/block_processor.rb", "lib/etl/processor/bulk_import_processor.rb", "lib/etl/processor/check_exist_processor.rb", "lib/etl/processor/check_unique_processor.rb", "lib/etl/processor/copy_field_processor.rb", "lib/etl/processor/encode_processor.rb", "lib/etl/processor/hierarchy_exploder_processor.rb", "lib/etl/processor/print_row_processor.rb", "lib/etl/processor/processor.rb", "lib/etl/processor/rename_processor.rb", "lib/etl/processor/require_non_blank_processor.rb", "lib/etl/processor/row_processor.rb", "lib/etl/processor/sequence_processor.rb", "lib/etl/processor/surrogate_key_processor.rb", "lib/etl/processor/truncate_processor.rb", "lib/etl/screen/row_count_screen.rb", "lib/etl/transform/block_transform.rb", "lib/etl/transform/date_to_string_transform.rb", "lib/etl/transform/decode_transform.rb", "lib/etl/transform/default_transform.rb", "lib/etl/transform/foreign_key_lookup_transform.rb", "lib/etl/transform/hierarchy_lookup_transform.rb", "lib/etl/transform/ordinalize_transform.rb", "lib/etl/transform/sha1_transform.rb", "lib/etl/transform/string_to_date_transform.rb", "lib/etl/transform/string_to_datetime_transform.rb", "lib/etl/transform/string_to_time_transform.rb", "lib/etl/transform/transform.rb", "lib/etl/transform/trim_transform.rb", "lib/etl/transform/type_transform.rb", "examples/database.example.yml"]
s.homepage = %q{http://activewarehouse.rubyforge.org/etl}
s.rdoc_options = ["--exclude", "."]
s.require_paths = ["lib"]
View
81 lib/etl/parser/excel_parser.rb
@@ -0,0 +1,81 @@
+require 'spreadsheet'
+
+module ETL
+ module Parser
+ class ExcelParser < ETL::Parser::Parser
+
+ # Initialize the parser
+ # * <tt>source</tt>: The Source object
+ # * <tt>options</tt>: Parser options Hash
+ def initialize(source, options={})
+ super
+ configure
+ end
+
+ # Returns each row
+ def each
+ Dir.glob(file).each do |file|
+ ETL::Engine.logger.debug "parsing #{file}"
+ line = 0
+ lines_skipped = 0
+ book = Spreadsheet.open file
+ book.worksheets.each do |sheet|
+ sheet.each do |raw_row|
+ if lines_skipped < source.skip_lines
+ ETL::Engine.logger.debug "skipping line"
+ lines_skipped += 1
+ next
+ end
+ line += 1
+ row = {}
+ validate_row(raw_row, line, file)
+ raw_row.each_with_index do |value, index|
+ f = fields[index]
+ row[f.name] = value
+ end
+ yield row
+ end
+ end
+ end
+ end
+
+ # Get an array of defined fields
+ def fields
+ @fields ||= []
+ end
+
+ private
+ def validate_row(row, line, file)
+ ETL::Engine.logger.debug "validating line #{line} in file #{file}"
+ if row.length != fields.length
+ raise_with_info( MismatchError,
+ "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
+ line, file
+ )
+ end
+ end
+
+ private
+ def configure
+ source.definition.each do |options|
+ case options
+ when Symbol
+ fields << Field.new(options)
+ when Hash
+ fields << Field.new(options[:name])
+ else
+ raise DefinitionError, "Each field definition must either be a symbol or a hash"
+ end
+ end
+ end
+
+ class Field #:nodoc:
+ attr_reader :name
+ def initialize(name)
+ @name = name
+ end
+ end
+
+ end
+ end
+end
View
BIN  test/data/excel.xls
Binary file not shown
View
21 test/excel.ctl
@@ -0,0 +1,21 @@
+source :in, {
+ :file => 'data/excel.xls',
+ :parser => :excel
+},
+[
+ :first_name,
+ :last_name,
+ :ssn,
+ :age
+]
+
+transform :ssn, :sha1
+transform(:ssn){ |n, v, r| v[0,24] }
+
+
+destination :out, {
+ :file => 'output/excel.out.txt'
+},
+{
+ :order => [:first_name, :last_name, :ssn, :age]
+}
View
11 test/parser_test.rb
@@ -45,6 +45,15 @@ def test_sax_parser
assert_equal 2, rows.length
assert_equal({:first_name=>"Bob", :last_name=>"Smith", :ssn=>"123456789", :age=>"24"}, rows.first)
end
+
+ # Test the Excel parser
+ def test_excel_parser
+ control = ETL::Control::Control.resolve(File.dirname(__FILE__) + '/excel.ctl')
+ parser = control.sources.first.parser
+ rows = parser.collect { |row| row }
+ assert_equal 2, rows.length
+ assert_equal({:first_name=>"Bob", :last_name=>"Smith", :ssn=>123456789, :age=>24}, rows.first)
+ end
# Test the Apache combined log format parser
def test_apache_combined_log_parser
@@ -197,4 +206,4 @@ def test_user_agent_parser
# :browser=>"Konquerer"}, parser.parse_user_agent(agents[6]), 'Agent 6 invalid'
# )
end
-end
+end
Please sign in to comment.
Something went wrong with that request. Please try again.