From 3f93860e1359a49f8063e7c02e193b08f5dd5046 Mon Sep 17 00:00:00 2001 From: Ben Prew Date: Fri, 31 Jan 2020 15:10:11 -0800 Subject: [PATCH 1/8] Pin rubies to OS default versions --- .ruby-version | 2 +- .travis.yml | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.ruby-version b/.ruby-version index 76521af..633c00d 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -ruby-2.2.0 +2.0.0-p648 diff --git a/.travis.yml b/.travis.yml index 86cd0b0..a86d447 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,11 @@ language: ruby rvm: - - 2.0.0 - - 2.3.3 - - 2.4.0 + # Mac High Sierra + - 2.0.0-p648 + # Mac Mojave + - 2.3.7 + # Ubuntu 19.10 + - 2.5 + # Mac Catalina + - 2.6 script: "bundle exec rake" From 7e15a61006ace84f722f348c06bbc75b9f6ea138 Mon Sep 17 00:00:00 2001 From: Ben Prew Date: Fri, 31 Jan 2020 15:12:00 -0800 Subject: [PATCH 2/8] Update gems to higest version supported by Ruby 2.0. Add pry to devel gems --- Gemfile | 1 - Gemfile.lock | 42 ++++++++++++++++++++++++++++-------------- reckon.gemspec | 4 +--- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/Gemfile b/Gemfile index 5378d16..22e17d6 100644 --- a/Gemfile +++ b/Gemfile @@ -1,5 +1,4 @@ source "http://rubygems.org" - gemspec gem 'rake' diff --git a/Gemfile.lock b/Gemfile.lock index e69c33a..ba2d0a5 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -3,7 +3,6 @@ PATH specs: reckon (0.4.4) chronic (>= 0.3.0) - fastercsv (>= 1.5.1) highline (>= 1.5.2) terminal-table (>= 1.4.2) @@ -11,24 +10,39 @@ GEM remote: http://rubygems.org/ specs: chronic (0.10.2) - diff-lcs (1.1.3) - fastercsv (1.5.5) - highline (1.6.21) - rake (10.0.4) - rspec (2.11.0) - rspec-core (~> 2.11.0) - rspec-expectations (~> 2.11.0) - rspec-mocks (~> 2.11.0) - rspec-core (2.11.1) - rspec-expectations (2.11.2) - diff-lcs (~> 1.1.3) - rspec-mocks (2.11.1) - terminal-table (1.4.5) + coderay (1.1.2) + diff-lcs (1.3) + highline (2.0.3) + method_source (0.9.2) + pry (0.12.2) + coderay (~> 1.1.0) + method_source (~> 0.9.0) + rake (12.3.3) + rspec (3.9.0) + rspec-core (~> 3.9.0) + rspec-expectations (~> 3.9.0) + rspec-mocks (~> 3.9.0) + rspec-core (3.9.1) + rspec-support (~> 3.9.1) + rspec-expectations (3.9.0) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.9.0) + rspec-mocks (3.9.1) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.9.0) + rspec-support (3.9.2) + terminal-table (1.8.0) + unicode-display_width (~> 1.1, >= 1.1.1) + unicode-display_width (1.6.1) PLATFORMS ruby DEPENDENCIES + pry (>= 0.12.2) rake reckon! rspec (>= 1.2.9) + +BUNDLED WITH + 1.17.3 diff --git a/reckon.gemspec b/reckon.gemspec index ddc6af4..9b44164 100644 --- a/reckon.gemspec +++ b/reckon.gemspec @@ -1,4 +1,3 @@ -# -*- encoding: utf-8 -*- $:.push File.expand_path("../lib", __FILE__) Gem::Specification.new do |s| @@ -16,9 +15,8 @@ Gem::Specification.new do |s| s.require_paths = ["lib"] s.add_development_dependency "rspec", ">= 1.2.9" - s.add_runtime_dependency "fastercsv", ">= 1.5.1" + s.add_development_dependency "pry", ">= 0.12.2" s.add_runtime_dependency "chronic", ">= 0.3.0" s.add_runtime_dependency "highline", ">= 1.5.2" s.add_runtime_dependency "terminal-table", ">= 1.4.2" end - From 5ce43ae0b36ee25cc313bd63a85129a7b7c49783 Mon Sep 17 00:00:00 2001 From: Ben Prew Date: Fri, 31 Jan 2020 15:14:06 -0800 Subject: [PATCH 3/8] bug: fix order-dependent test Sort isn't stable, so sorting by date in each_with_backwards meant that the "Book Store" transaction wasn't always row 7, so look for the string, instead of by index. --- spec/reckon/app_spec.rb | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/spec/reckon/app_spec.rb b/spec/reckon/app_spec.rb index acb05dc..f39b674 100644 --- a/spec/reckon/app_spec.rb +++ b/spec/reckon/app_spec.rb @@ -1,5 +1,4 @@ #!/usr/bin/env ruby -# encoding: utf-8 require "spec_helper" require 'rubygems' @@ -8,10 +7,10 @@ describe Reckon::App do context 'with chase csv input' do before do - @chase = Reckon::App.new(:string => BANK_CSV) - @chase.learn_from( BANK_LEDGER ) + @chase = Reckon::App.new(string: BANK_CSV) + @chase.learn_from(BANK_LEDGER) @rows = [] - @chase.each_row_backwards { |row| @rows.push( row ) } + @chase.each_row_backwards { |row| @rows.push(row) } end describe "each_row_backwards" do @@ -27,7 +26,11 @@ describe "weighted_account_match" do it "should guess the correct account" do - @chase.weighted_account_match( @rows[7] ).first[:account].should == "Expenses:Books" + row = @rows.find { |n| n[:description] =~ /Book Store/ } + + result = @chase.weighted_account_match(row).first + result[:account].should == "Expenses:Books" + result[:cosine].should > 0.0 end end end @@ -95,6 +98,5 @@ 2004/05/27 Book Store Expenses:Books $20.00 Liabilities:MasterCard - LEDGER - +LEDGER end From 718909de827f0e757cda16be93bcd8ba6321113a Mon Sep 17 00:00:00 2001 From: Ben Prew Date: Fri, 31 Jan 2020 15:16:12 -0800 Subject: [PATCH 4/8] bug: fix order-dependent test by choosing the lowest index date column first Sorting by date_score isn't stable, so either date field for Broker Canada data could've been returned. Added index to the sort key to use the column that came first. This behavior matches the 3-4 csv files I process from my financial institutions. --- lib/reckon/csv_parser.rb | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/lib/reckon/csv_parser.rb b/lib/reckon/csv_parser.rb index d00dc17..d820119 100644 --- a/lib/reckon/csv_parser.rb +++ b/lib/reckon/csv_parser.rb @@ -192,20 +192,17 @@ def detect_columns end end - results.reject! {|i| money_column_indices.include?(i[:index]) } - self.date_column_index = results.sort { |a, b| b[:date_score] <=> a[:date_score] }.first[:index] - results.reject! {|i| i[:index] == date_column_index } - @date_column = DateColumn.new( columns[ self.date_column_index ], @options ) - - if ( money_column_indices.length == 1 ) - @money_column = MoneyColumn.new( columns[money_column_indices[0]], - @options ) + results.reject! { |i| money_column_indices.include?(i[:index]) } + # sort by highest score followed by lowest index + @date_column_index = results.max_by { |n| [n[:date_score], -n[:index]] }[:index] + results.reject! { |i| i[:index] == date_column_index } + @date_column = DateColumn.new(columns[date_column_index], @options) + + @money_column = MoneyColumn.new(columns[money_column_indices[0]], @options) + if money_column_indices.length == 1 detect_sign_column if @money_column.positive? else - @money_column = MoneyColumn.new( columns[money_column_indices[0]], - @options ) - @money_column.merge!( - MoneyColumn.new( columns[money_column_indices[1]], @options ) ) + @money_column.merge! MoneyColumn.new(columns[money_column_indices[1]], @options) end self.description_column_indices = results.map { |i| i[:index] } From 9c953645d1b63145900b528dcc1cec1be6de90ce Mon Sep 17 00:00:00 2001 From: Ben Prew Date: Fri, 31 Jan 2020 15:24:49 -0800 Subject: [PATCH 5/8] Remove fastercsv, ruby 2.0 is our minimum version. High Sierra installs 2.0, so it's unlikely that someone would have a ruby < 2.0 installed. High Sierra is 2 versions behind the current OSx version (Catalina). --- lib/reckon.rb | 11 +++-------- lib/reckon/csv_parser.rb | 13 +++---------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/lib/reckon.rb b/lib/reckon.rb index 9d33b3c..6b0d17b 100755 --- a/lib/reckon.rb +++ b/lib/reckon.rb @@ -1,19 +1,14 @@ #!/usr/bin/env ruby require 'rubygems' -if RUBY_VERSION =~ /^1\.9/ || RUBY_VERSION =~ /^2/ - require 'csv' -else - require 'fastercsv' -end +require 'chronic' +require 'csv' require 'highline/import' require 'optparse' -require 'chronic' -require 'time' require 'terminal-table' +require 'time' require File.expand_path(File.join(File.dirname(__FILE__), "reckon", "app")) require File.expand_path(File.join(File.dirname(__FILE__), "reckon", "ledger_parser")) require File.expand_path(File.join(File.dirname(__FILE__), "reckon", "csv_parser")) require File.expand_path(File.join(File.dirname(__FILE__), "reckon", "money")) - diff --git a/lib/reckon/csv_parser.rb b/lib/reckon/csv_parser.rb index d820119..b7341a1 100644 --- a/lib/reckon/csv_parser.rb +++ b/lib/reckon/csv_parser.rb @@ -1,5 +1,4 @@ #coding: utf-8 -require 'pp' module Reckon class CSVParser @@ -227,21 +226,15 @@ def columns def parse data = options[:string] || File.read(options[:file]) - - if RUBY_VERSION =~ /^1\.9/ || RUBY_VERSION =~ /^2/ - data = data.force_encoding(options[:encoding] || 'BINARY').encode('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?') - csv_engine = CSV - else - csv_engine = FasterCSV - end - - @csv_data = csv_engine.parse data.strip, :col_sep => options[:csv_separator] || ',' + data = data.force_encoding(options[:encoding] || 'BINARY').encode('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?') + @csv_data = CSV.parse data.strip, :col_sep => options[:csv_separator] || ',' if options[:contains_header] options[:contains_header].times { csv_data.shift } end csv_data end + @settings = { :testing => false } def self.settings From e7753929b924dccfa1ffff093158ee4dcaf1dfb3 Mon Sep 17 00:00:00 2001 From: Ben Prew Date: Fri, 31 Jan 2020 15:53:20 -0800 Subject: [PATCH 6/8] bug: don't try to parse rows that the user considers header rows Since we throw them away anyway, we should just skip them --- lib/reckon/csv_parser.rb | 11 ++++++----- spec/reckon/csv_parser_spec.rb | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/lib/reckon/csv_parser.rb b/lib/reckon/csv_parser.rb index b7341a1..f91e766 100644 --- a/lib/reckon/csv_parser.rb +++ b/lib/reckon/csv_parser.rb @@ -225,15 +225,16 @@ def columns end def parse + rows = [] data = options[:string] || File.read(options[:file]) data = data.force_encoding(options[:encoding] || 'BINARY').encode('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?') - @csv_data = CSV.parse data.strip, :col_sep => options[:csv_separator] || ',' - if options[:contains_header] - options[:contains_header].times { csv_data.shift } + data.each_line.with_index do |line, i| + next if i < (options[:contains_header] || 0) + rows << CSV.parse_line(line, col_sep: options[:csv_separator] || ',') end - csv_data - end + @csv_data = rows + end @settings = { :testing => false } diff --git a/spec/reckon/csv_parser_spec.rb b/spec/reckon/csv_parser_spec.rb index c623e81..af1f8e5 100755 --- a/spec/reckon/csv_parser_spec.rb +++ b/spec/reckon/csv_parser_spec.rb @@ -42,6 +42,24 @@ it "should work with other separators" do Reckon::CSVParser.new(:string => "one;two\nthree;four", :csv_separator => ';').columns.should == [['one', 'three'], ['two', 'four']] end + + describe 'file with invalid csv in header' do + file = %q( + +="0234500012345678";21/11/2015;19/02/2016;36;19/02/2016;1234,37 EUR + +Date de l'opération;Libellé;Détail de l'écriture;Montant de l'opération;Devise +19/02/2016;VIR RECU 508160;VIR RECU 1234567834S DE: Francois REF: 123457891234567894561231 PROVENANCE: DE Allemagne ;50,00;EUR +18/02/2016;COTISATION JAZZ;COTISATION JAZZ ;-8,10;EUR +) + it 'should ignore invalid header lines' do + Reckon::CSVParser.new(string: file, contains_header: 4) + end + + it 'should fail' do + expect { Reckon::CSVParser.new(string: file, contains_header: 1) }.to raise_error(CSV::MalformedCSVError) + end + end end describe "columns" do From deba42b8065565f014e1e299d6efeacc35d52584 Mon Sep 17 00:00:00 2001 From: Ben Prew Date: Fri, 31 Jan 2020 15:56:36 -0800 Subject: [PATCH 7/8] Use CharDet to detect char encoding, strip BOM from file If the user doesn't pass an encoding option, we try to determine the encoding of the file using CharDet, then convert it to UTF-8 before parsing it as CSV. Also, strip the BOM, if it exists. Fall back to BINARY as a last resort --- Gemfile.lock | 2 ++ lib/reckon.rb | 1 + lib/reckon/csv_parser.rb | 19 ++++++++++++++----- reckon.gemspec | 1 + spec/data_fixtures/bom_utf8_file.csv | 1 + spec/reckon/csv_parser_spec.rb | 17 ++++++++++++++++- 6 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 spec/data_fixtures/bom_utf8_file.csv diff --git a/Gemfile.lock b/Gemfile.lock index ba2d0a5..42cf42f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -4,6 +4,7 @@ PATH reckon (0.4.4) chronic (>= 0.3.0) highline (>= 1.5.2) + rchardet (>= 1.8.0) terminal-table (>= 1.4.2) GEM @@ -18,6 +19,7 @@ GEM coderay (~> 1.1.0) method_source (~> 0.9.0) rake (12.3.3) + rchardet (1.8.0) rspec (3.9.0) rspec-core (~> 3.9.0) rspec-expectations (~> 3.9.0) diff --git a/lib/reckon.rb b/lib/reckon.rb index 6b0d17b..b0a0097 100755 --- a/lib/reckon.rb +++ b/lib/reckon.rb @@ -1,6 +1,7 @@ #!/usr/bin/env ruby require 'rubygems' +require 'rchardet' require 'chronic' require 'csv' require 'highline/import' diff --git a/lib/reckon/csv_parser.rb b/lib/reckon/csv_parser.rb index f91e766..a561d6e 100644 --- a/lib/reckon/csv_parser.rb +++ b/lib/reckon/csv_parser.rb @@ -7,7 +7,7 @@ class CSVParser def initialize(options = {}) self.options = options self.options[:currency] ||= '$' - parse + @csv_data = parse(options[:string] || File.read(options[:file])) filter_csv detect_columns end @@ -224,16 +224,25 @@ def columns end end - def parse + def parse(data) + # Use force_encoding to convert the string to utf-8 with as few invalid characters + # as possible. + data.force_encoding(try_encoding(data)) + data = data.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') + data.sub!("\xEF\xBB\xBF", '') # strip byte order marker, if it exists + rows = [] - data = options[:string] || File.read(options[:file]) - data = data.force_encoding(options[:encoding] || 'BINARY').encode('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?') data.each_line.with_index do |line, i| next if i < (options[:contains_header] || 0) rows << CSV.parse_line(line, col_sep: options[:csv_separator] || ',') end - @csv_data = rows + rows + end + + def try_encoding(data) + cd = CharDet.detect(data) + options[:encoding] || cd['encoding'] || 'BINARY' end @settings = { :testing => false } diff --git a/reckon.gemspec b/reckon.gemspec index 9b44164..74b8bf5 100644 --- a/reckon.gemspec +++ b/reckon.gemspec @@ -19,4 +19,5 @@ Gem::Specification.new do |s| s.add_runtime_dependency "chronic", ">= 0.3.0" s.add_runtime_dependency "highline", ">= 1.5.2" s.add_runtime_dependency "terminal-table", ">= 1.4.2" + s.add_runtime_dependency "rchardet", ">= 1.8.0" end diff --git a/spec/data_fixtures/bom_utf8_file.csv b/spec/data_fixtures/bom_utf8_file.csv new file mode 100644 index 0000000..5ad16fc --- /dev/null +++ b/spec/data_fixtures/bom_utf8_file.csv @@ -0,0 +1 @@ +"Date","Time","TimeZone","Name","Type","Status","Currency","Gross","Fee","Net","From Email Address","To Email Address","Transaction ID","Shipping Address","Address Status","Item Title","Item ID","Shipping and Handling Amount","Insurance Amount","Sales Tax","Option 1 Name","Option 1 Value","Option 2 Name","Option 2 Value","Reference Txn ID","Invoice Number","Custom Number","Quantity","Receipt ID","Balance","Address Line 1","Address Line 2/District/Neighborhood","Town/City","State/Province/Region/County/Territory/Prefecture/Republic","Zip/Postal Code","Country","Contact Phone Number","Subject","Note","Country Code","Balance Impact" diff --git a/spec/reckon/csv_parser_spec.rb b/spec/reckon/csv_parser_spec.rb index af1f8e5..6c29ea2 100755 --- a/spec/reckon/csv_parser_spec.rb +++ b/spec/reckon/csv_parser_spec.rb @@ -33,16 +33,31 @@ end describe "parse" do + it "should use binary encoding if none specified and chardet fails" do + allow(CharDet).to receive(:detect).and_return({'encoding' => nil}) + app = Reckon::CSVParser.new(:file => File.expand_path(File.join(File.dirname(__FILE__), "..", "data_fixtures", "extratofake.csv"))) + expect(app.try_encoding("foobarbaz")).to eq("BINARY") + end it "should work with foreign character encodings" do app = Reckon::CSVParser.new(:file => File.expand_path(File.join(File.dirname(__FILE__), "..", "data_fixtures", "extratofake.csv"))) app.columns[0][0..2].should == ["Data", "10/31/2012", "11/01/2012"] - app.columns[2].first.should == "Hist?rico" + app.columns[2].first.should == "Histórico" end it "should work with other separators" do Reckon::CSVParser.new(:string => "one;two\nthree;four", :csv_separator => ';').columns.should == [['one', 'three'], ['two', 'four']] end + it 'should parse quoted lines' do + file = %q("30.03.2015";"29.03.2015";"09.04.2015";"BARAUSZAHLUNGSENTGELT";"5266 xxxx xxxx 9454";"";"0";"EUR";"0,00";"EUR";"-3,50";"0") + Reckon::CSVParser.new(string: file, csv_separator: ';', comma_separates_cents: true).columns.length.should == 12 + end + + it 'should parse csv with BOM' do + file = File.expand_path(File.join(File.dirname(__FILE__), "..", "data_fixtures", "bom_utf8_file.csv")) + Reckon::CSVParser.new(file: file).columns.length.should == 41 + end + describe 'file with invalid csv in header' do file = %q( From 0e9e9771e0b4fc2519664fcd95e8d2dcba9dbd2b Mon Sep 17 00:00:00 2001 From: Ben Prew Date: Fri, 31 Jan 2020 15:58:30 -0800 Subject: [PATCH 8/8] Minor cleanup, use require_relative where appropriate --- lib/reckon/csv_parser.rb | 2 +- spec/reckon/csv_parser_spec.rb | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/reckon/csv_parser.rb b/lib/reckon/csv_parser.rb index a561d6e..a9407a2 100644 --- a/lib/reckon/csv_parser.rb +++ b/lib/reckon/csv_parser.rb @@ -159,7 +159,7 @@ def detect_sign_column def detect_columns results, found_likely_money_column = evaluate_columns(columns) - self.money_column_indices = [ results.sort { |a, b| b[:money_score] <=> a[:money_score] }.first[:index] ] + self.money_column_indices = [ results.max_by { |n| n[:money_score] }[:index] ] if !found_likely_money_column found_likely_double_money_columns = false diff --git a/spec/reckon/csv_parser_spec.rb b/spec/reckon/csv_parser_spec.rb index 6c29ea2..319a541 100755 --- a/spec/reckon/csv_parser_spec.rb +++ b/spec/reckon/csv_parser_spec.rb @@ -1,9 +1,9 @@ #!/usr/bin/env ruby -# encoding: utf-8 +# coding: utf-8 -require "spec_helper" +require_relative "../spec_helper" require 'rubygems' -require 'reckon' +require_relative '../../lib/reckon' Reckon::CSVParser.settings[:testing] = true