Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug: invalid header lines should be ignored, not parsed. #78

Merged
merged 8 commits into from
Feb 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ruby-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ruby-2.2.0
2.0.0-p648
11 changes: 8 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
language: ruby
rvm:
- 2.0.0
- 2.3.3
- 2.4.0
# Mac High Sierra
- 2.0.0-p648
# Mac Mojave
- 2.3.7
# Ubuntu 19.10
- 2.5
# Mac Catalina
- 2.6
script: "bundle exec rake"
1 change: 0 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
source "http://rubygems.org"

gemspec

gem 'rake'
44 changes: 30 additions & 14 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,48 @@ PATH
specs:
reckon (0.4.4)
chronic (>= 0.3.0)
fastercsv (>= 1.5.1)
highline (>= 1.5.2)
rchardet (>= 1.8.0)
terminal-table (>= 1.4.2)

GEM
remote: http://rubygems.org/
specs:
chronic (0.10.2)
diff-lcs (1.1.3)
fastercsv (1.5.5)
highline (1.6.21)
rake (10.0.4)
rspec (2.11.0)
rspec-core (~> 2.11.0)
rspec-expectations (~> 2.11.0)
rspec-mocks (~> 2.11.0)
rspec-core (2.11.1)
rspec-expectations (2.11.2)
diff-lcs (~> 1.1.3)
rspec-mocks (2.11.1)
terminal-table (1.4.5)
coderay (1.1.2)
diff-lcs (1.3)
highline (2.0.3)
method_source (0.9.2)
pry (0.12.2)
coderay (~> 1.1.0)
method_source (~> 0.9.0)
rake (12.3.3)
rchardet (1.8.0)
rspec (3.9.0)
rspec-core (~> 3.9.0)
rspec-expectations (~> 3.9.0)
rspec-mocks (~> 3.9.0)
rspec-core (3.9.1)
rspec-support (~> 3.9.1)
rspec-expectations (3.9.0)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.9.0)
rspec-mocks (3.9.1)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.9.0)
rspec-support (3.9.2)
terminal-table (1.8.0)
unicode-display_width (~> 1.1, >= 1.1.1)
unicode-display_width (1.6.1)

PLATFORMS
ruby

DEPENDENCIES
pry (>= 0.12.2)
rake
reckon!
rspec (>= 1.2.9)

BUNDLED WITH
1.17.3
12 changes: 4 additions & 8 deletions lib/reckon.rb
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
#!/usr/bin/env ruby

require 'rubygems'
if RUBY_VERSION =~ /^1\.9/ || RUBY_VERSION =~ /^2/
require 'csv'
else
require 'fastercsv'
end
require 'rchardet'
require 'chronic'
require 'csv'
require 'highline/import'
require 'optparse'
require 'chronic'
require 'time'
require 'terminal-table'
require 'time'

require File.expand_path(File.join(File.dirname(__FILE__), "reckon", "app"))
require File.expand_path(File.join(File.dirname(__FILE__), "reckon", "ledger_parser"))
require File.expand_path(File.join(File.dirname(__FILE__), "reckon", "csv_parser"))
require File.expand_path(File.join(File.dirname(__FILE__), "reckon", "money"))

54 changes: 27 additions & 27 deletions lib/reckon/csv_parser.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#coding: utf-8
require 'pp'

module Reckon
class CSVParser
Expand All @@ -8,7 +7,7 @@ class CSVParser
def initialize(options = {})
self.options = options
self.options[:currency] ||= '$'
parse
@csv_data = parse(options[:string] || File.read(options[:file]))
filter_csv
detect_columns
end
Expand Down Expand Up @@ -160,7 +159,7 @@ def detect_sign_column

def detect_columns
results, found_likely_money_column = evaluate_columns(columns)
self.money_column_indices = [ results.sort { |a, b| b[:money_score] <=> a[:money_score] }.first[:index] ]
self.money_column_indices = [ results.max_by { |n| n[:money_score] }[:index] ]

if !found_likely_money_column
found_likely_double_money_columns = false
Expand Down Expand Up @@ -192,20 +191,17 @@ def detect_columns
end
end

results.reject! {|i| money_column_indices.include?(i[:index]) }
self.date_column_index = results.sort { |a, b| b[:date_score] <=> a[:date_score] }.first[:index]
results.reject! {|i| i[:index] == date_column_index }
@date_column = DateColumn.new( columns[ self.date_column_index ], @options )
results.reject! { |i| money_column_indices.include?(i[:index]) }
# sort by highest score followed by lowest index
@date_column_index = results.max_by { |n| [n[:date_score], -n[:index]] }[:index]
results.reject! { |i| i[:index] == date_column_index }
@date_column = DateColumn.new(columns[date_column_index], @options)

if ( money_column_indices.length == 1 )
@money_column = MoneyColumn.new( columns[money_column_indices[0]],
@options )
@money_column = MoneyColumn.new(columns[money_column_indices[0]], @options)
if money_column_indices.length == 1
detect_sign_column if @money_column.positive?
else
@money_column = MoneyColumn.new( columns[money_column_indices[0]],
@options )
@money_column.merge!(
MoneyColumn.new( columns[money_column_indices[1]], @options ) )
@money_column.merge! MoneyColumn.new(columns[money_column_indices[1]], @options)
end

self.description_column_indices = results.map { |i| i[:index] }
Expand All @@ -228,21 +224,25 @@ def columns
end
end

def parse
data = options[:string] || File.read(options[:file])

if RUBY_VERSION =~ /^1\.9/ || RUBY_VERSION =~ /^2/
data = data.force_encoding(options[:encoding] || 'BINARY').encode('UTF-8', :invalid => :replace, :undef => :replace, :replace => '?')
csv_engine = CSV
else
csv_engine = FasterCSV
def parse(data)
# Use force_encoding to convert the string to utf-8 with as few invalid characters
# as possible.
data.force_encoding(try_encoding(data))
data = data.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
data.sub!("\xEF\xBB\xBF", '') # strip byte order marker, if it exists

rows = []
data.each_line.with_index do |line, i|
next if i < (options[:contains_header] || 0)
rows << CSV.parse_line(line, col_sep: options[:csv_separator] || ',')
end

@csv_data = csv_engine.parse data.strip, :col_sep => options[:csv_separator] || ','
if options[:contains_header]
options[:contains_header].times { csv_data.shift }
end
csv_data
rows
end

def try_encoding(data)
cd = CharDet.detect(data)
options[:encoding] || cd['encoding'] || 'BINARY'
end

@settings = { :testing => false }
Expand Down
5 changes: 2 additions & 3 deletions reckon.gemspec
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- encoding: utf-8 -*-
$:.push File.expand_path("../lib", __FILE__)

Gem::Specification.new do |s|
Expand All @@ -16,9 +15,9 @@ Gem::Specification.new do |s|
s.require_paths = ["lib"]

s.add_development_dependency "rspec", ">= 1.2.9"
s.add_runtime_dependency "fastercsv", ">= 1.5.1"
s.add_development_dependency "pry", ">= 0.12.2"
s.add_runtime_dependency "chronic", ">= 0.3.0"
s.add_runtime_dependency "highline", ">= 1.5.2"
s.add_runtime_dependency "terminal-table", ">= 1.4.2"
s.add_runtime_dependency "rchardet", ">= 1.8.0"
end

1 change: 1 addition & 0 deletions spec/data_fixtures/bom_utf8_file.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"Date","Time","TimeZone","Name","Type","Status","Currency","Gross","Fee","Net","From Email Address","To Email Address","Transaction ID","Shipping Address","Address Status","Item Title","Item ID","Shipping and Handling Amount","Insurance Amount","Sales Tax","Option 1 Name","Option 1 Value","Option 2 Name","Option 2 Value","Reference Txn ID","Invoice Number","Custom Number","Quantity","Receipt ID","Balance","Address Line 1","Address Line 2/District/Neighborhood","Town/City","State/Province/Region/County/Territory/Prefecture/Republic","Zip/Postal Code","Country","Contact Phone Number","Subject","Note","Country Code","Balance Impact"
16 changes: 9 additions & 7 deletions spec/reckon/app_spec.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/env ruby
# encoding: utf-8

require "spec_helper"
require 'rubygems'
Expand All @@ -8,10 +7,10 @@
describe Reckon::App do
context 'with chase csv input' do
before do
@chase = Reckon::App.new(:string => BANK_CSV)
@chase.learn_from( BANK_LEDGER )
@chase = Reckon::App.new(string: BANK_CSV)
@chase.learn_from(BANK_LEDGER)
@rows = []
@chase.each_row_backwards { |row| @rows.push( row ) }
@chase.each_row_backwards { |row| @rows.push(row) }
end

describe "each_row_backwards" do
Expand All @@ -27,7 +26,11 @@

describe "weighted_account_match" do
it "should guess the correct account" do
@chase.weighted_account_match( @rows[7] ).first[:account].should == "Expenses:Books"
row = @rows.find { |n| n[:description] =~ /Book Store/ }

result = @chase.weighted_account_match(row).first
result[:account].should == "Expenses:Books"
result[:cosine].should > 0.0
end
end
end
Expand Down Expand Up @@ -95,6 +98,5 @@
2004/05/27 Book Store
Expenses:Books $20.00
Liabilities:MasterCard
LEDGER

LEDGER
end
41 changes: 37 additions & 4 deletions spec/reckon/csv_parser_spec.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/env ruby
# encoding: utf-8
# coding: utf-8

require "spec_helper"
require_relative "../spec_helper"
require 'rubygems'
require 'reckon'
require_relative '../../lib/reckon'

Reckon::CSVParser.settings[:testing] = true

Expand Down Expand Up @@ -33,15 +33,48 @@
end

describe "parse" do
it "should use binary encoding if none specified and chardet fails" do
allow(CharDet).to receive(:detect).and_return({'encoding' => nil})
app = Reckon::CSVParser.new(:file => File.expand_path(File.join(File.dirname(__FILE__), "..", "data_fixtures", "extratofake.csv")))
expect(app.try_encoding("foobarbaz")).to eq("BINARY")
end
it "should work with foreign character encodings" do
app = Reckon::CSVParser.new(:file => File.expand_path(File.join(File.dirname(__FILE__), "..", "data_fixtures", "extratofake.csv")))
app.columns[0][0..2].should == ["Data", "10/31/2012", "11/01/2012"]
app.columns[2].first.should == "Hist?rico"
app.columns[2].first.should == "Histórico"
end

it "should work with other separators" do
Reckon::CSVParser.new(:string => "one;two\nthree;four", :csv_separator => ';').columns.should == [['one', 'three'], ['two', 'four']]
end

it 'should parse quoted lines' do
file = %q("30.03.2015";"29.03.2015";"09.04.2015";"BARAUSZAHLUNGSENTGELT";"5266 xxxx xxxx 9454";"";"0";"EUR";"0,00";"EUR";"-3,50";"0")
Reckon::CSVParser.new(string: file, csv_separator: ';', comma_separates_cents: true).columns.length.should == 12
end

it 'should parse csv with BOM' do
file = File.expand_path(File.join(File.dirname(__FILE__), "..", "data_fixtures", "bom_utf8_file.csv"))
Reckon::CSVParser.new(file: file).columns.length.should == 41
end

describe 'file with invalid csv in header' do
file = %q(

="0234500012345678";21/11/2015;19/02/2016;36;19/02/2016;1234,37 EUR

Date de l'opération;Libellé;Détail de l'écriture;Montant de l'opération;Devise
19/02/2016;VIR RECU 508160;VIR RECU 1234567834S DE: Francois REF: 123457891234567894561231 PROVENANCE: DE Allemagne ;50,00;EUR
18/02/2016;COTISATION JAZZ;COTISATION JAZZ ;-8,10;EUR
)
it 'should ignore invalid header lines' do
Reckon::CSVParser.new(string: file, contains_header: 4)
end

it 'should fail' do
expect { Reckon::CSVParser.new(string: file, contains_header: 1) }.to raise_error(CSV::MalformedCSVError)
end
end
end

describe "columns" do
Expand Down