From 7938b4f887ba83bac3f9425e24ea3f0b9022cce8 Mon Sep 17 00:00:00 2001 From: Blake Rosenberg Date: Thu, 10 Nov 2022 10:05:31 -0500 Subject: [PATCH 1/7] test: adds coverage reporting ala lcov --- Gemfile | 3 +++ spec/spec_helper.rb | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/Gemfile b/Gemfile index 5f34b77..520895a 100644 --- a/Gemfile +++ b/Gemfile @@ -5,6 +5,9 @@ source "https://rubygems.org" # Specify your gem's dependencies in ocr4pdf.gemspec gemspec +gem "simplecov", ">= 0.18.1", "< 0.22.0" +gem "simplecov-lcov", "~> 0.8.0" + gem "rake", "~> 13.0" gem "rspec", "~> 3.0" diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 1d708e4..3b34285 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,5 +1,14 @@ # frozen_string_literal: true +require "simplecov" + +SimpleCov.start do + add_filter do |source_file| + source_file.filename.include?("spec") && !source_file.filename.include?("fixture") + end + add_filter %r{/.bundle/} +end + require "ocr4pdf" RSpec.configure do |config| From eea02cf0eecb9159df27c865b6e423931f733cf4 Mon Sep 17 00:00:00 2001 From: Blake Rosenberg Date: Thu, 10 Nov 2022 10:27:09 -0500 Subject: [PATCH 2/7] workflows: adds RSpec + Coveralls reporting --- .github/workflows/main.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..917bed2 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,23 @@ +on: pull_request + +name: Test Coveralls + +jobs: + + build: + name: Build + runs-on: ubuntu-latest + steps: + + - name: Checkout copy + uses: actions/checkout@v3 + - name: Install system dependencies + run: sudo apt -y install poppler-utils tesseract-ocr + - name: Add OEM english LSTM model + run: wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata + - name: Run tests + run: bundle exec rspec + - name: Coveralls + uses: coverallsapp/github-action@master + with: + github-token: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file From 00ad7fb146882e37bd457ffc9c7344cc4019a72e Mon Sep 17 00:00:00 2001 From: Blake Rosenberg Date: Thu, 10 Nov 2022 10:30:08 -0500 Subject: [PATCH 3/7] workflows: fixes tessdata write permissions error --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 917bed2..fb80876 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -14,7 +14,7 @@ jobs: - name: Install system dependencies run: sudo apt -y install poppler-utils tesseract-ocr - name: Add OEM english LSTM model - run: wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata + run: sudo wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata - name: Run tests run: bundle exec rspec - name: Coveralls From 14732986118445a92b357c3e28510f538401a0ec Mon Sep 17 00:00:00 2001 From: Blake Rosenberg Date: Thu, 10 Nov 2022 10:35:35 -0500 Subject: [PATCH 4/7] workflows: fixes ruby not installed --- .github/workflows/main.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fb80876..690b012 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,10 +11,14 @@ jobs: - name: Checkout copy uses: actions/checkout@v3 - - name: Install system dependencies + - name: Install system dependencies for gem run: sudo apt -y install poppler-utils tesseract-ocr - name: Add OEM english LSTM model run: sudo wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata + - name: Install Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: 2.7 - name: Run tests run: bundle exec rspec - name: Coveralls From 420ffa81ebae68144c62e2c8c35974f7279e8fda Mon Sep 17 00:00:00 2001 From: Blake Rosenberg Date: Thu, 10 Nov 2022 10:39:31 -0500 Subject: [PATCH 5/7] workflows: runs bundle install --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 690b012..9de4dd5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,6 +19,8 @@ jobs: uses: ruby/setup-ruby@v1 with: ruby-version: 2.7 + - name: Install dependencies + run: bundle install - name: Run tests run: bundle exec rspec - name: Coveralls From bcb5172ceded6b6dc7d1bcc1d377d04dcf5f5309 Mon Sep 17 00:00:00 2001 From: Blake Rosenberg Date: Thu, 10 Nov 2022 10:57:28 -0500 Subject: [PATCH 6/7] test: disables longest tests and formats report w/ lcov --- spec/ocr4pdf_spec.rb | 61 ++++++++++++++++++++++---------------------- spec/spec_helper.rb | 9 +++++++ 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/spec/ocr4pdf_spec.rb b/spec/ocr4pdf_spec.rb index d2948f9..7027855 100644 --- a/spec/ocr4pdf_spec.rb +++ b/spec/ocr4pdf_spec.rb @@ -21,34 +21,35 @@ File.delete(ocr_file) end - it "can create a PDF with OCR text from a PDF with regular embedded text" do - ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf - File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob) - ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf") - - first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip - expect(first_line_text).to start_with "Form W-4 (2022)" - - # Delete the file - File.delete(ocr_file) - end - - it "can create a PDF from a multi-page rasterized PDF" do - ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf - File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob) - ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf") - - third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip - third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ") - # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3" - # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`... - expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3" - # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still... - - # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse... - # TODO: Really need to train my own LSTM model for it to perform better than this... - - # Delete the file - File.delete(ocr_file) - end + # TODO: Disabling these last tests to speed up workflow runs while configuration still in progress: + # it "can create a PDF with OCR text from a PDF with regular embedded text" do + # ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf + # File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob) + # ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf") + # + # first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip + # expect(first_line_text).to start_with "Form W-4 (2022)" + # + # # Delete the file + # File.delete(ocr_file) + # end + # + # it "can create a PDF from a multi-page rasterized PDF" do + # ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf + # File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob) + # ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf") + # + # third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip + # third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ") + # # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3" + # # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`... + # expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3" + # # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still... + # + # # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse... + # # TODO: Really need to train my own LSTM model for it to perform better than this... + # + # # Delete the file + # File.delete(ocr_file) + # end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 3b34285..52fa0e6 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -3,6 +3,15 @@ require "simplecov" SimpleCov.start do + require "simplecov-lcov" + + SimpleCov::Formatter::LcovFormatter.config do |c| + c.report_with_single_file = true + c.single_report_path = "coverage/lcov.info" + end + + formatter SimpleCov::Formatter::LcovFormatter + add_filter do |source_file| source_file.filename.include?("spec") && !source_file.filename.include?("fixture") end From eaf8492f8635ec475a21671456423d6bf3738fbb Mon Sep 17 00:00:00 2001 From: Blake Rosenberg Date: Thu, 10 Nov 2022 11:06:41 -0500 Subject: [PATCH 7/7] test: re-enables all tests --- spec/ocr4pdf_spec.rb | 61 ++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/spec/ocr4pdf_spec.rb b/spec/ocr4pdf_spec.rb index 7027855..d2948f9 100644 --- a/spec/ocr4pdf_spec.rb +++ b/spec/ocr4pdf_spec.rb @@ -21,35 +21,34 @@ File.delete(ocr_file) end - # TODO: Disabling these last tests to speed up workflow runs while configuration still in progress: - # it "can create a PDF with OCR text from a PDF with regular embedded text" do - # ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf - # File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob) - # ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf") - # - # first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip - # expect(first_line_text).to start_with "Form W-4 (2022)" - # - # # Delete the file - # File.delete(ocr_file) - # end - # - # it "can create a PDF from a multi-page rasterized PDF" do - # ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf - # File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob) - # ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf") - # - # third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip - # third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ") - # # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3" - # # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`... - # expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3" - # # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still... - # - # # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse... - # # TODO: Really need to train my own LSTM model for it to perform better than this... - # - # # Delete the file - # File.delete(ocr_file) - # end + it "can create a PDF with OCR text from a PDF with regular embedded text" do + ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf + File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob) + ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf") + + first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip + expect(first_line_text).to start_with "Form W-4 (2022)" + + # Delete the file + File.delete(ocr_file) + end + + it "can create a PDF from a multi-page rasterized PDF" do + ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf + File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob) + ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf") + + third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip + third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ") + # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3" + # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`... + expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3" + # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still... + + # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse... + # TODO: Really need to train my own LSTM model for it to perform better than this... + + # Delete the file + File.delete(ocr_file) + end end