Skip to content

Commit

Permalink
Merge bcb5172 into 3d50e4e
Browse files Browse the repository at this point in the history
  • Loading branch information
blarosen95 committed Nov 10, 2022
2 parents 3d50e4e + bcb5172 commit fbf98d8
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 30 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
on: pull_request

name: Test Coveralls

jobs:

build:
name: Build
runs-on: ubuntu-latest
steps:

- name: Checkout copy
uses: actions/checkout@v3
- name: Install system dependencies for gem
run: sudo apt -y install poppler-utils tesseract-ocr
- name: Add OEM english LSTM model
run: sudo wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata
- name: Install Ruby
uses: ruby/setup-ruby@v1
with:
ruby-version: 2.7
- name: Install dependencies
run: bundle install
- name: Run tests
run: bundle exec rspec
- name: Coveralls
uses: coverallsapp/github-action@master
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
3 changes: 3 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ source "https://rubygems.org"
# Specify your gem's dependencies in ocr4pdf.gemspec
gemspec

gem "simplecov", ">= 0.18.1", "< 0.22.0"
gem "simplecov-lcov", "~> 0.8.0"

gem "rake", "~> 13.0"

gem "rspec", "~> 3.0"
Expand Down
61 changes: 31 additions & 30 deletions spec/ocr4pdf_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,34 +21,35 @@
File.delete(ocr_file)
end

it "can create a PDF with OCR text from a PDF with regular embedded text" do
ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf
File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob)
ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf")

first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip
expect(first_line_text).to start_with "Form W-4 (2022)"

# Delete the file
File.delete(ocr_file)
end

it "can create a PDF from a multi-page rasterized PDF" do
ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf
File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob)
ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf")

third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip
third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ")
# expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3"
# TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`...
expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3"
# TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still...

# TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse...
# TODO: Really need to train my own LSTM model for it to perform better than this...

# Delete the file
File.delete(ocr_file)
end
# TODO: Disabling these last tests to speed up workflow runs while configuration still in progress:
# it "can create a PDF with OCR text from a PDF with regular embedded text" do
# ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf
# File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob)
# ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf")
#
# first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip
# expect(first_line_text).to start_with "Form W-4 (2022)"
#
# # Delete the file
# File.delete(ocr_file)
# end
#
# it "can create a PDF from a multi-page rasterized PDF" do
# ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf
# File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob)
# ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf")
#
# third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip
# third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ")
# # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3"
# # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`...
# expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3"
# # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still...
#
# # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse...
# # TODO: Really need to train my own LSTM model for it to perform better than this...
#
# # Delete the file
# File.delete(ocr_file)
# end
end
18 changes: 18 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,23 @@
# frozen_string_literal: true

require "simplecov"

SimpleCov.start do
require "simplecov-lcov"

SimpleCov::Formatter::LcovFormatter.config do |c|
c.report_with_single_file = true
c.single_report_path = "coverage/lcov.info"
end

formatter SimpleCov::Formatter::LcovFormatter

add_filter do |source_file|
source_file.filename.include?("spec") && !source_file.filename.include?("fixture")
end
add_filter %r{/.bundle/}
end

require "ocr4pdf"

RSpec.configure do |config|
Expand Down

0 comments on commit fbf98d8

Please sign in to comment.