Merge bcb5172 into 3d50e4e

blarosen95 · Nov 10, 2022 · fbf98d8 · fbf98d8
2 parents 3d50e4e + bcb5172
commit fbf98d8
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 30 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,29 @@
+on: pull_request
+
+name: Test Coveralls
+
+jobs:
+
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    steps:
+
+      - name: Checkout copy
+        uses: actions/checkout@v3
+      - name: Install system dependencies for gem
+        run: sudo apt -y install poppler-utils tesseract-ocr
+      - name: Add OEM english LSTM model
+        run: sudo wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata
+      - name: Install Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: 2.7
+      - name: Install dependencies
+        run: bundle install
+      - name: Run tests
+        run: bundle exec rspec
+      - name: Coveralls
+        uses: coverallsapp/github-action@master
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/Gemfile b/Gemfile
@@ -5,6 +5,9 @@ source "https://rubygems.org"
 # Specify your gem's dependencies in ocr4pdf.gemspec
 gemspec
 
+gem "simplecov", ">= 0.18.1", "< 0.22.0"
+gem "simplecov-lcov", "~> 0.8.0"
+
 gem "rake", "~> 13.0"
 
 gem "rspec", "~> 3.0"

diff --git a/spec/ocr4pdf_spec.rb b/spec/ocr4pdf_spec.rb
@@ -21,34 +21,35 @@
     File.delete(ocr_file)
   end
 
-  it "can create a PDF with OCR text from a PDF with regular embedded text" do
-    ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf
-    File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob)
-    ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf")
-
-    first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip
-    expect(first_line_text).to start_with "Form W-4 (2022)"
-
-    # Delete the file
-    File.delete(ocr_file)
-  end
-
-  it "can create a PDF from a multi-page rasterized PDF" do
-    ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf
-    File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob)
-    ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf")
-
-    third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip
-    third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ")
-    # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3"
-    # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`...
-    expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3"
-    # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still...
-
-    # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse...
-    # TODO: Really need to train my own LSTM model for it to perform better than this...
-
-    # Delete the file
-    File.delete(ocr_file)
-  end
+  # TODO: Disabling these last tests to speed up workflow runs while configuration still in progress:
+  # it "can create a PDF with OCR text from a PDF with regular embedded text" do
+  #   ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf
+  #   File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob)
+  #   ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf")
+  #
+  #   first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip
+  #   expect(first_line_text).to start_with "Form W-4 (2022)"
+  #
+  #   # Delete the file
+  #   File.delete(ocr_file)
+  # end
+  #
+  # it "can create a PDF from a multi-page rasterized PDF" do
+  #   ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf
+  #   File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob)
+  #   ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf")
+  #
+  #   third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip
+  #   third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ")
+  #   # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3"
+  #   # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`...
+  #   expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3"
+  #   # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still...
+  #
+  #   # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse...
+  #   # TODO: Really need to train my own LSTM model for it to perform better than this...
+  #
+  #   # Delete the file
+  #   File.delete(ocr_file)
+  # end
 end
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -1,5 +1,23 @@
 # frozen_string_literal: true
 
+require "simplecov"
+
+SimpleCov.start do
+  require "simplecov-lcov"
+
+  SimpleCov::Formatter::LcovFormatter.config do |c|
+    c.report_with_single_file = true
+    c.single_report_path = "coverage/lcov.info"
+  end
+
+  formatter SimpleCov::Formatter::LcovFormatter
+
+  add_filter do |source_file|
+    source_file.filename.include?("spec") && !source_file.filename.include?("fixture")
+  end
+  add_filter %r{/.bundle/}
+end
+
 require "ocr4pdf"
 
 RSpec.configure do |config|