From 7938b4f887ba83bac3f9425e24ea3f0b9022cce8 Mon Sep 17 00:00:00 2001
From: Blake Rosenberg <blakerosenberg@macroplant.com>
Date: Thu, 10 Nov 2022 10:05:31 -0500
Subject: [PATCH 1/7] test: adds coverage reporting ala lcov

---
 Gemfile             | 3 +++
 spec/spec_helper.rb | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/Gemfile b/Gemfile
index 5f34b77..520895a 100644
--- a/Gemfile
+++ b/Gemfile
@@ -5,6 +5,9 @@ source "https://rubygems.org"
 # Specify your gem's dependencies in ocr4pdf.gemspec
 gemspec
 
+gem "simplecov", ">= 0.18.1", "< 0.22.0"
+gem "simplecov-lcov", "~> 0.8.0"
+
 gem "rake", "~> 13.0"
 
 gem "rspec", "~> 3.0"
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
index 1d708e4..3b34285 100644
--- a/spec/spec_helper.rb
+++ b/spec/spec_helper.rb
@@ -1,5 +1,14 @@
 # frozen_string_literal: true
 
+require "simplecov"
+
+SimpleCov.start do
+  add_filter do |source_file|
+    source_file.filename.include?("spec") && !source_file.filename.include?("fixture")
+  end
+  add_filter %r{/.bundle/}
+end
+
 require "ocr4pdf"
 
 RSpec.configure do |config|

From eea02cf0eecb9159df27c865b6e423931f733cf4 Mon Sep 17 00:00:00 2001
From: Blake Rosenberg <blakerosenberg@macroplant.com>
Date: Thu, 10 Nov 2022 10:27:09 -0500
Subject: [PATCH 2/7] workflows: adds RSpec + Coveralls reporting

---
 .github/workflows/main.yml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 .github/workflows/main.yml

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 0000000..917bed2
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,23 @@
+on: pull_request
+
+name: Test Coveralls
+
+jobs:
+
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    steps:
+
+      - name: Checkout copy
+        uses: actions/checkout@v3
+      - name: Install system dependencies
+        run: sudo apt -y install poppler-utils tesseract-ocr
+      - name: Add OEM english LSTM model
+        run: wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata
+      - name: Run tests
+        run: bundle exec rspec
+      - name: Coveralls
+        uses: coverallsapp/github-action@master
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file

From 00ad7fb146882e37bd457ffc9c7344cc4019a72e Mon Sep 17 00:00:00 2001
From: Blake Rosenberg <blakerosenberg@macroplant.com>
Date: Thu, 10 Nov 2022 10:30:08 -0500
Subject: [PATCH 3/7] workflows: fixes tessdata write permissions error

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 917bed2..fb80876 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -14,7 +14,7 @@ jobs:
       - name: Install system dependencies
         run: sudo apt -y install poppler-utils tesseract-ocr
       - name: Add OEM english LSTM model
-        run: wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata
+        run: sudo wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata
       - name: Run tests
         run: bundle exec rspec
       - name: Coveralls

From 14732986118445a92b357c3e28510f538401a0ec Mon Sep 17 00:00:00 2001
From: Blake Rosenberg <blakerosenberg@macroplant.com>
Date: Thu, 10 Nov 2022 10:35:35 -0500
Subject: [PATCH 4/7] workflows: fixes ruby not installed

---
 .github/workflows/main.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index fb80876..690b012 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -11,10 +11,14 @@ jobs:
 
       - name: Checkout copy
         uses: actions/checkout@v3
-      - name: Install system dependencies
+      - name: Install system dependencies for gem
         run: sudo apt -y install poppler-utils tesseract-ocr
       - name: Add OEM english LSTM model
         run: sudo wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata
+      - name: Install Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: 2.7
       - name: Run tests
         run: bundle exec rspec
       - name: Coveralls

From 420ffa81ebae68144c62e2c8c35974f7279e8fda Mon Sep 17 00:00:00 2001
From: Blake Rosenberg <blakerosenberg@macroplant.com>
Date: Thu, 10 Nov 2022 10:39:31 -0500
Subject: [PATCH 5/7] workflows: runs bundle install

---
 .github/workflows/main.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 690b012..9de4dd5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -19,6 +19,8 @@ jobs:
         uses: ruby/setup-ruby@v1
         with:
           ruby-version: 2.7
+      - name: Install dependencies
+        run: bundle install
       - name: Run tests
         run: bundle exec rspec
       - name: Coveralls

From bcb5172ceded6b6dc7d1bcc1d377d04dcf5f5309 Mon Sep 17 00:00:00 2001
From: Blake Rosenberg <blakerosenberg@macroplant.com>
Date: Thu, 10 Nov 2022 10:57:28 -0500
Subject: [PATCH 6/7] test: disables longest tests and formats report w/ lcov

---
 spec/ocr4pdf_spec.rb | 61 ++++++++++++++++++++++----------------------
 spec/spec_helper.rb  |  9 +++++++
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/spec/ocr4pdf_spec.rb b/spec/ocr4pdf_spec.rb
index d2948f9..7027855 100644
--- a/spec/ocr4pdf_spec.rb
+++ b/spec/ocr4pdf_spec.rb
@@ -21,34 +21,35 @@
     File.delete(ocr_file)
   end
 
-  it "can create a PDF with OCR text from a PDF with regular embedded text" do
-    ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf
-    File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob)
-    ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf")
-
-    first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip
-    expect(first_line_text).to start_with "Form W-4 (2022)"
-
-    # Delete the file
-    File.delete(ocr_file)
-  end
-
-  it "can create a PDF from a multi-page rasterized PDF" do
-    ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf
-    File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob)
-    ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf")
-
-    third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip
-    third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ")
-    # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3"
-    # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`...
-    expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3"
-    # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still...
-
-    # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse...
-    # TODO: Really need to train my own LSTM model for it to perform better than this...
-
-    # Delete the file
-    File.delete(ocr_file)
-  end
+  # TODO: Disabling these last tests to speed up workflow runs while configuration still in progress:
+  # it "can create a PDF with OCR text from a PDF with regular embedded text" do
+  #   ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf
+  #   File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob)
+  #   ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf")
+  #
+  #   first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip
+  #   expect(first_line_text).to start_with "Form W-4 (2022)"
+  #
+  #   # Delete the file
+  #   File.delete(ocr_file)
+  # end
+  #
+  # it "can create a PDF from a multi-page rasterized PDF" do
+  #   ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf
+  #   File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob)
+  #   ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf")
+  #
+  #   third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip
+  #   third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ")
+  #   # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3"
+  #   # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`...
+  #   expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3"
+  #   # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still...
+  #
+  #   # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse...
+  #   # TODO: Really need to train my own LSTM model for it to perform better than this...
+  #
+  #   # Delete the file
+  #   File.delete(ocr_file)
+  # end
 end
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
index 3b34285..52fa0e6 100644
--- a/spec/spec_helper.rb
+++ b/spec/spec_helper.rb
@@ -3,6 +3,15 @@
 require "simplecov"
 
 SimpleCov.start do
+  require "simplecov-lcov"
+
+  SimpleCov::Formatter::LcovFormatter.config do |c|
+    c.report_with_single_file = true
+    c.single_report_path = "coverage/lcov.info"
+  end
+
+  formatter SimpleCov::Formatter::LcovFormatter
+
   add_filter do |source_file|
     source_file.filename.include?("spec") && !source_file.filename.include?("fixture")
   end

From eaf8492f8635ec475a21671456423d6bf3738fbb Mon Sep 17 00:00:00 2001
From: Blake Rosenberg <blakerosenberg@macroplant.com>
Date: Thu, 10 Nov 2022 11:06:41 -0500
Subject: [PATCH 7/7] test: re-enables all tests

---
 spec/ocr4pdf_spec.rb | 61 ++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/spec/ocr4pdf_spec.rb b/spec/ocr4pdf_spec.rb
index 7027855..d2948f9 100644
--- a/spec/ocr4pdf_spec.rb
+++ b/spec/ocr4pdf_spec.rb
@@ -21,35 +21,34 @@
     File.delete(ocr_file)
   end
 
-  # TODO: Disabling these last tests to speed up workflow runs while configuration still in progress:
-  # it "can create a PDF with OCR text from a PDF with regular embedded text" do
-  #   ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf
-  #   File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob)
-  #   ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf")
-  #
-  #   first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip
-  #   expect(first_line_text).to start_with "Form W-4 (2022)"
-  #
-  #   # Delete the file
-  #   File.delete(ocr_file)
-  # end
-  #
-  # it "can create a PDF from a multi-page rasterized PDF" do
-  #   ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf
-  #   File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob)
-  #   ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf")
-  #
-  #   third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip
-  #   third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ")
-  #   # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3"
-  #   # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`...
-  #   expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3"
-  #   # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still...
-  #
-  #   # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse...
-  #   # TODO: Really need to train my own LSTM model for it to perform better than this...
-  #
-  #   # Delete the file
-  #   File.delete(ocr_file)
-  # end
+  it "can create a PDF with OCR text from a PDF with regular embedded text" do
+    ocr_blob = Ocr4pdf.new("spec/fixtures/w4-page-2.pdf").create_ocr_pdf
+    File.binwrite("spec/fixtures/w4-page-2.ocr.pdf", ocr_blob)
+    ocr_file = File.expand_path("spec/fixtures/w4-page-2.ocr.pdf")
+
+    first_line_text = Open3.capture2("pdftotext -f 1 -l 1 -r 300 #{ocr_file} - | head -n 1")[0].strip
+    expect(first_line_text).to start_with "Form W-4 (2022)"
+
+    # Delete the file
+    File.delete(ocr_file)
+  end
+
+  it "can create a PDF from a multi-page rasterized PDF" do
+    ocr_blob = Ocr4pdf.new("spec/fixtures/form-w4.pdf").create_ocr_pdf
+    File.binwrite("spec/fixtures/form-w4.ocr.pdf", ocr_blob)
+    ocr_file = File.expand_path("spec/fixtures/form-w4.ocr.pdf")
+
+    third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip
+    third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ")
+    # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3"
+    # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`...
+    expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3"
+    # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still...
+
+    # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse...
+    # TODO: Really need to train my own LSTM model for it to perform better than this...
+
+    # Delete the file
+    File.delete(ocr_file)
+  end
 end