Merge 5a61403 into 9bcff1e

blarosen95 · Nov 11, 2022 · 84f8677 · 84f8677
2 parents 9bcff1e + 5a61403
commit 84f8677
Show file tree

Hide file tree

Showing 4 changed files with 5 additions and 10 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -14,7 +14,7 @@ jobs:
       - name: Install system dependencies for gem
         run: sudo apt -y install poppler-utils tesseract-ocr
       - name: Add OEM english LSTM model
-        run: sudo wget https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata
+        run: sudo wget https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata -O $(sudo find /usr -name tessdata -type d)/eng.traineddata
       - name: Install Ruby
         uses: ruby/setup-ruby@v1
         with:

diff --git a/lib/ocr4pdf/create_ocr_pdf.rb b/lib/ocr4pdf/create_ocr_pdf.rb
@@ -31,7 +31,7 @@ def self.run(src)
             # Create a TIFF file for the page:
             Open3.capture2("pdftocairo -singlefile -f #{page} -l #{page} -tiff #{src} #{base_name}")
             # Run Tesseract on the TIFF, exporting as a PDF:
-            Open3.capture2("tesseract --dpi 300 #{base_name}.tif #{base_name} pdf quiet")
+            Open3.capture2("tesseract #{base_name}.tif #{base_name} pdf quiet")
           end
           # Unite the pages into a single PDF:
           Open3.capture2("pdfunite #{tmp_dir}/*.pdf #{File.basename(src, ".*")}.ocr.pdf")

diff --git a/lib/ocr4pdf/version.rb b/lib/ocr4pdf/version.rb
@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 
 class Ocr4pdf
-  VERSION = "0.1.0"
+  VERSION = "0.2.0"
 end
diff --git a/spec/ocr4pdf_spec.rb b/spec/ocr4pdf_spec.rb
@@ -40,13 +40,8 @@
 
     third_page_first_text_line = Open3.capture2("pdftotext -f 3 -l 3 -r 300 #{ocr_file} - | head -n 3")[0].strip
     third_page_first_text_line = third_page_first_text_line.tr("\n", " ").gsub(/\s{2,}/, " ")
-    # expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page 3"
-    # TODO: This used to work with the exact same Tesseract model, but now it's not seeing the `-` in `W-4` or the ` ` in `Page 3`...
-    expect(third_page_first_text_line).to start_with("Form W-4 (2022) Page 3").or start_with "Form W4 (2022) Page3"
-    # TODO: If relevant, it's worth noting that using `--oem 1` will see the `-` in `W-4` AND the space in `Page 3` still...
-
-    # TODO: Note, using this same fixture file, the first bits of text in page 1 are OCR'd completely wrong: "Form W=4" or slightly worse...
-    # TODO: Really need to train my own LSTM model for it to perform better than this...
+    # TODO: The following is a flaw of poppler-util's pdftotext, the OCR result is actually correct:
+    expect(third_page_first_text_line).to start_with "Form W-4 (2022) Page3"
 
     # Delete the file
     File.delete(ocr_file)