Merge pull request #1612 from benwbrum/1576-extract-ocr-from-pdf-rail…

…s4-spaces Handle filenames with spaces
benwbrum · Jan 23, 2020 · b3e9c5b · b3e9c5b
2 parents 1aa7757 + b058760
commit b3e9c5b
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 4 deletions.
diff --git a/lib/image_helper.rb b/lib/image_helper.rb
@@ -42,8 +42,7 @@ def self.extract_pdf(filename, ocr=false)
       page_count = Dir.glob(File.join(destination, "*.jpg")).count
       1.upto(page_count) do |page_num|
         output_file = pattern % page_num
-        pdftotext = "pdftotext -f #{page_num} -l #{page_num} '#{filename}' #{output_file}"
-        print pdftotext
+        pdftotext = "pdftotext -f #{page_num} -l #{page_num} '#{filename}' '#{output_file}'"
         print "\t\t#{pdftotext}\n"
         system(pdftotext)
       end

diff --git a/lib/tasks/ingestor.rake b/lib/tasks/ingestor.rake
@@ -232,8 +232,12 @@ namespace :fromthepage do
     work.collection = document_upload.collection
 
     work.title = File.basename(path).ljust(3,'.') unless work.title
-    if document_upload.ocr && Dir.glob(File.join(path, "page*.txt")).count > 0
-      work.ocr_correction = true
+    if document_upload.ocr
+      if Dir.glob(File.join(path, "page*.txt")).count > 0
+        work.ocr_correction = true
+      else
+        print "\tOCR correction specifiied but no files found in #{File.join(path, "page*.txt")}\n"
+      end
     end
 
     work.save!