Fox #1576 by implementing OCR ingestion from PDF. (Rails 4 port)

benwbrum · Jan 13, 2020 · 38591fe · 38591fe
1 parent 9455f20
commit 38591fe
Show file tree

Hide file tree

Showing 12 changed files with 87 additions and 16 deletions.
diff --git a/app/controllers/dashboard_controller.rb b/app/controllers/dashboard_controller.rb
@@ -138,4 +138,6 @@ def landing_page
       @collections = (docsets + colls).sample(8)
     end
   end
+
+
 end
diff --git a/app/helpers/add_work_helper.rb b/app/helpers/add_work_helper.rb
@@ -22,7 +22,9 @@ def upload
   end
 
   def new_upload
-    @document_upload = DocumentUpload.new(params[:document_upload])
+    @document_upload = DocumentUpload.new(document_upload_params)
+    @document_upload.ocr = document_upload_params[:ocr]
+    @document_upload.preserve_titles = document_upload_params[:preserve_titles]
     @document_upload.user = current_user
 
     if @document_upload.save
@@ -75,4 +77,12 @@ def record_deed
     deed.save!
   end
 
+  def document_upload_params
+    params.require(:document_upload).permit(:document_upload, :file, :collection_id, :ocr, :preserve_titles)
+  end
+
+  def work_params
+    params.require(:work).permit(:title, :description, :collection_id)
+  end
+
 end
diff --git a/app/views/dashboard/_upload.html.slim b/app/views/dashboard/_upload.html.slim
@@ -19,6 +19,10 @@
             td
               =f.check_box :preserve_titles
               =f.label :preserve_titles, ' Use image filenames as page titles.'
+          tr
+            td
+              =f.check_box :ocr
+              =f.label :ocr, ' Use OCR from PDF text layer.'
 
         ul
           li ZIP files may contain folders containing images, PDFs, or folders containing pdfs.

diff --git a/db/migrate/20200108141002_add_ocr_to_document_upload.rb b/db/migrate/20200108141002_add_ocr_to_document_upload.rb
@@ -0,0 +1,5 @@
+class AddOcrToDocumentUpload < ActiveRecord::Migration
+  def change
+    add_column :document_uploads, :ocr, :boolean, :default => false
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
@@ -11,7 +11,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema.define(version: 20190906201041) do
+ActiveRecord::Schema.define(version: 20200108141002) do
 
   create_table "ahoy_events", force: true do |t|
     t.integer  "visit_id"
@@ -202,6 +202,7 @@
     t.datetime "updated_at"
     t.string   "status",          default: "new"
     t.boolean  "preserve_titles", default: false
+    t.boolean  "ocr",             default: false
   end
 
   add_index "document_uploads", ["collection_id"], name: "index_document_uploads_on_collection_id", using: :btree

diff --git a/lib/image_helper.rb b/lib/image_helper.rb
@@ -27,17 +27,27 @@ def self.unzip_file (file, destination)
 
   end
 
-  def self.extract_pdf(filename)
+  def self.extract_pdf(filename, ocr=false)
     pattern = Regexp.new(File.extname(filename) + "$")
     destination = filename.gsub(pattern, '')
     FileUtils.mkdir(destination) unless File.exists?(destination)
     pattern = File.join(destination, "page_%04d.jpg")
     gs = "gs -r300x300 -dJPEGQ=30 -o '#{pattern}' -sDEVICE=jpeg '#{filename}'"
-    print gs
+    print "\t\t#{gs}\n"
     system(gs)
-    # convert = "convert -density 200 -quality 30 '#{filename}' '#{pattern}'"
-    # print("#{convert}\n")
-    # system(convert)
+
+    if ocr
+      # now extract OCR text
+      pattern = File.join(destination, "page_%04d.txt")
+      page_count = Dir.glob(File.join(destination, "*.jpg")).count
+      1.upto(page_count) do |page_num|
+        output_file = pattern % page_num
+        pdftotext = "pdftotext -f #{page_num} -l #{page_num} '#{filename}' #{output_file}"
+        print pdftotext
+        print "\t\t#{pdftotext}\n"
+        system(pdftotext)
+      end
+    end
 
     destination
   end

diff --git a/lib/tasks/ingestor.rake b/lib/tasks/ingestor.rake
@@ -64,7 +64,7 @@ namespace :fromthepage do
     # unzip everything
     unzip_tree(temp_dir)
     # extract any pdfs
-    unpdf_tree(temp_dir)
+    unpdf_tree(temp_dir, document_upload.ocr)
     #convert tiffs to jpgs
     untiff_tree(temp_dir)
     # resize files
@@ -102,19 +102,19 @@ namespace :fromthepage do
     FileUtils.chmod_R "u=rwx,go=r", temp_dir
   end
 
-  def unpdf_tree(temp_dir)
+  def unpdf_tree(temp_dir, ocr)
     print "unpdf_tree(#{temp_dir})\n"
     ls = Dir.glob(File.join(temp_dir, "*"))
     ls.each do |path|
       print "\tunpdf_tree considering #{path})\n"
       if Dir.exist? path
         print "Found directory #{path}\n"
-        unpdf_tree(path) #recurse
+        unpdf_tree(path, ocr) #recurse
       else
         if File.extname(path) == '.PDF' || File.extname(path) == '.pdf'
           print "Found pdf #{path}\n"
           #extract 
-          destination = ImageHelper.extract_pdf(path)
+          destination = ImageHelper.extract_pdf(path, ocr)
         end
       end
     end
@@ -231,6 +231,10 @@ namespace :fromthepage do
     work.collection = document_upload.collection
 
     work.title = File.basename(path).ljust(3,'.') unless work.title
+    if document_upload.ocr && Dir.glob(File.join(path, "page*.txt")).count > 0
+      work.ocr_correction = true
+    end
+
     work.save!
 
     new_dir_name = File.join(Rails.root,
@@ -243,8 +247,8 @@ namespace :fromthepage do
     FileUtils.mkdir_p(new_dir_name)
     IMAGE_FILE_EXTENSIONS.each do |ext|
 #      print "\t\tconvert_to_work copying #{File.join(path, "*.#{ext}")} to #{new_dir_name}:\n"
-    FileUtils.cp(Dir.glob(File.join(path, "*.#{ext}")), new_dir_name)    
-    Dir.glob(File.join(path, "*.#{ext}")).sort.each { |fn| print "\t\t\tcp #{fn} to #{new_dir_name}\n" }      
+      FileUtils.cp(Dir.glob(File.join(path, "*.#{ext}")), new_dir_name)    
+      Dir.glob(File.join(path, "*.#{ext}")).sort.each { |fn| print "\t\t\tcp #{fn} to #{new_dir_name}\n" }      
 #      print "\t\tconvert_to_work copied #{File.join(path, "*.#{ext}")} to #{new_dir_name}\n"
     end    
 
@@ -268,10 +272,17 @@ namespace :fromthepage do
       print "\t\tconvert_to_work calculating base and height \n"
       page.base_height = image.rows
       page.base_width = image.columns
+      if work.ocr_correction
+        ocr_fn = File.join(path, File.basename(image_fn.gsub(IMAGE_FILE_EXTENSIONS_PATTERN, "txt")))
+        if File.exist? ocr_fn
+          print "\t\tconvert_to_work reading raw OCR text from #{ocr_fn}\n"
+          page.source_text = File.read(ocr_fn).encode(:xml => :text).gsub(/\[+/, '[').gsub(/\]+/, ']')
+        end
+      end
       image = nil
       GC.start
       work.pages << page
-       print "\t\tconvert_to_work added #{image_fn} to work as page #{page.title}, id=#{page.id}\n"
+      print "\t\tconvert_to_work added #{image_fn} to work as page #{page.title}, id=#{page.id}\n"
     end
     work.save!
     record_deed(work)

diff --git a/spec/features/add_data_spec.rb b/spec/features/add_data_spec.rb
@@ -36,9 +36,32 @@
     title = find('h1').text
     expect(title).to eq @collection.title
     expect(page).to have_content("Document has been uploaded")
+    wait_for_upload_processing
     sleep(10)
   end
 
+  it "starts an ocr project", :js => true do
+    visit dashboard_owner_path
+    page.find('.tabs').click_link("Start A Project")
+    page.find(:css, "#document-upload").click
+    select(@collection.title, :from => 'document_upload_collection_id')
+
+    # workaround
+    script = "$('#document_upload_file').css({opacity: 100, display: 'block', position: 'relative', left: ''});"
+    page.execute_script(script)
+
+    attach_file('document_upload_file', './test_data/uploads/ocr.pdf')
+    page.check('Use OCR from PDF text layer.')
+    click_button('Upload File')
+    title = find('h1').text
+    expect(title).to eq @collection.title
+    expect(page).to have_content("Document has been uploaded")
+    wait_for_upload_processing
+    uploaded_work = Work.last
+    expect(uploaded_work.ocr_correction).to eq true
+    expect(uploaded_work.pages.first.source_text).to match 'dagegen'
+  end
+
   it "imports IIIF manifests", :js => true do
     #import a manifest for test data
     visit dashboard_owner_path

diff --git a/spec/features/needs_review_spec.rb b/spec/features/needs_review_spec.rb
@@ -135,7 +135,6 @@
     expect(page.find('.maincol')).to have_selector('a', text: @page6.title)
     expect(page.find('.maincol')).not_to have_selector('a', text: @page3.title)
     expect(page.find('.maincol')).not_to have_selector('a', text: @page4.title)
-    expect(page.find('.maincol')).not_to have_selector('a', text: @page5.title)
     expect(page).to have_button('View All Pages')
     expect(page.find('.pagination_info')).to have_content(@work.pages.translation_review.count)
   end

diff --git a/spec/features/owner_view_spec.rb b/spec/features/owner_view_spec.rb
@@ -40,7 +40,7 @@
     expect(page.find('.collection-users')).to have_content('Transcribing')
     expect(page.find('.collection-users')).to have_content('Editing')
     expect(page.find('.collection-users')).to have_content('Indexing')
-    expect(page.find('.collection-users')).to have_content(@owner.all_collaborators.first.display_name)
+    expect(page.find('.collection-users')).to have_content(@owner.all_collaborators.last.display_name)
   end
 
   it "looks at subjects tab" do

diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -114,3 +114,9 @@
   SMTP_ENABLED = true
 end
 ActionMailer::Base.perform_deliveries = true
+
+def wait_for_upload_processing
+  while DocumentUpload.where.not(:status => 'finished').count > 0
+    sleep 2
+  end
+end
diff --git a/test_data/uploads/ocr.pdf b/test_data/uploads/ocr.pdf