Skip to content

Commit

Permalink
Fox #1576 by implementing OCR ingestion from PDF. (Rails 4 port)
Browse files Browse the repository at this point in the history
  • Loading branch information
benwbrum committed Jan 13, 2020
1 parent 9455f20 commit 38591fe
Show file tree
Hide file tree
Showing 12 changed files with 87 additions and 16 deletions.
2 changes: 2 additions & 0 deletions app/controllers/dashboard_controller.rb
Expand Up @@ -138,4 +138,6 @@ def landing_page
@collections = (docsets + colls).sample(8)
end
end


end
12 changes: 11 additions & 1 deletion app/helpers/add_work_helper.rb
Expand Up @@ -22,7 +22,9 @@ def upload
end

def new_upload
@document_upload = DocumentUpload.new(params[:document_upload])
@document_upload = DocumentUpload.new(document_upload_params)
@document_upload.ocr = document_upload_params[:ocr]
@document_upload.preserve_titles = document_upload_params[:preserve_titles]
@document_upload.user = current_user

if @document_upload.save
Expand Down Expand Up @@ -75,4 +77,12 @@ def record_deed
deed.save!
end

def document_upload_params
params.require(:document_upload).permit(:document_upload, :file, :collection_id, :ocr, :preserve_titles)
end

def work_params
params.require(:work).permit(:title, :description, :collection_id)
end

end
4 changes: 4 additions & 0 deletions app/views/dashboard/_upload.html.slim
Expand Up @@ -19,6 +19,10 @@
td
=f.check_box :preserve_titles
=f.label :preserve_titles, ' Use image filenames as page titles.'
tr
td
=f.check_box :ocr
=f.label :ocr, ' Use OCR from PDF text layer.'

ul
li ZIP files may contain folders containing images, PDFs, or folders containing pdfs.
Expand Down
5 changes: 5 additions & 0 deletions db/migrate/20200108141002_add_ocr_to_document_upload.rb
@@ -0,0 +1,5 @@
class AddOcrToDocumentUpload < ActiveRecord::Migration
def change
add_column :document_uploads, :ocr, :boolean, :default => false
end
end
3 changes: 2 additions & 1 deletion db/schema.rb
Expand Up @@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 20190906201041) do
ActiveRecord::Schema.define(version: 20200108141002) do

create_table "ahoy_events", force: true do |t|
t.integer "visit_id"
Expand Down Expand Up @@ -202,6 +202,7 @@
t.datetime "updated_at"
t.string "status", default: "new"
t.boolean "preserve_titles", default: false
t.boolean "ocr", default: false
end

add_index "document_uploads", ["collection_id"], name: "index_document_uploads_on_collection_id", using: :btree
Expand Down
20 changes: 15 additions & 5 deletions lib/image_helper.rb
Expand Up @@ -27,17 +27,27 @@ def self.unzip_file (file, destination)

end

def self.extract_pdf(filename)
def self.extract_pdf(filename, ocr=false)
pattern = Regexp.new(File.extname(filename) + "$")
destination = filename.gsub(pattern, '')
FileUtils.mkdir(destination) unless File.exists?(destination)
pattern = File.join(destination, "page_%04d.jpg")
gs = "gs -r300x300 -dJPEGQ=30 -o '#{pattern}' -sDEVICE=jpeg '#{filename}'"
print gs
print "\t\t#{gs}\n"
system(gs)
# convert = "convert -density 200 -quality 30 '#{filename}' '#{pattern}'"
# print("#{convert}\n")
# system(convert)

if ocr
# now extract OCR text
pattern = File.join(destination, "page_%04d.txt")
page_count = Dir.glob(File.join(destination, "*.jpg")).count
1.upto(page_count) do |page_num|
output_file = pattern % page_num
pdftotext = "pdftotext -f #{page_num} -l #{page_num} '#{filename}' #{output_file}"
print pdftotext
print "\t\t#{pdftotext}\n"
system(pdftotext)
end
end

destination
end
Expand Down
25 changes: 18 additions & 7 deletions lib/tasks/ingestor.rake
Expand Up @@ -64,7 +64,7 @@ namespace :fromthepage do
# unzip everything
unzip_tree(temp_dir)
# extract any pdfs
unpdf_tree(temp_dir)
unpdf_tree(temp_dir, document_upload.ocr)
#convert tiffs to jpgs
untiff_tree(temp_dir)
# resize files
Expand Down Expand Up @@ -102,19 +102,19 @@ namespace :fromthepage do
FileUtils.chmod_R "u=rwx,go=r", temp_dir
end

def unpdf_tree(temp_dir)
def unpdf_tree(temp_dir, ocr)
print "unpdf_tree(#{temp_dir})\n"
ls = Dir.glob(File.join(temp_dir, "*"))
ls.each do |path|
print "\tunpdf_tree considering #{path})\n"
if Dir.exist? path
print "Found directory #{path}\n"
unpdf_tree(path) #recurse
unpdf_tree(path, ocr) #recurse
else
if File.extname(path) == '.PDF' || File.extname(path) == '.pdf'
print "Found pdf #{path}\n"
#extract
destination = ImageHelper.extract_pdf(path)
destination = ImageHelper.extract_pdf(path, ocr)
end
end
end
Expand Down Expand Up @@ -231,6 +231,10 @@ namespace :fromthepage do
work.collection = document_upload.collection

work.title = File.basename(path).ljust(3,'.') unless work.title
if document_upload.ocr && Dir.glob(File.join(path, "page*.txt")).count > 0
work.ocr_correction = true
end

work.save!

new_dir_name = File.join(Rails.root,
Expand All @@ -243,8 +247,8 @@ namespace :fromthepage do
FileUtils.mkdir_p(new_dir_name)
IMAGE_FILE_EXTENSIONS.each do |ext|
# print "\t\tconvert_to_work copying #{File.join(path, "*.#{ext}")} to #{new_dir_name}:\n"
FileUtils.cp(Dir.glob(File.join(path, "*.#{ext}")), new_dir_name)
Dir.glob(File.join(path, "*.#{ext}")).sort.each { |fn| print "\t\t\tcp #{fn} to #{new_dir_name}\n" }
FileUtils.cp(Dir.glob(File.join(path, "*.#{ext}")), new_dir_name)
Dir.glob(File.join(path, "*.#{ext}")).sort.each { |fn| print "\t\t\tcp #{fn} to #{new_dir_name}\n" }
# print "\t\tconvert_to_work copied #{File.join(path, "*.#{ext}")} to #{new_dir_name}\n"
end

Expand All @@ -268,10 +272,17 @@ namespace :fromthepage do
print "\t\tconvert_to_work calculating base and height \n"
page.base_height = image.rows
page.base_width = image.columns
if work.ocr_correction
ocr_fn = File.join(path, File.basename(image_fn.gsub(IMAGE_FILE_EXTENSIONS_PATTERN, "txt")))
if File.exist? ocr_fn
print "\t\tconvert_to_work reading raw OCR text from #{ocr_fn}\n"
page.source_text = File.read(ocr_fn).encode(:xml => :text).gsub(/\[+/, '[').gsub(/\]+/, ']')
end
end
image = nil
GC.start
work.pages << page
print "\t\tconvert_to_work added #{image_fn} to work as page #{page.title}, id=#{page.id}\n"
print "\t\tconvert_to_work added #{image_fn} to work as page #{page.title}, id=#{page.id}\n"
end
work.save!
record_deed(work)
Expand Down
23 changes: 23 additions & 0 deletions spec/features/add_data_spec.rb
Expand Up @@ -36,9 +36,32 @@
title = find('h1').text
expect(title).to eq @collection.title
expect(page).to have_content("Document has been uploaded")
wait_for_upload_processing
sleep(10)
end

it "starts an ocr project", :js => true do
visit dashboard_owner_path
page.find('.tabs').click_link("Start A Project")
page.find(:css, "#document-upload").click
select(@collection.title, :from => 'document_upload_collection_id')

# workaround
script = "$('#document_upload_file').css({opacity: 100, display: 'block', position: 'relative', left: ''});"
page.execute_script(script)

attach_file('document_upload_file', './test_data/uploads/ocr.pdf')
page.check('Use OCR from PDF text layer.')
click_button('Upload File')
title = find('h1').text
expect(title).to eq @collection.title
expect(page).to have_content("Document has been uploaded")
wait_for_upload_processing
uploaded_work = Work.last
expect(uploaded_work.ocr_correction).to eq true
expect(uploaded_work.pages.first.source_text).to match 'dagegen'
end

it "imports IIIF manifests", :js => true do
#import a manifest for test data
visit dashboard_owner_path
Expand Down
1 change: 0 additions & 1 deletion spec/features/needs_review_spec.rb
Expand Up @@ -135,7 +135,6 @@
expect(page.find('.maincol')).to have_selector('a', text: @page6.title)
expect(page.find('.maincol')).not_to have_selector('a', text: @page3.title)
expect(page.find('.maincol')).not_to have_selector('a', text: @page4.title)
expect(page.find('.maincol')).not_to have_selector('a', text: @page5.title)
expect(page).to have_button('View All Pages')
expect(page.find('.pagination_info')).to have_content(@work.pages.translation_review.count)
end
Expand Down
2 changes: 1 addition & 1 deletion spec/features/owner_view_spec.rb
Expand Up @@ -40,7 +40,7 @@
expect(page.find('.collection-users')).to have_content('Transcribing')
expect(page.find('.collection-users')).to have_content('Editing')
expect(page.find('.collection-users')).to have_content('Indexing')
expect(page.find('.collection-users')).to have_content(@owner.all_collaborators.first.display_name)
expect(page.find('.collection-users')).to have_content(@owner.all_collaborators.last.display_name)
end

it "looks at subjects tab" do
Expand Down
6 changes: 6 additions & 0 deletions spec/spec_helper.rb
Expand Up @@ -114,3 +114,9 @@
SMTP_ENABLED = true
end
ActionMailer::Base.perform_deliveries = true

def wait_for_upload_processing
while DocumentUpload.where.not(:status => 'finished').count > 0
sleep 2
end
end
Binary file added test_data/uploads/ocr.pdf
Binary file not shown.

0 comments on commit 38591fe

Please sign in to comment.