Merge branch 'development' into 1476-admin-collection-list

benwbrum · Jun 14, 2021 · bcc5ab0 · bcc5ab0
2 parents 22578d2 + e9306cf
commit bcc5ab0
Show file tree

Hide file tree

Showing 12 changed files with 8,718 additions and 88,548 deletions.
diff --git a/app/assets/stylesheets/sections/page.scss b/app/assets/stylesheets/sections/page.scss
@@ -156,6 +156,11 @@
   display: flex;
   box-sizing: border-box;
   flex-direction: column;
+
+  [data-layout-mode='ttb'] &,
+  [data-layout-mode='btt'] & {
+    min-height: calc((100vh - 74px - 3rem) / 2);
+  }
 }
 
 .page-imagescan {

diff --git a/app/helpers/transcribe_helper.rb b/app/helpers/transcribe_helper.rb
@@ -58,6 +58,8 @@ def osd_source(page)
       ["#{@page.sc_canvas.sc_service_id}/info.json"]
     elsif page.ia_leaf
       [@page.ia_leaf.iiif_image_info_url]
+    elsif browser.platform.ios? && browser.webkit?
+      ["#{url_for(:root)}image-service/#{page.id}/info.json"]
     else
       {type: 'image', url: file_to_url(page.canonical_facsimile_url)}.to_json
     end

diff --git a/config/locales/dashboard/dashboard-en.yml b/config/locales/dashboard/dashboard-en.yml
@@ -59,7 +59,7 @@ en:
       click_to_browse_files: "Click to browse files"
       browse: "Browse"
       use_image_filenames: " Use image filenames as page titles."
-      use_ocr_from_pdf: " Use OCR from PDF text layer."
+      use_ocr_from_pdf: " Import text from PDF text layers, text files or XML files."
       zip_files_may_contain: "ZIP files may contain folders containing images, PDFs, or folders containing pdfs."
       each_folder_will_be_treated: "Each folder will be treated as a different document, so do not mix pages from different documents in the same folder."
       each_pdf_will_be_treated: "Each PDF will be treated as its own document, so do not split pages from the same document among more than one PDF."

diff --git a/lib/tasks/ingestor.rake b/lib/tasks/ingestor.rake
@@ -246,10 +246,10 @@ namespace :fromthepage do
 
     if document_upload.ocr
       clean_dir=path.gsub('[','\[').gsub(']','\]')
-      if Dir.glob(File.join(clean_dir, "*.txt")).count > 0
+      if (Dir.glob(File.join(clean_dir, "*.txt")).count + Dir.glob(File.join(clean_dir, "*.xml")).count) > 0
         work.ocr_correction = true
       else
-        print "\tOCR correction specified but no files found in #{File.join(path, "page*.txt")}\n"
+        print "\tOCR correction specified but no files found in #{File.join(path, "page*.txt")} or #{File.join(path, "page*.xml")}\n"
       end
     end
 
@@ -293,7 +293,12 @@ namespace :fromthepage do
       page.base_width = image.columns
       if work.ocr_correction
         ocr_fn = File.join(path, File.basename(image_fn.gsub(IMAGE_FILE_EXTENSIONS_PATTERN, "txt")))
-        if File.exist? ocr_fn
+        xml_fn = File.join(path, File.basename(image_fn.gsub(IMAGE_FILE_EXTENSIONS_PATTERN, "xml")))
+        if File.exist? xml_fn
+          print "\t\tconvert_to_work reading raw XML text from #{xml_fn}\n"
+          page.source_text = File.read(xml_fn).gsub(/\[+/, '[').gsub(/\]+/, ']')
+          # if there are errors, consider escaping
+        elsif File.exist? ocr_fn
           print "\t\tconvert_to_work reading raw OCR text from #{ocr_fn}\n"
           page.source_text = File.read(ocr_fn).encode(:xml => :text).gsub(/\[+/, '[').gsub(/\]+/, ']')
         end

diff --git a/spec/features/add_data_spec.rb b/spec/features/add_data_spec.rb
@@ -51,7 +51,7 @@
     page.execute_script(script)
 
     attach_file('document_upload_file', './test_data/uploads/ocr.pdf')
-    page.check('Use OCR from PDF text layer.')
+    page.check('Import text from PDF text layers, text files or XML files.')
     click_button('Upload File')
     title = find('h1').text
     expect(title).to eq @collection.title

diff --git a/spec/features/export_spec.rb b/spec/features/export_spec.rb
@@ -22,6 +22,36 @@
     expect(page).to have_content(@work.title)
     page.find('#btnExportAll').click
     expect(page.response_headers['Content-Type']).to eq 'text/html; charset=utf-8'
+
+    page.check('bulk_export_html_page')
+    page.check('bulk_export_html_work')
+    page.check('bulk_export_plaintext_verbatim_page')
+    page.check('bulk_export_plaintext_verbatim_work')
+    page.check('bulk_export_plaintext_emended_work')
+    page.check('bulk_export_plaintext_emended_page')
+    page.check('bulk_export_plaintext_searchable_work')
+    page.check('bulk_export_plaintext_searchable_page')
+    page.check('bulk_export_tei_work')
+    page.check('bulk_export_table_csv_work')
+    page.check('bulk_export_table_csv_collection')
+    page.check('bulk_export_subject_csv_collection')
+    page.check('bulk_export_work_metadata_csv')
+
+    page.find('button', text: 'Start Export').click
+    expect(page).to have_content("Queued")
+
+    login_as(User.where(admin: true).first, :scope => :user)
+
+    # wait for the background process to run
+    1.upto(10) do
+      sleep 5
+      if BulkExport.last.status == 'finished'
+        break
+      end
+    end
+
+    visit bulk_export_index_path
+    expect(page).to have_content("Finished")
   end
 
   it "exports the subject index" do