In [None]:
# Check your jupyter notebook is in the correct directory
!pwd

In [None]:
## libraries
import os, ocrmypdf, time

In [None]:
## OCR program parameters

keep_processed_PDFs = True  # If TRUE, when the program finds a 'processed' version of the original PDF 
                            # it will skip the original PDF and not overwrite the existing 'processed' PDF.
                            # If FALSE, will overwrite all processed PDFs files for all original PDFs. 

In [None]:
## Stationkeeping variables
starting_time = time.time() # time program started
files_in_directory = !ls    # list of all files in directory
directory = !pwd            # full directory name and path 
PDF_processed_counter = 0   # Number of PDFs processed
PDF_examined_counter = 0    # Number of PDFs examined, including PDFs processed

## Output number and location of PDFs at script start
total_number_of_PDFs = !ls | grep '\.pdf$' | wc -l
processed_number_of_PDFs = !ls | grep '\.processed.pdf$' | wc -l
PDFs_to_OCR = str(int(total_number_of_PDFs[0]) - int(processed_number_of_PDFs[0]))
if keep_processed_PDFs == True:
  PDFs_to_OCR = str(int(total_number_of_PDFs[0]) - int(processed_number_of_PDFs[0])*2)
  print('OCRing ' + PDFs_to_OCR + ' PDFs in ' + directory[0] + '\n')
else:    
  print('OCRing ' + PDFs_to_OCR + ' PDFs in ' + directory[0] + '\n')


## Execute OCR on PDF files
for filename in files_in_directory:    
    ## only look at non-processed PDF files 
    if os.path.splitext(filename)[1] == '.pdf' and os.path.splitext(filename)[0][-10:] != '.processed': 
        running_time = (time.time()-starting_time)/60
        print('PDF ' + str(PDF_examined_counter+1) +': '+ filename + ', '+ str(running_time) + ' minutes.')
        
        ## generate filenames for PDF  
        processed_filename = os.path.splitext(filename)[0] + '.processed.pdf'
        original_OCR_txt_filename = os.path.splitext(filename)[0] + '.original.txt'
        processed_OCR_txt_filename = os.path.splitext(filename)[0] + '.ocr.txt'
        
        ## Case 1: Ignore PDFs we have previously processed. Do not perform OCR on PDF.
        ## keep_processed_PDFs==True
        if os.path.isfile(processed_filename) == True and keep_processed_PDFs == True:
            print('PDF already processed. keep_processed_PDFs = True. Moving to next PDF...\n') 
            PDF_examined_counter+=1
        
        ## Case 2: Overwrite processed PDFs by reOCRing original PDF, or OCR PDF if not processed.
        ## Save original OCR in unprocessed PDF, if it exists, to txt file. keep_processed_PDFs==False.
        if (os.path.isfile(processed_filename) == True and keep_processed_PDFs == False) or os.path.isfile(processed_filename) == False:
            try:
                ## execute OCR on PDF, and catch exception if OCR text already exists
                ocrmypdf.ocr(input_file = filename, output_file = processed_filename, 
                             deskew = True, 
                             clean_final = True, 
                             rotate_pages = True,
                             remove_background = True, 
                             sidecar = processed_OCR_txt_filename)
                PDF_processed_counter+=1
                PDF_examined_counter+=1
                
                
            except ocrmypdf.exceptions.PriorOcrFoundError as error:
                ## Unprocessed PDF already has OCR and needs to be reOCRed 
                !pdftotext "$filename" "$original_OCR_txt_filename" 
                print('PDF already has OCR text. Saved original OCR text to a txt file.')
                
                               
                ## OCR unprocessed PDF, & generate new processed PDF
                print('Overwriting existing OCR, and saving in ' + processed_filename)
                ocrmypdf.ocr(input_file = filename, output_file = processed_filename, 
                             deskew = True, 
                             clean_final = True, 
                             rotate_pages = True,
                             remove_background = True, 
                             sidecar = processed_OCR_txt_filename,
                             force_ocr = True)  
                print('moving to next PDF....\n')
                PDF_processed_counter+=1
                PDF_examined_counter+=1
            
            except Exception as general_error:
                print('Unexpected Error!! Halt Code!!!')
                print(general_error)
                break
        
            finally:
                # this code will always run
                pass
                
# Concluding Stats                
print('')
print('Processed PDFs in ' + directory[0] + ' are OCRed.')
print('PDFs processed: ' + str(PDF_processed_counter))
print('PDFs examined, including processed: ' + str(PDF_examined_counter))
running_time = (time.time()-starting_time)/60
print('Program Completed in ' + str(running_time) + ' minutes.')

In [None]:
## If you find the script above appears to stall on certain PDFs, interupt notebook, run this cell, 
## and then rerun above cell. 

## Limiting thread limit to 1 empirically appears to decrease time slightly.
## You can ignore this and likely experience no clear performance penalty unless
## you are doing 1000s of PDFs in one directory. Be aware that ocrmypdf *will* spin
## up as many thread processes as possible given the cores you have regardless 
## of OMP_THREAD_LIMIT value. Multithreading in Tesseract only really helps for large PDFs.   

#os.environ['OMP_THREAD_LIMIT'] = '1'
#print(os.environ['OMP_THREAD_LIMIT']) #on my machine it appears to default to 3