In [None]:
## libraries
import os, ocrmypdf, time

In [None]:
## OCR program parameters

keep_processed_PDFs = True  # If TRUE, when the program finds a 'processed' version of the original PDF 
                            # it will skip the original PDF and not overwrite the existing 'processed' PDF.
                            # If FALSE, will overwrite all processed PDFs files for all original PDFs. 

In [None]:
## OCRing PDFs

starting_time = time.time() # time program started
files_in_directory = !ls    # list of all files in directory
directory = !pwd            # full directory name and path 
PDF_processed_counter = 0   # Number of PDFs processed
PDF_examined_counter = 0    # Number of PDFs examined, including PDFs processed

for filename in files_in_directory: # Will loop through every file in directory     
    
    ## only look at PDF files, and ignore all other files 
    if os.path.splitext(filename)[1] == '.pdf': ## only look at PDF files                
        print('PDF ' + str(PDF_examined_counter+1))
        
        ## ignore PDFs we have previously processed
        if os.path.splitext(filename)[0][-10:] == '.processed': ## ignore PDFs we have previously processed
            print('skipping ' + filename + '...') 
            PDF_examined_counter+=1
            print('')
        
        else:
            # announcing PDF to be OCRed
            print(filename + '...') 
            
            ## generate filenames for PDF  
            processed_filename = os.path.splitext(filename)[0] + '.processed.pdf'
            original_OCR_txt_filename = os.path.splitext(filename)[0] + '.original.txt'
            processed_OCR_txt_filename = os.path.splitext(filename)[0] + '.ocr.txt'
            
            try:
                ## execute OCR on PDF, and catch exception if OCR text already exists in unprocessed PDF
                ocrmypdf.ocr(input_file = filename, output_file = processed_filename, 
                             deskew = True, 
                             clean_final = True, 
                             rotate_pages = True,
                             remove_background = True, 
                             sidecar = processed_OCR_txt_filename)
                PDF_processed_counter+=1
                PDF_examined_counter+=1
                
            except ocrmypdf.exceptions.PriorOcrFoundError as error:
                ## runs if unprocessed PDF already has text
                #print(error)
                !pdftotext "$filename" "$original_OCR_txt_filename" 
                print('PDF already has OCR text. Saved original OCR text to a txt file.')
                
                ## Case 1: PDF has already been processed, and User wants to not reprocess PDF OCR
                if os.path.isfile(processed_filename) == True and keep_processed_PDFs == True:
                    print("PDF has already been processed by this program.") 
                    print("Processed PDF kept (keep_processed_PDFs = True).")
                    print('moving to next PDF....')
                    PDF_examined_counter+=1
                               
                ## Case 2: Overwrite existing OCR for all unprocessed PDFs
                if os.path.isfile(processed_filename) == True and keep_processed_PDFs == False or os.path.isfile(processed_filename) == False:
                    print('Overwriting existing OCR, and saving in ' + processed_filename)
                    ocrmypdf.ocr(input_file = filename, output_file = processed_filename, 
                                 deskew = True, 
                                 clean_final = True, 
                                 rotate_pages = True,
                                 remove_background = True, 
                                 sidecar = processed_OCR_txt_filename,
                                 force_ocr = True)  
                    print('moving to next PDF....')
                    PDF_processed_counter+=1
                    PDF_examined_counter+=1
        
            finally:
                # this code will always run
                print('')
                
# Concluding Stats                
print('')
print('Processed PDFs in ' + directory[0] + ' are OCRed.')
print('PDFs processed: ' + str(PDF_processed_counter))
print('PDFs examined, including processed: ' + str(PDF_examined_counter))
running_time = (time.time()-starting_time)
print('Program Completed in ' + str(running_time) + ' seconds')

In [None]:
## DO NOT RUN THIS CODE. FOR REFERENCE ONLY. 

#ocrmypdf.ocr(input_file: Union[BinaryIO, os.PathLike, str, bytes], 
#             output_file: Union[BinaryIO, os.PathLike, str, bytes], *, 
#             language: Iterable[str] = None, 
#             image_dpi: int = None, 
#             output_type=None, 
#             sidecar: os.PathLike = None, 
#             jobs: int = None, 
#             use_threads: bool = None, 
#             title: str = None, 
#             author: str = None, 
#             subject: str = None, 
#             keywords: str = None, 
#             rotate_pages: bool = None, 
#             remove_background: bool = None, 
#             deskew: bool = None, 
#             clean: bool = None, 
#             clean_final: bool = None, 
#             unpaper_args: str = None, 
#             oversample: int = None, 
#             remove_vectors: bool = None, 
#             threshold: bool = None, 
#             force_ocr: bool = None, 
#             skip_text: bool = None, 
#             redo_ocr: bool = None, 
#             skip_big: float = None, 
#             optimize: int = None, 
#             jpg_quality: int = None, 
#             png_quality: int = None, 
#             jbig2_lossy: bool = None, 
#             jbig2_page_group_size: int = None, 
#             pages: str = None, 
#             max_image_mpixels: float = None, 
#             tesseract_config: Iterable[str] = None, 
#             tesseract_pagesegmode: int = None, 
#             tesseract_oem: int = None, 
#             pdf_renderer=None, 
#             tesseract_timeout: float = None, 
#             rotate_pages_threshold: float = None, 
#             pdfa_image_compression=None, 
#             user_words: os.PathLike = None, 
#             user_patterns: os.PathLike = None, 
#             fast_web_view: float = None, 
#             plugins: Iterable[str] = None, 
#             keep_temporary_files: bool = None, 
#             progress_bar: bool = None, 
#             **kwargs)