In [None]:
## libraries
import os, ocrmypdf, time, pprint

In [None]:
## Limiting thread limit to 1 empirically appears to decrease time slightly
## You can ignore this and likely experience no clear performance penalty

# os.environ['OMP_THREAD_LIMIT'] = '1'
# print(os.environ['OMP_THREAD_LIMIT']) #it appears default is set to 1 since laptops have fewer cores?

In [None]:

## The main ocr function, where the 'data_dir' is the 
## directory address (and subdirectories) that contains PDFs to
## OCR and 'keep processed' == TRUE/FALSE does not rerun OCR for PDFs
## that have previously been processed by the program.

def ocr_pdfs(data_dir, keep_processed):
    info = [0,0,0.0] #[PDF_processed_counter, PDF_examined_counter, running_time]
    
    for directory in list_of_directories(data_dir):
        dir_info = ocr_pdfs_in_directory(keep_processed, directory)
        info = [prior + new for prior,new in zip(info,dir_info)]
    
    print('pdfs proccessed: ' + str(info[0]))
    print('pdfs examined: ' + str(info[1]))
    print('total runtime: ' + str(info[2]))

In [None]:
## The following functions are called by the ocr_pdfs function above.
## [You have to run this cell before the ocr_pdfs function will work.]

def list_of_directories(data_dir):
    ## list of all dirs & sub dirs, including current directory
    dirs = [(d[0]) for d in os.walk(data_dir)]         
    print(str(len(dirs)) + ' directories to examine, including all sub-directories')
    print('* * *')
    return(dirs)

def ocr_pdfs_in_directory(keep_processed, dir_name):
    keep_processed_PDFs = keep_processed  # If TRUE, when the program finds a 'processed' version of the original PDF 
                                          # it will skip the original PDF and not overwrite the existing 'processed' PDF.
                                          # If FALSE, will overwrite all processed PDFs files for all original PDFs. 

    starting_time = time.time() # time program started
    directory = dir_name        # full directory name and path 
    PDF_processed_counter = 0   # PDFs processed
    PDF_examined_counter = 0    # PDFs examined, including PDFs already processed

    
    #identify all pdfs in current directory
    pdfs_in_directory = [filename for filename in os.listdir() if filename.endswith('.pdf')]
    
    for filename in pdfs_in_directory: # loops through all pdfs in directory    
    
        ## only look at PDF files, and ignore all other files 
        if os.path.splitext(filename)[1] == '.pdf': ## only look at PDF files                
            print('PDF ' + str(PDF_examined_counter+1))
        
            ## ignore PDFs we have previously processed
            if os.path.splitext(filename)[0][-10:] == '.processed': ## ignore PDFs we have previously processed
                print('skipping ' + filename + '...') 
                PDF_examined_counter+=1
                print('')
        
            else:
                # announcing PDF to be OCRed
                print(filename + '...') 
            
                ## generate filenames for PDF  
                processed_filename = os.path.splitext(filename)[0] + '.processed.pdf'
                original_OCR_txt_filename = os.path.splitext(filename)[0] + '.original.txt'
                processed_OCR_txt_filename = os.path.splitext(filename)[0] + '.ocr.txt'
            
                try:
                    ## execute OCR on PDF, and catch exception if OCR text already exists in unprocessed PDF
                    ocrmypdf.ocr(input_file = filename, output_file = processed_filename, 
                                 deskew = True, 
                                 clean = True, 
                                 rotate_pages = True,
                                 # remove_background = True, ##temporarily not implemented in current OCRmyPDF 
                                 sidecar = processed_OCR_txt_filename,
                                 tesseract_timeout = 600
                                 )
                    PDF_processed_counter+=1
                    PDF_examined_counter+=1
                
                except ocrmypdf.exceptions.PriorOcrFoundError as error:
                    ## runs if unprocessed PDF already has text
                    #print(error)
                    !pdftotext "$filename" "$original_OCR_txt_filename" 
                    print('PDF already has OCR text. Saved original OCR text to a txt file.')
                
                    ## Case 1: PDF has already been processed, and you do not want to reprocess PDF OCR
                    if os.path.isfile(processed_filename) == True and keep_processed_PDFs == True:
                        print("PDF has already been processed by this program.") 
                        print("Processed PDF kept (keep_processed_PDFs = True).")
                        print('moving to next PDF....')
                        PDF_examined_counter+=1
                               
                    ## Case 2: Overwrite existing OCR for all unprocessed PDFs
                    if os.path.isfile(processed_filename) == True and keep_processed_PDFs == False or os.path.isfile(processed_filename) == False:
                        print('Overwriting existing OCR, and saving in ' + processed_filename)
                        ocrmypdf.ocr(input_file = filename, 
                                 output_file = processed_filename, 
                                 deskew = True, 
                                 clean_final = True, 
                                 rotate_pages = True,
                                 # remove_background = True, ##temporarily not implemented in current OCRmyPDF
                                 sidecar = processed_OCR_txt_filename,
                                 force_ocr = True,
                                 tesseract_timeout = 600
                                 )  
                        print('moving to next PDF....')
                        PDF_processed_counter+=1
                        PDF_examined_counter+=1
        
                except ocrmypdf.exceptions.TaggedPDFError as error:
                    ## runs if unprocessed PDF already has text
                    #print(error)
                    !pdftotext "$filename" "$original_OCR_txt_filename" 
                    print('PDF already has OCR text. Saved original OCR text to a txt file.')
                
                    ## Case 1: PDF has already been processed, and you do not want to reprocess PDF OCR
                    if os.path.isfile(processed_filename) == True and keep_processed_PDFs == True:
                        print("PDF has already been processed by this program.") 
                        print("Processed PDF kept (keep_processed_PDFs = True).")
                        print('moving to next PDF....')
                        PDF_examined_counter+=1
                               
                    ## Case 2: Overwrite existing OCR for all unprocessed PDFs
                    if os.path.isfile(processed_filename) == True and keep_processed_PDFs == False or os.path.isfile(processed_filename) == False:
                        print('Overwriting existing OCR, and saving in ' + processed_filename)
                        ocrmypdf.ocr(input_file = filename, 
                                 output_file = processed_filename, 
                                 deskew = True, 
                                 clean_final = True, 
                                 rotate_pages = True,
                                 # remove_background = True, ##temporarily not implemented in current OCRmyPDF 
                                 sidecar = processed_OCR_txt_filename,
                                 force_ocr = True,
                                 tesseract_timeout = 600
                                 )  
                        print('moving to next PDF....')
                        PDF_processed_counter+=1
                        PDF_examined_counter+=1
        
                finally:
                    # this code will always run
                    print('')
                
    # Concluding Summary                
    print('----directory summary----')
    print('-------------------------')
    print(str(PDF_processed_counter) +' pdfs processed in ' + directory)
    print('PDFs examined in directory, including processed: ' + str(PDF_examined_counter))
    running_time = round((time.time()-starting_time), 1)
    print('Running Time: ' + str(running_time) + ' seconds')
    print('-------------------------')
    return(PDF_processed_counter, PDF_examined_counter, running_time)
    

In [None]:
%%capture output

## Code below will OCR all pdfs in the current
## directory and subdirectories that contains this
## jupyter notebook.

ocr_pdfs('.', False)

In [None]:
## Prints list of PDFs that were OCRed, plus any
## error messages. 

print(output)

In [None]:
## DO NOT RUN THIS CODE. FOR REFERENCE ONLY. 

#ocrmypdf.ocr(input_file: Union[BinaryIO, os.PathLike, str, bytes], 
#             output_file: Union[BinaryIO, os.PathLike, str, bytes], *, 
#             language: Iterable[str] = None, 
#             image_dpi: int = None, 
#             output_type=None, 
#             sidecar: os.PathLike = None, 
#             jobs: int = None, 
#             use_threads: bool = None, 
#             title: str = None, 
#             author: str = None, 
#             subject: str = None, 
#             keywords: str = None, 
#             rotate_pages: bool = None, 
#             remove_background: bool = None, 
#             deskew: bool = None, 
#             clean: bool = None, 
#             clean_final: bool = None, 
#             unpaper_args: str = None, 
#             oversample: int = None, 
#             remove_vectors: bool = None, 
#             threshold: bool = None, 
#             force_ocr: bool = None, 
#             skip_text: bool = None, 
#             redo_ocr: bool = None, 
#             skip_big: float = None, 
#             optimize: int = None, 
#             jpg_quality: int = None, 
#             png_quality: int = None, 
#             jbig2_lossy: bool = None, 
#             jbig2_page_group_size: int = None, 
#             pages: str = None, 
#             max_image_mpixels: float = None, 
#             tesseract_config: Iterable[str] = None, 
#             tesseract_pagesegmode: int = None, 
#             tesseract_oem: int = None, 
#             pdf_renderer=None, 
#             tesseract_timeout: float = None, 
#             rotate_pages_threshold: float = None, 
#             pdfa_image_compression=None, 
#             user_words: os.PathLike = None, 
#             user_patterns: os.PathLike = None, 
#             fast_web_view: float = None, 
#             plugins: Iterable[str] = None, 
#             keep_temporary_files: bool = None, 
#             progress_bar: bool = None, 
#             **kwargs)