### Uses Tesseract-OCR to Create Text from Images
This notebook will loop through all the images in the imgs folder to use tesseract-ocr. It will save the output as a json file, where the filename is the name of the pdf. The contents are a nested pdf with each page number as a key. Within each page of data, tesseracts confidence score, location of the text detection, and text are available.

In [None]:
import json 
import logging
import sys
from pathlib import Path
from PIL import Image # cv2 doesnt actually work well with pathlib and raises an error with some of our filenames
from pytesseract import image_to_data # https://github.com/UB-Mannheim/tesseract/wiki
from pytesseract import Output

sys.path.append(str(Path.cwd().parent.parent))
from utils import create_fh_logger

In [None]:
src = Path.cwd().parent.parent.parent / 'processing' / 'nro_declassified' / 'imgs'
dst = src.parent / "ocr"
dst.mkdir(exist_ok=True)
logs = src.parent.parent / 'logs'
logs.mkdir(exist_ok=True)
logger = create_fh_logger(logs / "ocr.log")
already_ocrd = list(dst.glob('*.json'))

In [None]:
logger.info('Entering for loop.')
for item in src.iterdir(): # could be a file ig if something went wrong
    translation = {}
    try: # there are some unallowed characters and conditions in here that haven't all been identified
        if item.is_dir: # what we're after 
            pdf_name = item.stem
            file_name = dst / f'{pdf_name}.json'
            if file_name not in already_ocrd:
                pgs = list(item.glob('*png'))
                logger.info(f"Starting Translation on {pdf_name.encode('utf-8')} with {len(pgs)} pages.")
                pg_translations = {}
                for pg in pgs:
                    pg_num = pg.stem
                    pg = Image.open(pg)
                    parsed = image_to_data(pg, output_type = Output.DICT)
                    pg_translations[pg_num] = parsed
                translation[pdf_name] = pg_translations
                logger.info(f"Saving Translation for {pdf_name.encode('utf-8')} as a JSON.")
                with open (file_name, 'w') as f:
                    json.dump(translation, f)
    except Exception as e:
        logger.info(e)  
logger.info('Exiting for loop')