### This notebook converts the NRO pdfs to images
To use pytesseract, the input must be an image. Therefore, we need to convert the PDFs into a series of images. Each page will be its own image. The images for a pdf are stored in a folder based on the name of the pdf.

In [None]:
import logging
from pathlib import Path 
from pdf2image import convert_from_path

def create_fh_logger(file):
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')
    file_handler = logging.FileHandler(file)
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    return logger

In [None]:
src = Path.cwd().parent.parent / 'processing' / 'nro_declassified' / 'pdfs'
pdfs = sorted(list(src.glob('*pdf')))
dst = src.parent / 'imgs'
dst.mkdir(exist_ok=True)
logs = src.parent.parent / 'logs'
logs.mkdir(exist_ok=True)
logger = create_fh_logger(logs / "imgs2pdfs.log")

In [None]:
for pdf in pdfs:
    logger.info(f"Working on {str(pdf.name).encode('utf-8')}")
    save_loc = dst / pdf.stem
    pgs = convert_from_path(pdf_path=pdf, dpi=300, fmt='png', thread_count=32, output_folder=save_loc)
    logger.info(f"Finished {str(pdf.name).encode('utf-8')}")