### This notebook converts the NRO pdfs to images
To use pytesseract, the input must be an image. Therefore, we need to convert the PDFs into a series of images. Each page will be its own image. The images for a pdf are stored in a folder based on the name of the pdf.

In [None]:
from pathlib import Path 
from pdf2image import convert_from_path
import sys
sys.path.append(str(Path.cwd().parent.parent))
from utils import create_fh_logger

In [None]:
src = Path.cwd().parent.parent.parent.parent / 'processing' / 'nro_declassified' / 'pdfs'
pdfs = sorted(list(src.glob('*pdf')))
dst = src.parent / 'imgs'
dst.mkdir(exist_ok=True)
logs = src.parent.parent / 'logs'
logs.mkdir(exist_ok=True)
logger = create_fh_logger(logs / "imgs2pdfs.log")

In [None]:
for pdf in pdfs:
    logger.info(f"Working on {str(pdf.name).encode('utf-8')}")
    save_loc = dst / pdf.stem
    save_loc.mkdir(exist_ok=True)
    try:
        pgs = convert_from_path(pdf_path=pdf, dpi=300, fmt='png', thread_count=16, output_folder=save_loc, output_file='') 
        del pgs # remove from memory so we can rename them
    except Exception as e:
        print(e)
        print(save_loc)
    for img in save_loc.glob('*png'): # files are saved with strange prefix then page number, just want page number
        pg_num = img.stem.split('-')[-1] # keep the page number from the naming prefix
        new = img.with_name(f'{pg_num}{img.suffix}')
        img.rename(new)
    logger.info(f"Finished {str(pdf.name).encode('utf-8')}")