## Prepare data

In [1]:
from pypdf import PdfReader as pdf_reader
from collections import defaultdict
from google.cloud import vision
from tqdm import tqdm
import logging
import json
import os 

##### Setup logging

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

#### Essentials 

In [3]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './credentials/vai-key.json'
LOCAL_INPUT_DIR = './DATA/INPUT'
LOCAL_OUTPUT_DIR = './DATA/OUTPUT'

image_annotator_client = vision.ImageAnnotatorClient()

#### Process PDFs 

In [4]:
def read_pdf(file_name: str) -> list:
    """
    Extracts text and images from pages of the PDF and create dictionaries with the mapped information.
    """
    text_by_page = {}
    images_by_page = defaultdict(list)
    logger.info(f'Extracting text and images from document: {file_name}')
    with open(f'{LOCAL_INPUT_DIR}/{file_name}.pdf', 'rb') as pdf_file:
        reader = pdf_reader(pdf_file)
        for i, page in tqdm(enumerate(reader.pages)):
            i += 1
            logger.info(f'Processing page num: {i}')
            text = page.extract_text()
            text_by_page[i] = text
            try:
                for image in page.images:
                    IMAGE_WRITE_PATH = f'{LOCAL_OUTPUT_DIR}/{file_name}/IMAGES'
                    os.makedirs(IMAGE_WRITE_PATH, exist_ok=True)
                    images_by_page[i].extend([f'{IMAGE_WRITE_PATH}/{image.name}'])
                    with open(f'{IMAGE_WRITE_PATH}/{image.name}', 'wb') as image_file:
                        image_file.write(image.data)
            except Exception as e:
                logger.error(e)
    return [text_by_page, images_by_page]
    

In [5]:
def process_images_ocr(text_by_pages: dict, images_by_pages: dict) -> dict:
    for page, images in tqdm(images_by_pages.items()):  
        logger.info(f'Extracting text from images for page: {page}')
        page_text = text_by_pages[page]
        for image_path in images:      
            with open(image_path, 'rb') as image_file:
                try:
                    content = image_file.read()
                    image = vision.Image(content=content)
                    response = image_annotator_client.text_detection(image=image)
                    detected_text = response.full_text_annotation.text
                    page_text = text_by_pages[page]
                    logger.info(f'Adding extracted text from image back into the page: {page}')
                    text_by_pages[page] = '\n'.join([page_text, "Text extracted from the image =>", detected_text])
                except Exception as e:
                    logger.error(e)
    return text_by_pages

#### Write processed pages to local dir as JSON lines

In [6]:
def write_pages_to_local(file_name, text_by_pages):
    logger.info(f'Writing processed pages for {file_name} into local disk')
    JSON_WRITE_PATH = f'{LOCAL_OUTPUT_DIR}/{file_name}/TEXT'
    try:
        os.makedirs(JSON_WRITE_PATH, exist_ok=True)
        with open(f'{JSON_WRITE_PATH}/{file_name}.jsonl', 'w') as output_json:
            for page, text in tqdm(text_by_pages.items()):
                json_line = json.dumps({"doc_name": file_name, "page_num": page, "page_content": str(text)})
                output_json.write(json_line + '\n')
    except Exception as e:
        logger.error(e)

In [7]:
def process_file(file_name: str):
    text_by_pages, images_by_pages = read_pdf(file_name)
    text_by_pages = process_images_ocr(text_by_pages, images_by_pages)
    write_pages_to_local(file_name, text_by_pages)

In [8]:
process_file('file-1')
process_file('file-2')

Extracting text and images from document: file-1
0it [00:00, ?it/s]Processing page num: 1
1it [00:00,  9.36it/s]Processing page num: 2
Processing page num: 3
Processing page num: 4
4it [00:00, 19.99it/s]Processing page num: 5
pillow is required to do image extraction. It can be installed via 'pip install pypdf[image]'
Processing page num: 6
Processing page num: 7
7it [00:00, 23.53it/s]Processing page num: 8
Processing page num: 9
Processing page num: 10
10it [00:00, 24.65it/s]Processing page num: 11
pillow is required to do image extraction. It can be installed via 'pip install pypdf[image]'
Processing page num: 12
pillow is required to do image extraction. It can be installed via 'pip install pypdf[image]'
Processing page num: 13
pillow is required to do image extraction. It can be installed via 'pip install pypdf[image]'
Processing page num: 14
pillow is required to do image extraction. It can be installed via 'pip install pypdf[image]'
14it [00:00, 28.92it/s]Processing page num: 15
