## Step 1. Setup & Data Preparation for PDF files
---

This notebook does as follows:

1. Install the required Python packages and import the relevant files.

1. Utilize the PDF files available in the `pdf_data` directory that are specified in the `config.yaml` file under the `content_info` section.

1. Extracts text from each page of the PDF file using the `PyPDF2` library and storing each in a `.txt` file. 

1. Converts each page in the PDF file into an image and crops it in 4 parts: 2 horizontal and 2 vertical halves and stores it as `.jpg` files based on how many parts a user wants to split the image into

1. Stores the extracted texts and images in an S3 bucket for further analytics and RAG workflow purposes

In [None]:
# install the requirements
import sys
!{sys.executable} -m pip install -r requirements.txt

In [None]:
# import libaries required to run this notebook
import os
import json
import yaml
import fitz
import glob
import boto3
import PyPDF2
import base64
import logging
import sagemaker
import globals as g
from PIL import Image
import requests as req
from typing import List
from pathlib import Path
import pypdfium2 as pdfium
from typing import Dict, Optional
from utils import upload_to_s3, get_bucket_name

In [None]:
# set a logger
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# global constants
CONFIG_FILE_PATH = "config.yaml"

# read the config yaml file
fpath = CONFIG_FILE_PATH
with open(fpath, 'r') as yaml_in:
    config = yaml.safe_load(yaml_in)
logger.info(f"config read from {fpath} -> {json.dumps(config, indent=2)}")

In [None]:
bucket_name: str = get_bucket_name(config['aws']['cfn_stack_name'])
logger.info(f"Bucket name being used to store extracted images and texts from data: {bucket_name}")

## Step 2. Download PDF files from a local directory/extract it from a `public url` and store the text and images for each page in a pdf folder

For the purpose of this POC we will manually use sample PDF files within the `pdf_data` folder. To use your own pdf files, insert the pdf files in the `pdf_data` folder, or mention the `http url` to the file

In [None]:
def ImageCrop(fname: str, left_outfile: str, right_outfile: str, upper_outfile: str, lower_outfile: str):
    """
    This function crops a given image (using the image file path) into two vertical halves
    , two horizontal halves and saves them as separate images in the the associated image paths
    """
    img = Image.open(fname)
    width, height = img.size
    # Coordinates for the left half
    left_half = img.crop((0, 0, width / 2, height))
    left_half.save(left_outfile, 'JPEG')
    # Coordinates for the right half
    right_half = img.crop((width / 2, 0, width, height))
    right_half.save(right_outfile, 'JPEG')
    # Coordinates for the upper half
    upper_half = img.crop((0, 0, width, height / 2))
    upper_half.save(upper_outfile, 'JPEG')
    # Coordinates for the lower half
    lower_half = img.crop((0, height / 2, width, height))
    lower_half.save(lower_outfile, 'JPEG')

In [None]:
def get_images(file:str, image_dir:str) -> List[str]:
    """
    Get PIL images from PDF pages and save them to a specified directory
    :param file: Path to file
    :return: A list of PIL images
    """
    pdf = pdfium.PdfDocument(file)
    # the image scale is configured in the config.yaml file 
    image_scale: float = config['page_split_imgs']['image_scale']
    n_pages: int = len(pdf)
    file_name: str = Path(file).stem  
    os.makedirs(image_dir, exist_ok=True)
    image_paths: List[str] = []
    print(f"Extracting {n_pages} images for {file}")
    for page_number in range(n_pages):
        page = pdf.get_page(page_number)
        bitmap = page.render(scale=image_scale, rotation=0, crop=(0, 0, 0, 0))
        pil_image = bitmap.to_pil()
        # Saving the image with the specified naming convention
        image_path = os.path.join(image_dir, f"{file_name}_page_{page_number + 1}.jpg")
        pil_image.save(image_path, format=config['pdf_dir_info']['image_format'])
        # append the image path and return the path to where the image is saved
        image_paths.append(image_path)
    return image_paths

In [None]:
# if you have manually uploaded the images from the pdf page in a `manually_saved_images_path` directory, those are used in this notebook instead
manual_img_path: str = os.path.join(config['pdf_dir_info']['manually_saved_images_path'], "*", "*.jpg")
manually_uploaded_img_files = glob.glob(manual_img_path, recursive=True)
logger.info(f"there are {len(manually_uploaded_img_files)} files in {manual_img_path}")

### Step 3: Extract the `text files` and `images` from each `page in the PDF file` and store it in S3

In [None]:
def extract_texts_and_images(pdf_file: str, output_dir: str) -> Dict:
    """
    Get images and texts from each page of a given pdf file and store it in
    each page directory, containing a text_dir for texts extracted from pdf images, and image_dir
    to store images extracted for that pdf page
    return: Dictionary containing the page number, and paths to the texts and image files 
            generated from each pdf page
    """
    # Dict containing the text and image paths, along with the page number
    path_info: Dict = {
        'page_number': [],
        'image_paths': [],
        'text_paths': []
    }
    # Open the PDF file. Insert your pdf files in this directory to use custom pdf files
    pdf_fpath: str = os.path.join(config['pdf_dir_info']['source_pdf_dir'], pdf_file)
    logger.info(f"Reading PDF file: {pdf_fpath}")
    # Use 'PdfReader' for extracting texts, images and other data from PDF documents
    pdf_reader = PyPDF2.PdfReader(open(pdf_fpath, "rb"))
    pdf_document = fitz.open(pdf_fpath)
    num_pages: int = len(pdf_reader.pages)
    # Extracting file name and creating the directory for each page of the pdf
    file_name: str = Path(pdf_file).stem
    output_pdf_dir = os.path.join(output_dir, file_name)
    # directories where the texts and images extracted from each page of a pdf file are saved
    text_dir = os.path.join(output_pdf_dir, config['pdf_dir_info']['pdf_txt_path'])
    image_dir = os.path.join(output_pdf_dir, config['pdf_dir_info']['pdf_img_path'])
    os.makedirs(text_dir, exist_ok=True)
    os.makedirs(image_dir, exist_ok=True)
    # Extract pages as images from the PDF file
    image_paths = get_images(pdf_fpath, image_dir)
    # Iterate over the pages and extract the text and images from each page
    for page_number in range(num_pages):
        # Get the page object
        page = pdf_reader.pages[page_number]
        # Extract the text from the page
        pdf_text = page.extract_text()
        text_path = os.path.join(text_dir, f"{file_name}_text_{page_number + 1}{config['content_info']['text_extn']}")
        with open(text_path, 'w', encoding='utf-8') as text_file:
            text_file.write(pdf_text)
        # if the images are manually not provided, do the split&save of images as the user configured in the 
        # config file for each page of the pdf file
        if config['manually_saved_images_provided'] is False:
            # Append the entire page image path to the list of image paths
            page_image = os.path.join(image_dir, f"{file_name}_page_{page_number + 1}.jpg")
            image_paths.append(page_image)
            # Split it in half vertically
            left_half_path = os.path.join(image_dir, f"{file_name}_page_{page_number + 1}_left_half{g.IMAGE_FILE_EXTN}")
            right_half_path = os.path.join(image_dir, f"{file_name}_page_{page_number + 1}_right_half{g.IMAGE_FILE_EXTN}")
            # Split it in half horizontally
            upper_half_path = os.path.join(image_dir, f"{file_name}_page_{page_number + 1}_upper_half{g.IMAGE_FILE_EXTN}")
            lower_half_path = os.path.join(image_dir, f"{file_name}_page_{page_number + 1}_lower_half{g.IMAGE_FILE_EXTN}")
            # Crop and save the image halves. Now we we have image paths saved for each crop
            ImageCrop(image_paths[page_number], left_half_path, right_half_path, upper_half_path, lower_half_path)
            # if the user wants the image to be split all 4 ways, then save all four different files
            if config['page_split_imgs']['horizontal_split'] and config['page_split_imgs']['vertical_split'] is True:
                path_info['image_paths'].extend([left_half_path, right_half_path, upper_half_path, lower_half_path,  page_image])
            # if the user wants the image to only be split vertically, only save the left and right side of the image
            elif config['page_split_imgs']['horizontal_split'] is False and config['page_split_imgs']['vertical_split'] is True:
                path_info['image_paths'].extend([left_half_path, right_half_path, page_image])
            # if the user wants the image to only be split horizontally, only save the upper and lower side of the image
            elif config['page_split_imgs']['horizontal_split'] is True and config['page_split_imgs']['vertical_split'] is False:
                path_info['image_paths'].extend([upper_half_path, lower_half_path, page_image])
            # if none are set to 'yes', then append the image path to the page as a single image without cropping
            else:
                path_info['image_paths'].extend([page_image])
        else:
            # if the user has provided a path to an image that is uploaded manually in a dir, then use that instead
            path_info['image_paths'].extend(manually_uploaded_img_files)
        # save the text and page number of the given page from the pdf file
        path_info['text_paths'].append(text_path)
        path_info['page_number'].append(page_number)
    return path_info

In [None]:
logger.info(f"Are the images manually given as screenshots from files: {config['manually_saved_images_provided']}")

#### Download a publicly available pdf file or your custom file from the `pdf_data` directory

In [None]:
content_list: str = config['content_info']['pdf_local_files']
logger.info(f"List of pdf content provided: {content_list}")
local_files: List[str] = []
for pdf_file in content_list:
    if 'https://' in pdf_file or 'http://' in pdf_file:
        local_file: str = os.path.basename(url)
        r = req.get(url, allow_redirects=True)
        if r.status_code == 200:
            logger.info(f"{url} downloaded successfully")
            with open(local_file, "wb") as f:
                f.write(r.content)
            logger.info(f"{url} written to {local_file}")
    else:
        local_file = pdf_file
        local_files.append(local_file)
    logger.info(f"saved pdf file: {pdf_file}")
logger.info(f"total files saved: {len(local_files)}")

In [None]:
# extract the separate text and images files into a 'pages_stored' list
pages_stored: List[str] = []
for local_file in local_files:
    pages = extract_texts_and_images(local_file, config['pdf_dir_info']['pdf_extracted_data'])
    pages_stored.append(pages)
logger.info(f"Images and Page texts have been extracted from {len(local_files)} PDF file")

In [None]:
# view all pages stored (including text and images) that need to be uploaded to S3 for further use
# Displays the page numbers, associated image paths that will be uploaded to S3, and the text files
# for each pdf page that will be uploaded to S3
pages_stored

Now we upload the images into an S3 bucket. This is done for two reasons:
1. In a production environment these images could be worked upon in parallel by a batch process.
1. An S3 bucket (that is part of a datalake) provides a secure location for an enterprise to store these images and a multimodal model can read the texts/images directly from the S3 bucket.

In [None]:
# store the text and image files from each pdf page from each pdf file in an s3 bucket path
for pdf_stored in pages_stored:
    _ = list(map(lambda img_path: upload_to_s3(img_path, bucket_name, g.BUCKET_IMG_PREFIX), pdf_stored['image_paths']))
    _ = list(map(lambda txt_path: upload_to_s3(txt_path, bucket_name, g.BUCKET_PDF_TEXT_PREFIX), pdf_stored['text_paths']))