# Data preparation
***This notebook works best with the `conda_python3` on the `ml.t3.medium` instance***.

In this notebook we download a publicly available slide deck and convert it into images, one image for each slide. These images are then stored in Amazon S3 from where they can be made available to a Amazon SageMaker Endpoint for inference.


In [1]:
!pip install -r requirements.txt



In [2]:
import os
import json
import glob
import boto3
import base64
import logging
import sagemaker
from globals import *
from PIL import Image
import requests as req
from typing import List
from pathlib import Path
import pypdfium2 as pdfium
from utils import upload_to_s3

logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
def get_images(file:str, image_dir:str = IMAGE_DIR):
    """
    Get PIL images from PDF pages and save them to a specified directory
    :param file: Path to file
    :return: A list of PIL images
    """

    # Get presentation
    pdf = pdfium.PdfDocument(file)
    n_pages = len(pdf)

    # Extracting file name and creating the directory for images
    file_name = Path(file).stem  # Gets the file name without extension
    os.makedirs(IMAGE_DIR, exist_ok=True)

    # Get images
    image_paths = []
    print(f"Extracting {n_pages} images for {file}")
    for page_number in range(n_pages):
        page = pdf.get_page(page_number)
        bitmap = page.render(scale=1, rotation=0, crop=(0, 0, 0, 0))
        pil_image = bitmap.to_pil()
        # pil_images.append(pil_image)

        # Saving the image with the specified naming convention
        image_path = os.path.join(IMAGE_DIR, f"{file_name}_image_{page_number + 1}.jpg")
        pil_image.save(image_path, format="JPEG")
        image_paths.append(image_path)

    return image_paths

In [4]:
url: str = SLIDE_DECK
local_file: str = os.path.basename(url)
r = req.get(url, allow_redirects=True)
if r.status_code == 200:
    logger.info(f"{url} downloaded successfully")
    with open(local_file, "wb") as f:
        f.write(r.content)
    logger.info(f"{url} written to {local_file}")

[2024-01-06 04:51:45,293] p22586 {2780445363.py:5} INFO - https://d1.awsstatic.com/events/Summits/torsummit2023/CMP301_TrainDeploy_E1_20230607_SPEdited.pdf downloaded successfully
[2024-01-06 04:51:45,298] p22586 {2780445363.py:8} INFO - https://d1.awsstatic.com/events/Summits/torsummit2023/CMP301_TrainDeploy_E1_20230607_SPEdited.pdf written to CMP301_TrainDeploy_E1_20230607_SPEdited.pdf


In [5]:
images: List = []
file_names = [local_file]
for fi in file_names:
    images.extend(get_images(fi))


Extracting 31 images for CMP301_TrainDeploy_E1_20230607_SPEdited.pdf


In [6]:
_ = list(map(lambda img_path: upload_to_s3(img_path, BUCKET_NAME, BUCKET_IMG_PREFIX), images))

[2024-01-06 04:51:49,172] p22586 {utils.py:20} INFO - File img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_1.jpg uploaded to sagemaker-us-east-1-015469603702/multimodal/img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_1.jpg.
[2024-01-06 04:51:49,229] p22586 {utils.py:20} INFO - File img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_2.jpg uploaded to sagemaker-us-east-1-015469603702/multimodal/img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_2.jpg.
[2024-01-06 04:51:49,297] p22586 {utils.py:20} INFO - File img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_3.jpg uploaded to sagemaker-us-east-1-015469603702/multimodal/img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_3.jpg.
[2024-01-06 04:51:49,361] p22586 {utils.py:20} INFO - File img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_4.jpg uploaded to sagemaker-us-east-1-015469603702/multimodal/img/CMP301_TrainDeploy_E1_20230607_SPEdited_image_4.jpg.
[2024-01-06 04:51:49,423] p22586 {utils.py:20} INFO - File img/CMP301_TrainDeploy_E1_202