# Data preparation
***This notebook works best with the `conda_python3` on the `ml.t3.medium` instance***.

In this notebook we download a publicly available slide deck and convert it into images, one image for each slide. These images are then stored in Amazon S3 from where they can be made available to a Amazon SageMaker Endpoint for inference.


In [None]:
!pip install -r requirements.txt

In [None]:
import httplib2
h = httplib2.Http()
resp = h.request("http://www.google.com", 'HEAD')
assert int(resp[0]['status']) < 400

In [None]:
import os
import json
import glob
import boto3
import base64
import logging
import sagemaker
from PIL import Image
import requests as req
from typing import List
from pathlib import Path
import pypdfium2 as pdfium

logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
%run -i globals.py

In [None]:
def get_images(file:str, image_dir:str = IMAGE_DIR):
    """
    Get PIL images from PDF pages and save them to a specified directory
    :param file: Path to file
    :return: A list of PIL images
    """

    # Get presentation
    pdf = pdfium.PdfDocument(file)
    n_pages = len(pdf)

    # Extracting file name and creating the directory for images
    file_name = Path(file).stem  # Gets the file name without extension
    img_dir = os.path.join(Path(file).parent, image_dir)
    os.makedirs(img_dir, exist_ok=True)

    # Get images
    image_paths = []
    print(f"Extracting {n_pages} images for {file}")
    for page_number in range(n_pages):
        page = pdf.get_page(page_number)
        bitmap = page.render(scale=1, rotation=0, crop=(0, 0, 0, 0))
        pil_image = bitmap.to_pil()
        # pil_images.append(pil_image)

        # Saving the image with the specified naming convention
        image_path = os.path.join(img_dir, f"{file_name}_image_{page_number + 1}.jpg")
        pil_image.save(image_path, format="JPEG")
        image_paths.append(image_path)

    return image_paths

In [None]:
url: str = SLIDE_DECK
local_file: str = os.path.basename(SLIDE_DECK)
r = req.get(url, allow_redirects=True)
if r.status_code == 200:
    logger.info(f"{url} downloaded successfully")
    with open(local_file, "wb") as f:
        f.write(r.content)
    logger.info(f"{url} written to {local_file}")

In [None]:
images: List = []
file_names = [local_file]
for fi in file_names:
    images.extend(get_images(fi))


In [None]:
for img_path in images:
    upload_to_s3(img_path, BUCKET_IMG_PREFIX)
    # delete local images after uploading to s3
    