# 2. VGG16 Image Embeddings

_created by Austin Poor_

In this notebook, I use a pretrained VGG-16 model to create image embeddings for each of the film stills.

The notebook [1.format-images.ipynb](./1.format-images.ipynb), has placed uniform images in an S3 bucket for this notebook to pull down, process, and then upload the results (as individual parquet files) to another S3 bucket.

In [9]:
!pip install -q --upgrade pip
!pip install -q -r requirements.txt



In [10]:
import datetime as dt
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

import boto3
import numpy as np
from PIL import Image
from tqdm import tqdm

import pyarrow as pa
import pyarrow.parquet as pq

import tensorflow as tf
from tensorflow.keras.applications.vgg16 import preprocess_input

In [11]:
tmp_dir = Path("./tmp")
tmp_dir.mkdir(exist_ok=True)
[f.unlink() for f in tmp_dir.glob("*") if f.is_file()];

In [12]:
SOURCE_BUCKET = "apoor-clean-movie-stills"
DEST_BUCKET = "apoor-vgg-movie-vecs"

s3 = boto3.client("s3")

In [13]:
batch_size = 1_000 # Max of 1,000 per S3

In [14]:
input_shape = (500, 500, 3)

vgg16 = tf.keras.applications.VGG16(
    include_top=False,
    weights='imagenet',
    input_shape=input_shape
)
vgg16.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 500, 500, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 500, 500, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 500, 500, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 250, 250, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 250, 250, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 250, 250, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 125, 125, 128)     0     

In [15]:
def iter_keys(bucket: str, batch_size: int = 1_000) -> [str]:
    """Iterates through keys in an S3 `bucket`
    in groups of `batch_size`

    :param bucket: Name of the S3 bucket to search
    :param batch_size: The max number of keys to return at a time.
        Note: S3 will return a maximum of 1,000 keys at a time.
    :yields: A list of keys to files in the S3 bucket, `bucket`
    """
    last_key = ""
    while True:
        resp = s3.list_objects_v2(
            Bucket=SOURCE_BUCKET,
            MaxKeys=batch_size,
            StartAfter=last_key
        )
        keys = [c["Key"] for c in resp["Contents"]]
        yield keys
        if not resp["IsTruncated"]: break
        else: last_key = keys[-1]
            
            
def download_object(bucket: str, key: str, tmp_dir: Path) -> Path:
    """Downloads a single file from an S3 bucket
    and stores it in a temporary directory

    :param bucket: The bucket to search in S3
    :param key: The object's key in `bucket`
    :param tmp_dir: The temporary directory to save the
        downloaded file.
    :returns: A path to the downloaded file in `tmp_dir`
    """
    res = s3.get_object(Bucket=bucket, Key=key)
    filename = tmp_dir / key
    with open(filename, "wb") as f:
        f.write(res["Body"].read())
    return filename
    
    
def batch_download(bucket: str, keys: [str], tmp_dir: Path) -> [Path]:
    """Downloads a batch of objects from an S3 bucket.

    Uses the function `download_image` in a multi-threaded
    map -- using `concurrent.futures.ThreadPoolExecutor`.

    :param bucket: S3 bucket where objects are stored
    :param keys: List of object keys stored in `bucket`
    :param tmp_dir: Local directory to save downloaded images
    :returns: List of paths for locally stored objects
        downloaded from S3.
    """
    def curried_download(key: str): 
        return download_object(bucket,key,tmp_dir)
    with ThreadPoolExecutor() as P:
        return list(P.map(curried_download,keys))

    
def clean_tmp_files(paths: [Path]):
    """Unlinks a list of files using `pathlib.Path.unlink`.

    :param paths: List of paths to files that should be deleted.
    """
    [Path(p).unlink() for p in paths]
    
    
def load_image(path: Path) -> np.ndarray:
    """Loads a JPEG image at `path` as an ndarray.

    Uses `tf.io.read_file` and `tf.image.decode_jpeg`

    :param path: Path to an image file
    :returns: ndarray representation of the image
        with dimensions (img_witdh,img_height,img_color_channels)
    """
    return tf.image.decode_jpeg(tf.io.read_file(str(path)))

    
def load_images(paths: [Path]) -> np.ndarray:
    """Loads a group of images as numpy arrays,
    and concatenates them together.

    :param paths: List of paths to image files being loaded
    :returns: A single numpy ndarray with dimensions
        (n_image,img_witdh,img_height,img_color_channels)
    """
    return np.concatenate([
        np.expand_dims(load_image(p),0)
        for p in paths
    ],0)


def format_input(data: np.ndarray) -> np.ndarray:
    """Formats image for processing using `tf.keras`'s 
    supplied function for preprocessing VGG16 inputs.
    
    :param data: ndarray of images with shape: (n,w,h,c)
    :returns: ndarray with images rescaled / typed to
        match what VGG16 expects.
    """
    return preprocess_input(data)


def vgg_process(data: np.ndarray):
    """Creates VGG-16 embeddings from image data.

    :param data: ndarray of image data
    :returns: ndarray of image embeddings
    """
    return vgg16.predict(data)


def format_output(data: np.ndarray) -> np.ndarray:
    """Reformats image embeddings for parquet
    storrage.

    :param data: ndarray of image embeddings (4-dimensional)
    :returns: (Mostly-)flattened ndarray with dimensions:
        (n_images, flattened_vgg_output)
    """
    batch_size, *_ = data.shape
    return data.reshape((batch_size, -1))


def make_arrow_table(row: np.ndarray, filename: Path) -> pa.Table:
    """Converts a flat numpy array into an arrow Table,
    where the key is `filename`'s stem and the value is
    the row of data.

    :param row: A flat numpy array
    :param filename: A path where the filename's stem will become
        the key in the arrow table
    :returns: An arrow table with `filename`'s stem as a key
        and `row` as the data
    """
    return pa.table({Path(filename).stem: row})


def write_parquet(row: np.ndarray, filename: Path) -> Path:
    """Writes the data in `row` as an arrow table,
    to a parquet file.

    Saves the file to `filename` where the extension
    is changed to `.parquet`.

    In the arrow table, `row`'s key is the stem of
    `filename`.

    For example, if `filename = "dir/test.jpg"` then
    the result will be a parquet file: `dir/test.parquet`
    which stores an arrow table with the key `test`.

    :param row: Data to be stored in a parquet file
    :param filename: Source data's filename
    :returns: Path to the newly created parquet file
    """
    table = make_arrow_table(row, filename)
    new_filename = filename.with_suffix(".parquet")
    pq.write_table(table, new_filename)
    return new_filename


def write_parquets(data: np.ndarray, filenames: [Path]) -> [Path]:
    """Writes the rows in `data` to parquet files
    based of the paths in the list, `filenames`.

    The rows in `data` should correspond to the paths
    in `filenames`.

    See `write_parquet` for more details.

    :param data: a 2-D ndarray with data to be stored
        as parquet files.
    :param filenames: List of source `Paths` corresponding
        to the rows in `data`.
    :returns: A list of `Paths` to the newly created parquet files
    """
    return [write_parquet(r, f) for r, f in zip(data, filenames)]

def upload_parquet_file(bucket: str, filename: Path):
    """Upload a parquet file to S3.

    The key used in S3 will be the name and extension
    from filename (aka no directory names included).

    For example, if `filename = "path/to/file.ext"`, then
    the object's key in S3 will be `file.ext`.

    :param bucket: S3 bucket to store the file
    :param filename: File to be stored in S3
    """
    key = filename.name
    s3.upload_file(str(filename), bucket, key)


def upload_parquet_files(bucket: str, filenames: [Path]):
    """Upload multiple parquet files to S3.

    Uses the function `upload_parquet_file` in a multi-threaded 
    map with `concurrent.futures.ThreadPoolExecutor`.    

    :param bucket: S3 bucket to upload files
    :param filenames: List of `Path`s to parquets being uploaded
    """
    def curried_upload(filename):
        upload_parquet_file(bucket, filename)

    with ThreadPoolExecutor() as P:
        list(P.map(curried_upload, filenames))

In [None]:
start_time = dt.datetime.now()
print(f"START TIME: {start_time}")
print(f"Loading batches of {batch_size:,d} images.\n")

bar = tqdm(
    enumerate(iter_keys(SOURCE_BUCKET, batch_size)),
    desc="Starting...",
    unit="batch",
    ncols=80
)

for i, image_keys in bar:
    loop_start = dt.datetime.now()
    bar.set_description("Downloading images")
    image_paths = batch_download(SOURCE_BUCKET, image_keys, tmp_dir)

    bar.set_description("Loading images")
    input_data = load_images(image_paths)
    input_data = format_input(input_data)

    bar.set_description("Removing tmp files")
    clean_tmp_files(image_paths)

    bar.set_description("Embedding with VGG16")
    encoding = vgg_process(input_data)
    output_data = format_output(encoding)
    
    bar.set_description("Saving to parquet")
    parquet_paths = write_parquets(output_data, image_paths)

    bar.set_description("Uploading embeddings")
    upload_parquet_files(DEST_BUCKET, parquet_paths)

    bar.set_description("Removing tmp files")
    clean_tmp_files(parquet_paths)
    
    bar.write(f"[{i:4,d}] COMPLETED IN {dt.datetime.now() - loop_start}")

print(f"\nFULL TIME TO COMPLETE: {dt.datetime.now() - start_time}")

Starting...: 0batches [00:00, ?batches/s]

START TIME: 2021-05-06 13:55:04.868376
Loading batches of 1,000 images.



Removing tmp files: : 1batches [01:21, 81.45s/batches]

[   0] COMPLETED IN 0:01:21.218090


Removing tmp files: : 2batches [02:39, 79.46s/batches]  

[   1] COMPLETED IN 0:01:17.800552


Removing tmp files: : 3batches [04:00, 80.35s/batches]  

[   2] COMPLETED IN 0:01:21.129284


Removing tmp files: : 4batches [05:23, 81.09s/batches]  

[   3] COMPLETED IN 0:01:21.974962


Removing tmp files: : 5batches [06:46, 81.90s/batches]  

[   4] COMPLETED IN 0:01:23.053065


Removing tmp files: : 6batches [08:09, 82.28s/batches]  

[   5] COMPLETED IN 0:01:22.750251


Removing tmp files: : 7batches [09:32, 82.42s/batches]  

[   6] COMPLETED IN 0:01:22.375641


Removing tmp files: : 8batches [11:03, 85.17s/batches]  

[   7] COMPLETED IN 0:01:30.803342


Removing tmp files: : 9batches [12:25, 84.39s/batches]  

[   8] COMPLETED IN 0:01:22.425338


Removing tmp files: : 10batches [13:48, 83.93s/batches] 

[   9] COMPLETED IN 0:01:22.664082


Removing tmp files: : 11batches [15:10, 83.36s/batches]  

[  10] COMPLETED IN 0:01:21.756764


Removing tmp files: : 12batches [16:34, 83.31s/batches]  

[  11] COMPLETED IN 0:01:22.936897


Removing tmp files: : 13batches [17:56, 82.97s/batches]  

[  12] COMPLETED IN 0:01:21.934721


Removing tmp files: : 14batches [19:18, 82.86s/batches]  

[  13] COMPLETED IN 0:01:22.373098


Removing tmp files: : 15batches [20:40, 82.51s/batches]  

[  14] COMPLETED IN 0:01:21.425813


Removing tmp files: : 16batches [22:01, 82.09s/batches]  

[  15] COMPLETED IN 0:01:20.905906


Removing tmp files: : 17batches [23:24, 82.22s/batches]  

[  16] COMPLETED IN 0:01:22.258253


Removing tmp files: : 18batches [24:46, 82.17s/batches]  

[  17] COMPLETED IN 0:01:21.810187


Removing tmp files: : 19batches [26:08, 82.22s/batches]  

[  18] COMPLETED IN 0:01:22.071658


Removing tmp files: : 20batches [27:30, 82.19s/batches]  

[  19] COMPLETED IN 0:01:21.896925


Removing tmp files: : 21batches [28:52, 82.13s/batches]  

[  20] COMPLETED IN 0:01:21.733811


Removing tmp files: : 22batches [30:17, 82.85s/batches]  

[  21] COMPLETED IN 0:01:24.200487


Removing tmp files: : 23batches [31:40, 82.83s/batches]  

[  22] COMPLETED IN 0:01:22.533489


Removing tmp files: : 24batches [33:04, 83.22s/batches]  

[  23] COMPLETED IN 0:01:23.890829


Removing tmp files: : 25batches [34:28, 83.41s/batches]  

[  24] COMPLETED IN 0:01:23.574825


Removing tmp files: : 26batches [35:52, 83.59s/batches]  

[  25] COMPLETED IN 0:01:23.765675


Removing tmp files: : 27batches [37:16, 83.86s/batches]  

[  26] COMPLETED IN 0:01:24.228661


Removing tmp files: : 28batches [38:40, 83.95s/batches]  

[  27] COMPLETED IN 0:01:23.923524


Removing tmp files: : 29batches [40:05, 84.08s/batches]  

[  28] COMPLETED IN 0:01:24.152013


Removing tmp files: : 30batches [41:28, 84.04s/batches]  

[  29] COMPLETED IN 0:01:23.702417


Removing tmp files: : 31batches [42:52, 84.00s/batches]  

[  30] COMPLETED IN 0:01:23.599866


Removing tmp files: : 32batches [44:16, 83.96s/batches]  

[  31] COMPLETED IN 0:01:23.588867


Removing tmp files: : 33batches [45:40, 83.81s/batches]  

[  32] COMPLETED IN 0:01:23.221220


Removing tmp files: : 34batches [47:04, 83.87s/batches]  

[  33] COMPLETED IN 0:01:23.756622


Removing tmp files: : 35batches [48:27, 83.81s/batches]  

[  34] COMPLETED IN 0:01:23.419515


Removing tmp files: : 36batches [49:50, 83.39s/batches]  

[  35] COMPLETED IN 0:01:22.175730


Removing tmp files: : 37batches [51:25, 86.80s/batches]  

[  36] COMPLETED IN 0:01:34.524845


Removing tmp files: : 38batches [52:49, 86.02s/batches]  

[  37] COMPLETED IN 0:01:23.763733


Removing tmp files: : 39batches [54:12, 85.30s/batches]  

[  38] COMPLETED IN 0:01:23.357362


Removing tmp files: : 40batches [55:36, 84.76s/batches]  

[  39] COMPLETED IN 0:01:23.233902


Removing tmp files: : 41batches [56:59, 84.39s/batches]  

[  40] COMPLETED IN 0:01:23.280997


Removing tmp files: : 42batches [58:23, 84.27s/batches]  

[  41] COMPLETED IN 0:01:23.746328


Removing tmp files: : 43batches [59:48, 84.24s/batches]  

[  42] COMPLETED IN 0:01:23.909210


Removing tmp files: : 44batches [1:01:12, 84.17s/batches]  

[  43] COMPLETED IN 0:01:23.745057


Removing tmp files: : 45batches [1:02:35, 83.99s/batches]  

[  44] COMPLETED IN 0:01:23.322576


Removing tmp files: : 46batches [1:03:59, 83.87s/batches]  

[  45] COMPLETED IN 0:01:23.350089


Removing tmp files: : 47batches [1:05:24, 84.26s/batches]  

[  46] COMPLETED IN 0:01:24.907940


Removing tmp files: : 48batches [1:06:49, 84.58s/batches]  

[  47] COMPLETED IN 0:01:25.073359


Removing tmp files: : 49batches [1:08:14, 84.58s/batches]  

[  48] COMPLETED IN 0:01:24.348637


Removing tmp files: : 50batches [1:09:38, 84.38s/batches]  

[  49] COMPLETED IN 0:01:23.634033


Removing tmp files: : 51batches [1:11:09, 86.32s/batches]  

[  50] COMPLETED IN 0:01:30.570138


Removing tmp files: : 52batches [1:12:33, 85.82s/batches]  

[  51] COMPLETED IN 0:01:24.423017


Removing tmp files: : 53batches [1:13:58, 85.48s/batches]  

[  52] COMPLETED IN 0:01:24.356898


Removing tmp files: : 54batches [1:15:22, 85.12s/batches]  

[  53] COMPLETED IN 0:01:23.897729


Removing tmp files: : 55batches [1:16:47, 85.06s/batches]  

[  54] COMPLETED IN 0:01:24.571772


Removing tmp files: : 56batches [1:18:12, 85.02s/batches]  

[  55] COMPLETED IN 0:01:24.582526


Removing tmp files: : 57batches [1:19:36, 84.69s/batches]  

[  56] COMPLETED IN 0:01:23.554338


Removing tmp files: : 58batches [1:21:01, 84.70s/batches]  

[  57] COMPLETED IN 0:01:24.465447


Removing tmp files: : 59batches [1:22:26, 84.74s/batches]  

[  58] COMPLETED IN 0:01:24.541477


Removing tmp files: : 60batches [1:23:50, 84.60s/batches]  

[  59] COMPLETED IN 0:01:24.047242


Removing tmp files: : 61batches [1:25:15, 84.79s/batches]  

[  60] COMPLETED IN 0:01:24.989291


Removing tmp files: : 62batches [1:26:40, 84.87s/batches]  

[  61] COMPLETED IN 0:01:24.782751


Removing tmp files: : 63batches [1:28:05, 84.78s/batches]  

[  62] COMPLETED IN 0:01:24.327999


Removing tmp files: : 64batches [1:29:29, 84.61s/batches]  

[  63] COMPLETED IN 0:01:23.936512


Removing tmp files: : 65batches [1:30:53, 84.38s/batches]  

[  64] COMPLETED IN 0:01:23.605218


Removing tmp files: : 66batches [1:32:27, 87.23s/batches]  

[  65] COMPLETED IN 0:01:33.621810


Removing tmp files: : 67batches [1:33:51, 86.35s/batches]  

[  66] COMPLETED IN 0:01:23.997787


Removing tmp files: : 68batches [1:35:15, 85.78s/batches]  

[  67] COMPLETED IN 0:01:24.202147


Removing tmp files: : 69batches [1:36:41, 85.64s/batches]  

[  68] COMPLETED IN 0:01:25.075296


Removing tmp files: : 70batches [1:38:05, 85.29s/batches]  

[  69] COMPLETED IN 0:01:24.253078


Removing tmp files: : 71batches [1:39:30, 85.31s/batches]  

[  70] COMPLETED IN 0:01:25.119931


Removing tmp files: : 72batches [1:40:55, 85.11s/batches]  

[  71] COMPLETED IN 0:01:24.393577


Removing tmp files: : 73batches [1:42:19, 84.76s/batches]  

[  72] COMPLETED IN 0:01:23.679943


Removing tmp files: : 74batches [1:43:43, 84.49s/batches]  

[  73] COMPLETED IN 0:01:23.633577


Removing tmp files: : 75batches [1:45:10, 85.32s/batches]  

[  74] COMPLETED IN 0:01:27.023883


Removing tmp files: : 76batches [1:46:35, 85.10s/batches]  

[  75] COMPLETED IN 0:01:24.331988


Removing tmp files: : 77batches [1:48:06, 87.01s/batches]  

[  76] COMPLETED IN 0:01:31.213544


Removing tmp files: : 78batches [1:49:30, 86.17s/batches]  

[  77] COMPLETED IN 0:01:23.968248


Removing tmp files: : 79batches [1:50:54, 85.52s/batches]  

[  78] COMPLETED IN 0:01:23.729976


Removing tmp files: : 80batches [1:52:19, 85.20s/batches]  

[  79] COMPLETED IN 0:01:24.195542


Removing tmp files: : 81batches [1:53:44, 85.06s/batches]  

[  80] COMPLETED IN 0:01:24.467760


Removing tmp files: : 82batches [1:55:08, 84.73s/batches]  

[  81] COMPLETED IN 0:01:23.728993


Removing tmp files: : 83batches [1:56:32, 84.74s/batches]  

[  82] COMPLETED IN 0:01:24.479631


Removing tmp files: : 84batches [1:58:04, 86.76s/batches]  

[  83] COMPLETED IN 0:01:31.234901


Removing tmp files: : 85batches [1:59:30, 86.68s/batches]  

[  84] COMPLETED IN 0:01:26.231493


Removing tmp files: : 86batches [2:00:55, 86.20s/batches]  

[  85] COMPLETED IN 0:01:24.832821


Embedding with VGG16: : 86batches [2:01:15, 86.20s/batches]