In [1]:
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import boto3

from PIL import Image

Connect to `S3`

In [2]:
s3 = boto3.resource("s3")
s3

s3.ServiceResource()

In [3]:
with open("data/s3-data.json") as f:
    data = json.load(f)
len(data)

2143

Move the stills info into a DataFrame

In [4]:
movie_stills = []
for m in data:
    movie_stills.extend([
        {
            "mid": m["mid"],
            "title": m["title"],
            "url": m["url"],
            "sid": f"{sid:02d}",
            "s3_url": s
        } for sid, s in enumerate(m["stills"])
    ])
movie_stills = pd.DataFrame(movie_stills)
movie_stills.head()

Unnamed: 0,mid,title,url,sid,s3_url
0,0,10 Cloverfield Lane,https://film-grab.com/2017/03/24/10-cloverfiel...,0,processed/0000_10CloverfieldLane_00000.png
1,0,10 Cloverfield Lane,https://film-grab.com/2017/03/24/10-cloverfiel...,1,processed/0000_10CloverfieldLane_00001.png
2,0,10 Cloverfield Lane,https://film-grab.com/2017/03/24/10-cloverfiel...,2,processed/0000_10CloverfieldLane_00002.png
3,0,10 Cloverfield Lane,https://film-grab.com/2017/03/24/10-cloverfiel...,3,processed/0000_10CloverfieldLane_00003.png
4,0,10 Cloverfield Lane,https://film-grab.com/2017/03/24/10-cloverfiel...,4,processed/0000_10CloverfieldLane_00004.png


In [5]:
print(f"There are {len(movie_stills):,d} movie stills and {len(movie_stills['title'].unique()):,d} movies.")

There are 132,617 movie stills and 2,127 movies.


Use pytorch to create embeddings for each image

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torchvision
import torchvision.models as models

In [7]:
vgg16 = models.vgg16(pretrained=True)
vgg16

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

Remove the last layer in the classifier

In [8]:
vgg16.classifier = nn.Sequential(*[vgg16.classifier[i] for i in range(4)])
vgg16

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [9]:
import io

In [10]:
stills_bucket = s3.Bucket("film-stills")
stills_bucket

s3.Bucket(name='film-stills')

In [11]:
def load_image(url=None,i=None):
    assert url is not None or i is not None
    if url is None:
        # Pick the correct s3 key
        url = movie_stills.loc[i,"s3_url"]
    # Load image into a BytesIO obj
    f = io.BytesIO()
    stills_bucket.download_fileobj(url,f)
    f.seek(0)
    # Load into a PIL Image
    img = Image.open(f)
    # Transform to be (224 x 224)
    w, h = img.size
    pad = 224 - h
    img = img.crop((0,-pad/2,w,h+pad/2))
    aimg = np.expand_dims(np.array(img),0).astype("float32") / 255
    t = torch.from_numpy(aimg)
    return t.permute(0, 3, 1, 2)

In [12]:
class MovieStills(Dataset):
    def __init__(self,data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,i):
        return load_image(i=i)

In [13]:
r = vgg16(load_image(i=5))
r.shape, r.dtype

(torch.Size([1, 4096]), torch.float32)

Moving the data to AWS Postgres

In [14]:
import psycopg2, dotenv, os

dotenv.load_dotenv()

DB_URL  = os.environ.get("AWS_DB_URL")
DB_PORT = os.environ.get("AWS_DB_PORT")
DB_USER = os.environ.get("AWS_DB_USER")
DB_PASS = os.environ.get("AWS_DB_PWD")

In [15]:
db = psycopg2.connect(f"host={DB_URL} port={DB_PORT} dbname=film-stills user={DB_USER} password={DB_PASS}")
cursor = db.cursor()
cursor

<cursor object at 0x7f8970fec9f8; closed: 0>

In [16]:
data[0].keys()

dict_keys(['mid', 'stills', 'title', 'url', 'attrs'])

In [17]:
cursor.execute("""
CREATE TABLE IF NOT EXISTS Films (
    "mid" INT,
    "url" TEXT,
    "title" TEXT,
    PRIMARY KEY (mid)
);
""")
cursor.execute("""CREATE TABLE IF NOT EXISTS FilmAttrs (
    "mid" INT,
    "attr" TEXT,
    "value" TEXT,
    PRIMARY KEY (mid, attr),
    CONSTRAINT fk_mid
        FOREIGN KEY(mid)
            REFERENCES Films(mid)
);
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS FilmStills (
    "mid" INT,
    "sid" INT,
    "path" TEXT,
    PRIMARY KEY (mid,sid),
    CONSTRAINT fk_mid
        FOREIGN KEY (mid)
            REFERENCES Films(mid)
);
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS EmbeddingsVGG16 (
    "mid" INT,
    "sid" INT,
    "eid" INT,
    "value" REAL,
    PRIMARY KEY (mid,sid,embed_index),
    CONSTRAINT fk_mid_sid
        FOREIGN KEY (mid, sid)
            REFERENCES FilmStills(mid, sid)
);
""")

In [18]:
db.commit()

Now let's add `data` to the database

In [19]:
data[0]["attrs"]

{'director': 'Dan Trachtenberg',
 'director of photography': 'Jeff Cutter',
 'production design': 'Ramsey Avery',
 'costume design': 'Meagan McLaughlin',
 'year': '2016',
 'title': '10 Cloverfield Lane'}

In [20]:
from IPython.display import ProgressBar

In [21]:
def progress(iterable,length=None):
    if length is None:
        length = len(iterable)
    progress = ProgressBar(length)
    progress.display()
    for v in iterable:
        progress.progress += 1
        yield v

In [None]:
for d in progress(data):
    # Add the film to the main table
    mid = int(d["mid"])
    cursor.execute(
        "INSERT INTO Films (mid, url, title) VALUES (%s,%s,%s);",
        (mid, d["url"], d["title"]))
    db.commit()
    # Add each of the movie attributes
    for k, v in d["attrs"].items():
        cursor.execute(
            "INSERT INTO FilmAttrs (mid, attr, value) VALUES (%s, %s, %s);",
            (mid,k,v))
    db.commit()
    # Add each of the stills to the stills table
    for sid, s in enumerate(d["stills"]):
        cursor.execute(
            "INSERT INTO FilmStills (mid, sid, path) VALUES (%s, %s, %s);",
            (mid,sid,s))
    db.commit()

***

In [None]:
p1 = ProgressBar(len(data))
p1.display()
p2 = ProgressBar(1)
p2.display()

for mid, d in enumerate(data):
    p2.total = len(d["stills"])
    p2.progress = 0
    # Generator to download images from S3
    images = (load_image(url) for url in d["stills"])
    # Generator passing images through VGG16 model
    embeddings = (vgg16(img)[0].tolist() for img in images)
    # Add embeddings to Postgres
    for sid, embedding in enumerate(embeddings):
        for eid, e in enumerate(embedding):
            cursor.execute(
                "INSERT INTO EmbeddingsVGG16 (mid,sid,eid,value) VALUES (%s,%s,%s,%s);",
                (mid,sid,eid,e)
            )
        p2.progress += 1
        db.commit()
    p1.progress += 1

In [None]:
db.rollback()

---

In [22]:
def load_images(urls=None,is_=None):
    assert urls is not None or is_ is not None
    if urls is None:
        # Pick the correct s3 key
        urls = movie_stills.loc[is_,"s3_url"]
    d = []
    for url in urls:
        # Load image into a BytesIO obj
        f = io.BytesIO()
        stills_bucket.download_fileobj(url,f)
        f.seek(0)
        # Load into a PIL Image
        img = Image.open(f)
        # Transform to be (224 x 224)
        w, h = img.size
        pad = 224 - h
        img = img.crop((0,-pad/2,w,h+pad/2))
        d.append(np.array(img))
    aimg = np.array(d).astype("float32") / 255
    t = torch.from_numpy(aimg)
    return t.permute(0, 3, 1, 2)

In [23]:
movie_stills.s3_url.head().values

array(['processed/0000_10CloverfieldLane_00000.png',
       'processed/0000_10CloverfieldLane_00001.png',
       'processed/0000_10CloverfieldLane_00002.png',
       'processed/0000_10CloverfieldLane_00003.png',
       'processed/0000_10CloverfieldLane_00004.png'], dtype=object)

In [25]:
def get_urls(mid):
    return data[mid]["stills"]

In [None]:
def process_movie(d):
    mid, urls = d
    images = load_images(urls)
    embeddings = vgg16(images)
    cursor = db.cursor()
    for sid, still in enumerate(embeddings):
        for eid, e in enumerate(still):
            cursor.execute(
                """INSERT INTO EmbeddingsVGG16 (
                    mid,sid,eid,value
                ) VALUES (
                    %s,%s,%s,%s
                );""",
                (mid,sid,eid,e)
            )
        db.commit()

In [None]:
mid_urls = enumerate(m["stills"][:] for m in data)
for mid, movie in progress(enumerate(data),len(data)):
    process_movie((mid,movie["stills"]))

In [None]:
import dill as pickle
from pathos.multiprocessing import ProcessingPool as Pool
from toolz.sandbox.parallel import fold
from functools import reduce

In [None]:
mid_urls = enumerate(m["stills"][:] for m in data)
with Pool() as P:
    P.map(process_movie,mid_urls)

In [None]:
db.rollback()