In [1]:
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import boto3

In [7]:
with open("data/bucket-objects.json") as f:
    data = json.load(f)
    
with open("data/file-info.json") as f:
    file_info = json.load(f)

In [9]:
data[0].keys()

dict_keys(['mid', 'stills'])

In [11]:
file_info[0].keys()

dict_keys(['title', 'url', 'attrs', 'img-links'])

In [5]:
s3 = boto3.resource("s3")

s3.ServiceResource()

In [13]:
dynamodb = boto3.resource("dynamodb")
table = dynamodb.Table("film-stills")
table

dynamodb.Table(name='film-stills')

In [26]:
data = [{
    **d,
    "title":file_info[mid]["title"],
    "url":file_info[mid]["url"],
    "attrs":{k.lower():v for k,v in file_info[mid]["attrs"].items()}
} for mid, d in enumerate(data)]

In [27]:
data[0]

{'mid': 0,
 'stills': ['stills/0000_10CloverfieldLane_00000.jpg',
  'stills/0000_10CloverfieldLane_00001.jpg',
  'stills/0000_10CloverfieldLane_00002.jpg',
  'stills/0000_10CloverfieldLane_00003.jpg',
  'stills/0000_10CloverfieldLane_00004.jpg',
  'stills/0000_10CloverfieldLane_00005.jpg',
  'stills/0000_10CloverfieldLane_00006.jpg',
  'stills/0000_10CloverfieldLane_00007.jpg',
  'stills/0000_10CloverfieldLane_00008.jpg',
  'stills/0000_10CloverfieldLane_00009.jpg',
  'stills/0000_10CloverfieldLane_00010.jpg',
  'stills/0000_10CloverfieldLane_00011.jpg',
  'stills/0000_10CloverfieldLane_00012.jpg',
  'stills/0000_10CloverfieldLane_00013.jpg',
  'stills/0000_10CloverfieldLane_00014.jpg',
  'stills/0000_10CloverfieldLane_00015.jpg',
  'stills/0000_10CloverfieldLane_00016.jpg',
  'stills/0000_10CloverfieldLane_00017.jpg',
  'stills/0000_10CloverfieldLane_00018.jpg',
  'stills/0000_10CloverfieldLane_00019.jpg',
  'stills/0000_10CloverfieldLane_00020.jpg',
  'stills/0000_10CloverfieldLane_0

In [41]:
import itertools

In [46]:
def get_all_attrs():
    return set(chain(*(d["attrs"].keys() for d in data)))

In [49]:
get_all_attrs()

{'#bwg_container1_0 #bwg_container2_0 .bwg_masonry_thumb_0 {        text-align',
 'animation sequence director',
 'costume',
 'costume design',
 'costume designer',
 'dir',
 'directed by',
 'directir of photography',
 'director',
 'director of photography',
 'director of production',
 'directors',
 'directors of photography',
 'dop',
 'dorector of photography',
 'note',
 'product design',
 'production design',
 'production designer',
 'purchase 2001',
 'purchase ali',
 'purchase alien',
 'purchase avengers',
 'purchase bad lieutenant',
 'purchase batman v superman',
 'purchase captain america',
 'purchase dominion',
 'purchase dracula',
 'purchase first name',
 'purchase freddy’s dead',
 'purchase ga-ga',
 'purchase ghost dog',
 'purchase hellboy ii',
 'purchase history of the world',
 'purchase kong',
 'purchase lady snowblood',
 'purchase lone wolf and cub',
 'purchase léon',
 'purchase mad max',
 'purchase mishima',
 'purchase mission',
 'purchase mission impossible',
 'purchase par

In [52]:
# Get rid of any of the attributes
# with "purchase" in the name
for d in data:
    a2d = set()
    for a in d["attrs"].keys():
        if "purchase" in a:
            a2d.add(a)
    for a in a2d: del d["attrs"][a]
            
get_all_attrs()

{'#bwg_container1_0 #bwg_container2_0 .bwg_masonry_thumb_0 {        text-align',
 'animation sequence director',
 'costume',
 'costume design',
 'costume designer',
 'dir',
 'directed by',
 'directir of photography',
 'director',
 'director of photography',
 'director of production',
 'directors',
 'directors of photography',
 'dop',
 'dorector of photography',
 'note',
 'product design',
 'production design',
 'production designer',
 'title',
 'year'}

In [53]:
# Get rid of the bad index value
for d in data:
    if "#bwg_container1_0 #bwg_container2_0 .bwg_masonry_thumb_0 {        text-align" in d["attrs"]:
        del d["attrs"]["#bwg_container1_0 #bwg_container2_0 .bwg_masonry_thumb_0 {        text-align"]
            
get_all_attrs()

{'animation sequence director',
 'costume',
 'costume design',
 'costume designer',
 'dir',
 'directed by',
 'directir of photography',
 'director',
 'director of photography',
 'director of production',
 'directors',
 'directors of photography',
 'dop',
 'dorector of photography',
 'note',
 'product design',
 'production design',
 'production designer',
 'title',
 'year'}

In [56]:
def rename(attrs,old,new):
    if old in attrs:
        attrs[new] = attrs[old]
        del attrs[old]
    return attrs     

In [68]:
to_rename = {
    'costume': 'costume design',
    'costume designer': 'costume design',
    'dir': 'director',
    'directed by': 'director',
    'directir of photography': 'director of photography',
    'directors': 'director',
    'directors of photography': 'director of photography',
    'dop': 'director of photography',
    'dorector of photography': 'director of photography',
    'product design': 'production design',
    'production designer': 'production design',
}

In [70]:
for d in data:
    for a, b in to_rename.items():
        rename(d["attrs"],a,b)

get_all_attrs()

{'animation sequence director',
 'costume design',
 'director',
 'director of photography',
 'director of production',
 'note',
 'production design',
 'title',
 'year'}

In [76]:
for d in data:
    if "note" in d["attrs"]:
        del d["attrs"]["note"]
        
get_all_attrs()

{'animation sequence director',
 'costume design',
 'director',
 'director of photography',
 'director of production',
 'production design',
 'title',
 'year'}

In [77]:
data[0]

{'mid': 0,
 'stills': ['stills/0000_10CloverfieldLane_00000.jpg',
  'stills/0000_10CloverfieldLane_00001.jpg',
  'stills/0000_10CloverfieldLane_00002.jpg',
  'stills/0000_10CloverfieldLane_00003.jpg',
  'stills/0000_10CloverfieldLane_00004.jpg',
  'stills/0000_10CloverfieldLane_00005.jpg',
  'stills/0000_10CloverfieldLane_00006.jpg',
  'stills/0000_10CloverfieldLane_00007.jpg',
  'stills/0000_10CloverfieldLane_00008.jpg',
  'stills/0000_10CloverfieldLane_00009.jpg',
  'stills/0000_10CloverfieldLane_00010.jpg',
  'stills/0000_10CloverfieldLane_00011.jpg',
  'stills/0000_10CloverfieldLane_00012.jpg',
  'stills/0000_10CloverfieldLane_00013.jpg',
  'stills/0000_10CloverfieldLane_00014.jpg',
  'stills/0000_10CloverfieldLane_00015.jpg',
  'stills/0000_10CloverfieldLane_00016.jpg',
  'stills/0000_10CloverfieldLane_00017.jpg',
  'stills/0000_10CloverfieldLane_00018.jpg',
  'stills/0000_10CloverfieldLane_00019.jpg',
  'stills/0000_10CloverfieldLane_00020.jpg',
  'stills/0000_10CloverfieldLane_0

In [78]:
import pathlib

In [85]:
processed = pathlib.Path("processed")
for d in data:
    d["stills"] = [str(processed / pathlib.Path(s).with_suffix(".png").name) for s in d["stills"]]

In [92]:
data = [{**d,"mid":f"{mid:04d}"} for mid,d in enumerate(data)]

In [93]:
data[0]

{'mid': '0000',
 'stills': ['processed/0000_10CloverfieldLane_00000.png',
  'processed/0000_10CloverfieldLane_00001.png',
  'processed/0000_10CloverfieldLane_00002.png',
  'processed/0000_10CloverfieldLane_00003.png',
  'processed/0000_10CloverfieldLane_00004.png',
  'processed/0000_10CloverfieldLane_00005.png',
  'processed/0000_10CloverfieldLane_00006.png',
  'processed/0000_10CloverfieldLane_00007.png',
  'processed/0000_10CloverfieldLane_00008.png',
  'processed/0000_10CloverfieldLane_00009.png',
  'processed/0000_10CloverfieldLane_00010.png',
  'processed/0000_10CloverfieldLane_00011.png',
  'processed/0000_10CloverfieldLane_00012.png',
  'processed/0000_10CloverfieldLane_00013.png',
  'processed/0000_10CloverfieldLane_00014.png',
  'processed/0000_10CloverfieldLane_00015.png',
  'processed/0000_10CloverfieldLane_00016.png',
  'processed/0000_10CloverfieldLane_00017.png',
  'processed/0000_10CloverfieldLane_00018.png',
  'processed/0000_10CloverfieldLane_00019.png',
  'processed/0

In [94]:
with table.batch_writer() as batch:
    for d in data:
        batch.put_item(Item=d)

In [95]:
table.creation_date_time

datetime.datetime(2020, 8, 9, 20, 27, 40, 433000, tzinfo=tzlocal())

In [96]:
table.item_count

0

In [98]:
table.get_item(Key={"mid":"0000"})

{'Item': {'stills': ['processed/0000_10CloverfieldLane_00000.png',
   'processed/0000_10CloverfieldLane_00001.png',
   'processed/0000_10CloverfieldLane_00002.png',
   'processed/0000_10CloverfieldLane_00003.png',
   'processed/0000_10CloverfieldLane_00004.png',
   'processed/0000_10CloverfieldLane_00005.png',
   'processed/0000_10CloverfieldLane_00006.png',
   'processed/0000_10CloverfieldLane_00007.png',
   'processed/0000_10CloverfieldLane_00008.png',
   'processed/0000_10CloverfieldLane_00009.png',
   'processed/0000_10CloverfieldLane_00010.png',
   'processed/0000_10CloverfieldLane_00011.png',
   'processed/0000_10CloverfieldLane_00012.png',
   'processed/0000_10CloverfieldLane_00013.png',
   'processed/0000_10CloverfieldLane_00014.png',
   'processed/0000_10CloverfieldLane_00015.png',
   'processed/0000_10CloverfieldLane_00016.png',
   'processed/0000_10CloverfieldLane_00017.png',
   'processed/0000_10CloverfieldLane_00018.png',
   'processed/0000_10CloverfieldLane_00019.png',
  