In [2]:
from pathlib import Path

import boto3
import numpy as np
from PIL import Image

import pyarrow as pa
import pyarrow.parquet as pq

import tensorflow as tf

In [26]:
from concurrent.futures import ThreadPoolExecutor

In [3]:
tmp_dir = Path("./tmp")
tmp_dir.mkdir(exist_ok=True)

In [12]:
SOURCE_BUCKET = "apoor-clean-movie-stills"
DEST_BUCKET = "apoor-vgg-movie-vecs"

s3 = boto3.client("s3")

In [10]:
batch_size = 1_000

In [9]:
input_shape = (500, 500, 3)

vgg16 = tf.keras.applications.VGG16(
    include_top=False,
    weights='imagenet',
    input_shape=input_shape
)
vgg16.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 500, 500, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 500, 500, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 500, 500, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 250, 250, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 250, 250, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 250, 250, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 125, 125, 128)     0     

In [None]:
def iter_keys(bucket: str, batch_size: int = 1_000):
    last_key = ""
    while True:
        resp = s3.list_objects_v2(
            Bucket=SOURCE_BUCKET,
            MaxKeys=batch_size,
            StartAfter=last_key
        )
        keys = [c["Key"] for c in resp["Contents"]]
        yield keys
        if not resp["IsTruncated"]: break
        else: last_key = keys[-1]
            
            
def download_object(bucket: str, key: str, tmp_dir: Path) -> Path:
    res = s3.get_object(Bucket=bucket, Key=key)
    with open(tmp_dir / key, "wb") as f:
        f.write(res["Body"].read())
    
    
def batch_download(bucket: str, keys: [str], tmp_dir: Path) -> [Path]:
    def curried_download(key: str): 
        return download_object(bucket,key,tmp_dir)
    with ThreadPoolExecutor() as P:
        return P.map(curried_download,keys)

    
def clean_tmp_files(paths: [Path]):
    [Path(p).unlink() for p in keys]
    
    
def load_image(path: Path) -> np.ndarray:
    img = Image.open(path)
    return np.array(img)

    
def load_images(keys: [Path]) -> np.ndarray:
    return np.concatenate((
        np.expand_dims(load_image(p),0)
        for p in keys
    ))

def vgg_process(data: np.ndarray):
    pass


def write_parquets(data, keys) -> [Path]:
    pass

In [30]:
arr = np.ones(3)
arr

array([1., 1., 1.])

In [32]:
np.expand_dims(arr,0)

array([[1., 1., 1.]])

In [33]:
tf

<module 'tensorflow' from '/home/apoor/anaconda3/lib/python3.7/site-packages/tensorflow/__init__.py'>

In [37]:
res = vgg16.predict(np.zeros((1,500,500,3)))
res.shape

(1, 15, 15, 512)

In [38]:
np.prod(res.shape)

115200

In [41]:
np.prod(res.shape) / 3

38400.0

In [42]:
(np.prod(res.shape) // 3) ** 0.5

195.95917942265424

In [45]:
target = np.prod(res.shape) // 3

for i in range(int(target**0.5+1),0,-1):
    if target / i % 1 == 0:
        j = target // i
        print(f"{i} * {j} = {target}")
        break

192 * 200 = 38400
