# Preprocess and Upload Data

In [None]:
!pip install -qr requirements.txt

In [1]:
import boto3
import pandas as pd

import io
import requests
from PIL import Image

from multiprocessing import Pool, cpu_count
from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm

In [2]:
cpu_count()

2

In [3]:
s3 = boto3.resource("s3")
bucket = s3.Bucket("ap-unsplash-images")

In [4]:
df = pd.read_csv(
    "data/photos.tsv000",
    sep="\t"
)[[
    "photo_id",
    "photo_image_url",
    "photo_width",
    "photo_height",
    "photo_aspect_ratio",
    "exif_camera_make",
    "exif_camera_model",
    "exif_iso",
    "exif_aperture_value",
    "exif_focal_length",
    "exif_exposure_time",
]].dropna()

df.head()

Unnamed: 0,photo_id,photo_image_url,photo_width,photo_height,photo_aspect_ratio,exif_camera_make,exif_camera_model,exif_iso,exif_aperture_value,exif_focal_length,exif_exposure_time
0,2Q8zDWkj0Yw,https://images.unsplash.com/photo-141520117961...,4192,2794,1.5,NIKON CORPORATION,NIKON D700,2000.0,2.5,50.0,1/125
1,tsBDNuCJiLg,https://images.unsplash.com/photo-141768928330...,4324,2880,1.5,NIKON CORPORATION,NIKON D3200,320.0,7.1,200.0,1/2000
2,A93gsuMxVcE,https://images.unsplash.com/photo-142981401899...,2000,1333,1.5,Canon,Canon EOS REBEL T2i,400.0,14.0,18.0,1/640
3,oYIdH6bFssk,https://images.unsplash.com/photo-143275722183...,4288,2848,1.51,NIKON CORPORATION,NIKON D5000,250.0,5.6,105.0,1/30
4,wgLPy2YBXuc,https://images.unsplash.com/photo-143205996405...,5312,2988,1.78,SAMSUNG,SAMSUNG-SM-G870A,40.0,2.2,4.8,1/2384


In [5]:
def load_image(url):
    resp = requests.get(url)
    b = io.BytesIO(resp.content)
    return Image.open(b)

def reshape_image(img,shape=(960,960)):
    return img.resize(shape)

def upload_img(img,name):
    img = img.convert("RGB")
    with io.BytesIO() as f:
        img.save(f,"JPEG")
        f.seek(0)
        bucket.upload_fileobj(f,
            f"lite-normalized/unsplash.lite.{name}.JPG")
        
def process_image(url,name=None):
    if name is None: url, name = url
    img = load_image(url)
    img = reshape_image(img)
    upload_img(img,name)

In [6]:
objs = iter(bucket.objects.all())
next(objs)

existing_objects = pd.Series((
    o.key.replace("lite-normalized/unsplash.lite.","").replace(".JPG","") 
    for o in objs))

existing_objects.head()

0    -3cTY-Q6k88
1    -3qSsolbivo
2    -4AR-vVjAbM
3    -4qCLz3r1s8
4    -5WWw6DeQ8w
dtype: object

In [7]:
n_existing = len(existing_objects)
n_total = len(df)

print(f"[{(n_existing/n_total*100):6.2f}% ] Processed {n_existing:,d} of {n_total:,d} images.")

[ 20.06% ] Processed 4,250 of 21,182 images.


In [8]:
# with Pool() as P:
#     P.map(
#         process_image,
#         df[
#             ["photo_image_url","photo_id"]
#         ][
#             ~df.photo_id.isin(existing_objects)
#         ].values
#     )

In [9]:
# with ThreadPoolExecutor() as ex:
#     ex.map(
#         process_image,
#         df[
#             ["photo_image_url","photo_id"]
#         ][
#             ~df.photo_id.isin(existing_objects)
#         ].values
#     )

In [None]:
for url, pid in tqdm(
    df[["photo_image_url","photo_id"]][~df.photo_id.isin(existing_objects)].values):
    process_image(url,pid)

  0%|          | 71/16932 [01:50<7:09:21,  1.53s/it] 