In [1]:
import io
import requests

import numpy as np
import pandas as pd

import boto3

In [2]:
from PIL import Image

In [3]:
from tqdm.notebook import tqdm

In [None]:
s3 = boto3.client("s3")

In [None]:
df = pd.read_csv("data/photos.tsv000",sep="\t")
df.head()

In [None]:
# for img_id, url in tqdm(df[["photo_id","photo_image_url"]].values,desc="Images Downloaded"):
#     resp = requests.get(url)
#     b = io.BytesIO(resp.content)
#     b.seek(0)
#     s3.upload_fileobj(b,"ap-unsplash-images",f"{img_id}.jpg")

In [None]:
def get_crop(img):
    w, h = img.size
    if w > h:
        d = (w - h) / 2
        return (0,-d,w,h+d)
    else:
        d = (h - w) / 2
        return (-d,0,w+d,h)
    
def transform_img(img,new_size=(960,960)):
    return img.crop(get_crop(img)).resize(new_size)

def drop_alpha(img):
    return Image.fromarray(np.array(img)[:,:,:3])

def load_image(url: str):
    resp = requests.get(url)
    b = io.BytesIO(resp.content)
    return Image.open(b)

def upload_image(img, img_id: str):
    with io.BytesIO() as b:
        img.save(b,format="JPEG")
        b.seek(0)
        s3.upload_fileobj(b,"ap-unsplash-images",f"reshaped/{img_id}.jpg")
        
def process_image(url,img_id=None):
    if img_id is None:
        url, img_id = url
    img = load_image(url)
    img = drop_alpha(img)
    img = transform_img(img)
    upload_image(img, img_id)

In [None]:
from multiprocessing import Pool

In [None]:
with Pool() as p:
    p.map(
        process_image,
        df[["photo_image_url","photo_id"]].values
    )