# Get Image Color Channel Norms

Map-Reduce the preprocessed images to calculate the mean and std values for each color channel in the dataset.

In [1]:
!pip install -qr requirements.txt

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import boto3
import numpy as np
import pandas as pd

import io
from PIL import Image

from tqdm import tqdm

In [3]:
from functools import reduce

In [4]:
s3 = boto3.resource("s3")
bucket = s3.Bucket("ap-unsplash-images")

In [5]:
with io.BytesIO() as b:
    bucket.download_fileobj("lite-normalized-photos.csv",b)
    b.seek(0)
    df = pd.read_csv(b)
df.head()

Unnamed: 0,photo_id,photo_image_url,photo_width,photo_height,photo_aspect_ratio,exif_camera_make,exif_camera_model,exif_iso,exif_aperture_value,exif_focal_length,exif_exposure_time,s3_key,s3_bucket
0,2Q8zDWkj0Yw,https://images.unsplash.com/photo-141520117961...,4192,2794,1.5,NIKON CORPORATION,NIKON D700,2000.0,2.5,50.0,1/125,lite-normalized/unsplash.lite.2Q8zDWkj0Yw.JPG,ap-unsplash-images
1,tsBDNuCJiLg,https://images.unsplash.com/photo-141768928330...,4324,2880,1.5,NIKON CORPORATION,NIKON D3200,320.0,7.1,200.0,1/2000,lite-normalized/unsplash.lite.tsBDNuCJiLg.JPG,ap-unsplash-images
2,A93gsuMxVcE,https://images.unsplash.com/photo-142981401899...,2000,1333,1.5,Canon,Canon EOS REBEL T2i,400.0,14.0,18.0,1/640,lite-normalized/unsplash.lite.A93gsuMxVcE.JPG,ap-unsplash-images
3,oYIdH6bFssk,https://images.unsplash.com/photo-143275722183...,4288,2848,1.51,NIKON CORPORATION,NIKON D5000,250.0,5.6,105.0,1/30,lite-normalized/unsplash.lite.oYIdH6bFssk.JPG,ap-unsplash-images
4,wgLPy2YBXuc,https://images.unsplash.com/photo-143205996405...,5312,2988,1.78,SAMSUNG,SAMSUNG-SM-G870A,40.0,2.2,4.8,1/2384,lite-normalized/unsplash.lite.wgLPy2YBXuc.JPG,ap-unsplash-images


In [6]:
def get_img_arr(key):
    with io.BytesIO() as f:
        bucket.download_fileobj(key,f)
        f.seek(0)
        return np.array(Image.open(f))
    
def reshape_arr(arr):
    return arr.reshape((-1,3))

def normalize(arr):
    return arr.astype("float32") / 255

def squash_pixels(arr):
    return np.concatenate((arr.mean(0).reshape((1,-1)), arr.std(0).reshape((1,-1))),0)

def process_img(key):
    arr = get_img_arr(key)
    arr = reshape_arr(arr)
    arr = normalize(arr)
    return squash_pixels(arr)

In [7]:
img = process_img(df.s3_key[0])
img

array([[0.55697006, 0.7256846 , 0.94664055],
       [0.31954202, 0.20058027, 0.06601173]], dtype=float32)

In [8]:
image_shapes = 960, 960
flat_image_length = np.prod(image_shapes)

In [9]:
progress_bar = tqdm(df.s3_key,desc="Images Processed",ncols=100)
processed_images = map(process_img,progress_bar)
channel_sums = reduce(lambda a,b: np.mean((a,b),0),processed_images)
means, stds = channel_sums

print("CHANNEL MEANS:", means)
print("CHANNEL STDS: ", stds)

Images Processed: 100%|███████████████████████████████████████| 21178/21178 [59:27<00:00,  5.94it/s]

CHANNEL MEANS: [0.56040615 0.36607313 0.42010993]
CHANNEL STDS:  [0.2895784  0.23586935 0.22863376]



