## Images dataset

In [21]:
import os
import glob
import cv2
import h5py
import csv
import pandas as pd
import numpy as np
import torch
from PIL import Image
import ZipFile
import tarfile

In this notebook we setup the training, validation and testing dataset for the image classification task. We used the [UTKFace](https://susanqq.github.io/UTKFace/) dataset for training, and testing on our own scrapped profile pictures from the nannies, to obtained them you'll have to run the image_scrapper.R script, providing the urls from the urls from the user_ages.csv file. 

First we download the zip file from the link above and use the `ZipFile` and `tarfile` packages to extract them. 

In [None]:
path = 'your_path'
target_dir = ''
with ZipFile('UTKface_inthewild.zip', 'r') as f:
  f.extractall(target_dir)

target_dir = ''
for file in os.listdir(target_dir):
  with tarfile.open(path + file, 'r:gz') as f:
    f.extractall(target_dir)

The next cell is to resize all images in a directory, it's not necessary, but if you want to do it run it in a local machine rather than google colab. 

In [None]:
sizes = []
wild_files = []
new_size = (256,256)
dir = '../data/part2/'
for file in os.listdir(dir):
    filepath = os.path.join(dir, file)
    try: 
        img = Image.open(filepath)
        sizes.append(img.size)
        if img.size != new_size:
            img.resize(new_size).convert('RGB').save(filepath)
            #sizes.append(img.size)
            wild_files.append(filepath)
    except:
        #os.remove(filepath)
        print(filepath)

The next function is to build a csv file, containing the file path and other information in the UTKFace image dataset. 

In [2]:
def class_labels_reassign(age):
    if 10 <= age <= 20:
        return 0
    elif 21 <= age <= 27:
        return 1
    elif 28 <= age <= 45:
        return 2
    elif 46 <= age <= 65:
        return 3
    else:
        return 4


def build_csv(directory, output_csv_name):
    """
    Builds a csv file for pytorch training from a directory of folders of images.
    Install csv module if not already installed.
    Args:
    directory_string: string of directory path, e.g. r'.\data\train'
    output_csv_name: string of output csv file name, e.g. 'train.csv'
    Returns:
    csv file with file names, file paths, class names and class indices
    """

    file_lst = os.listdir(directory) #returns a LIST containing the names of the entries (folder names in this case) in the directory.
    #class_lst.sort() #IMPORTANT
    with open(output_csv_name, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(['file_name', 'file_path', 'age', 'age_class', 'gender', 'race']) #create column names
        for img_file in file_lst:
            img_path = os.path.join(directory, img_file) #concatenates various path components with exactly one directory separator (‘/’) except the last path component.
            name_components = img_file.split('_')
            if len(name_components) == 4:
              age, gender, race, _ = name_components
              if int(age) >= 14:
                writer.writerow([img_file, img_path, age, class_labels_reassign(float(age)), gender, race]) #write the file path and class name to the csv file
    return

In [5]:
datapath = "../../data/input/"
build_csv(datapath + "images/part1", datapath + "part1.csv")
build_csv(datapath + "images/part2", datapath + "part2.csv")
build_csv(datapath + "images/part3", datapath + "part3.csv")

In this cell we create a csv for the users' profile picture. 

In [15]:
directory = datapath + "images/profile_pics/"
output_csv = datapath + "nannies_test.csv"
file_list = os.listdir(directory)
    # Get user ages
user_ages = pd.read_csv("../../data/output/python_tests/user_ages.csv")
with open(output_csv, "w") as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['file_name', 'file_path', 'age', 'age_class'])
    for img_file in file_list:
      img_path = os.path.join(directory, img_file)
      img_id = int(img_file.split('.')[0])
      ages_df = user_ages[user_ages.id == img_id]["age"]
      if ages_df.any():
        age = ages_df.values[0]
        writer.writerow([img_file, img_path, age, class_labels_reassign(age)])

CSV file file ready


We apply some transformations to the train and validation sets to make them more similar to our test set.

In [3]:
datapath = "../../data/input/"
class_names = ["10-20", "21-27", "28-45", "46-65", "65+"]
part1 = pd.read_csv(datapath + 'part1.csv')
part2 = pd.read_csv(datapath + 'part2.csv')
test = pd.read_csv(datapath + 'nannies_test.csv').dropna()
test.to_csv(datapath + 'nannies_test.csv')

train = pd.concat([part1, part2]).reset_index(drop=True)
train = train[(train.age > test.age.min()) & (train.age < test.age.max() + 10)].reset_index(drop=True)
train.to_csv(datapath + 'train.csv', index=False)

validation = pd.read_csv(datapath + 'part3.csv')
validation = validation[(validation.age > test.age.min()) & (validation.age < test.age.max() + 10)].reset_index(drop=True)
validation.to_csv(datapath + 'validation.csv', index=False)

In this function we create a h5py file that stores the images, ages and age class in numpy arrays rather than jpeg, which will make the model run way faster. If possible run this cell on a local machine rather than google colab, since it will loop through all images which will be very slow on colab. 

In [46]:
def create_h5py_file(filename, dataframe, size: tuple):
    n = len(dataframe)
    x_shape = (n,) + size
    y_shape = (n,)
    with h5py.File(filename, "w") as out:
        X = out.create_dataset("X", x_shape, dtype="u1")
        age = out.create_dataset("age", y_shape, dtype="u1")
        age_class = out.create_dataset("age_class", y_shape, dtype="u1")

        age[:] = dataframe.loc[:,"age"].values#.reshape(-1, 1, 1)
        age_class[:] = dataframe.loc[:,"age_class"].values#.reshape(-1, 1, 1)

        for i, file in enumerate(dataframe.loc[:, "file_path"]):
            img = cv2.imread(file)
            if img.shape != size:
              img = cv2.resize(img, size[:2], interpolation=cv2.INTER_AREA)
            X[i:i+1:,:,:,:] = img
        

In [27]:
correct_size = (256, 256, 3)
create_h5py_file(datapath + 'train.h5', train, size=correct_size)
create_h5py_file(datapath + 'validation.h5', validation, size=correct_size)
create_h5py_file(datapath + 'test.h5', test, size=correct_size)

In [65]:
correct_size = (30, 30, 3)
create_h5py_file(datapath + 'train-30.h5', train, size=correct_size)
create_h5py_file(datapath + 'validation-30.h5', validation, size=correct_size)
create_h5py_file(datapath + 'test-30.h5', test, size=correct_size)

Next up we create a pytorch dataset based on the h5py file. 

In [48]:
class dataset_h5(torch.utils.data.Dataset):
    def __init__(self, in_file, transform=None):
        super(dataset_h5, self).__init__()
 
        self.file = h5py.File(in_file, 'r')
        self.transform = transform
 
    def __getitem__(self, index):
        x = self.file['X'][index, ...]
        age = self.file['age'][index, ...]
        age_class = self.file['age_class'][index, ...]
        
        # Preprocessing each image
        if self.transform is not None:
            x = self.transform(x)        
        
        return x, age, age_class
 
    def __len__(self):
        return self.file['X'].shape[0]

In [54]:
test_dataset = dataset_h5(datapath + 'test-1.h5')
test_dataset[0][0].shape

(256, 256, 3)

In [None]:
train_dataset = dataset_h5(datapath + 'train-30.h5')
train_dataset[0]