# Imports

In [None]:
class Object(object):
    pass

# We'll store all our variables here.
t = Object()

In [None]:
from PIL import Image, ImageDraw

Pytorch:

In [None]:
import torch
from torch import nn
import torchvision.transforms as transforms

Numpy and graphs:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import timeit
import datetime
import sys
import random
from typing import List, Tuple

We will keep variables inside an object to avoid pulluting the global namespace and prevent various bugs. The defition below allows adding attributes at runtime:

# Settings

You need to add the CelebA dataset to your Kaggle notebook.

In [None]:
t.image_root = "../../../../data/vae/img_align_celeba/"
t.image_folder = f"{t.image_root}/img_align_celeba"

# t.image_root = "../input/celeba-dataset/"
# t.image_folder = f"{t.image_root}/img_align_celeba/img_align_celeba"

t.output_folder = "../working"
t.crop_size = 178

For reproducibility, we will set a fixed seed for all randomized functions:

In [None]:
t.seed = 1
torch.manual_seed(t.seed)
torch.cuda.manual_seed(t.seed)
torch.cuda.manual_seed_all(t.seed)
np.random.seed(t.seed)
# For complete reproducibility
# torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.deterministic = True

# Set Device

We can run our code on GPU or CPU, if GPU is not present. Training will be very slow on CPU. However, we can pre-train our model on GPU, save, and then experiment with it on CPU. That'll be reasonably fast.

In [None]:
t.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
t.device

# Data Loader

We will no longer load all images into memory because it's not efficient, and the dataset is too large. We will use the Pytroch's Data Loader to load images efficiently.

The data loader concept uses two objects:

1. Data Set - represents the actual data, or, in our case, images.
2. Data Loader - specializes in loading, converting and passing the data to the model.

Let's start by defining the dataset.

In [None]:
import torchvision.datasets as datasets

def plot_images(images):
    fig, axis = plt.subplots(1, len(images), figsize=(20, 5))
    for i, im in enumerate(images):
        ax = axis[i]
        ax.axis('off')
        ax.imshow(im, cmap="gray")

# Load Image Metadata

Move images into subfolders for classes "with smile", "no smile" as ImageFolder expects. We will use these classes to transform images from "no smiles", to "smiles".

First, we read the 'smile' attribute from a CSV file that maps images to smile setting:

# Dataset

We will create a custom dataset object to return files in the order we want:

1. Return training subset
1. Use validation subset for validation
1. All faces will be labeled either smile or no smile

In [None]:
import csv

class ImageTensorDataset(torch.utils.data.Dataset):
    def __init__(self, csv_folder, image_folder, transform, is_train):
        self.folder = image_folder
        self.int_to_smile = {-1: "no_smile", 1: "smile"}
        self.smile_to_int = {
            self.int_to_smile[1]: 1,
            self.int_to_smile[-1]: -1
        }
        self.image_to_smile = {}
        self.smile_to_image = {"no_smile":[], "smile":[]}
        self.dataset_split = {"train": [], "valid":[], "test":[]}
        self.dataset_split_translation = {0: "train", 1: "valid", 2: "test"}
        self.load_metadata(csv_folder)
        self.set_type = "train" if is_train else "valid"
        self.transform = transform
        
    def load_metadata(self, csv_folder):
        for r in csv.DictReader(open(f"{csv_folder}/list_attr_celeba.csv")):
            self.image_to_smile[r['image_id']] = int(r['Smiling'])
            self.smile_to_image[self.int_to_smile[int(r['Smiling'])]].append(r['image_id'])
        for r in csv.DictReader(open(f"{csv_folder}/list_eval_partition.csv")):
            partition = int(r["partition"])
            translated_partition = self.dataset_split_translation[partition]
            self.dataset_split[translated_partition].append(r['image_id'])

    def __len__(self):
        return len(self.dataset_split[self.set_type])
    
    def __getitem__(self, index):
        files = self.dataset_split[self.set_type]
        file_id = files[index]
        file_path = self.folder + "/" + file_id
        im = self.transform(Image.open(file_path))
        return im, self.image_to_smile[file_id]

In [None]:
t.train_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.CenterCrop(t.crop_size), 
        transforms.Resize(128),
])


In [None]:
t.train_dataset = ImageTensorDataset(
    t.image_root, 
    t.image_folder, 
    t.train_transform, 
    is_train=True)

In [None]:
t.valid_dataset = ImageTensorDataset(
    t.image_root, 
    t.image_folder, 
    t.train_transform, 
    is_train=False
)

# Init Data Loader

In [None]:
t.train_loader = torch.utils.data.DataLoader(
    t.train_dataset,
    batch_size=32,
    num_workers=8,
    pin_memory=True,
)

In [None]:
t.valid_loader = torch.utils.data.DataLoader(
    t.valid_dataset,
    batch_size=512,
    num_workers=8,
    pin_memory=True,
)

Test loader is the same dataset, but 1024 samples, sampled from the beginning.

Read a small sample of images:

In [None]:
def read_images(train_loader, count):
    images = None
    remain = count
    for data in train_loader:
        batch_images, labels = data
        if images is None:
            images = batch_images[0:remain]
        else:
            images = torch.cat((images, batch_images), 0)
        remain -= min(remain, batch_images.size(0))
        if remain <= 0:
            break
    return images.to(t.device)

In [None]:
t.sample_images = read_images(t.train_loader, 10)
t.sample_images.shape

The function to display our images:

In [None]:
def plot_images_tensor(images_t):
    images = images_t.cpu().detach().permute(0, 2, 3, 1).numpy()
    fig, axis = plt.subplots(1, max(images.shape[0], 2), figsize=(20, 5))
    for i, im in enumerate(images):
        axis[i].axis('off')
        axis[i].imshow(im)
        
def plot_images(images):
    fig, axis = plt.subplots(1, len(images), figsize=(20, 5))
    for i, im in enumerate(images):
        ax = axis[i]
        ax.axis('off')
        ax.imshow(im, cmap="gray")

In [None]:
plot_images_tensor(t.sample_images)

Dataset maps folder names to class IDs. In our case, `0` will mean no smile, while `1` smile. This is very reasonable.

# Design Model

Pytorch doesn't provide a "reshape" layer that we will need for our model. They didn't provide "Flatten" for a while either, but then added it. The hope is they will add Reshape at some point as well.

For now, we'll just implement. All NN layers inherit from nn.Module.

In [None]:
class Reshape(nn.Module):
    def __init__(self, *args):
        super(Reshape, self).__init__()
        self.shape = args
    def forward(self, x):
        return x.view(self.shape)

## Define Model

We now need to do extra steps during encoding, e.g. sampling from the normal distribution. So, we'll define our standard layers as `_encoder_main` and will have a function `encode` that will call layers, and do additional work.
    

In [None]:
import pytz 

class VAE(nn.Module):
    def __init__(self, device):
        super().__init__()
        
        self._total_epochs = 0
        self._training_time_s = 0
        self._timer = None

        self.N = torch.distributions.Normal(0, 1)
        if device.type == 'cuda':
            self.N.loc = self.N.loc.cuda()
            self.N.scale = self.N.scale.cuda()
        
        self.model_file_name = f"faces_vae_" \
            + datetime.datetime.now().astimezone(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d_%H-%M") \
            + ".pkl"

        self.z_mean = torch.nn.Linear(64 * 8 * 8, 200)
        self.log_var = torch.nn.Linear(64 * 8 * 8, 200)

        self.encoder = nn.Sequential(
            # 128 x 128
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            # 64 x 64
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            # 32 x 32
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            # 16 x 16
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            
            nn.Flatten(),
        )
                
        self.decoder = nn.Sequential(
            nn.Linear(200, 64 * 8 * 8),
            Reshape(-1, 64, 8, 8),
            
            # 8 x 8 
            nn.ConvTranspose2d(64, 64, kernel_size=2, stride=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),

            # 16 x 16
            nn.ConvTranspose2d(64, 32, kernel_size=2, stride=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            # 32 x 32
            nn.ConvTranspose2d(32, 32, kernel_size=2, stride=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),

            # 64 x 64
            nn.ConvTranspose2d(32, 3, kernel_size=2, stride=2),  
#             nn.BatchNorm2d(32),
            nn.Sigmoid(),
            
            # 128 x 128
        )
        
        self.to(device)

    def encode(self, x):
        x = self.encoder(x)
        z_mean, log_var = self.z_mean(x), self.log_var(x)
        std_dev = torch.exp(0.5 * log_var)
        eps = self.N.sample(z_mean.shape)
        encoded = z_mean + eps * std_dev
        return encoded, z_mean, log_var

    def forward(self, x):
        encoded, z_mean, log_var = self.encode(x)
        decoded = self.decoder(encoded)
        return decoded, z_mean, log_var
    
    def start_training(self):
        if self._timer is None:
            self._timer = timeit.default_timer()

    def end_training(self):
        if self._timer:
            self._training_time_s += timeit.default_timer() - self._timer
            self._timer = None

    def add_epochs(self, epochs):
        self._total_epochs += epochs
        
    def training_timedelta(self):
        return datetime.timedelta(seconds=self._training_time_s)
    
t.model = VAE(t.device)

In [None]:
print('Parameters: ', sum(p.numel() for p in t.model.parameters()))

# Loss Function

## MSE Loss

In [None]:
t.mse_loss_base = nn.MSELoss(reduction="none")

def mse_loss(pred, y_true):
    pixel_loss = t.mse_loss_base(pred, y_true)
    batch_size = y_true.shape[0]
    pixel_loss_sum = pixel_loss.view(batch_size, -1).sum(axis=1) # sum over pixels
    loss = pixel_loss_sum.mean() # mean over batch
    return loss

## KL Divergence

In [None]:
def kl_div(z_mean, log_var):
    mu = z_mean
    var = torch.exp(log_var)
    kl_loss_sum = -0.5 * (1 + log_var - mu**2 - var).sum(axis=1) # sum over dimensions
    kl_loss = kl_loss_sum.mean() # mean over batch
    return kl_loss

## VAE Loss

In [None]:
def vae_loss(pred, y_true, kl_factor, z_mean, log_var):
    r_loss = mse_loss(pred, y_true)
    kl_loss = kl_div(z_mean, log_var)
    return r_loss + kl_factor * kl_loss, r_loss, kl_loss

# Analyze Images Before Training

### Inputs

In [None]:
t.i = 1
plot_images_tensor(t.sample_images[t.i:t.i+10])

What would output, that is, decoded images will look like now, before we did no training of the network? Let's pass images through the network, that is, execute the sequence:

    Input Image -> Encoder -> (0.1, -2.3) -> Decoder -> Output Image
    
Let's run the end-to-end model, both encoder and decoder:

In [None]:
t.sample_output = t.model(t.sample_images[t.i : t.i+10])[0]

What is the output?

In [None]:
t.sample_output.shape

Display output images

In [None]:
plot_images_tensor(t.sample_output)

# Training Loop

We need to modify our train loop to read images from the data loader and not from memory. Images will no longer fit into memory (at least on Kaggle).

We will do an optimization to speed up our calculation. Instead of reloading images and sending them to GPU again and again, we will pre-allocate memory on GPU, copy images once, then use them directly.

In [None]:
def calculate_validation_loss(model, valid_loader, criterion, device, kl_factor):
    model.eval()
    total_loss = 0
    mse_loss = 0
    kl_loss = 0
    image_count = 0
    
    with torch.no_grad():
        for images, labels in valid_loader:
            images = images.to(device)
            output, z_mean, log_var = model(images)
            cur_loss, cur_r_loss, cur_kl_loss = criterion(output, images, kl_factor, z_mean, log_var)
 
            batch_size = images.shape[0] # Can have an incomlete batch at the end
            del images # to help free memory on GPU

            total_loss += cur_loss * batch_size  # recover sum from the mean return by the loss function.
            mse_loss += cur_r_loss * batch_size  # we'll need it to correctly calcualte the global mean.
            kl_loss += cur_kl_loss * batch_size
            
            image_count += batch_size
            
    return total_loss / image_count, mse_loss / image_count, kl_loss / image_count

In [None]:
def image_count(image_folder):
    result = 0
    for _ in os.listdir(image_folder):
        result += 1
    return result

t.image_count = image_count(t.image_folder)
t.image_count

In [None]:
import time
from datetime import timedelta

def train(model, n_epochs, train_loader, opt, report_each_nth_batch, start_lr,
         valid_loader):
    print("Running on: ", t.device)
    print("Model name:", model.model_file_name)
    print("Model parameters:", sum(p.numel() for p in t.model.parameters()))
    print()
    print(f"Batch size: {train_loader.batch_size}, Epochs: {n_epochs}")
    print("Epoch, Batch num, Train loss, Valid Loss, MSE loss, KL loss, KL Factor, Total epochs, Train time, Total train time, LR")

    log_dict = {
        'batch_train_loss': [], 
        'batch_mse_loss': [], 
        'batch_kl_div': [], 
        'valid_loss': []
    }
    
    start_time = time.time()
    batch_num = 0
    kl_factor = 1
    epoch = 0 
    train_loss = -1
    
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        opt, 
        mode='min',
        factor=0.1, 
        patience=3, 
        threshold=1, 
        threshold_mode='abs') 
    
    while model._total_epochs < n_epochs:
        epoch += 1
        # Start of epoch training
        for i, data in enumerate(train_loader):
            image_batch, labels_smile = data
            model.start_training()

            image_batch = image_batch.to(t.device, non_blocking=True)
            
            # Train model
            output, z_mean, log_var = model(image_batch)
            loss, r_loss, kl_div = vae_loss(output, image_batch, kl_factor, z_mean, log_var)
            
            log_dict['batch_train_loss'].append(loss.item())
            log_dict['batch_mse_loss'].append(r_loss.item())
            log_dict['batch_kl_div'].append(kl_div.item())
                    
            model.zero_grad()
            
            loss.backward()
            
            opt.step()
            
            batch_num += 1
            
            if batch_num % report_each_nth_batch == 0:
                with torch.no_grad():
                    model.eval()
                    train_loss = loss.item()
                    # Validate
                    total, r_loss, kl_div = calculate_validation_loss(model, valid_loader, vae_loss, t.device, kl_factor)
                    model.end_training()
                    print(f"{epoch}, {batch_num}, {train_loss:.2f}, {total.item():.2f}, {r_loss.item():.2f},", 
                          f"{kl_div.item():.2f}, {kl_factor}, {model._total_epochs},",
                          f"{timedelta(seconds=time.time() - start_time)},",
                          f"{model.training_timedelta()}, {start_lr}")
                    model.train()
                    model.start_training()
        # End of epoch training        
        lr_scheduler.step(loss)
        model.add_epochs(1)
        torch.save(t.model, model.model_file_name)
    model.end_training()
    model.eval()
    # Validate
    with torch.no_grad():
        total, r_loss, kl_div = calculate_validation_loss(model, valid_loader, vae_loss, t.device, kl_factor)
        print(f"{epoch}, {batch_num}, {train_loss:.2f}, {total.item():.2f}, {r_loss.item():.2f},", 
              f"{kl_div.item():.2f},",
              f"{model._total_epochs}, {timedelta(seconds=time.time() - start_time)},",
              f"{model.training_timedelta()},",
              f"{start_lr}")
    return log_dict

# Run Training

In [None]:
%%time

t.opt = torch.optim.Adam(t.model.parameters(), lr=1e-4)

log_dict = train(
    model = t.model, 
    n_epochs = 20, 
    train_loader  = t.train_loader, 
    opt = t.opt, 
    report_each_nth_batch = 2000,
    start_lr = 1e-4,
    valid_loader=t.valid_loader
)

# Load Model

In [None]:
# Load saved model with the most recent timestamp.
t.model_file = [f for f in sorted(os.listdir()) if f.startswith("faces_vae_")][-1]
t.model = torch.load(t.model_file)

# Check the Model Output

Original images:

In [None]:
t.i = 0
plot_images_tensor(t.sample_images[t.i:t.i+12])

Reconstructed images:

In [None]:
plot_images_tensor(t.model(t.sample_images[t.i : t.i+12])[0])

# Embedding Space

In [None]:
a = Object()
a.model = t.model

In [None]:
with torch.no_grad():
    a.emb = None
    image_count = 1024
    for images, _ in t.valid_loader:
        images = images.to(t.device)
        emb = a.model.encode(images)[0]
        emb = emb.detach().cpu().numpy()
        if a.emb is None:
            a.emb = emb
        else:
            a.emb = np.append(a.emb, emb, axis=0)
        image_count -= images.shape[0]
        del images
        if image_count <= 0:
            break
a.valid_emb = a.emb
torch.cuda.empty_cache()
a.valid_emb.shape

In [None]:
a.valid_emb[:, 4].shape

In [None]:
fig, axes = plt.subplots(4, 4, figsize=(10, 10))
for i in range(4):
    for j in range(4):
        x = a.valid_emb[:, i]
        y = a.valid_emb[:, 10 + j]
        ax = fig.axes[i * 4 + j]
        ax.set_ylim(-5, 5)
        ax.set_xlim(-5, 5)
        ax.scatter(x, y, s=1)

# Sample Images from VAE

In [None]:
torch.randn(10, 5)

In [None]:
def sample_images_from_vae(model, emb_size, device, num_images):
    with torch.no_grad():
        rand_emb = torch.randn(num_images, emb_size).to(device)
        sampled_images = model.decoder(rand_emb)
        fig, axes = plt.subplots(nrows=1, ncols=num_images, figsize=(10, 3), sharey=True)
        for ax, im in zip(axes, sampled_images):
            im_np = im.detach().cpu().numpy()
            im_np = np.transpose(im_np, (1, 2, 0))
            ax.imshow(im_np)

In [None]:
for row in range(6):
    sample_images_from_vae(t.model, 200, t.device, 10)
    plt.show()

# Latent Space Vector Algebra

1. Load 512 images with no smile.
   1. Get their embedddings.
   1. Calculate average.
1. Load 512 images with smile.
   1. Get their embedddings.
   1. Calculate average.
1. Calculate the difference between mean_smile - mean_no_smile. We'll get the "add smile" vector.


#### Q. How do we add a smile to an image?

In [None]:
a.train_loader = torch.utils.data.DataLoader(
    t.train_dataset,
    batch_size=64,
    num_workers=2,
    pin_memory=True,
    shuffle=True
)

## Calculate the "Smile" Vector

In [None]:
a.image_count = 512
a.smile_emb = torch.zeros(512, 3, 128, 128)
a.no_smile_emb = torch.zeros(512, 3, 128, 128)
a.images_no_smile = torch.zeros(512, 3, 128, 128)
a.images_smile = torch.zeros(512, 3, 128, 128)


a.count_smile = 0
a.count_no_smile = 0
a.iter = a.train_loader.__iter__();
a.classes = a.train_loader.dataset.smile_to_int

while a.count_smile < a.image_count or a.count_no_smile < a.image_count:
    images, labels = next(a.iter)
    for im, lbl in zip(images, labels):
        if lbl == a.classes['smile'] and a.count_smile < a.image_count:
            a.images_smile[a.count_smile, :, :, :] = im
            a.count_smile += 1
        elif a.count_no_smile < a.image_count:
            a.images_no_smile[a.count_no_smile, :, :, :] = im
            a.count_no_smile += 1

In [None]:
plot_images_tensor(a.images_smile[0: 10])

In [None]:
plot_images_tensor(a.images_no_smile[0: 10])

In [None]:
with torch.no_grad():
    a.smile_emb = a.model.encode(a.images_smile.to(t.device))[0]
    a.no_smile_emb = a.model.encode(a.images_no_smile.to(t.device))[0]
torch.cuda.empty_cache()

In [None]:
a.smile_vec = torch.mean(a.smile_emb, axis=0) - torch.mean(a.no_smile_emb)

In [None]:
a.smile_vec.shape, a.smile_vec

## Add a Smile

In [None]:
a.test_no_smile_images = a.images_no_smile[14:24]
with torch.no_grad():
    a.test_no_smile = a.model.encode(a.test_no_smile_images.to(t.device))[0]
torch.cuda.empty_cache()

In [None]:
plot_images_tensor(a.test_no_smile_images)

In [None]:
with torch.no_grad():
    a.test_smile_images = a.model.decoder(a.test_no_smile)
torch.cuda.empty_cache()
plot_images_tensor(a.test_smile_images)

In [None]:
a.test_smile = a.test_no_smile  + 1.5 * a.smile_vec
a.test_smile.shape

In [None]:
with torch.no_grad():
    a.test_smile_images = a.model.decoder(a.test_smile)
torch.cuda.empty_cache()
plot_images_tensor(a.test_smile_images)