**NOTE: This notebook is written for the Google Colab platform, which provides free hardware acceleration. However it can also be run (possibly with minor modifications) as a standard Jupyter notebook, using a local GPU.**

In [None]:
#@title -- Installation of Packages -- { display-mode: "form" }
!git clone https://github.com/Puzer/stylegan-encoder.git

In [None]:
#@title -- Import of Necessary Packages -- { display-mode: "form" }
%tensorflow_version 1.x
import sys
sys.path.append('stylegan-encoder')

import os
import bz2
import dlib
import PIL.Image
import numpy as np
from keras.utils import get_file
import matplotlib.pyplot as plt
from google.colab import files

import pickle
import config
import dnnlib
import dnnlib.tflib as tflib

from tqdm.autonotebook import tqdm
import tensorflow as tf
from keras.models import Model
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
import keras.backend as K

from keras.applications.vgg16 import VGG16, preprocess_input
from scipy.optimize import fmin_l_bfgs_b
import scipy

In [None]:
#@title -- Downloading Data -- { display-mode: "form" }
!mkdir -p data
!wget -nc -O data/starr.jpg https://www.dropbox.com/s/oyr35cz55lry5my/starr.jpg?dl=1
!wget -nc -O data/model.pkl https://www.dropbox.com/s/3rxfuwwcia8hxhj/karras2019stylegan-ffhq-1024x1024.pkl?dl=1

In [None]:
#@title -- Auxiliary Functions -- { display-mode: "form" }
LANDMARKS_MODEL_URL = "https://www.dropbox.com/s/ptx0wgfsnraq4xi/shape_predictor_68_face_landmarks.dat.bz2?dl=1"
# "http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2"
LANDMARKS_FILENAME = "shape_predictor_68_face_landmarks.dat.bz2"

def unpack_bz2(src_path):
    data = bz2.BZ2File(src_path).read()
    dst_path = src_path[:-4]
    with open(dst_path, 'wb') as fp:
        fp.write(data)
    return dst_path
  
def image_align(img, face_landmarks, output_size=1024, transform_size=4096, enable_padding=True):
    # Align function from FFHQ dataset pre-processing step
    # https://github.com/NVlabs/ffhq-dataset/blob/master/download_ffhq.py

    lm = np.array(face_landmarks)
    lm_chin          = lm[0  : 17]  # left-right
    lm_eyebrow_left  = lm[17 : 22]  # left-right
    lm_eyebrow_right = lm[22 : 27]  # left-right
    lm_nose          = lm[27 : 31]  # top-down
    lm_nostrils      = lm[31 : 36]  # top-down
    lm_eye_left      = lm[36 : 42]  # left-clockwise
    lm_eye_right     = lm[42 : 48]  # left-clockwise
    lm_mouth_outer   = lm[48 : 60]  # left-clockwise
    lm_mouth_inner   = lm[60 : 68]  # left-clockwise

    # Calculate auxiliary vectors.
    eye_left     = np.mean(lm_eye_left, axis=0)
    eye_right    = np.mean(lm_eye_right, axis=0)
    eye_avg      = (eye_left + eye_right) * 0.5
    eye_to_eye   = eye_right - eye_left
    mouth_left   = lm_mouth_outer[0]
    mouth_right  = lm_mouth_outer[6]
    mouth_avg    = (mouth_left + mouth_right) * 0.5
    eye_to_mouth = mouth_avg - eye_avg

    # Choose oriented crop rectangle.
    x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
    x /= np.hypot(*x)
    x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
    y = np.flipud(x) * [-1, 1]
    c = eye_avg + eye_to_mouth * 0.1
    quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
    qsize = np.hypot(*x) * 2

    # Shrink.
    shrink = int(np.floor(qsize / output_size * 0.5))
    if shrink > 1:
        rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
        dst_img = img.resize(rsize, PIL.Image.ANTIALIAS)
        quad /= shrink
        qsize /= shrink
    else:
        dst_img = img.copy()

    # Crop.
    border = max(int(np.rint(qsize * 0.1)), 3)
    crop = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))),
            int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
    crop = (max(crop[0] - border, 0),
            max(crop[1] - border, 0),
            min(crop[2] + border, dst_img.size[0]),
            min(crop[3] + border, dst_img.size[1]))
    if crop[2] - crop[0] < dst_img.size[0] or crop[3] - crop[1] < dst_img.size[1]:
        dst_img = dst_img.crop(crop)
        quad -= crop[0:2]

    # Pad.
    pad = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
    pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - dst_img.size[0] + border, 0), max(pad[3] - dst_img.size[1] + border, 0))
    if enable_padding and max(pad) > border - 4:
        pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
        dst_img = np.pad(np.float32(dst_img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
        h, w, _ = dst_img.shape
        y, x, _ = np.ogrid[:h, :w, :1]
        mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w-1-x) / pad[2]), 1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h-1-y) / pad[3]))
        blur = qsize * 0.02
        dst_img += (scipy.ndimage.gaussian_filter(dst_img, [blur, blur, 0]) - dst_img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
        dst_img += (np.median(dst_img, axis=(0,1)) - dst_img) * np.clip(mask, 0.0, 1.0)
        dst_img = PIL.Image.fromarray(np.uint8(np.clip(np.rint(dst_img), 0, 255)), 'RGB')
        quad += pad[:2]

    # Transform.
    dst_img = dst_img.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR)
    if output_size < transform_size:
        dst_img = dst_img.resize((output_size, output_size), PIL.Image.ANTIALIAS)

    return dst_img
  
class LandmarksDetector:
    def __init__(self, predictor_model_path):
        """
        :param predictor_model_path: path to shape_predictor_68_face_landmarks.dat file
        """
        self.detector = dlib.get_frontal_face_detector() # cnn_face_detection_model_v1 also can be used
        self.shape_predictor = dlib.shape_predictor(predictor_model_path)

    def get_landmarks(self, img):
        dets = self.detector(img, 1)

        for detection in dets:
            face_landmarks = [(item.x, item.y) for item in self.shape_predictor(img, detection).parts()]
            yield face_landmarks
            
def convert_images_loss(images):
    images = tf.cast(images, tf.float32)
    images = tf.transpose(images, [0, 2, 3, 1])
    drange=[-1,1]
    scale = 255 / 2
    images = images * scale + scale + 0.5
    return images
  
def convert_images_gen(images):
    images = tf.saturate_cast(images, tf.uint8)
    return images
  
class Evaluator:
    def __init__(self, aligned_img, loss_grad_func, latent_shape):
        self.loss_value = None
        self.grads_values = None
        self.latent_shape = latent_shape
        
        aligned_img = np.asarray(aligned_img)
        if len(aligned_img.shape) == 4:
            self.aligned_img = aligned_img
        elif len(aligned_img.shape) == 3:
            self.aligned_img = np.expand_dims(aligned_img, 0)
        else:
            raise RuntimeError("Unsupported image shape '{}'.".format(aligned_img.shape))

        self.loss_grad_func = loss_grad_func
        
        self.eval_iter = 0

    def loss(self, latent):
        assert self.loss_value is None
        latent = latent.reshape(self.latent_shape) 
        outs = self.loss_grad_func([self.aligned_img, latent])
        self.loss_value = outs[0]
        self.grad_values = np.array(outs[1:]).flatten().astype('float64')
        
        # clip the gradients
        self.grad_values = np.maximum(np.minimum(self.grad_values, 1.0), -1.0)

        self.eval_iter += 1
        print("eval {}, loss {}".format(self.eval_iter, self.loss_value))
        
        return self.loss_value

    def grads(self, x):
        assert self.loss_value is not None
        grad_values = np.copy(self.grad_values)
        self.loss_value = None
        self.grad_values = None
        return grad_values

def move_and_show(dlatent, direction, coeffs):
    fig,ax = plt.subplots(1, len(coeffs), figsize=(12, 10), dpi=80)
    dlatent = dlatent.reshape(dlatent_shape)
    
    for i, coeff in enumerate(coeffs):
        new_latent_vector = dlatent.copy()
        new_latent_vector[:8] = (dlatent + coeff*direction)[:8]
        ax[i].imshow(gen_func([new_latent_vector])[0][0])
        ax[i].set_title('Coeff: %0.1f' % coeff)
    [x.axis('off') for x in ax]
    plt.show()
    
def blend_and_show(dlatent1, dlatent2, coeffs):
    fig,ax = plt.subplots(1, len(coeffs), figsize=(12, 10), dpi=80)
    dlatent1 = dlatent1.reshape(dlatent_shape)
    
    for i, coeff in enumerate(coeffs):
        new_latent_vector = coeff * dlatent1 + (1-coeff) * dlatent2
        ax[i].imshow(gen_func([new_latent_vector])[0][0])
        ax[i].set_title('Coeff: %0.1f' % coeff)
    [x.axis('off') for x in ax]
    plt.show()

# Generating Human Faces using StyleGAN

The notebook will show how the StyleGAN method from NVIDIA introduced in paper ["A Style-Based Generator Architecture for Generative Adversarial Networks"](https://arxiv.org/abs/1812.04948) can be used to generate images of human faces. The official implementation of the method can be found in [this GitHub repository](https://github.com/NVlabs/stylegan). However, we will also be using some latent vectors and pieces of code from [another GitHub repository](https://github.com/Puzer/stylegan-encoder.git).

## Loading the Models

As a first step we will load the pretrained GAN model.

In [None]:
tflib.init_tf()
with open("data/model.pkl", "rb") as file:
    _, _, Gs = pickle.load(file)

Models used to align faces.

In [None]:
landmarks_model_path = unpack_bz2(
    get_file(LANDMARKS_FILENAME,
    LANDMARKS_MODEL_URL, cache_subdir='temp')
)

landmarks_detector = LandmarksDetector(landmarks_model_path)

We will separate the individual parts of the model used for face generation and create functions which will allow us to apply them. The part, which maps the original latent vector to a disentangled latent vector:

In [None]:
tf_map_in = Gs.components.mapping.input_templates[0]
tf_map_out = Gs.components.mapping.output_templates[0]
map_func = K.function([tf_map_in], [tf_map_out])

The part, which generates images from the disentangled latent vector:

In [None]:
tf_dlatents = Gs.components.synthesis.input_templates[0]
tf_output = Gs.components.synthesis.output_templates[0]
tf_loss_img_out = convert_images_loss(tf_output)
tf_img_out = convert_images_gen(tf_loss_img_out)
gen_func = K.function([tf_dlatents], [tf_img_out])

We will store the the shape of the original and the disentangled latent vector.

In [None]:
latent_shape = (1,) + K.int_shape(tf_map_in)[1:]
dlatent_shape = (1,) + K.int_shape(tf_dlatents)[1:]

## Random Face Generation

Next we will generate a random face. We will start by generating a latent vector. Its elements will be drawn from the normal distribution and the shape will be according to variable ``latent_shape``.

In [None]:
latents = np.random.randn(*latent_shape)

We will form the disentangled latent vector by applying function ``map_func`` (defined above) to the original one.

In [None]:
dlatents = map_func([latents])[0]

We can next use ``dlatents`` as an input to ``gen_func``, which will generate the image itself.

In [None]:
img = gen_func([dlatents])[0][0]

Afterwards all we need to do is to visualize the image, or perhaps save it into a file.

In [None]:
plt.imshow(img)
plt.axis('off')

Generating more images can be tried here – we just need to generate a new latent vector every time.

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(10, 10))

for row in axes:
    for ax in row:
        latents = np.random.randn(*latent_shape)
        dlatents = map_func([latents])[0]
        img = gen_func([dlatents])[0][0]
        ax.imshow(img)
        ax.axis('off')

## Disentangled Latent Vector of an Existing Face

The GAN could be used to manipulate existing faces in interesting ways. However, we would first need to know their latent vectors. Unfortunately, StyleGAN only work one way: it generates faces out of latent vectors, but not vice versa.

Nevertheless, we can apply the same principle that we use to generate pre-images and adversarial examples. The neural net is differentiable and we can use optimization to find a latent vector whose matching face will be as similar to the target face as possible.

### Face Similarity

We will not measure face similarity in terms of pixel-wise distance, because that would not express the actual similarity well. We will instead preprocess the facial images using a neural net pretrained on ImageNet first. We will be comparing the resulting features instead of the raw pixels.

Let us therefore create all the individual tensors and load the pretrained model.

In [None]:
batch_size = 1
feature_img_size = 256
aligned_img_size = 1024
vgg_layer = 9

In [None]:
vgg16 = VGG16(include_top=False, input_shape=(feature_img_size, feature_img_size, 3))
perceptual_model = Model(vgg16.input, vgg16.layers[vgg_layer].output)

tf_img_ref = K.placeholder((1, aligned_img_size, aligned_img_size, 3))

tf_out_resized = preprocess_input(tf.image.resize_images(tf_loss_img_out,
                                  (feature_img_size, feature_img_size), method=1))
tf_out_features = perceptual_model(tf_out_resized)

tf_ref_resized = preprocess_input(tf.image.resize_images(tf_img_ref,
                                  (feature_img_size, feature_img_size), method=1))
tf_ref_features = perceptual_model(tf_ref_resized)

Next we will need to define the loss function – we can use the mean square error between the features of the generated image and the original image. We can also rescale the loss to squash the numbers into a more reasonable range:

In [None]:
loss = tf.losses.mean_squared_error(tf_out_features, tf_ref_features) / 85000

As a further step we will define the gradient of the loss function w.r.t. the disentangled latent vector so that we can later use it to minimize the loss.

In [None]:
tf_grads = K.gradients(loss, tf_dlatents)
loss_grad_func = K.function([tf_img_ref, tf_dlatents], [loss] + tf_grads)

### Minimizing the Loss

We will now load an image and minimize the loss function so as to find its matching latent vector.

In [None]:
face_img_path = "data/starr.jpg"

You can uncomment the following cell if you would rather upload your own image.

In [None]:
# face_img_path = list(files.upload())[0]

In [None]:
face_img = PIL.Image.open(face_img_path)
plt.imshow(face_img)
plt.axis('off')

It is necessary to preprocess the image a bit. We will extract the key points and align the face to match the data on which the GAN was trained.

In [None]:
face_landmarks = next(landmarks_detector.get_landmarks(np.asarray(face_img)))
aligned_img = image_align(face_img, face_landmarks, output_size=aligned_img_size)
plt.imshow(aligned_img)
plt.axis('off')

We will start optimizing from an all-zeros latent vector using LBFGS.

In [None]:
evaluator = Evaluator(aligned_img, loss_grad_func, dlatent_shape)
dlatent = np.zeros(dlatent_shape)

In [None]:
dlatent, min_val, info = fmin_l_bfgs_b(evaluator.loss, dlatent.flatten(),
     fprime=evaluator.grads, maxfun=400, disp=1)

### Generating the Image

Having minimized the loss function, we gain the disentangled latent vector, which approximately matches the original image. When we use it to generate a new face, it should be similar to the original face.

In [None]:
img = gen_func([dlatent.reshape((dlatent_shape))])[0][0]
plt.imshow(img)
plt.axis('off')

## Latent Vector Manipulation

We can now further modify the latent vector of the image. In a way similar to various other types of GANs and embeddings, some arithmetic operations with the vectors make sense semantically. We can, for an instance, identify a vector, which approximately corresponds to a smile, to age, to gender, etc. Let us now load several such vectors:

In [None]:
smile_direction = np.load('stylegan-encoder/ffhq_dataset/latent_directions/smile.npy')
gender_direction = np.load('stylegan-encoder/ffhq_dataset/latent_directions/gender.npy')
age_direction = np.load('stylegan-encoder/ffhq_dataset/latent_directions/age.npy')

Let us apply the smile vector:

In [None]:
move_and_show(dlatent.reshape((dlatent_shape)), smile_direction, [-1, 0, 1])

The gender vector:

In [None]:
move_and_show(dlatent.reshape((dlatent_shape)), gender_direction, [-1.5, 0, 2])

The age vector:

In [None]:
move_and_show(dlatent.reshape((dlatent_shape)), age_direction, [-2, 0, 1.5])

## Style Mixing

Alternatively, we can mix styles from multiple images.

In [None]:
latent2 = np.random.RandomState(1855).randn(*latent_shape)
dlatent2 = map_func([latent2])[0]
img2 = gen_func([dlatent2])[0][0]
plt.imshow(img2)
plt.axis('off')

In [None]:
blend_and_show(dlatent.reshape(dlatent_shape), dlatent2, [0, 0.25, 0.5, 0.75, 1])

## Arithmetics

The following cells show, how to find more semantic vectors: We generate more photos and keep track the random seeds corresponding to photos which do or do not contain the target property – e.g. photos with long and short hair. We then compute the difference between the latent vectors for both groups.

In [None]:
def find_mean_dlatent(seeds):
    mean_dlatent = np.zeros(dlatent_shape)
    
    for s in seeds:
        h_latent = np.random.RandomState(s).randn(*latent_shape)
        h_dlatent = map_func([h_latent])[0]

        mean_dlatent += h_dlatent / len(seeds)
        
    return mean_dlatent

In [None]:
female_long_hair = [517, 519, 521, 523, 525, 528, 529, 538, 539, 540, 618, 642, 655]
female_short_hair = [537, 546, 561, 597, 599, 602, 610, 616, 627, 637, 652]

In [None]:
long_hair_dlatent = find_mean_dlatent(female_long_hair)
short_hair_dlatent = find_mean_dlatent(female_short_hair)

In [None]:
dlatent2 = dlatent.reshape(dlatent_shape) + 0.5 * (short_hair_dlatent - long_hair_dlatent)
img2 = gen_func([dlatent2])[0][0]
plt.imshow(img2)
plt.axis('off')