# [Blog](https://docs.google.com/document/d/1K4prKKw67B_64ljlggcfVN29FOKcMnRuofFIp2XLuwI/edit?usp=sharing)

## Anime GAN

The imports here are pretty standard. They include: 
- numpy (basic array manipulation, opening npy files, etc)
- matplotlib (creating graphs)
- os (file directory)
- tensorflow (machine learning framework)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Flatten, Reshape, LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam

Here is a python script we used to locally download the anime dataset to our laptops locally. One we had the images we re-shaped then into arrays so they were properly formatted for our neural network. 

In [None]:
import os
from os import listdir
from os.path import isfile, join
from PIL import Image
import numpy as np


IMAGE_SIZE = 100
IMAGE_CHANNELS = 3
IMAGE_DIR = os.path.expanduser("~/Desktop/AnimeImages/")
NUM_IMAGES = 60000


onlyFiles = [f for f in listdir(IMAGE_DIR) if isfile(join(IMAGE_DIR, f))]

training_data = []

for file in onlyFiles:
    path = os.path.join(IMAGE_DIR, file)
    try:
        image = Image.open(path).resize(
            (IMAGE_SIZE, IMAGE_SIZE), Image.ANTIALIAS)
    except:
        continue
    training_data.append(np.asarray(image))

training_data = np.reshape(
    training_data, (-1, IMAGE_SIZE, IMAGE_SIZE, IMAGE_CHANNELS))
training_data = training_data / 127.5 - 1

np.save('anime_data.npy', training_data[:NUM_IMAGES])

print('done')

Back in the cloud here, we manually upload are anime_data.npy file to either Google Colab or Kaggle and load it into our variable training data. 

In [None]:
# Load the training data
if os.path.isfile('anime_data.npy'):
  training_data = np.load('anime_data.npy')
  print('Training data loaded successfully.')
  print(f'Shape of data: {training_data.shape}')
else:
  raise Exception('You need to import the anime_data.npy file into the runtime.')

Here we define the structure of our generator and Discriminator. Both our generator and discriminator have three dense layers of 128 nodes each and use leakyReLu activation functions. At the Bottom of the cell we create the GAN by adding both the generator and discriminator to the model. 

In [None]:
def build_generator(img_shape, z_dim):
    model = Sequential()
    #The first layer is taking in 100 dimensional noise vector that follows a Gaussian Curve
    model.add(Dense(128, input_dim=z_dim))
    #All the nodes in our generator have a LeakyReLu activation function. Relu in general is prefered to sigmoid because is 
    #much faster to compute as is its derivative. However, the regular ReLu funciton can cause nodes to irreversably "die"
    #due to negiatve numbers all becoming zero. LeakyReLu solves that problem by just making negative numbers very small, not zero. 
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dense(128, input_dim=z_dim))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dense(128, input_dim=z_dim))
    model.add(LeakyReLU(alpha=0.01))
    #last output will be the size of a 100x100x3 image becuase it is generating the image
    model.add(Dense(np.prod(img_shape), activation='tanh')) 
    #finally, model shapes it's output from a flat 100x100x3 dimensional vector into the shape an an image
    model.add(Reshape(img_shape))
    return model

def build_discriminator(img_shape):
    model = Sequential()
    #For input, this model is receiving nested arrays in the shape of 100x100x3 images and this needs to be simplified to a single array 
    #with all the numbers in one vector -- this is what Flatten does
    model.add(Flatten(input_shape=img_shape))
    model.add(Dense(128))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dense(128))
    model.add(LeakyReLU(alpha=0.01))
    model.add(Dense(128))
    model.add(LeakyReLU(alpha=0.01))
    #The output is only one neuron becuase it is just guessing on whether the anime face is real or not
    model.add(Dense(1, activation='sigmoid'))
    return model
    
def build_gan(generator, discriminator):
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    return model

Here we build actual instances of the generator, discriminator, and GAN. The variables at the top describe how large the images will be, how many layers of an image there will be (here three for RGB) and how many dimensions our noise vector will have. 

In [None]:

img_size = 100
img_channels = 3
img_shape = (img_size, img_size, img_channels)
z_dim = 100

# Create discriminator
discriminator = build_discriminator(img_shape)
#the loss funciton here (which is used to determine how to evaluate the models performance) is binary cross-entropy since the 
#discriminator is making a binary decision -- is the image that it received real or fake?
discriminator.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
discriminator.trainable = False

# Create generator
generator = build_generator(img_shape, z_dim)

# Create GAN
gan = build_gan(generator, discriminator)
#Why binary_crossentropy is used here is less intuitive, since the generator creates a 100 dimensional vector as an output 
#instead of a single number. However, if you think about it, it makes sense. The generator is not judged on the the image it 
#makes in a vacuum. It's performance is also based on the binary decision of the discriminator (i.e., did it fool it or not), 
#ehich is also a binary decision. 
gan.compile(loss='binary_crossentropy', optimizer=Adam())

After every x number of training iterations for the GAN, sample_images is called to have the generator make predictions at it's current state. These pictures are then put onto a plot so we cna visualize the GANS improvement over time.

In [None]:
def sample_images(generator, image_grid_rows=4, image_grid_columns=4, seed=False):
  # Sample a number of random images from the generator and plot them in a grid
  if seed:
    np.random.seed(seed)

  z = np.random.normal(0, 1, (image_grid_rows * image_grid_columns, z_dim))
  gen_imgs = generator.predict(z)
  gen_imgs = 0.5 * gen_imgs + 0.5
  fig, axs = plt.subplots(image_grid_rows, image_grid_columns, figsize=(4, 4), sharey=True, sharex=True)

  cnt = 0
  for i in range(image_grid_rows):
    for j in range(image_grid_columns):
      axs[i, j].imshow(gen_imgs[cnt, :, :])
      axs[i, j].axis('off')
      cnt += 1

This is the most complex part of the process -- actually training the GAN. The function essientially is a for-loop that does n iterations. For each iteration, a random sample of the real images is taken, and an equal number of fake images are generated. Then the discriminator is tested to see how well it can acurrately discriminate on both the real and fake images separately, and the generator is also evaluated on how well it creates fake images based on how often it fools the discriminator. Both models are then changed accordingly. Every n iterations, a sample plot is made with the helper function above. 

In [None]:
def train(iterations, batch_size, sample_interval):
  storeloss = []
  accuracies = []
  iteration_checkpoints = []

  X_train = training_data

  real = np.ones((batch_size, 1))
  fake = np.zeros((batch_size, 1))

  img_num = 0

  # Training loop
  for iteration in range(iterations):
    # This gets batch_size amount of numbers between zero and the amount of images in X_train - 1. Images from those inexes
    # are then taken from X_train as a random sample
    idx = np.random.randint(0, X_train.shape[0], batch_size)
    imgs = X_train[idx]

    #This creates an equal amount of 100 dimensional noise vectors for the generator so there are an equal amount of real and
    #fake images
    z = np.random.normal(0, 1, (batch_size, 100))
    gen_imgs = generator.predict(z)

    # Here the discriminator tests on real and fake data seperately and gets accuracy scores for both. Its overall acurracy 
    # is the average of the two. the variable "real" is an array of ones that the discriminator is evaluated against when 
    # judging real images (as the correct answer should always be one) and the same is true for "fake" except it's an array 
    # of zeroes. 
    d_loss_acc_real = discriminator.train_on_batch(imgs, real)
    d_loss_acc_fake = discriminator.train_on_batch(gen_imgs, fake)
    d_loss, accuracy = 0.5 * np.add(d_loss_acc_real, d_loss_acc_fake)

    #Here a new batch of random noise generators are created and how well the generator does is calculated.
    z = np.random.normal(0, 1, (batch_size, 100))
    gen_imgs = generator.predict(z)
    g_loss = gan.train_on_batch(z, real)

    #if it's a certain iteration the stats will be printed an sample_images will be called
    if iteration % sample_interval == 0:

      storeloss.append((d_loss, g_loss))
      accuracies.append(100.0 * accuracy)
      iteration_checkpoints.append(iteration)

      print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % 
            (iteration, d_loss, 100.0 * accuracy, g_loss))

      # Visualize the performance of the generator by producing images from the test vector
      plt.figure(figsize=(10,10))
      sample_images(generator, seed=42)
      plt.axis('off')
      plt.tight_layout()
      plt.savefig(f'iteration_images/{img_num}.png')
      img_num +=1
      plt.close('all')

And Finally, we can train our data, changing the number of iterations, batch size, and sample iterval as we please. 

In [None]:
!rm -r 'iteration_images/'
!mkdir 'iteration_images/'

iterations = 20000
batch_size = 128
sample_interval = 200

train(iterations, batch_size, sample_interval)

## GIF Creator

After we have completed all of our training iterations (whew), we can take the grids made and create a GIF to show our progress.

In [None]:
import imageio
images = []

for filename in os.listdir("/content/iteration_images/"):
    # print(filename)
    images.append(imageio.imread("/content/iteration_images/" + filename))

imageio.mimsave('anime_28.gif', images, duration=0.3)

## Noise Predictor

Finally, after we have a fully trained generator, we can create an model that does the exact opposite process. Instead of taking in a vector of noise, and creating an image, this model learns to take in an image, and create an appropriate vector of noise. Why would we do this? So that we can feed it an image of a real person, decompose it into noise, and then feed that noise back into the generator to get an anime version of oneself!

This network is a convolutional neural network which means that instead of passing the whole image through the network, the image is first reduced down with filters into a smaller grid which represents patters in the picture. This simplified version of the picture is then passed through to train the model. A more techincal explanation can be found in the inline code below. 

In [None]:
def make_noise_predictor(img_size):
    model = tf.keras.Sequential()
    #This is portion different from the more vanilla neural networks we saw above. Analyzing images pixel by pixel can be very 
    #computationally expensive and it can also cause models to miss the forest for the trees. What this does next line does is
    #create several filters that are 5x5 in dimension and have different patterns. These filters are then run across the image, and create a score
    #where points are awarded per pixel if both the image and the filter are lit up and none are avoided if the pixels from the image
    #and filter don't match. Then the dot product is calculated and added to a feature map which is a essentially a smaller grid with each pixel 
    #value representing how each close each portion of the came to matching  a particular pattern. 
    model.add(layers.Conv2D(32, (5, 5), strides=(2, 2), padding='same',
                                     input_shape=[img_size, img_size, 3]))
    model.add(layers.LeakyReLU())
    #Max Pooling is a continuation of the convolutional process where for each 2x2 grid in the feature map, the max value is taken
    #and added to a new, even smaller grid. At this point, the original image has been reduced to a very large array of pixels, to 
    # a much smaller array which recognizes patters. 
    
    #In the end, this is much easier for the computer to deal with and should intuitively make
    #sense even if the specifics aren't fully clear. When humans see a picture of a car, they don't look at each pixel, 
    #they notice patterns that identify it as a car (windows, doors, antenna, etc.)
    model.add(layers.MaxPool2D(pool_size=(2, 2), strides=None, padding='same'))

    model.add(layers.Conv2D(64, (5, 5), strides=(1, 1), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.MaxPool2D(pool_size=(2, 2), strides=None, padding='same'))
    
    model.add(layers.Conv2D(96, (5, 5), strides=(1, 1), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.MaxPool2D(pool_size=(2, 2), strides=None, padding='same'))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(100, activation='linear'))    
    model.add(layers.Dense(100, activation='linear'))
    model.add(layers.Dense(100, activation='linear'))    

    return model

training_size = 5000
img_size = 28
epochs = 50

Y_train = np.random.normal(0, 1, (training_size, 100))
X_train = generator.predict(Y_train)

noisePredictor = make_noise_predictor(img_size)
noisePredictor.compile(loss='MSE', optimizer=Adam(), metrics=['accuracy'])
noisePredictor.fit(X_train, Y_train, batch_size=128, epochs=epochs)

In [None]:
#this creates a single 100 dimensional vector of noise
imageNoise = np.random.normal(0, 1, (1, 100))

In [None]:
#here, we create a generated image as usual from the noise, but then we reverse engineer the generated image into the noise
#that the noise predictor thinks was used to make it. We then take that prediction and put it back in the generator to see
#how different the results are. 
generatedImage = generator.predict(imageNoise)
reverseEngineeredNoise = noisePredictor.predict(generatedImage)
reconstructedImage = generator.predict(reverseEngineeredNoise)

#for these two blobs we are just loading in actual pictures of faces and formatting them 
face_1 = np.load('/kaggle/input/realfacepictures/colin_anime_data_64.npy')
face_1 = (face_1 - 127.5) / 127.5
face_1 = face_1.reshape(1, 64, 64, 3)

face_2 = np.load('/kaggle/input/realfacepictures/phil_anime_data_64.npy')
face_2 = (face_2 - 127.5) / 127.5
face_2 = face_2.reshape(1, 64, 64, 3)

For these four cells below, we are showing the images created above. If the noise predictor is good, generatedImage should look very similar to reconstructedImage. 

In [None]:
plt.imshow(generatedImage[0])

In [None]:
plt.imshow(reconstructedImage[0])

In [None]:
face_1_noise = noisePredictor.predict(face_1)
face_1_image = generator.predict(face_1_noise)
face_1_image = face_1_image * .5 + .5

plt.imshow(face_1_image[0])


In [None]:
face_2_noise = noisePredictor.predict(face_2)
face_2_image = generator.predict(face_2_noise)
face_2_image = face_2_image * .5 + .5

plt.imshow(face_2_image[0])

## Improved GAN model

Here is a different way of creaitng the GAN that used convolutional neural networks. While in theory this should perform better, I did not manipulate it enough to actually produce better results. The iterations were slower and the results were worse so I did not use it in the end. The discriminator works similiary to the noise generator explained above. The one interesting thing here though is that the generater useing convultions but in the reverse direction. Here, it takes smaller bits of information and blows them up.

In [None]:
import tensorflow as tf
import glob
import imageio
import matplotlib.pyplot as plt
import numpy as np
import os
import PIL
from tensorflow.keras import layers
import time
import math

from IPython import display

# BUFFER_SIZE = 5000
# BATCH_SIZE = 256

train_images = training_data
train_images = train_images.reshape(train_images.shape[0], 28, 28, 3).astype('float32')
print(train_images)
train_images = (train_images - 127.5) / 127.5  # Normalize the images to [-1, 1]


def make_generator_model(img_shape, z_dim):
    quarterImage = int(img_shape/4)
    halfImage = int(img_shape/2)

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(math.pow(quarterImage, 2)*128, use_bias=False, input_shape=(100,)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.LeakyReLU())

    model.add(tf.keras.layers.Reshape((quarterImage, quarterImage, 128)))
    assert model.output_shape == (None, quarterImage, quarterImage, 128)  # Note: None is the batch size

    model.add(tf.keras.layers.Conv2DTranspose(64, (5, 5), strides=(1, 1), padding='same', use_bias=False))
    assert model.output_shape == (None, quarterImage, quarterImage, 64)
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.LeakyReLU())

    model.add(tf.keras.layers.Conv2DTranspose(32, (5, 5), strides=(2, 2), padding='same', use_bias=False))
    assert model.output_shape == (None, halfImage, halfImage, 32)
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.LeakyReLU())

    model.add(tf.keras.layers.Conv2DTranspose(3, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh'))
    assert model.output_shape == (None, img_shape, img_shape, 3)

    return model


#64 -> 128
def make_discriminator_model(img_shape):
    model = tf.keras.Sequential()
    model.add(layers.Conv2D(32, (5, 5), strides=(2, 2), padding='same',
                                     input_shape=[img_shape, img_shape, 3]))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))

    model.add(layers.Flatten())
    model.add(layers.Dense(1))

    return model

def build_gan(generator, discriminator):
    model = Sequential()
    model.add(generator)
    model.add(discriminator)
    return model

In [None]:
img_size = 28
img_channels = 3
img_shape = (img_size, img_size, img_channels)
z_dim = 100

# Create discriminator
discriminator = make_discriminator_model(img_size)
discriminator.compile(loss='binary_crossentropy', optimizer=Adam(1e-4), metrics=['accuracy'])
discriminator.trainable = False

# Create generator
generator = make_generator_model(img_size, z_dim)
generator.summary()

# Create GAN
gan = build_gan(generator, discriminator)
gan.compile(loss='binary_crossentropy', optimizer=Adam(1e-4))

In [None]:
!rm -r 'iteration_images/'
!mkdir 'iteration_images/'

iterations = 20000
batch_size = 128
sample_interval = 200

train(iterations, batch_size, sample_interval)