In [2]:
## Implementation of Generative Adversarial Nets, by Ian Goodfellow et al. (NIPS 2014)
## Exact implementation on MNIST

In [1]:
## Import packages and modules
import numpy as np
import math
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.utils import save_image
from torch.utils.data import DataLoader
from torchvision import datasets
import matplotlib.pyplot as plt
os.chdir("/home/agastya/Downloads")
%matplotlib inline

In [None]:
## 1.0 Data Preparation and Preprocessing
# Use transforms.Resize(img_size) to resize the image into (batch_size, 1, img_size, img_size)
def mnist_data():
    compose = transforms.Compose([
         transforms.ToTensor(),
         transforms.Normalize((.5, .5, .5), (.5, .5, .5)) 
         #Normalized to (-1,1) so as to mimic a tanh activation function
        ])
    out_dir = './dataset'
    return datasets.MNIST(root=out_dir, train=True, transform=compose, download=True)
data = mnist_data()
data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True)

In [77]:
n_epochs = 200    # number of epochs of training, int, default=200
batch_size = 64    # size of the batches, int, default=64
lr = 0.0002    # learning rate, float, default=0.0002
b1 = 0.5    # first order momentum gradient decay ADAM, float, default=0.5
b2 = 0.999    # second order momentum gradient decay ADAM, float, default=0.999
n_cpu = 8    # number of cpu threads to use during batch generation, int, default=8
latent_dim = 100    # dimensionality of latent space, int, default=100
img_size = 28    # size of each image dimension, int, default=28
channels = 1    # number of image channels, int, default=1
sample_interval = 400    # interval between image samples, int, default=400

In [7]:
img_shape = (channels, img_size, img_size)
cuda = True if torch.cuda.is_available() else False

In [3]:
# Generator NN
# Batch Norm with a momentum of 0.8
# Use Leaky RELU ALWAYS
# Tanh in last layer to mimic distribution of original data
# Progressively increasing mapping
class Generator(nn.Module):
    
    def __init__(self, latent_dims, output_dims, img_shape):
        super(Generator, self).__init__()
        
        self.negative_slope = 0.2
        self.latent_dims = latent_dims
        self.img_shape = img_shape
        
        self.layer1 = nn.Linear(self.latent_dims, 128)
        self.layer2 = nn.Linear(128, 256)
        self.batchnorm2 = nn.BatchNorm1d(256, 0.8)
        self.layer3 = nn.Linear(256, 512)
        self.batchnorm3 = nn.BatchNorm1d(512, 0.8)
        self.layer4 = nn.Linear(512, 1024)
        self.batchnorm4 = nn.BatchNorm1d(1024, 0.8)
        self.layer5 = nn.Linear(1024, output_dims)
        
    def forward(self, x):
        x = x.view(-1, self.latent_dims)
        x = F.leaky_relu_(self.layer1(x), self.negative_slope)
        print(x.shape)
        x = F.leaky_relu_(self.batchnorm2(self.layer2(x)), self.negative_slope)
        x = F.leaky_relu_(self.batchnorm3(self.layer3(x)), self.negative_slope)
        x = F.leaky_relu_(self.batchnorm4(self.layer4(x)), self.negative_slope)
        x = F.tanh(self.layer5(x))
        x = x.view(x.size(0), *self.img_shape)
        return x

In [4]:
# Discriminator NN
# Progressively decreasing mapping
# Sigmoid in last layer for probability whether real or fake
class Discriminator(nn.Module):
    
    def __init__(self, input_dims):
        super(Discriminator, self).__init__()
        
        self.input_dims = input_dims
        self.negative_slope = 0.2
        
        self.layer1 = nn.Linear(input_dims, 512)
        self.layer2 = nn.Linear(512, 256)
        self.layer3 = nn.Linear(256, 1)
        
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.leaky_relu_(self.layer1(x), self.negative_slope)
        x = F.leaky_relu_(self.layer2(x), self.negative_slope)
        x = F.sigmoid(self.layer3(x))
        return x

In [5]:
# Weights initialization
# Gaussian Distribution works the best
def init_weights(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1 or classname.find('BatchNorm') != -1:
        m.weight.data.normal_(0.00, 0.02)

In [68]:
# Binary CrossEntropy Loss
# For discriminator, maximize log(D(x)) + log (1 - D(G(z)))
# For generator, maximize log(D(G(z)))
adversarial_loss = nn.BCELoss()
generator = Generator(latent_dim, img_size**2, img_shape)
generator.apply(init_weights)
discriminator = Discriminator(img_size**2)
discriminator.apply(init_weights)
if cuda:
    generator.cuda()
    discriminator.cuda()
    adversarial_loss.cuda()
gen_optim = optim.Adam(generator.parameters(), lr=lr, betas=(b1, b2))
disc_optim = optim.Adam(discriminator.parameters(), lr=lr, betas=(b1, b2))

In [6]:
# Sample random noise froma Gaussian Distribution
def noise(size):
    n = torch.tensor(torch.randn(size, 100), requires_grad=True)
    if torch.cuda.is_available():
        return n.cuda()
    else:
        return n

# Returns an array of 1's for real data
def real_data_targets(size):
    data = torch.tensor(torch.ones(size, 1), requires_grad=False)
    if torch.cuda.is_available():
        return data.cuda()
    else:
        return data

# Returns a array of 0's for fake data
def fake_data_targets(size):
    data = torch.tensor(torch.zeros(size, 1), requires_grad=False)
    if torch.cuda.is_available():
        return data.cuda()
    else:
        return data

In [71]:
def train_disc(optimizer, real_data, fake_data):
    
    optimizer.zero_grad()
    
    # Training on real data
    # real_loss = -log(D(x))
    real_prediction = discriminator(real_data)
    real_loss = adversarial_loss(real_prediction, real_data_targets(real_data.size(0)))
    
    # Training on Generated data
    # fake_loss = -log(1 - D(G(z)))
    fake_prediction = discriminator(fake_data)
    fake_loss = adversarial_loss(fake_prediction, fake_data_targets(fake_data.size(0)))
    
    # Update Gradients
    # Total_loss = -log(D(x)) - log(1 - D(G(z))
    total_loss = real_loss + fake_loss//2
    total_loss.backward()
    optimizer.step()
    
    return real_loss+fake_loss, real_prediction, fake_prediction

In [72]:
def train_gen(optimizer, fake_data):
    
    optimizer.zero_grad()
    
    # Generator loss is -log(D(G(z)))
    prediction = discriminator(fake_data)
    loss = adversarial_loss(prediction, real_data_targets(fake_data.size(0)))
    loss.backward()
    optimizer.step()
    return loss

In [73]:
def train_GAN(num_epochs):
    generator_error = []
    discriminator_error = []
    for epoch in range(num_epochs):
        for batch, (real_data, _) in enumerate(data_loader):
            
            real_data = torch.tensor(real_data.view(real_data.size(0),28*28), requires_grad=True)
            if torch.cuda.is_available():
                real_data = real_data.cuda()
            
            fake_data = generator(noise(real_data.size(0))).detach()    # Don't train generator when training discriminator
            
            # First train Discriminator without training discriminator
            disc_error, disc_real_pred, disc_fake_pred = train_disc(disc_optim, real_data, fake_data)
            
            # Then train generator via discriminator without training discriminator
            # that's why two optimizers with only each parameters of discriminator and generator
            fake_data = generator(noise(real_data.size(0)))
            gen_error = train_gen(gen_optim, fake_data)

In [59]:
train_GAN(10)

KeyboardInterrupt: 

In [78]:
# For testing use a batch size > 1 because batch norm can track statistics based on that only