# VAE

## code reference

The theoretical derivation and basic structure of generating grayscale pictures for VAE are provided in the link. The basic structure of VAE generation model is built according to this tutorial, and a lot of code is modified on this infrastructure to achieve VAE image generation and color picture generation.

https://blog.csdn.net/weixin_43845922/article/details/129325896?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522171033905316800185899475%2522%252C%2522scm%2522%253A%252220140713.130102334..%2522%257D&request_id=171033905316800185899475&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~baidu_landing_v2~default-4-129325896-null-null.142^v99^control&utm_term=VAE生成图片&spm=1018.2226.3001.4187


## Import some libraries

In [1]:
import os 
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import torchvision 
from torchvision import transforms 
from torchvision.utils import save_image

## Parameter configuration

Define parameters and data sets and create sample folders.

In [2]:
image_size = 784 # 28*28 
h_dim = 400 
z_dim = 20 
num_epochs = 300 
batch_size = 128 
learning_rate = 0.001 

# Create a folder
sample_dir = 'samples'
if not os.path.exists(sample_dir):
    os.makedirs(sample_dir)
                                     
dataset = torchvision.datasets.ImageFolder(root='painting/', 
                                            transform=transforms.Compose([
                                                transforms.Resize((28, 28)),
                                                transforms.ToTensor(),
                                            ]))

## Defining the VAE model

A variational autoencoder (VAE) neural network structure is defined, including an encoder and a decoder.

The encoder converts the input image into the mean and variance of the latent space, and the decoder reconstructs the latent vector into the original image.

In [3]:
class VAE(nn.Module):
    # The structure of neural network is defined
    def __init__(self, image_size=784, h_dim=400, z_dim=20):
        super(VAE, self).__init__()
        self.fc1 = nn.Linear(image_size, h_dim)
        self.fc2 = nn.Linear(h_dim, z_dim)
        self.fc3 = nn.Linear(h_dim, z_dim)
        
        self.fc4 = nn.Linear(z_dim, h_dim)
        self.fc5 = nn.Linear(h_dim, image_size)

    def encoder(self, x):
        h = F.relu(self.fc1(x))
        return self.fc2(h), self.fc3(h)
    
    def reparameterize(self, mu, log_var):
        std = torch.exp(log_var/2)
        eps = torch.randn_like(std)
        return mu + eps * std 

    def decoder(self, z):
        h = F.relu(self.fc4(z))
        return F.sigmoid(self.fc5(h))
    
    def forward(self, x):
        mu, log_var = self.encoder(x)
        z = self.reparameterize(mu, log_var)
        x_reconst = self.decoder(z)
        return x_reconst, mu, log_var 
    

# Select the GPU and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VAE().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Training model

A double loop is used, with an outer loop traversing the number of training rounds and an inner loop traversing the data set.

For each data batch, the data is first passed into the model for forward propagation, and then the reconstruction loss and KL divergence are calculated. Then backpropagation and optimization are carried out to update the model parameters.

For every 100 batches trained, print out the current round, the steps, and the value of the reconstruction loss and KL divergence.

In [4]:
for epoch in range(num_epochs):
    for i, (x, _) in enumerate(dataset):
        # Samples are obtained and propagated forward
        x = x.to(device).view(-1, image_size)
        x_reconst, mu, log_var = model(x)

        # Calculate reconstruction loss and KL divergence
        # KL divergence calculation reference：https://shenxiaohai.me/2018/10/20/pytorch-tutorial-advanced-02/
        reconst_loss = F.binary_cross_entropy(x_reconst, x, size_average=False)
        kl_div = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())

        # Backpropagation and optimization
        loss = reconst_loss + kl_div 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1)%100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Reconst Loss: {:.4f}, KL Div:{:.4f}'.
                  format(epoch+1, num_epochs, i+1, len(dataset), reconst_loss.item(), kl_div.item()))

#The images generated by each training step are retained

# with torch.no_grad():
        
#         # Save the sampled image, that is, the new image generated by the latent vector z through the decoder
#         # Randomly generated images
#         z = torch.randn(batch_size, z_dim).to(device)
#         out = model.decoder(z).view(-1, 1, 28, 28)
#         save_image(out, os.path.join(sample_dir, 'sampled-{}.png'.format(epoch+1)))

#         # Save the reconstructed image
#         out, _, _ = model(x)
#         x_concat = torch.cat([x.view(-1, 1, 28, 28), out.view(-1, 1, 28, 28)], dim=3)
#         save_image(x_concat, os.path.join(sample_dir, 'reconst-{}.png'.format(epoch+1)))



Epoch [1/300], Step [100/2042], Reconst Loss: 1250.6622, KL Div:2.9523
Epoch [1/300], Step [200/2042], Reconst Loss: 1099.3325, KL Div:2.8520
Epoch [1/300], Step [300/2042], Reconst Loss: 1413.2888, KL Div:16.4427
Epoch [1/300], Step [400/2042], Reconst Loss: 1186.1074, KL Div:3.6154
Epoch [1/300], Step [500/2042], Reconst Loss: 1683.8351, KL Div:63.3989
Epoch [1/300], Step [600/2042], Reconst Loss: 1356.7648, KL Div:8.9825
Epoch [1/300], Step [700/2042], Reconst Loss: 1324.6372, KL Div:22.0458
Epoch [1/300], Step [800/2042], Reconst Loss: 1163.2579, KL Div:11.8785
Epoch [1/300], Step [900/2042], Reconst Loss: 1195.2729, KL Div:15.4491
Epoch [1/300], Step [1000/2042], Reconst Loss: 1321.5194, KL Div:11.5000
Epoch [1/300], Step [1100/2042], Reconst Loss: 763.2000, KL Div:0.9630
Epoch [1/300], Step [1200/2042], Reconst Loss: 1299.4104, KL Div:18.8847
Epoch [1/300], Step [1300/2042], Reconst Loss: 1588.4285, KL Div:59.3868
Epoch [1/300], Step [1400/2042], Reconst Loss: 751.7977, KL Div:2.

## Generate image

Test with trained models and generated images: 

1. sampled images (a new image generated from the underlying vector z by the decoder)

2. reconstruct images (an image reconstructed from the original image by the encoder and decoder).

In [5]:
# Test with the trained model
with torch.no_grad():
        
        # Save the sampled image, that is, the new image generated by the latent vector z through the decoder
        # Randomly generated images
        z = torch.randn(batch_size, z_dim).to(device)
        out = model.decoder(z).view(-1, 1, 28, 28)
        save_image(out, os.path.join(sample_dir, 'sampled-{}.png'.format(epoch+1)))

        # Save the reconstructed image
        out, _, _ = model(x)
        x_concat = torch.cat([x.view(-1, 1, 28, 28), out.view(-1, 1, 28, 28)], dim=3)
        save_image(x_concat, os.path.join(sample_dir, 'reconst-{}.png'.format(epoch+1)))