# Variational Autoencoders (VAE) from Scratch

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/adiel2012/deep-learning-abc/blob/main/vae.ipynb)

VAEs learn a latent variable model by maximizing the Evidence Lower Bound (ELBO).

Key Concepts:
1. **Encoder:** Maps input $x$ to latent distribution parameters $\mu, \sigma$ (approximate posterior $q_\phi(z|x)$).
2. **Reparameterization Trick:** Sample $z = \mu + \sigma \cdot \epsilon$ where $\epsilon \sim \mathcal{N}(0, 1)$. This allows backpropagation through stochastic sampling.
3. **Decoder:** Maps $z$ back to $x$ (likelihood $p_\theta(x|z)$).

Loss = Reconstruction Loss - KL Divergence

In [None]:
!pip install torch torchvision matplotlib

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 1. VAE Model

In [None]:
class VAE(nn.Module):
    def __init__(self, img_dim=28*28, h_dim=400, z_dim=20):
        super().__init__()
        # Encoder
        self.fc1 = nn.Linear(img_dim, h_dim)
        self.fc2_mu = nn.Linear(h_dim, z_dim)
        self.fc2_logvar = nn.Linear(h_dim, z_dim)
        
        # Decoder
        self.fc3 = nn.Linear(z_dim, h_dim)
        self.fc4 = nn.Linear(h_dim, img_dim)

    def encode(self, x):
        h = F.relu(self.fc1(x))
        return self.fc2_mu(h), self.fc2_logvar(h)

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(0.5 * logvar)
            eps = torch.randn_like(std)
            return mu + eps * std
        else:
            return mu

    def decode(self, z):
        h = F.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h))  # Output probabilities [0, 1]

    def forward(self, x):
        mu, logvar = self.encode(x.view(-1, 784))
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

## 2. Loss Function (ELBO)

$$ \mathcal{L} = \underbrace{\mathbb{E}_{q}[\log p(x|z)]}_{\text{Reconstruction}} - \underbrace{D_{KL}(q(z|x) || p(z))}_{\text{Regularization}} $$

In [None]:
def loss_function(recon_x, x, mu, logvar):
    # 1. Reconstruction Loss (Binary Cross Entropy)
    BCE = F.binary_cross_entropy(recon_x, x.view(-1, 784), reduction='sum')
    
    # 2. KL Divergence: KL(N(mu, sigma) || N(0, 1))
    # = -0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    
    return BCE + KLD

# Data & Training
dataset = datasets.MNIST(root="dataset/", transform=transforms.ToTensor(), download=True)
loader = DataLoader(dataset, batch_size=128, shuffle=True)

model = VAE().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

print("Training VAE...")
model.train()
for epoch in range(5):
    total_loss = 0
    for batch_idx, (data, _) in enumerate(loader):
        data = data.to(device)
        optimizer.zero_grad()
        
        recon_batch, mu, logvar = model(data)
        loss = loss_function(recon_batch, data, mu, logvar)
        
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
        
    print(f"Epoch {epoch}: Average Loss {total_loss / len(loader.dataset):.4f}")

## 3. Manifold Learning (Visualizing Latent Space)

Since we forced $z$
to be close to Gaussian, we can sample from the latent space to generate new digits.

In [None]:
with torch.no_grad():
    # Sample random points from standard normal
    z = torch.randn(16, 20).to(device)
    sample = model.decode(z).cpu()
    
    # Plot
    fig, axes = plt.subplots(2, 8, figsize=(10, 3))
    for i, ax in enumerate(axes.flatten()):
        ax.imshow(sample[i].view(28, 28), cmap='gray')
        ax.axis('off')
    plt.suptitle('VAE Generated Digits (sampled from N(0,1))')
    plt.show()