In [1]:
#celeba dataset
img_dir = "/kaggle/input/celeba-dataset/img_align_celeba/img_align_celeba"
attributes_file = "/kaggle/input/celeba-dataset/list_attr_celeba.csv"

In [2]:
!rm -r /kaggle/working/output
!rm -r /kaggle/working/checkpoint
!rm -r /kaggle/working/logs
!rm /kaggle/working/state.db

rm: cannot remove '/kaggle/working/output': No such file or directory
rm: cannot remove '/kaggle/working/checkpoint': No such file or directory
rm: cannot remove '/kaggle/working/logs': No such file or directory
rm: cannot remove '/kaggle/working/state.db': No such file or directory


In [3]:
#necessary libraries
import os
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision.io import read_image
from torchvision import transforms

In [4]:
import matplotlib.pyplot as plt
import numpy as np

image = read_image('/kaggle/input/celeba-dataset/img_align_celeba/img_align_celeba/000001.jpg')
transform = transforms.Resize((224, 224)),

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToPILImage(),
    transforms.ToTensor()
])
image = transform(image)
# print(image.size())

# image = np.transpose(image, (1, 2, 0))
# plt.imshow(image)  # Assuming the image is grayscale
# plt.axis('off')  # Turn off axis
# plt.show()
# print(image)



In [5]:
# data = pd.read_csv(attributes_file)
# data.dtypes
# data.iloc[0].tolist()[1:]detach

In [6]:
IMAGE_SIZE = 32
CHANNELS = 3
BATCH_SIZE = 256
NUM_FEATURES = 128
Z_DIM = 200
LEARNING_RATE = 0.0005
EPOCHS = 10
BETA = 2000

In [7]:
from torch.utils.data import DataLoader
from torch.utils.data import Subset

class CELEBADataset(Dataset):
    def __init__(self, img_dir, attr_file=None, transform=None, target_transform=None):
        self.img_dir = img_dir
        self.attr = pd.read_csv(attr_file)
        self.transform = transform
        self.target_transform = target_transform
    
    def __len__(self):
        return len(self.attr)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, "{:06d}.jpg".format(idx+1))
        image = read_image(img_path)
#         label = self.attr.iloc[idx]
        if self.transform:
            image = self.transform(image)
        return image

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToPILImage(),
    transforms.ToTensor()
])
dataset = CELEBADataset(img_dir,attributes_file,transform=transform)
# subset_dataset = Subset(dataset, range(256))
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [8]:
#define the sampling layer
class Sampling(nn.Module):
    def forward(self, z_mean, z_log_var):
        epsilon = torch.randn_like(z_log_var)
        return z_mean + torch.exp(0.5 * z_log_var) * epsilon

In [9]:
#encoder
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        
        #batch normalization layer
        self.bn = nn.BatchNorm2d(NUM_FEATURES)
        
        #convolutional layers
        self.conv1 = nn.Conv2d(CHANNELS, NUM_FEATURES, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(NUM_FEATURES, NUM_FEATURES, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(NUM_FEATURES, NUM_FEATURES, kernel_size=3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(NUM_FEATURES, NUM_FEATURES, kernel_size=3, stride=2, padding=1)
        
        #flatten the last layer's output
        self.flatten = nn.Flatten()
        
        self.fc_mean = nn.Linear(NUM_FEATURES * 8 * 8 , Z_DIM)
        self.fc_log_var = nn.Linear(NUM_FEATURES * 8 * 8, Z_DIM)
        self.sampling = Sampling()
        
    def forward(self, x):
        x = F.leaky_relu(self.bn(self.conv1(x)))
        x = F.leaky_relu(self.bn(self.conv2(x)))
        x = F.leaky_relu(self.bn(self.conv3(x)))
        x = F.leaky_relu(self.bn(self.conv4(x)))
        
        x = self.flatten(x) ##128x8192
        
        z_mean = self.fc_mean(x)
        z_log_var = self.fc_log_var(x)
        z = self.sampling(z_mean, z_log_var)
        return z_mean, z_log_var, z
        
        
        

In [10]:
# enc = Encoder()
# y = enc(b)
# print(y[2].size())

In [11]:
#decoder

class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(Z_DIM, NUM_FEATURES * 8 * 8)
        self.bn1d = nn.BatchNorm1d(NUM_FEATURES * 8 * 8)
        self.bn2d = nn.BatchNorm2d(NUM_FEATURES)
        
        self.conv1 = nn.ConvTranspose2d(NUM_FEATURES, NUM_FEATURES, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.conv2 = nn.ConvTranspose2d(NUM_FEATURES, NUM_FEATURES, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.conv3 = nn.ConvTranspose2d(NUM_FEATURES, NUM_FEATURES, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.conv4 = nn.ConvTranspose2d(NUM_FEATURES, NUM_FEATURES, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.conv5 = nn.ConvTranspose2d(NUM_FEATURES, CHANNELS, kernel_size=3, stride=1, padding=1)
        
    def forward(self, x):
        x = self.fc(x)
        x = F.leaky_relu(self.bn1d(x))

        x = x.view(-1, NUM_FEATURES, 8, 8)
        x = F.leaky_relu(self.bn2d(self.conv1(x)))
        x = F.leaky_relu(self.bn2d(self.conv2(x)))
        x = F.leaky_relu(self.bn2d(self.conv3(x)))
        x = F.leaky_relu(self.bn2d(self.conv4(x)))
        x = torch.sigmoid(self.conv5(x))
        
        return x
        

In [12]:
# model = Decoder()
# x = model(y[2])
# print(x.size())
# print(x)

In [13]:
class VAE(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
        self.total_loss_tracker = []
        self.reconstruction_loss_tracker = []
        self.kl_loss_tracker = []
        
    def forward(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstruction = self.decoder(z)
        return z_mean, z_log_var, reconstruction

    def train_step(self, data, optimizer, beta):
        optimizer.zero_grad()
        
        z_mean, z_log_var, reconstruction = self(data)
        reconstruction_loss = F.mse_loss(data, reconstruction)
        kl_loss = -0.5 * torch.sum(1 + z_log_var - z_mean.pow(2) - z_log_var.exp(), dim=1).mean()
        total_loss = reconstruction_loss + beta * kl_loss
        
        total_loss.backward()
        optimizer.step()
        
        self.total_loss_tracker.append(total_loss.item())
        self.reconstruction_loss_tracker.append(reconstruction_loss.item())
        self.kl_loss_tracker.append(kl_loss.item())
        
        return {
            "loss": total_loss.item(),
            "reconstruction_loss": reconstruction_loss.item(),
            "kl_loss": kl_loss.item(),
        }
    
    def test_step(self, data):
        with torch.no_grad():
            z_mean, z_log_var, reconstruction = self(data)
            reconstruction_loss = F.mse_loss(data, reconstruction)
            kl_loss = -0.5 * torch.sum(1 + z_log_var - z_mean.pow(2) - z_log_var.exp(), dim=1).mean()
            total_loss = reconstruction_loss + kl_loss
            
        
        return {
            "loss": total_loss.item(),
            "reconstruction_loss": reconstruction_loss.item(),
            "kl_loss": kl_loss.item(),
        }



In [14]:
encoder = Encoder()
decoder = Decoder()
vae = VAE(encoder, decoder)
optimizer = optim.Adam(vae.parameters(), lr=LEARNING_RATE)
beta = BETA

In [15]:
from torch.utils.tensorboard import SummaryWriter
#define the optimizer
optimizer = optim.Adam(vae.parameters(), lr=LEARNING_RATE)

#model checkpoint
checkpoint_dir = "./checkpoint"
os.makedirs(checkpoint_dir, exist_ok=True)

#save checkpoint
def save_checkpoint(epoch, model, optimizer):
    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pt")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)

#tensorboard callback
log_dir = "./logs"
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir=log_dir)


#image generation callback
class ImageGenerator:
    def __init__(self, num_img, latent_dim, decoder, device):
        self.num_img = num_img
        self.latent_dim = latent_dim
        self.decoder = decoder
        self.device = device
        self.output_dir = "./output"
    def __call__(self, epoch):
        with torch.no_grad():
            random_latent_vectors = torch.randn((self.num_img, self.latent_dim)).to(self.device)
            generated_images = decoder(random_latent_vectors).cpu()*255
            
            #save images to local
            for i in range(self.num_img):
                os.makedirs(self.output_dir, exist_ok=True)
                img = transforms.ToPILImage()(generated_images[i])
                img.save(os.path.join(self.output_dir, f"generated_img_{epoch:3d}_{i}.png"))

2024-03-08 15:54:44.830949: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 15:54:44.831043: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 15:54:44.959312: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [16]:
#training loop
def train_vae(model, train_loader, optimizer, epochs, beta, device, callbacks=[]):
#     if torch.cuda.device_count() > 1:
#         print("Let's use", torch.cuda.device_count(), "GPUs!")
#         model = nn.DataParallel(model)    
    model.to(device)
    model.train()
    for epoch in range(1, epochs + 1):
        total_loss = 0.0
        recon_loss = 0.0
        kl_loss = 0.0
        for batch_idx, data in enumerate(train_loader):
            if batch_idx%100 == 0:
                print(f"Training batch: {batch_idx}")
            data = data.to(device)
            optimizer.zero_grad()
            results = model.train_step(data, optimizer, beta)
            total_loss += results["loss"]
            recon_loss += results["reconstruction_loss"]
            kl_loss += results["kl_loss"]
        total_loss /= len(train_loader.dataset)
        recon_loss /= len(train_loader.dataset)
        kl_loss /= len(train_loader.dataset)
        print(f"Epoch {epoch}: Total Loss: {total_loss:.4f}, Recon Loss: {recon_loss:.4f}, KL Loss: {kl_loss:.4f}")
        writer.add_scalar("Loss/Total", total_loss, epoch)
        writer.add_scalar("Loss/Reconstruction", recon_loss, epoch)
        writer.add_scalar("Loss/KL", kl_loss, epoch)
        for callback in callbacks:
            if isinstance(callback, ImageGenerator):
                callback(epoch)
        save_checkpoint(epoch, model, optimizer)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_generator_callback = ImageGenerator(num_img=10, latent_dim=Z_DIM, decoder=decoder, device=device)
train_vae(vae, dataloader, optimizer, EPOCHS, BETA, device, callbacks=[image_generator_callback])



Training batch: 0
Training batch: 100
Training batch: 200
Training batch: 300
Training batch: 400
Training batch: 500
Training batch: 600
Training batch: 700
Epoch 1: Total Loss: 8.7103, Recon Loss: 0.0003, KL Loss: 0.0044
Training batch: 0
Training batch: 100
Training batch: 200
Training batch: 300
Training batch: 400
Training batch: 500
Training batch: 600
Training batch: 700
Epoch 2: Total Loss: 0.0565, Recon Loss: 0.0003, KL Loss: 0.0000
Training batch: 0
Training batch: 100
Training batch: 200
Training batch: 300
Training batch: 400
Training batch: 500
Training batch: 600
Training batch: 700
Epoch 3: Total Loss: 0.0447, Recon Loss: 0.0003, KL Loss: 0.0000
Training batch: 0
Training batch: 100
Training batch: 200
Training batch: 300
Training batch: 400
Training batch: 500
Training batch: 600
Training batch: 700
Epoch 4: Total Loss: 0.0835, Recon Loss: 0.0003, KL Loss: 0.0000
Training batch: 0
Training batch: 100
Training batch: 200
Training batch: 300
Training batch: 400
Training b

In [17]:
torch.cuda.empty_cache()

In [18]:
!nvidia-smi

Fri Mar  8 18:55:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0              42W / 250W |    520MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

In [19]:
!kill 

kill: usage: kill [-s sigspec | -n signum | -sigspec] pid | jobspec ... or kill -l [sigspec]
