# Converting FFHQ images to latent space Z

From:

- https://github.com/CompVis/latent-diffusion#pretrained-autoencoding-models


In [2]:
import os
import glob
import wget
import zipfile

import numpy as np

from PIL import Image

import torchvision.transforms as transforms
import torchvision.datasets as dset
import torch

from ldm.models.autoencoder import AutoencoderKL

from tqdm import tqdm

In [3]:
IMAGE_DIR = "/data/imagefolder/ffhq"
OUTPUT_DIR = "/data/imagefolder/latent/klf4"
IMAGE_SIZE = 256
WORKERS = 2
BATCH_SIZE = 128

In [3]:
dataset = dset.ImageFolder(root=IMAGE_DIR,
                           transform=transforms.Compose([
                                transforms.ToTensor(),
                                transforms.Resize(IMAGE_SIZE),
                                transforms.CenterCrop(IMAGE_SIZE),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                           ]))

dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=WORKERS)

In [4]:
def download_pre_trained_ae(url, output_dir):
    if os.path.exists(output_dir):
        print("Used cache")
    else:
        filename = wget.download(url)

        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall(output_dir)

        if os.path.exists(filename):
            os.remove(filename)

In [5]:
download_pre_trained_ae("https://ommer-lab.com/files/latent-diffusion/kl-f4.zip", './klf4')

Used cache


In [6]:
ddconfig = {
    "double_z": True,
    "z_channels": 3,
    "resolution": 256,
    "in_channels": 3,
    "out_ch": 3,
    "ch": 128,
    "ch_mult": [1,2,4],
    "num_res_blocks": 2,
    "attn_resolutions": [],
    "dropout": 0.0
}
lossconfig = {
      "target": "ldm.modules.losses.LPIPSWithDiscriminator",
      "params": {
        "disc_start": 50001,
        "kl_weight": 1.0e-06,
        "disc_weight": 0.5
      }
}
embed_dim = 3

pl_sd = torch.load("klf4/model.ckpt", map_location="cpu")

model = AutoencoderKL(ddconfig, lossconfig, embed_dim)

model.load_state_dict(pl_sd["state_dict"] ,strict=False)
# model.cuda()
# model.to('cuda:0')
# model.eval()


making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 3, 64, 64) = 12288 dimensions.
making attention of type 'vanilla' with 512 in_channels
loaded pretrained LPIPS loss from taming/modules/autoencoder/lpips/vgg.pth


<All keys matched successfully>

In [7]:
# for f in glob.glob(f"{OUTPUT_DIR}/*"):
#     os.remove(f)

In [8]:
for i, data in tqdm(enumerate(dataloader, 0), total=len(dataloader)):
    latent = model.encoder(data[0].to(model.device))
    np.save(f'{OUTPUT_DIR}/latent_klf4_{i}.npy', latent.cpu().detach().numpy())

100%|████████████████████████████████████████████████████████████████████████████| 547/547 [8:20:25<00:00, 54.89s/it]


In [4]:
max = 0
min = 0

for f in glob.glob(f"{OUTPUT_DIR}/*"):
    latent_klf4 = np.load(f)
    max_i = latent_klf4.max()
    min_i = latent_klf4.min()

    if max_i > max:
        max = max_i
    if min_i < min:
        min = min_i

In [5]:
print(max, min)

74.63506 -66.52526
