In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install torch torchvision

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
BATCH_SIZE = 8

In [None]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from google.colab import drive
import zipfile
import tarfile
import pandas as pd
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import clip
from torchvision import transforms
import random
from tqdm.notebook import tqdm
import nltk
nltk.download('punkt_tab')

In [None]:
# Mount drive and set paths
drive.mount('/content/drive')

# DataLoader

In [None]:
import scipy.io
import h5py
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader, Subset
import clip
from PIL import Image
import time
from tqdm import tqdm
import random

def load_mat_file(file_path):
  try:
    return scipy.io.loadmat(file_path)
  except Exception as e:
    print(f"Error loading {file_path}: {e}")
    return None

class CLIPDataset(Dataset):
  def __init__(self, anno_mat_path, image_h5_path, transform=None):
    self.model, self.preprocess = clip.load("ViT-B/32")
    if transform is None:
      self.transform = self.preprocess
    else:
      self.transform = transform
    self.file_path = image_h5_path
    self.anno_mat_path = anno_mat_path
    anno_data = load_mat_file(self.anno_mat_path)
    self.images = None
    self.texts = anno_data.get('engJ', None)
    if self.texts is None:
      raise ValueError("No 'engJ' found in .mat file.")
    self.original_texts = [text[0] for text in self.texts]

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx][0][0]
    if self.images is None:
      self.images = h5py.File(self.file_path, 'r')["ih"]
    raw_image = self.images[idx]
    if raw_image.shape[0] == 3:
      raw_image = np.transpose(raw_image, (2, 1, 0))
    if raw_image.dtype == np.float32 and raw_image.max() <= 1.0:
      raw_image = (raw_image * 255).astype(np.uint8)
    elif raw_image.dtype != np.uint8:
      raw_image = raw_image.astype(np.uint8)
    pil_image = Image.fromarray(raw_image)
    transformed_image = self.transform(pil_image)
    return transformed_image, text, idx

def evaluate_clip_accuracy(model, dataset, dataloader, device="mps", batch_size=16, k_values=[1, 5, 10]):
  model.eval()
  all_image_features = []
  all_text_features = []
  all_indices = []
  all_texts = []
  print("Extracting features")
  with torch.no_grad():
    for images, texts, indices in tqdm(dataloader):
      images = images.to(device)
      indices = torch.tensor(indices)
      image_features = model.encode_image(images)
      text_tokens = clip.tokenize(texts).to(device)
      text_features = model.encode_text(text_tokens)
      image_features = image_features / image_features.norm(dim=-1, keepdim=True)
      text_features = text_features / text_features.norm(dim=-1, keepdim=True)
      all_image_features.append(image_features.cpu())
      all_text_features.append(text_features.cpu())
      all_indices.extend(indices.tolist())
      all_texts.extend(texts)

  all_image_features = torch.cat(all_image_features)
  all_text_features = torch.cat(all_text_features)

  print("Calculating similarity matrix")
  similarity_matrix = all_image_features @ all_text_features.T
  print("Calculating accuracy metrics")
  results = {}

  correct_at_k = {k: 0 for k in k_values}
  ranks = []
  for i, orig_idx in enumerate(all_indices):
    similarities = similarity_matrix[i]
    _, sorted_indices = similarities.sort(descending=True)
    rank = (sorted_indices == i).nonzero().item() + 1
    ranks.append(rank)
    for k in k_values:
      if rank <= k:
        correct_at_k[k] += 1

  for k in k_values:
    results[f'top{k}_accuracy'] = correct_at_k[k] / len(all_indices) * 100
    results[f'recall@{k}'] = correct_at_k[k] / len(all_indices) * 100

  results['mean_rank'] = sum(ranks) / len(ranks)
  results['median_rank'] = np.median(ranks)
  best_matches = []
  worst_matches = []

  for i in range(min(5, len(all_indices))):
    top_match = similarity_matrix[i].argmax().item()
    score = similarity_matrix[i, top_match].item()
    if top_match == i:
      best_matches.append({
        'image_idx': all_indices[i],
        'text': all_texts[i],
        'top_match_text': all_texts[top_match],
        'score': score
      })
    else:
      worst_matches.append({
        'image_idx': all_indices[i],
        'ground_truth_text': all_texts[i],
        'predicted_text': all_texts[top_match],
        'score': score,
        'ground_truth_rank': ranks[i]
      })
  best_matches = sorted(best_matches, key=lambda x: x['score'], reverse=True)[:5]
  worst_matches = sorted(worst_matches, key=lambda x: x['ground_truth_rank'], reverse=True)[:5]

  results['best_matches'] = best_matches
  results['worst_matches'] = worst_matches
  return results


if __name__ == "__main__":
  random.seed(42)
  torch.manual_seed(42)

  device = "mps" if torch.backends.mps.is_available() else "cpu"
  print(f"Using device: {device}")

  model, _ = clip.load("ViT-B/32", device=device)

  full_dataset = CLIPDataset("/content/drive/MyDrive/DL/Project/Fashion Synthesis Benchmark/Anno/language_original.mat", "/content/drive/MyDrive/DL/Project/G2.h5")

  total_len = len(full_dataset)
  subset_size = int(total_len * 0.30)

  print(f"Full dataset size: {total_len}")
  print(f"Evaluating on a 30% subset: {subset_size} samples")

  subset_indices = random.sample(range(total_len), subset_size)

  subset_dataset = Subset(full_dataset, subset_indices)

  batch_size = BATCH_SIZE
  dataloader = DataLoader(
      subset_dataset,
      batch_size=batch_size,
      shuffle=False,
      num_workers=2,
      collate_fn=lambda batch: (
      torch.stack([item[0] for item in batch]),
      [item[1] for item in batch],
      [item[2] for item in batch]
    )
  )

In [None]:
!pip install datasets

In [None]:
from datasets import Dataset
from PIL import Image
import numpy as np
import h5py
import scipy.io

def deepfashion_to_dataset(anno_path, image_h5_path, limit=None):
    anno_data = scipy.io.loadmat(anno_path)
    texts = [entry[0][0] for entry in anno_data['engJ']]

    h5_file = h5py.File(image_h5_path, 'r')
    images = h5_file['ih']

    data = []
    for i, text in enumerate(texts):
        if limit and i >= limit:
            break
        raw_image = images[i]
        if raw_image.shape[0] == 3:
            raw_image = np.transpose(raw_image, (2, 1, 0))
        image = Image.fromarray((raw_image * 255).astype(np.uint8))  # convert to proper image
        data.append({"image": image, "text": text})

    return Dataset.from_list(data)


In [None]:
def preprocess_function(examples, vae, tokenizer, text_encoder):

    transform = transforms.ToTensor()
    images = [transform(image.convert("RGB").resize((vae.config.sample_size, vae.config.sample_size))) for image in examples['image']]
    images = torch.stack(images).to(device)

    with torch.no_grad():
        latents = vae.encode(images).latent_dist.sample().detach()
    latents = latents * 0.18215
    latents = latents.half()

    inputs = tokenizer(examples["text"], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")

    with torch.no_grad():
        encoder_hidden_states = text_encoder(input_ids=inputs.input_ids.to(device), attention_mask=inputs.attention_mask.to(device)).last_hidden_state.half()

    return {
      "pixel_values": latents.cpu().numpy().tolist(),
      "input_ids": inputs.input_ids.cpu().numpy().tolist(),
      "attention_mask": inputs.attention_mask.cpu().numpy().tolist(),
      "encoder_hidden_states": encoder_hidden_states.cpu().numpy().tolist()
    }


In [None]:
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler, StableDiffusionPipeline
from transformers import CLIPTokenizer, CLIPTextModel
import torch

def load_components():
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae").to(device)
  unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet").to(device)
  scheduler = PNDMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
  tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
  text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device)

  return vae, unet, scheduler, tokenizer, text_encoder



In [None]:
from functools import partial

In [None]:
def get_data(DATASET_PATH, num_images, vae, tokenizer, text_encoder):
  LOCAL_EXTRACT_DIR = '/content/extracted'
  os.makedirs(LOCAL_EXTRACT_DIR, exist_ok=True)
  anno_path = os.path.join(DATASET_PATH, 'Anno/language_original.mat')
  eval_path = os.path.join(DATASET_PATH, 'Eval/ind.mat')
  hf_dataset = deepfashion_to_dataset(anno_path, full_dataset.file_path, limit=num_images)  # for now just use 100 examples
  preprocess_fn = partial(preprocess_function, vae=vae, tokenizer=tokenizer, text_encoder=text_encoder)
  processed_dataset = hf_dataset.map(preprocess_fn, batched=True, batch_size=BATCH_SIZE)
  processed_dataset = processed_dataset.remove_columns(["image", "text"])
  processed_dataset.set_format(type="torch")
  sample = processed_dataset[0]
  print({k: type(v) for k, v in sample.items()})

  return processed_dataset, hf_dataset, anno_path, eval_path

In [None]:
from zipfile import ZipFile
from datetime import datetime

In [None]:
def save_model_info(unet, tokenizer, text_encoder, scheduler):
    custom_dir = "/content/drive/MyDrive/DL/Project/project_models"
    os.makedirs(custom_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    save_dir = os.path.join(custom_dir, f"finetuned_model_{timestamp}")
    zip_filename = os.path.join(custom_dir, f"finetuned_model_{timestamp}.zip")

    unet.save_pretrained(os.path.join(save_dir, "unet"))
    tokenizer.save_pretrained(os.path.join(save_dir, "tokenizer"))
    text_encoder.save_pretrained(os.path.join(save_dir, "text_encoder"))
    scheduler.save_config(os.path.join(save_dir, "scheduler"))

    with ZipFile(zip_filename, 'w') as zipf:
        for root, _, files in os.walk(save_dir):
            for file in files:
                filepath = os.path.join(root, file)
                arcname = os.path.relpath(filepath, save_dir)
                zipf.write(filepath, arcname=os.path.join(f"finetuned_model_{timestamp}", arcname))

    print(f"Model saved and zipped to Google Drive: {zip_filename}")
    return zip_filename

In [None]:
def train(DATASET_PATH, num_images, epochs, lr):
  vae, unet, scheduler, tokenizer, text_encoder = load_components()
  vae.to(device)
  text_encoder.to(device)
  unet.to(device)

  vae.eval()
  text_encoder.eval()

  processed_dataset, hf_dataset, anno_path, eval_path = get_data(DATASET_PATH, num_images, vae, tokenizer, text_encoder)

  train_dataloader = DataLoader(processed_dataset, batch_size=BATCH_SIZE, shuffle=True)

  optimizer = torch.optim.AdamW(unet.parameters(), lr=lr)
  unet.train()
  torch.cuda.empty_cache()

  for epoch in range(epochs):
      for batch in train_dataloader:
          latents = batch["pixel_values"].to(device)
          encoder_hidden_states = batch["encoder_hidden_states"].to(device)
          noise = torch.randn_like(latents)
          timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (latents.size(0),), device=device).long()
          noisy_latents = scheduler.add_noise(latents, noise, timesteps)

          model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
          loss = torch.nn.functional.mse_loss(model_pred, noise)

          loss.backward()
          optimizer.step()
          optimizer.zero_grad()

      print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
      del latents, noisy_latents, encoder_hidden_states, model_pred, loss
      torch.cuda.empty_cache()

  zip_filename = save_model_info(unet, tokenizer, text_encoder, scheduler)
  return unet, tokenizer, text_encoder, scheduler, vae, hf_dataset, zip_filename

In [None]:
print(os.getcwd())

In [None]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

import scipy.io
import clip
import torch
from PIL import Image
from tqdm import tqdm


In [None]:
def evaluate(eval_path, unet, tokenizer, text_encoder, scheduler, vae, hf_dataset, num_images, num_samples):
  eval_data = scipy.io.loadmat(eval_path)
  test_indices = []
  for i in eval_data["test_ind"].squeeze():
    if i < num_images:
      test_indices.append(int(i))


  hf_test = hf_dataset.select(test_indices)

  device = "cuda" if torch.cuda.is_available() else "cpu"
  clip_model, preprocess_clip = clip.load("ViT-B/32", device=device)


  pipe = StableDiffusionPipeline.from_pretrained(
      "CompVis/stable-diffusion-v1-4",
      vae=vae,
      unet=unet,
      text_encoder=text_encoder,
      tokenizer=tokenizer,
      scheduler=scheduler,
      torch_dtype=torch.float16,

  ).to(device)
  pipe.safety_checker = lambda images, **kwargs: (images, [False] * len(images))

  scores = []

  for i in tqdm(range(num_samples)):
      text_prompt = hf_test[i]["text"]
      generated_image = pipe(text_prompt).images[0]

      image_input = preprocess_clip(generated_image).unsqueeze(0).to(device).to(torch.float32)
      text_input = clip.tokenize([text_prompt]).to(device)

      with torch.no_grad():
          image_features = clip_model.encode_image(image_input)
          text_features = clip_model.encode_text(text_input)

      image_features /= image_features.norm(dim=-1, keepdim=True)
      text_features /= text_features.norm(dim=-1, keepdim=True)
      similarity = (image_features @ text_features.T).item()

      scores.append(similarity)

  print("Average CLIP similarity score:", round(sum(scores) / len(scores), 4))

In [None]:
from transformers import AutoTokenizer, AutoModel
from diffusers import UNet2DConditionModel, DDPMScheduler

In [None]:
def load_model_info(zip_path):
    unzip_dir = os.path.splitext(zip_path)[0]
    if not os.path.exists(unzip_dir):
        from zipfile import ZipFile
        with ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(os.path.dirname(zip_path))

    unet = UNet2DConditionModel.from_pretrained(os.path.join(unzip_dir, "unet"))
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(unzip_dir, "tokenizer"))
    text_encoder = AutoModel.from_pretrained(os.path.join(unzip_dir, "text_encoder"))
    scheduler = DDPMScheduler.from_pretrained(os.path.join(unzip_dir, "scheduler"))

    print(f"Model loaded from {zip_path}")
    return unet, tokenizer, text_encoder, scheduler

In [None]:
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler, StableDiffusionPipeline

In [None]:
def evaluate_from_saved(eval_path, zipfile_path, hf_dataset, num_images, num_samples):
  eval_data = scipy.io.loadmat(eval_path)
  test_indices = []
  for i in eval_data["test_ind"].squeeze():
    if i < num_images:
      test_indices.append(int(i))


  hf_test = hf_dataset.select(test_indices)

  device = "cuda" if torch.cuda.is_available() else "cpu"
  clip_model, preprocess_clip = clip.load("ViT-B/32", device=device)
  vae, unet, scheduler, tokenizer, text_encoder = load_components()
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  pipe = StableDiffusionPipeline.from_pretrained(
    zipfile_path,
    vae=vae,
    unet=unet,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    scheduler=scheduler,
    torch_dtype=torch.float16,

  )

  pipe = pipe.to(device)
  pipe.run_safety_checker = lambda images, device, dtype: (images, [False] * len(images))


  scores = []

  for i in tqdm(range(num_samples)):
      text_prompt = hf_test[i]["text"]
      generated_image = pipe(text_prompt).images[0]

      image_input = preprocess_clip(generated_image).unsqueeze(0).to(device).to(torch.float32)
      text_input = clip.tokenize([text_prompt]).to(device)

      with torch.no_grad():
          image_features = clip_model.encode_image(image_input)
          text_features = clip_model.encode_text(text_input)

      image_features /= image_features.norm(dim=-1, keepdim=True)
      text_features /= text_features.norm(dim=-1, keepdim=True)
      similarity = (image_features @ text_features.T).item()

      scores.append(similarity)

  print("Average CLIP similarity score:", round(sum(scores) / len(scores), 4))



# **Testing: num_images = 20**

**training**

In [None]:
DATASET_PATH = '/content/drive/MyDrive/DL/Project/Fashion Synthesis Benchmark'

In [None]:
unet, tokenizer, text_encoder, scheduler, vae, hf_dataset, zip_filename = train(DATASET_PATH, 20, 5, 5e-5)

zipfile path: /content/drive/MyDrive/DL/Project/project_models/finetuned_model_20250425_2302.zip

Loss Values:

Epoch 1, Loss: 0.0646

Epoch 2, Loss: 0.0465

Epoch 3, Loss: 0.0438

Epoch 4, Loss: 0.0634

Epoch 5, Loss: 0.0197

In [None]:
eval_path = "/content/drive/MyDrive/DL/Project/Fashion Synthesis Benchmark/Eval/ind.mat"
evaluate(eval_path, unet, tokenizer, text_encoder, scheduler, vae, hf_dataset, 20, 15)

Average CLIP similarity score: 0.2829

# **5 epochs batch size 8 num_images 100**

In [None]:
DATASET_PATH = '/content/drive/MyDrive/DL/Project/Fashion Synthesis Benchmark'

In [None]:
unet, tokenizer, text_encoder, scheduler, vae, hf_dataset, zip_filename = train(DATASET_PATH, 100, 5, 5e-5)

zipfile path: /content/drive/MyDrive/DL/Project/project_models/finetuned_model_20250425_2316.zip

Losses:

Epoch 1, Loss: 0.0389

Epoch 2, Loss: 0.0374

Epoch 3, Loss: 0.1419

Epoch 4, Loss: 0.0394

Epoch 5, Loss: 0.0118

In [None]:
eval_path = "/content/drive/MyDrive/DL/Project/Fashion Synthesis Benchmark/Eval/ind.mat"
evaluate(eval_path, unet, tokenizer, text_encoder, scheduler, vae, hf_dataset, 100, 90)

Average CLIP similarity score: 0.2564

# **5 epochs, batch size 8, 1000 images**

In [None]:
DATASET_PATH = '/content/drive/MyDrive/DL/Project/Fashion Synthesis Benchmark'

In [None]:
unet, tokenizer, text_encoder, scheduler, vae, hf_dataset, zip_filename = train(DATASET_PATH, 1000, 5, 5e-5)

zipfile path:/content/drive/MyDrive/DL/Project/project_models/finetuned_model_20250425_2333.zip

Losses:

Epoch 1, Loss: 0.0748

Epoch 2, Loss: 0.0763

Epoch 3, Loss: 0.0283

Epoch 4, Loss: 0.0639

Epoch 5, Loss: 0.0262

In [None]:
zipfolder_path = '/content/drive/MyDrive/DL/Project/project_models/finetuned_model_20250425_2302'
eval_path = "/content/drive/MyDrive/DL/Project/Fashion Synthesis Benchmark/Eval/ind.mat"
hf_dataset = deepfashion_to_dataset(anno_path, full_dataset.file_path, limit=1000)

In [None]:
evaluate_from_saved(eval_path, zipfolder_path, hf_dataset, 1000, 50)

# **5 epochs, batch size 8, 10000 images**

In [None]:
DATASET_PATH = '/content/drive/MyDrive/DL/Project/Fashion Synthesis Benchmark'

In [None]:
unet, tokenizer, text_encoder, scheduler, vae, hf_dataset, zip_filename = train(DATASET_PATH, 10000, 5, 5e-5)

# **5 epochs, batch size 8, 20000 images**

In [None]:
DATASET_PATH = '/content/drive/MyDrive/DL/Project/Fashion Synthesis Benchmark'

In [None]:
unet, tokenizer, text_encoder, scheduler, vae, hf_dataset, zip_filename = train(DATASET_PATH, 20000, 5, 5e-5)

# Loss Plots

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_loss_table(loss_lists, num_images_list, batch_size):
  epochs = len(loss_lists[0])
  columns = ["Number of Images"] + [f"Epoch {i+1} loss" for i in range(epochs)]
  rows = [str(images) for images in num_images_list]

  cell = []
  for img, losses in zip(rows, loss_lists):
      formatted_losses = [f"{loss:.4f}" for loss in losses]
      cell.append([img] + formatted_losses)

  fig, ax = plt.subplots(figsize=(1+epochs, len(rows)))

  #actual table
  table = ax.table(
      cellText=cell,
      colLabels=columns,
      loc='center',
      cellLoc='center'
  )

  table.scale(2, 2)

  ax.axis("off")

  plt.title(f"Loss per Epoch at Batch Size {batch_size} for Different Number of Images", fontsize=14)
  plt.show()

In [None]:
def plot_loss_line_graph(loss_lists, num_images_list, batch_size):
    epochs = len(loss_lists[0])
    epochs_list = list(range(1, epochs + 1))

    plt.figure(figsize=(8, 6))

    for losses, num_images in zip(loss_lists, num_images_list):
        plt.plot(epochs_list, losses, marker='o', label=f"{num_images} images")
    plt.xlabel("Epoch", fontsize=12)
    plt.ylabel("Loss", fontsize=12)
    plt.title(f"Loss vs Epochs for Different Number of Images at Batch Size {batch_size}", fontsize=14)
    plt.legend(title="Dataset Size")
    plt.grid(True)
    plt.xticks(epochs_list)
    plt.tight_layout()
    plt.show()

In [None]:
loss_lists_batch_8 = [[0.0646, 0.0465, 0.0438, 0.0634, 0.0197],
                      [0.0389, 0.0374, 0.1419, 0.0394, 0.0118],
                      [0.0748, 0.0763, 0.0283, 0.0639, 0.0262]]

num_images_batch_8 = [20, 100, 1000]
plot_loss_table(loss_lists_batch_8, num_images_batch_8, 8)

In [None]:
plot_loss_line_graph(loss_lists_batch_8, num_images_batch_8, 8)

### Experiements

### Failure on CUDA
Batch size = 16, Epoch = 3
Batch size = 32, Epoch = 3

#### 3 epochs & batch size = 8 ->
Epoch 1, Loss: 0.0597

Epoch 2, Loss: 0.0617

Epoch 3, Loss: 0.1440

Average CLIP similarity score: 0.2369


#### 5 epochs & batch size = 8 ->
Epoch 1, 0.0398

Epoch 2, 0.0768

Epoch 3, 0.0316

Epoch 4, 0.0576

Epoch 5, 0.0356

Average CLIP Similarity score: 0.2296

# Generating Images

In [None]:
vae, unet, scheduler, tokenizer, text_encoder = load_components()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler, StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained(
    "/content/drive/MyDrive/GT/Deep Learning/Project/project_models/finetuned_model_20250425_2302",
    vae=vae,
    unet=unet,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    scheduler=scheduler,
    torch_dtype=torch.float16,

)

pipe = pipe.to(device)

In [None]:
prompt_list = [
    "navy basketball shorts and shoes", "red dress and black heels", "denim jacket", "yellow summer dress", "white graduation dress",
    "black sweatsuit", "navy suit", "leather pants and jacket", "cocktail dress", "short sleeve hawaiian shirt",
    "girls pajamas", "boys pajamas", "pink soccer uniform", "irish costume", "college party dress", "cowboy boots", "rain boots",
    "women's corporate blouse", "swimming trunks for men", "gray hoodie and joggers", "black leather skirt", "white blouse",
    "green cargo pants", "blue denim jeans", "formal black dress", "plaid button-up shirt", "pink ballet dress", "long-sleeve sweater",
    "fuzzy winter coat", "cheetah print leggings", "blue tank top", "faux fur vest", "red plaid skirt", "tropical floral dress",
    "yellow raincoat", "striped sweater", "beige trench coat", "pink hoodie", "sporty windbreaker", "high-waisted jeans",
    "v-neck t-shirt", "knitted scarf", "short overalls", "black leather boots", "slim fit suit", "sundress", "black mini skirt",
    "sweater dress", "white tennis shoes", "men's athletic shorts", "purple velvet pants", "checkered flannel shirt", "brown leather belt",
    "fleece jacket", "graphic t-shirt", "long-sleeve turtleneck", "blue denim shorts", "high-neck sweater", "white tank top",
    "pleated skirt", "denim overalls", "red leather jacket", "chinos", "beanie hat", "vintage denim jacket", "red satin dress",
    "white button-down shirt", "knee-high boots", "cargo shorts", "green parka", "chambray shirt", "tie-dye hoodie", "black cargo pants",
    "pleated trousers", "floral romper", "denim skirt", "leopard print blouse", "yellow blouse", "thermal leggings",
    "gray plaid pants", "sporty sneakers", "off-the-shoulder blouse", "red crop top", "white tennis skirt", "gray cargo shorts",
    "cozy fleece sweater", "black leggings", "tailored blazer", "high-top sneakers", "leather ankle boots", "light denim jacket",
    "red bomber jacket", "striped leggings", "denim vest", "black cargo shorts", "leopard print jacket", "open-back dress",
    "yellow floral blouse", "denim mini skirt", "puffer jacket", "wrap dress", "black knee-high socks", "brown boots",
    "white sports bra", "floral blouse", "blue cardigan", "black dress pants", "sweatshirt and leggings", "gray hoodie and jeans",
    "fitted t-shirt", "yellow skirt", "blue zip-up hoodie", "orange beanie", "beige shorts", "leather biker jacket",
    "blush pink dress", "beige boots", "black high-waisted skirt", "lumberjack shirt", "fuzzy slippers", "polo shirt",
    "warm winter sweater", "light pink scarf", "beige cardigan", "long skirt", "cashmere sweater", "black slip dress",
    "tartan skirt", "brown cargo pants", "camel coat", "striped t-shirt", "white sneakers", "blue overalls",
    "gray wool coat", "checked pants", "flannel pajama set", "turtleneck sweater", "green bomber jacket"
]

In [None]:
from IPython.display import display

images = []

num_inference_steps = 50
guidance_scale = 7.5

for prompt in prompt_list:
  with torch.autocast("cuda"):
      output = pipe(
          prompt,
          num_inference_steps=num_inference_steps,
          guidance_scale=guidance_scale,
      )

  image = output.images[0]
  images.append(image)
  display(image)

In [None]:
ind = 117
print(prompt_list[ind])
display(images[ind])

In [None]:
from torchvision.models import inception_v3
import torch.nn.functional as F


def calculate_inception_score(images, batch_size=32, splits=10, device=None):
    """
    Calculate the Inception Score for a list of images

    Args:
        images: List of PIL images or tensor of shape [N, 3, H, W] in range [0, 1]
        batch_size: Batch size for inference
        splits: Number of splits to compute the score
        device: Device to run the model on

    Returns:
        mean and standard deviation of the Inception Score
    """
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    inception_model = inception_v3(pretrained=True, transform_input=False)
    inception_model.eval()
    inception_model.to(device)

    preprocess = transforms.Compose([
        transforms.Resize((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    if isinstance(images[0], Image.Image):
        processed_images = []
        for img in images:
            processed_images.append(preprocess(img))
        images = torch.stack(processed_images)

    class ImageDataset(Dataset):
        def __init__(self, images):
            self.images = images

        def __len__(self):
            return len(self.images)

        def __getitem__(self, idx):
            return self.images[idx]

    dataloader = DataLoader(ImageDataset(images), batch_size=batch_size)

    preds = []
    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            pred = inception_model(batch)
            pred = F.softmax(pred, dim=1)
            preds.append(pred.cpu().numpy())

    preds = np.concatenate(preds, axis=0)

    mean_preds = np.mean(preds, axis=0)

    scores = []
    for i in range(splits):
        part = preds[(i * preds.shape[0] // splits):((i + 1) * preds.shape[0] // splits), :]
        kl_div = part * (np.log(part) - np.log(mean_preds[None, :]))
        kl_div = np.sum(kl_div, axis=1)
        scores.append(np.exp(np.mean(kl_div)))

    return np.mean(scores), np.std(scores)


In [None]:
score_mean, score_std = calculate_inception_score(images)
print(f"Inception Score: {score_mean} ± {score_std}")

Inception Score: 8.114767074584961 ± 1.0872924327850342