In [None]:
import os
import random
import numpy as np
from PIL import Image
from tqdm import tqdm
from pathlib import Path
from collections import Counter
from matplotlib import pyplot as plt

import torchvision.transforms as transforms
from torch.utils.data import random_split, DataLoader
from torch import Generator

from transformers import CLIPProcessor

# Dataset image sizes

Datasets

- Real faces: `ffhq_real_faces`
    - 3143 images
    - these are all in `png` format
- Diffusion-generated faces (set 1): `AIS-4SD/StableDiffusion-3-faces-20250203-1545`
    - 500 images
    - these are all in `png` format
- Diffusion-generated faces (set 2): `SFHQ-T2I`
    - 1724 images
    - these are all in `jpg` format

For each of these datasets, we are interested in the image size and whether it's consistent for all images within the dataset.

In [None]:
def get_image_sizes(image_folder_path: Path) -> list:
    """
    """
    image_names = os.listdir(image_folder_path)
    image_sizes = []
    for i in tqdm(range(len(image_names))):
        test_image_path = image_folder_path / image_names[i]
        test_img = Image.open(test_image_path)
        image_sizes.append(test_img.size)
    return image_sizes

## Real images

In [None]:
real_images_path = Path("data/ffhq_real_faces")
image_sizes = get_image_sizes(real_images_path)
Counter(image_sizes)

All images in this dataset have size (1024, 1024)

## Synthetic images

### AIS-4SD

In [None]:
synth_images_1_path = Path("data/AIS-4SD/StableDiffusion-3-faces-20250203-1545")
image_sizes = get_image_sizes(synth_images_1_path)
Counter(image_sizes)

All images in this dataset have size (768, 768)

### SFHQ-T2I

In [None]:
synth_images_2_path = Path("data/SFHQ-T2I")
image_sizes = get_image_sizes(synth_images_2_path)
Counter(image_sizes)

All images in this dataset have size (1024, 1024)

## Summary

- All 3143 real images have size (1024, 1024)
- 1724 of the diffusion-generated images have size (1024, 1024), but 500 of them have size (768, 768)

I could upscale the smaller images, but it would be safer (less likely to introduce image artifacts) to reduce the size of the larger images to (768, 768).

# Image pre-processing

I will experiment image pre-processing techniques in this notebook as it will be easier to display the images and understand how they are transformed by various functions.

A recap of PyTorch dataset functionality:
- `torch.utils.data.Dataset` stores the samples and their corresponding labels
    - Each time it's called, it returns an [input, label] pair
    - Pre-processing functions can be defined / called inside this class
    - A custom Dataset class must implement three functions: __init__, __len__, and __getitem__
- `torch.utils.data.DataLoader` wraps an iterable around the Dataset to enable easy access to the samples   
    - Enabled iteration through the dataset in batches
    - Provides access to in-built functions for shuffling, parallel processing etc
    - Calls the `__getitem__()` function from the Dataset class to create a batch of data


In [None]:
labels_dict = {
    "real": 0,
    "synthetic": 1
}


image_folder_names_to_labels = {
    "ffhq_real_faces": 0,
    "AIS-4SD/StableDiffusion-3-faces-20250203-1545": 1,
    "SFHQ-T2I": 1
}

data_root_dir = Path("data")

Next we'll define the image augmentations.

We already identified a difference in the original sizes of the images, and decided to resize all images to 768 by 768 pixels.

We also want to ...

We then need to convert the images to tensors as this is the format required by PyTorch models. This conversion also scales the pixel values to between 0-1.

Finally, we need to normalise the pixel values in the same way that the training images were normalised. Since we are fine-tuning a pre-trained CLIP model, we can find the mean and standard deviation values used to normalise the CLIP training images by importing `CLIPProcessor` from the `transformers` library.

In [None]:
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

clip_mean = clip_processor.image_processor.image_mean
clip_std = clip_processor.image_processor.image_std

In [None]:
target_image_size = (768, 768)

image_transforms = transforms.Compose([
    transforms.Resize(size=target_image_size),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=clip_mean,
        std=clip_std
    )
])

In [None]:
class FaceImageDataset:
    def __init__(self, data_root_dir: Path, transform, labels_dict: dict):
        """
        Args:
            data_root_dir: Path to directory containing image subdirectories
        """
        self.data_root_dir = data_root_dir
        self.samples = []
        for class_label_int in list(labels_dict.values()):
            self.get_images_and_labels(class_label_int)
        self.transform = transform

    def get_images_and_labels(self, class_label_int: int):
        folder_paths = [folder_path for folder_path, class_label in image_folder_names_to_labels.items() if class_label==class_label_int]
        for folder_path in folder_paths:
            for image_name in os.listdir(self.data_root_dir / folder_path):
                if image_name.lower().endswith((".png", ".jpg")):
                    image_path = self.data_root_dir / folder_path / image_name
                    self.samples.append((image_path, class_label_int))

    def __len__(self):
        """Return the total number of samples"""
        return len(self.samples)

    def __getitem__(self, idx: int):
        """
        Get one sample
        Returns:
            image: Transformed image tensor
            label: 0 for real, 1 for synthetic
        """
        image_path, label = self.samples[idx]
        image = Image.open(image_path)
        transformed_image_tensor = self.transform(image)
        assert transformed_image_tensor.shape[0] == 3, "Unexpected number of channels; expected 3 for RGB."
        return transformed_image_tensor, label
        

full_dataset = FaceImageDataset(data_root_dir, image_transforms, labels_dict)
len(full_dataset)

In [None]:
# let's start with a train/val/test split of 0.6/0.2/0.2 so that we have just over 1000 images in the validation and test sets

train_size = int(0.6 * len(full_dataset))
val_test_size = len(full_dataset) - train_size

train_dataset, val_test_dataset = random_split(
    full_dataset, 
    [train_size, val_test_size],
    generator=Generator().manual_seed(10)
)

val_size = val_test_size//2
test_size = len(val_test_dataset) - val_size

val_dataset, test_dataset = random_split(
    val_test_dataset, 
    [val_size, test_size],
    generator=Generator().manual_seed(10)
)

print(f"\nTrain size: {len(train_dataset)}")
print(f"Val size: {len(val_dataset)}")
print(f"Test size: {len(test_dataset)}")

In [None]:
batch_size = 64

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# for batch_images, batch_labels in train_dataloader

In [None]:
# Display image and label
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0]
img = img.numpy().transpose(1, 2, 0)
label = train_labels[0]
plt.imshow(img)
plt.show()
print(f"Label: {label}")

In [None]:
def view_transformed_images(image_paths: list, transform, num_samples=3, seed=10):
    random.seed(seed)
    random_image_paths = random.sample(image_paths, k=num_samples)
    for image_path in random_image_paths:
        with Image.open(image_path) as f:
            fig, ax = plt.subplots(1, 2)
            ax[0].imshow(f) 
            ax[0].set_title(f"Original \nSize: {f.size}")
            ax[0].axis("off")

            # Transform and plot image
            # Note: permute() will change shape of image to suit matplotlib 
            # (PyTorch default is [C, H, W] but Matplotlib is [H, W, C])
            transformed_image = transform(f).permute(1, 2, 0) 
            ax[1].imshow(transformed_image) 
            ax[1].set_title(f"Transformed \nSize: {transformed_image.shape}")
            ax[1].axis("off")

            fig.suptitle(f"Class: {image_path.parent.stem}", fontsize=16)

folder_path = data_root_dir / "ffhq_real_faces"
image_names = os.listdir(folder_path)
image_paths = [folder_path / image_name for image_name in image_names]

view_transformed_images(image_paths, image_transforms)

Next steps:
- apply more transforms; also, do we need to normalise after converting to tensor?
- dealing with class imbalance
- K-fold cross-validation? Since we don't have a huge amount of data
- performance metrics
- batch size (depends on GOU memory capacity)