# Preprocessing The Data

In order to have smooth and efficient learning procedure we will do some preprocessing on the data

The data is a subset of Post_impressionism folder from <a href="https://www.kaggle.com/datasets/steubk/wikiart">WIKIART</a> collection. <br> You can find the dataset and the csv file at: <a href="https://drive.google.com/drive/folders/1vDTerxVX999kI7wkZHvx4xNGUn5bdPVY?usp=sharing">HERE</a>

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch.onnx.symbolic_opset9 import tensor
from torch.utils.data import Dataset
from torchvision import transforms
from torchvision.datasets import ImageFolder
import time

from torchvision.transforms import Compose
from torchvision.transforms.v2.functional import to_pil_image

import utils # Project utilities

In [34]:
# Our csv file
classes = pd.read_csv(utils.CSV_PATH)
classes.head()

Unnamed: 0,filename,artist,genre,description,phash,width,height,genre_count,subset,is_van_gogh
0,Post_Impressionism/edouard-cortes_the-theater-...,edouard cortes,['Post Impressionism'],the-theater-of-the-comedie-francaise,9491ada9caf05cf1,1675,1382,1,train,0
1,Post_Impressionism/edouard-cortes_theatre-du-c...,edouard cortes,['Post Impressionism'],theatre-du-chatelet-1,c7d69030996f36e4,1896,1382,1,train,0
2,Post_Impressionism/edouard-vuillard_boulevard-...,edouard vuillard,['Post Impressionism'],boulevard-of-battignolles,eb7214d866c638b5,1688,1382,1,train,0
3,Post_Impressionism/edouard-vuillard_figures-ea...,edouard vuillard,['Post Impressionism'],figures-eating-in-a-garden-by-the-water,d3272568d0a95e3d,1684,1382,1,train,0
4,Post_Impressionism/edouard-vuillard_sacha-guit...,edouard vuillard,['Post Impressionism'],sacha-guitry-in-his-dressing-room-1912,dae0254af31a6fa4,1818,1382,1,train,0


We want to make sure that all the images in our dataset are labeled in the csv file:

In [55]:
prefix = 'Post_Impressionism/'
filenames = classes['filename'].values
files_not_in_csv = []
for filename in os.listdir(utils.DATASET_DIR):
        path = prefix + filename
        if path not in filenames: # Counting which files are not in the CSV:
            files_not_in_csv.append(filename)
print(f'We have {len(files_not_in_csv)} files that are not present in the dataset.')

We have 120 files that are not present in the dataset.


It seems that we have 120 missing labels, so because that we have a large dataset, and we only focus on Van gogh paintings, we'll delete those images

In [57]:
for filename in files_not_in_csv: # Removing the files which are not in the CSV:
    os.remove(os.path.join(utils.DATASET_DIR, filename))
print('Successfully removed files from the dataset.')

Successfully removed files from the dataset.


Now we need to implement a convenient way to access our data, so we created a class that handles the data loading for us:

In [2]:
def map_labels(root, target):
    """
    Reads a CSV file named 'classes.csv' from the specified root directory and creates a mapping
    of file paths (with backslashes replaced) to labels based on a specified target column.

    :param root:
        The root directory containing the 'classes.csv' file. The directory should
        include the dataset used for generating the label mapping.
        The path should be compatible with filesystem conventions.

    :param target:
        The column name in 'classes.csv' to be used for creating the mapping. This
        column must exist in the CSV file and represents the labels assigned to the
        file names.

    :return:
        A dictionary where keys are file paths with updated backslashes, and values
        are label values extracted from the specified column in the 'classes.csv' file.
    """
    classes_df = pd.read_csv(os.path.join(root, 'classes.csv'))  # Read CSV file
    add_backslash= lambda s: s.replace('/','\\')
    label_mapping = { f"{root}\\{add_backslash(row['filename'])}": row[target] for _, row in classes_df.iterrows()}  # Map image names to labels
    return label_mapping

class ImageFolderForBinaryClassification(ImageFolder):
    """
    Extends the ImageFolder class to support binary classification with custom label mapping.

    This class is designed to preprocess and organize a dataset for binary classification tasks.
    It overrides the default behavior of ImageFolder to allow specification of a target category
    and the corresponding binary labels for dataset samples.

    :ivar target: The target class label for binary classification.
    :type target: str
    :ivar transform: Transformation to be applied on images, such as resizing, normalization, etc.
    :type transform: Callable or None
    :ivar samples: List of (path, label) tuples where label reflects binary classification labels.
    :type samples: list
    """
    def __init__(self, root, target, transform=None,):
        super().__init__(root, transform=transform)
        self.target = target
        label_mapping = map_labels(root, target)
        self.__pre_process_data(label_mapping) # Apply pre precessing

    def __pre_process_data(self, label_mapping):
        """
        Pre-processes the data by updating the labels of the sample dataset based on the given
        label mapping dictionary. If a path is not found in the dictionary, a default label of -1 is used.

        :param label_mapping: A dictionary where keys are sample paths and values are corresponding
                              labels to be assigned.
        :type label_mapping: dict
        :return: None
        """
        for i in range(len(self.samples)):
            path, _ = self.samples[i]
            label = label_mapping.get(path, -1) # Return -1 if no path was found
            self.samples[i] = (path, label)

    # probably redundant
    def get_subset_by_indices(self, indices):
        """
        Returns a subset of the dataset using the specified indices.
        """
        subset = ImageFolderForBinaryClassification(self.root, self.target, transform=self.transform)
        subset.samples = [subset.samples[i] for i in indices ]
        return subset

In [3]:
# Creating our dataset loader
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(), # Convert the image to a PyTorch tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229,0.224, 0.225])]) # Normalize the pixel values based on ImageNet statistics

dataset = ImageFolderForBinaryClassification(root=utils.DATA_DIR, target='is_van_gogh', transform=transform)

After working with our loader we noticed that it takes a lot of time to load the picture, let's see how much time a full epoch over the dataset takes:

In [14]:
start_time = time.time()
for sample in dataset:
    pass
print(f'Epoch time: {(time.time() - start_time)/60:.1f} minutes')

Epoch time: 4.1 minutes


It takes 4 minutes to perform a pass over the dataset, as a result we decided to optimize our dataset by converting the transformed images into numpy format:

In [7]:
def optimize_dataset(dataset, output_name):
    """
    Optimize and convert a given dataset into a compressed NumPy file format. This function processes
    the dataset by converting its elements (images and labels) into NumPy arrays, and then saves them
    in a `.npz` compressed file for efficient storage and further use.

    :param dataset: The input dataset containing image data and their corresponding labels. Images
        are expected to be in Tensor format, while labels can be in any format compatible with NumPy.
    :type dataset: Iterable

    :param output_name: The name of the output file, excluding the file extension, in which the
        optimized dataset will be saved. The file will be saved in the directory defined by
        `utils.OPTIMIZED_DIR` with the file extension `.npz`.
    :type output_name: str

    :return: The function does not return any value. The optimized dataset is saved directly to a
        compressed `.npz` file in the specified output directory.
    :rtype: None
    """
    images = []
    labels = []
    n = len(dataset)
    print(f"Converting {n} images into NumPy format...")
    for i, (data, label) in enumerate(dataset):
        utils.show_optimization_progress(i + 1, n)
        images.append(data.numpy())  # Convert Tensor to NumPy
        labels.append(label)
    path = f"{utils.OPTIMIZED_DIR}/{output_name}.npz"
    np.savez_compressed(path, images=np.array(images), labels=np.array(labels))
    print(f"Saved dataset to {path}")

optimize_dataset(dataset, 'dataset')

Converting 4551 images into NumPy format...
Optimizing dataset... 100.00%
Saved dataset to ./data/optimized/dataset.npz


Then we created a class to decode our data back from numpy format into torch tensor object:

In [39]:
class NumPyDataset(Dataset):
    def __init__(self, file_path):
        data = np.load(file_path)
        self.images = data["images"]
        self.labels = data["labels"]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        x = torch.tensor(self.images[idx], dtype=torch.float32)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

# Loading our optimized dataset into memory
optimized_dataset = NumPyDataset(utils.OPTIMIZED_DIR + '/dataset.npz')

Let's see how much it improved our performance:

In [40]:
start_time = time.time()
for sample in optimized_dataset:
    pass
print(f'Epoch time: {time.time() - start_time:.1f} seconds')

Epoch time: 0.5 seconds


It took us 0.5 SECONDS! to preform a complete pass over the entire dataset, comparing to the previous way which took us 4 minutes = 240 seconds we get our optimized dataset is roughly  240/0.5 = 480 times faster.

Now we will optimize our data augmentations:

In [8]:
def transform_wrapper(composed_transform: transforms.Compose):
    """
    Applies a specified transformation sequence to an input while ensuring ImageNet
    standard preprocessing steps are integrated.
    :param composed_transform: A composite transformation object containing a list
        of transformations to be applied sequentially.
    :type composed_transform: transforms.Compose
    :return: A new composite transformation object with ImageNet preprocessing
        steps integrated.
    :rtype: transforms.Compose
    """
    img_net_transform = transforms.Compose([
        transforms.ToTensor(), # Convert the image to a PyTorch tensor
        transforms.Resize((224, 224)),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229,0.224, 0.225])]) # Normalize the pixel values based on ImageNet statistics

    # Check if transforms.ToTensor() was already applied
    if any(map(lambda t: isinstance(t, transforms.ToTensor), composed_transform.transforms)):
        return transforms.Compose([*composed_transform.transforms, *img_net_transform.transforms[1:]])

    return transforms.Compose([*composed_transform.transforms, *img_net_transform.transforms])

In [9]:
flip_transform = transforms.Compose([
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(degrees=30),
    transforms.RandomPerspective(distortion_scale=0.3, p=0.5),
])
dataset_to_optimize = ImageFolderForBinaryClassification(root=utils.DATA_DIR, target='is_van_gogh', transform=transform_wrapper(flip_transform))
optimize_dataset(dataset_to_optimize, 'flip')

Converting 4551 images into NumPy format...
Optimizing dataset... 100.00%
Saved dataset to ./data/optimized/flip.npz


In [10]:
dropout_transform = transforms.Compose([
    transforms.ToTensor(),
    *([transforms.RandomErasing(p=0.5, scale=(0.01, 0.01), ratio=(1, 1))]*25),
    transforms.Grayscale(num_output_channels=3),
])
dataset_to_optimize = ImageFolderForBinaryClassification(root=utils.DATA_DIR, target='is_van_gogh', transform=transform_wrapper(dropout_transform))
optimize_dataset(dataset_to_optimize, 'dropout')

Converting 4551 images into NumPy format...
Optimizing dataset... 100.00%
Saved dataset to ./data/optimized/dropout.npz


In [14]:
affine_transform = transforms.Compose([
    transforms.RandomAffine(degrees=180, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=(10,10)),
    transforms.RandomEqualize(p=0.5),
])
dataset_to_optimize = ImageFolderForBinaryClassification(root=utils.DATA_DIR, target='is_van_gogh', transform=transform_wrapper(affine_transform))
optimize_dataset(dataset_to_optimize, 'affine')

Converting 4551 images into NumPy format...
Optimizing dataset... 100.00%
Saved dataset to ./data/optimized/affine.npz


In [13]:
blur_transform = transforms.Compose([
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.GaussianBlur(kernel_size=(3,3), sigma=(0.1, 2))
])
dataset_to_optimize = ImageFolderForBinaryClassification(root=utils.DATA_DIR, target='is_van_gogh', transform=transform_wrapper(blur_transform))
optimize_dataset(dataset_to_optimize, 'blur')

Converting 4551 images into NumPy format...
Optimizing dataset... 100.00%
Saved dataset to ./data/optimized/blur.npz
