In [6]:
import os
import shutil
from PIL import Image
from tqdm import tqdm
import operator

In [None]:
"""
the first thing we did is that we took the biggest 15 classes of our original dataset and then resizing its images and then putting them in a whole new directiry
to not alter/change the original dataset, to avoid any error :)
"""

DATASET_PATH = "dataset/VGGFace2/VGGFace2"

# thats the path of the final desired dataset we would like to use in our models
OUTPUT_DATASET_PATH = "dataset/VGGFace2_top15_classes"
NUM_CLASSES_TO_SELECT = 15

# ofcourse we need to resize all the images to be the same size to help the model to learn
IMAGE_SIZE = (224, 224)

"""
ofcourse we need our dataset to be balanced meaning that each class/folder must have same number of images to let the model not to be biased to a certain class
so we need this function to count the number of images in each class and return a dictionary containing each class with its number of images insde it
"""
def get_class_image_counts(dataset_path):
    class_counts = {}
    if not os.path.isdir(dataset_path):
        print(f"Error: Dataset path not found - {dataset_path}")
        return class_counts

    # we used tqdm for a progress bar
    class_folders = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
    print(f"Scanning {len(class_folders)} class folders...")

    for class_name in tqdm(class_folders, desc="Scanning classes"):
        class_path = os.path.join(dataset_path, class_name)
        num_images = len([f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
        if num_images > 0:
            class_counts[class_name] = num_images
    return class_counts

"""
we had to use only 15 classes out of 400 and something class because that number was very huge and our poor laptops would take ages until they train the models
and get a good accuracy on that kind of huge dataset, we have tried it already and it took more than 3 days and we ony got accuracy of 0.02% :') so dissapointing
also we need to pick the classes that have the biggest number of images to let our model learn very well, thats why this function is here
"""
def select_top_classes(class_counts, num_classes):
    if not class_counts:
        print("No classes found to select from.")
        return []

    # here we sort the classes accprding to the `operator.itemgetter(1)` means the second element of each tuple inside the dict => num_of_images
    sorted_classes = sorted(class_counts.items(), key=operator.itemgetter(1), reverse=True)

    # her we pick the top15 classes names, `sorted_classes[:num_classes]` means we take only the first 15 elements in this dict, and then we take the name of each class of them
    top_n_class_names = [class_name for class_name, count in sorted_classes[:num_classes]]

    return top_n_class_names

# --- 3. Preprocess and Save Images for Selected Classes ---

def preprocess_and_save_images(dataset_path, output_path, selected_classes, image_size):
    # here we check if the output_path/output_dataset_path exists or not, so if it exists we will remove it ofcourse, since we don't want any inconsistency in our data
    if os.path.exists(output_path):
        print(f"Output directory {output_path} already exists. Removing it.")
        shutil.rmtree(output_path)
    os.makedirs(output_path)

    print(f"\nProcessing and saving images for {len(selected_classes)} selected classes...")

    # here we know that each class in the `selected_classes` is a celebrity in our original dataset, so we want to create the same set of folders/celebrities in our output_dataset as well
    for class_name in tqdm(selected_classes, desc="Processing images"):
        class_path = os.path.join(dataset_path, class_name)
        output_class_path = os.path.join(output_path, class_name)
        os.makedirs(output_class_path, exist_ok=True)

        # here we collect all the image names that ofcourse are ending with .png or bla bla and putting them in one list for easily catching them
        image_files = [f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

        for image_name in image_files:
            try:
                # getting the whole image's path in our original dataset
                image_path = os.path.join(class_path, image_name)
                with Image.open(image_path) as img:

                    # resizing the image
                    img_resized = img.resize(image_size, Image.LANCZOS)
                    
                    # some models expect to train on RGB colors so better convert them into it, ugh remembered multimedia TwT
                    if img_resized.mode != 'RGB':
                        img_resized = img_resized.convert('RGB')
                    
                    # yo smarty, don't forget to save these resulted images in our output_dataset okee, or all what we did will just evaborate ^_^
                    output_image_path = os.path.join(output_class_path, image_name)
                    img_resized.save(output_image_path)
            except Exception as e:
                print(f"Error processing image {image_path}: {e}")

    print("\nImage preprocessing complete.")
    print(f"Processed dataset is saved at: {output_path}")


In [None]:
# lets test these functions out

if __name__ == '__main__':
    all_class_counts = get_class_image_counts(DATASET_PATH)
    if all_class_counts:
        top_classes = select_top_classes(all_class_counts, NUM_CLASSES_TO_SELECT)
        print(f"\nTop {NUM_CLASSES_TO_SELECT} classes selected:")
        for i, class_name in enumerate(top_classes):
            print(f"{i+1}. Class: {class_name}, Images: {all_class_counts[class_name]}")
        preprocess_and_save_images(DATASET_PATH, OUTPUT_DATASET_PATH, top_classes, IMAGE_SIZE)

Scanning 469 class folders...


Scanning classes: 100%|██████████| 469/469 [00:03<00:00, 148.23it/s]



Top 15 classes selected:
1. Class: Alexa Chung, Images: 720
2. Class: Amy Adams, Images: 689
3. Class: Alex Salmond, Images: 676
4. Class: Andie MacDowell, Images: 658
5. Class: Alberto Núñez Feijóo, Images: 648
6. Class: Bronisław Komorowski, Images: 644
7. Class: Alesha Dixon, Images: 639
8. Class: Aleksander Kwaśniewski, Images: 623
9. Class: Alfredo Pérez Rubalcaba, Images: 617
10. Class: Aléxis Tsípras, Images: 613
11. Class: Amber Heard, Images: 605
12. Class: Adrienne Bailon-Houghton, Images: 603
13. Class: Aleksandra Kwaśniewska, Images: 596
14. Class: Aditi Rao Hydari, Images: 594
15. Class: Boris Tadić, Images: 594

Processing and saving images for 15 selected classes...


Processing images: 100%|██████████| 15/15 [03:30<00:00, 14.02s/it]


Image preprocessing complete.
Processed dataset is saved at: dataset/VGGFace2_top15_classes





In [None]:
import random
import numpy as np
import cv2
import albumentations as A


In [None]:
# thats the dataset we just created that has the top15 classes
INPUT_DATASET_PATH = "dataset/VGGFace2_top15_classes"

# thats the final desired dataset that is balanced and preprocessed
BALANCED_DATASET_PATH = "dataset/VGGFace2_balanced_900_albumentations"

# thats the number of images we need in each class
TARGET_COUNT = 900

"""if we found out that a class needs more images so what we do? we should add new augmented images of randomly selected images of our class, so confusing but
read it again and you will understand :), all thgese things we do is to FAKELY create new images, so more data more learning 
"""
augmentation_pipeline = A.Compose([
    A.HorizontalFlip(p=0.5),    # probability of 50%
    A.ShiftScaleRotate(
        shift_limit=0.1,
        scale_limit=0.1,
        rotate_limit=10,
        p=0.8                   # probability of 80%
    ),
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.7),
    A.GaussianBlur(blur_limit=(3, 7), p=0.5),   # probability of 50%
])


# this function's name already explains itself so no need for me to tire myself by explaining it..
def balance_dataset_to_target(input_path, output_path, target_count):
    # checking if the `output_path` meaning the balanced datasetalready exixts? if it exists then removed because we don't need errors anymore
    if os.path.exists(output_path):
        print(f"Output directory {output_path} already exists. Removing it.")
        shutil.rmtree(output_path)
    os.makedirs(output_path)
    print(f"Balancing all classes to a target of {target_count} images per class.\n")

    # here we get a list of all the folders/celebrities names inside our original dataset
    class_folders = [d for d in os.listdir(input_path) if os.path.isdir(os.path.join(input_path, d))]

    # here we loop on each folder/celebrity and get its name to be ble to create the same folder in the output dataset
    for class_name in tqdm(class_folders, desc="Balancing classes"):
        input_class_path = os.path.join(input_path, class_name)
        output_class_path = os.path.join(output_path, class_name)
        os.makedirs(output_class_path, exist_ok=True)

        image_files = os.listdir(input_class_path)
        current_count = len(image_files)
        
        if current_count == 0:
            continue

        # here we copy all the images in the original dataset's folder and paste them in the same folder created in our output dataset
        for image_name in image_files:
            shutil.copy(os.path.join(input_class_path, image_name), os.path.join(output_class_path, image_name))

        # here we check if the number of images in the class size is less than 900 if its less than that then we need to balance it :)
        if current_count < target_count:
            num_to_generate = target_count - current_count
            # here we collect all the image paths inside that class that need tio be balanced
            image_paths_to_sample = [os.path.join(input_class_path, f) for f in image_files]

            # we need to generate `num_to_generate` number of images to make our output dataset balanced
            for i in range(num_to_generate):
                # we pick a random image to not be biased to anyone
                random_image_path = random.choice(image_paths_to_sample)
                
                try:
                    # here we had to read the image using cv2 that reads images as BGR NumPy arrays
                    img_array = cv2.imread(random_image_path)

                    # here we needed to convert BGR to RGB since many models expect RGB as i said before
                    img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)

                    # now the step that we waited long time to reach to, the augmentation! here the real manipulation happens to create FAKELY new images
                    augmented = augmentation_pipeline(image=img_array)
                    augmented_image_array = augmented['image']
                    
                    # convert it back to BGR to save with cv2
                    augmented_image_bgr = cv2.cvtColor(augmented_image_array, cv2.COLOR_RGB2BGR)

                    # here is the step that we wanna save our augmented image
                    new_image_name = f"aug_{i}_{os.path.basename(random_image_path)}"
                    output_image_path = os.path.join(output_class_path, new_image_name)
                    cv2.imwrite(output_image_path, augmented_image_bgr)

                except Exception as e:
                    print(f"\nError augmenting image {random_image_path}: {e}")

    print("\nDataset balancing complete!")
    print(f"New balanced dataset is located at: {BALANCED_DATASET_PATH}")


In [None]:
# lets test all this out
if __name__ == '__main__':
    balance_dataset_to_target(INPUT_DATASET_PATH, BALANCED_DATASET_PATH, TARGET_COUNT)
    print("\nVerifying image counts in the new balanced dataset:")
    all_balanced = True
    for class_name in os.listdir(BALANCED_DATASET_PATH):
        count = len(os.listdir(os.path.join(BALANCED_DATASET_PATH, class_name)))
        print(f"- {class_name}: {count} images")
        if count != TARGET_COUNT:
            all_balanced = False
    
    if all_balanced:
        print("\nSuccess! All classes are perfectly balanced.")
    else:
        print("\nWarning: Some classes did not reach the target count.")

Balancing all classes to a target of 900 images per class.



Balancing classes: 100%|██████████| 15/15 [04:56<00:00, 19.77s/it]


Dataset balancing complete!
New balanced dataset is located at: dataset/VGGFace2_balanced_900_albumentations

Verifying image counts in the new balanced dataset:
- Aditi Rao Hydari: 900 images
- Adrienne Bailon-Houghton: 900 images
- Alberto Núñez Feijóo: 900 images
- Aleksander Kwaśniewski: 900 images
- Aleksandra Kwaśniewska: 900 images
- Alesha Dixon: 900 images
- Alex Salmond: 900 images
- Alexa Chung: 900 images
- Alfredo Pérez Rubalcaba: 900 images
- Aléxis Tsípras: 900 images
- Amber Heard: 900 images
- Amy Adams: 900 images
- Andie MacDowell: 900 images
- Boris Tadić: 900 images
- Bronisław Komorowski: 900 images

Success! All classes are perfectly balanced.





In [None]:
# --- 1. Configuration and Setup ---
# Point to your balanced dataset with 900 images per class
BALANCED_DATASET_PATH = "dataset/VGGFace2_balanced_900_albumentations"
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32
NUM_CLASSES = 15
# The official TF Hub handle for the InceptionV1 model
TFHUB_MODEL_HANDLE = "https://tfhub.dev/google/imagenet/inception_v1/feature_vector/5"

# --- 2. Create a Dataset of all File Paths and Shuffle ---
# This is the most memory-efficient way to handle a large dataset.
# We work with file paths first, not the actual image data.

# Get a list of all image file paths
all_image_paths = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(BALANCED_DATASET_PATH)) for f in fn]
# Shuffle the paths randomly. This is a critical step.
random.shuffle(all_image_paths)

total_images = len(all_image_paths)
print(f"Found {total_images} total images.")