# 0. Gather information about the dataset

Gather information about the dataset.
- Number of images in test set and train set
- Number of different formats
- Number of cats and dogs in train and test set
- Different types of breeds in each set and their count
- Different type of image sizes in each set and their count

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import os

In [4]:
test_path = "../Dataset/Test/color"
test_label_path = "../Dataset/Test/label"
train_path = "../Dataset/TrainVal/color"
train_label_path = "../Dataset/TrainVal/label"

## Number of images in test set and train set

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import os

from PIL import Image

Number of images in test set: 3694
Number of images in train set: 3673


In [2]:
test_path = "../Dataset/Test/color"
test_label_path = "../Dataset/Test/label"
train_path = "../Dataset/TrainVal/color"
train_label_path = "../Dataset/TrainVal/label"

In [6]:
def get_file_format(filename):
    """Extracts file format from the filename."""
    return filename.split(".")[-1]

def get_file_dimensions(filename):
    """Processes the filename to extract dimensions."""
    return plt.imread(filename).shape
    

def get_breed_name(filename):
    """Extracts breed name by removing the last part (assumed to be an ID)."""
    breed = filename.split("_")[:-1]  # Remove the last split part
    return "_".join(breed)

def count_file_formats_dimensions_and_breeds(image_dir):
    """
    Counts occurrences of file formats and breeds in the given directory.
    
    Args:
        image_dir (str): Path to the dataset directory.

    Returns:
        tuple: Two dictionaries, one for file formats and one for breed names.
    """
    format_count = {}
    dimension_count = {}
    breed_count = {}

    for filename in os.listdir(image_dir):
        file_format = get_file_format(filename)
        dimensions = get_file_dimensions(os.path.join(image_dir, filename))
        breed_name = get_breed_name(filename)

        # Count file format occurrences
        format_count[file_format] = format_count.get(file_format, 0) + 1

        # Count dimension occurrences
        dimension_count[dimensions] = dimension_count.get(dimensions, 0) + 1

        # Count breed occurrences
        breed_count[breed_name] = breed_count.get(breed_name, 0) + 1

    return format_count, dimension_count, breed_count


## 0.1 Get classes in train and test set, and invalid images

In [7]:
def classify_image(image_path):
    """
    Classifies an image as either 'cat' or 'dog' based on pixel colors in the label.
    If the image is invalid, returns (0, 0).

    - Red pixels (R > 0, G = 0, B = 0, A = 1) → Cat
    - Green pixels (R = 0, G > 0, B = 0, A = 1) → Dog
    - White pixels (1,1,1,1) → Silhouette
    - Black pixels (0,0,0,1) → Background
    - Any other pixel values → Invalid

    Args:
        image_path (str): Path to the image label file.

    Returns:
        tuple: (is_cat, is_dog)
               is_cat, is_dog: Either 1 or 0.
               If the image is invalid, both values will be 0.
    """
    # Load the image label
    label = plt.imread(image_path)

    # Ensure image has an alpha channel
    if label.shape[-1] != 4:
        return 0, 0  # Invalid image due to missing alpha channel

    # Extract RGBA channels
    R, G, B, A = label[:, :, 0], label[:, :, 1], label[:, :, 2], label[:, :, 3]

    # Masks for valid colors
    black_mask = (R == 0) & (G == 0) & (B == 0) & (A == 1)
    white_mask = (R == 1) & (G == 1) & (B == 1) & (A == 1)
    red_mask = (R > 0) & (G == 0) & (B == 0) & (A == 1)
    green_mask = (R == 0) & (G > 0) & (B == 0) & (A == 1)

    # Check if there are any red or green pixels
    has_cat = np.any(red_mask)
    has_dog = np.any(green_mask)

    # Check for invalid pixels (not in the valid set)
    valid_mask = white_mask | black_mask | red_mask | green_mask
    is_invalid = not np.all(valid_mask) or (not has_cat and not has_dog)

    # If invalid, return (0, 0)
    if is_invalid:
        return 0, 0

    return int(has_cat), int(has_dog)


### 0.1.1 Get classes and invalid images in test set

In [8]:
# Initialize counters
cat_dog_count_test = {"cat": 0, "dog": 0}

# Get format and breed counts
format_and_count_test, dimension_and_count_test, breed_and_count_test = count_file_formats_dimensions_and_breeds(test_path)

# List of images with invalid labels
invalid_test_images = []

# Iterate through test images
for filename in os.listdir(test_path):
    label_image_path = os.path.join(test_label_path, filename.replace(".jpg", ".png"))

    # Classify the image
    cat, dog = classify_image(label_image_path)

    if cat == 0 and dog == 0:
        invalid_test_images.append(filename)
        continue

    # Update counters
    cat_dog_count_test["cat"] += cat
    cat_dog_count_test["dog"] += dog

# Display results
print("Cat/Dog count in test set:", cat_dog_count_test)
print("There are", len(invalid_test_images), "invalid images in the test set:", invalid_test_images)

Cat/Dog count in test set: {'cat': 1203, 'dog': 2491}
There are 0 invalid images in the test set: []


### 0.1.2 Get classes and invalid images in train set

In [6]:
# Initialize counters
cat_dog_count_train = {"cat": 0, "dog": 0}

# Get format and breed counts
format_and_count_train, dimension_and_count_train, breed_and_count_train = count_file_formats_dimensions_and_breeds(train_path)

# List of images with invalid labels
invalid_train_images = []

# Iterate through train images
for filename in os.listdir(train_path):
    label_image_path = os.path.join(train_label_path, filename.replace(".jpg", ".png"))

    # Classify the image
    cat, dog = classify_image(label_image_path)

    if cat == 0 and dog == 0:
        invalid_train_images.append(filename)
        continue

    # Update counters
    cat_dog_count_train["cat"] += cat
    cat_dog_count_train["dog"] += dog

# Display results
print("Cat/Dog count in train set:", cat_dog_count_train)
print("There are", len(invalid_train_images), "invalid images in the train set:", invalid_train_images)


Cat/Dog count in train set: {'cat': 1185, 'dog': 2488}
There are 0 invalid images in the train set: []


## 0.2 Summary

In [10]:
files_with_no_label_test = number_of_images_in_test - cat_dog_count_test["cat"] - cat_dog_count_test["dog"]
files_with_no_label_train = number_of_images_in_train - cat_dog_count_train["cat"] - cat_dog_count_train["dog"]

# Display results
print("Files with no label in test set:", files_with_no_label_test)
print(invalid_test_images)
print("Files with no label in train set:", files_with_no_label_train)
print(invalid_train_images)

# Delate invalid images and labels
for filename in invalid_test_images:
    os.remove(os.path.join(test_path, filename))
    os.remove(os.path.join(test_label_path, filename.replace(".jpg", ".png")))
              
for filename in invalid_train_images:
    os.remove(os.path.join(train_path, filename))
    os.remove(os.path.join(train_label_path, filename.replace(".jpg", ".png")))

number_of_valid_images_in_test = len(os.listdir(test_path))
number_of_valid_images_in_train = len(os.listdir(train_path))

Files with no label in test set: 0
[]
Files with no label in train set: 0
[]


## Summary

In [13]:
# Count the number of images in test and train sets
number_of_images_in_test = len(os.listdir(test_path))
number_of_images_in_train = len(os.listdir(train_path))

# Display counts
print("Number of images in test set:", number_of_images_in_test)
print("Number of images in train set:", number_of_images_in_train)

print("\nFormat and Breed counts in test set:")
print(format_and_count_test)
print(breed_and_count_test)

print("\nFormat and Breed counts in train set:")
print(format_and_count_train)
print(breed_and_count_train)

print("\nCat/dog count in test set: ", cat_dog_count_test)
print("Cat/dog count in train set: ", cat_dog_count_train)

files_with_no_label_test = number_of_images_in_test - cat_dog_count_test["cat"] - cat_dog_count_test["dog"]
files_with_no_label_train = number_of_images_in_train - cat_dog_count_train["cat"] - cat_dog_count_train["dog"]

print("\nFiles with no label in test set: ", files_with_no_label_test)
print("Files with no label in train set: ", files_with_no_label_train)


print("\nDimension counts in test set:")
print(dimension_and_count_test)

print("\nDimension counts in train set:")
print(dimension_and_count_train)

print("\nThere are {} different shapes of images in the test set and {} in the train set.".format(len(dimension_and_count_test), len(dimension_and_count_train)))
print("The 5 most common shapes in the test set are:")
for i in sorted(dimension_and_count_test.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(i)
print("The 5 most common shapes in the train set are:")
for i in sorted(dimension_and_count_train.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(i)

Total number of images in test set:  3694
Total number of images in train set:  3673
Number of valid images in test set:  3694
Number of valid images in train set:  3673

Format and Breed counts in test set:
{'jpg': 3694}
{'pug': 100, 'Siamese': 101, 'leonberger': 100, 'Abyssinian': 99, 'miniature_pinscher': 100, 'wheaten_terrier': 99, 'shiba_inu': 100, 'saint_bernard': 98, 'pomeranian': 100, 'Sphynx': 100, 'Egyptian_Mau': 100, 'British_Shorthair': 100, 'Birman': 100, 'Bombay': 104, 'samoyed': 100, 'Russian_Blue': 100, 'Persian': 99, 'Maine_Coon': 100, 'Bengal': 100, 'english_cocker_spaniel': 104, 'american_pit_bull_terrier': 100, 'keeshond': 99, 'great_pyrenees': 100, 'boxer': 100, 'chihuahua': 100, 'german_shorthaired': 100, 'yorkshire_terrier': 100, 'staffordshire_bull_terrier': 89, 'beagle': 100, 'japanese_chin': 99, 'american_bulldog': 100, 'english_setter': 100, 'Ragdoll': 100, 'basset_hound': 100, 'newfoundland': 104, 'scottish_terrier': 99, 'havanese': 100}

Format and Breed co

# 1. Dataset preprocessing and augmentation 

## 1.1 Dataset preprocessing (convert RGBA to RGB and resize)

### 1.1.1 Remove invalid images

In [8]:
files_with_no_label_test = number_of_images_in_test - cat_dog_count_test["cat"] - cat_dog_count_test["dog"]
files_with_no_label_train = number_of_images_in_train - cat_dog_count_train["cat"] - cat_dog_count_train["dog"]

# Display results
print("Files with no label in test set:", files_with_no_label_test)
print(invalid_test_images)
print("Files with no label in train set:", files_with_no_label_train)
print(invalid_train_images)

# Delate invalid images and labels
for filename in invalid_test_images:
    os.remove(os.path.join(test_path, filename))
    os.remove(os.path.join(test_label_path, filename.replace(".jpg", ".png")))
              
for filename in invalid_train_images:
    os.remove(os.path.join(train_path, filename))
    os.remove(os.path.join(train_label_path, filename.replace(".jpg", ".png")))

number_of_valid_images_in_test = len(os.listdir(test_path))
number_of_valid_images_in_train = len(os.listdir(train_path))

print("Total number of images in test set: ", number_of_images_in_test)
print("Total number of images in train set: ", number_of_images_in_train)

print("Number of valid images in test set: ", number_of_valid_images_in_test)
print("Number of valid images in train set: ", number_of_valid_images_in_train)

Files with no label in test set: 16
['Egyptian_Mau_167.jpg', 'Egyptian_Mau_20.jpg', 'saint_bernard_78.jpg', 'Egyptian_Mau_177.jpg', 'wheaten_terrier_195.jpg', 'Egyptian_Mau_139.jpg', 'keeshond_7.jpg', 'Egyptian_Mau_129.jpg', 'Abyssinian_34.jpg', 'Persian_259.jpg', 'Egyptian_Mau_191.jpg', 'staffordshire_bull_terrier_2.jpg', 'saint_bernard_60.jpg', 'Egyptian_Mau_145.jpg', 'japanese_chin_199.jpg', 'staffordshire_bull_terrier_22.jpg']
Files with no label in train set: 7
['Egyptian_Mau_165.jpg', 'Egyptian_Mau_162.jpg', 'leonberger_18.jpg', 'miniature_pinscher_14.jpg', 'saint_bernard_15.jpg', 'Egyptian_Mau_196.jpg', 'saint_bernard_108.jpg']
Total number of images in test set:  3710
Total number of images in train set:  3680
Number of valid images in test set:  3694
Number of valid images in train set:  3673


### 1.1.2 Convert RGBA to RGB

We noticed that there are some images with RGBA mode, we need to convert them to RGB in order to be able to process them

In [35]:
# remove 90% of images of each cat/dog breed
from pathlib import Path

# Get the 10% of the images of each breed
_, _, breeds_test = count_file_formats_dimensions_and_breeds(test_path)
_, _, breeds_train = count_file_formats_dimensions_and_breeds(train_path)

print("breeds_test: ", breeds_test)
print("breeds_train: ", breeds_train)


for breed in breeds_test:
    count = breeds_test[breed]
    delete_number = int(count * 0.5)
    print()
    if count < 10:
        print("skiping breed: ", breed)
        continue
    # generate delete_number random numbers between 0 and count
    random_numbers = np.random.choice(np.arange(count), size=delete_number, replace=False)
    for i, filename in enumerate(list(Path(test_path).glob(breed + "*"))):
        filename = filename.name
        if get_breed_name(filename) == breed and i in random_numbers:
            os.remove(os.path.join(test_path, filename))
            os.remove(os.path.join(test_label_path, filename.replace(".jpg", ".png")))
            print("removed from test set: ", filename)

for breed in breeds_train:
    count = breeds_train[breed]
    if count < 10:
        print("skiping breed: ", breed)
        continue
    delete_number = int(count * 0.5)
    # generate delete_number random numbers between 0 and count
    random_numbers = np.random.choice(np.arange(count), size=delete_number, replace=False)
    for i, filename in enumerate(list(Path(train_path).glob(breed + "*"))):
        filename = filename.name
        if get_breed_name(filename) == breed and i in random_numbers:
            os.remove(os.path.join(train_path, filename))
            os.remove(os.path.join(train_label_path, filename.replace(".jpg", ".png")))
            print("removed from train set: ", filename)
    


breeds_test:  {'pug': 44, 'Siamese': 37, 'leonberger': 39, 'wheaten_terrier': 37, 'Abyssinian': 40, 'Sphynx': 41, 'Egyptian_Mau': 39, 'Birman': 41, 'shiba_inu': 39, 'pomeranian': 39, 'samoyed': 44, 'Russian_Blue': 41, 'english_cocker_spaniel': 36, 'keeshond': 39, 'american_pit_bull_terrier': 38, 'great_pyrenees': 42, 'Maine_Coon': 41, 'boxer': 41, 'chihuahua': 43, 'yorkshire_terrier': 46, 'german_shorthaired': 40, 'miniature_pinscher': 38, 'beagle': 43, 'japanese_chin': 46, 'saint_bernard': 38, 'american_bulldog': 45, 'Persian': 41, 'basset_hound': 41, 'english_setter': 44, 'newfoundland': 41, 'scottish_terrier': 40, 'Bombay': 42, 'Ragdoll': 40, 'Bengal': 37, 'staffordshire_bull_terrier': 41, 'havanese': 38, 'British_Shorthair': 38}
breeds_train:  {'basset_hound': 41, 'Siamese': 38, 'shiba_inu': 42, 'scottish_terrier': 41, 'pomeranian': 39, 'newfoundland': 38, 'english_setter': 43, 'British_Shorthair': 42, 'great_pyrenees': 41, 'japanese_chin': 39, 'Maine_Coon': 42, 'havanese': 42, 'bo

In [36]:
_, _, breeds_test = count_file_formats_dimensions_and_breeds(test_path)
print("breeds_test: ", breeds_test)
_, _, breeds_train = count_file_formats_dimensions_and_breeds(train_path)
print("breeds_train: ", breeds_train)

breeds_test:  {'wheaten_terrier': 19, 'Sphynx': 21, 'Egyptian_Mau': 20, 'Birman': 21, 'pomeranian': 20, 'Maine_Coon': 21, 'leonberger': 20, 'chihuahua': 22, 'yorkshire_terrier': 23, 'german_shorthaired': 20, 'miniature_pinscher': 19, 'boxer': 21, 'saint_bernard': 19, 'Persian': 21, 'english_setter': 22, 'Abyssinian': 20, 'american_bulldog': 23, 'Bombay': 21, 'newfoundland': 21, 'Ragdoll': 20, 'japanese_chin': 23, 'Siamese': 19, 'keeshond': 20, 'american_pit_bull_terrier': 19, 'shiba_inu': 20, 'pug': 22, 'basset_hound': 21, 'Russian_Blue': 21, 'beagle': 22, 'samoyed': 22, 'Bengal': 19, 'english_cocker_spaniel': 18, 'great_pyrenees': 21, 'scottish_terrier': 20, 'staffordshire_bull_terrier': 21, 'havanese': 19, 'British_Shorthair': 19}
breeds_train:  {'basset_hound': 21, 'Siamese': 19, 'shiba_inu': 21, 'scottish_terrier': 21, 'english_setter': 22, 'great_pyrenees': 21, 'japanese_chin': 20, 'Maine_Coon': 21, 'boxer': 20, 'miniature_pinscher': 20, 'american_bulldog': 19, 'beagle': 21, 'staf

In [37]:
test_sum = sum(breeds_test.values())
train_sum = sum(breeds_train.values())
print("Total number of images in test set: ", test_sum)
print("Total number of images in train set: ", train_sum)

Total number of images in test set:  760
Total number of images in train set:  756
