# Cleaning Training and Validation Dataset
Get the minimum sample exists in the set

In [10]:
import os
train_set = 'dataset/train'
val_set = 'dataset/val'
def count_samples(dataset_dir):
    sample_counts = {}
    for person_folder in os.listdir(dataset_dir):
        person_folder_path = os.path.join(dataset_dir, person_folder)
        if os.path.isdir(person_folder_path):
            sample_counts[person_folder] = len(os.listdir(person_folder_path))
    return sample_counts

train_counts = count_samples(train_set)
val_counts = count_samples(val_set)

In [11]:
def min_samples(counts):
    return min(counts.values())

min_train_samples = min_samples(train_counts)
min_val_samples = min_samples(val_counts)

print(f'train min: {min_train_samples}')
print(f'val min: {min_val_samples}')

# set each to 100 as approximate

train min: 102
val min: 136


Filter our the dataset in minimum of 100 samples

In [12]:
import shutil
import random

target_size = 100
def equalize_sample(original_dir, target_dir, num_samples_per_person):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for person_folder in os.listdir(original_dir):
        person_folder_path = os.path.join(original_dir, person_folder)
        target_person_folder_path = os.path.join(target_dir, person_folder)
        if not os.path.exists(target_person_folder_path):
            os.makedirs(target_person_folder_path)
        
        if os.path.isdir(person_folder_path):
            all_images = os.listdir(person_folder_path)
            sampled_images = random.sample(all_images, num_samples_per_person)
            for image in sampled_images:
                shutil.copy(os.path.join(person_folder_path, image), os.path.join(target_person_folder_path, image))

# Define paths and sample sizes
target_train_set = 'dataset/train_eq'
target_val_set = 'dataset/val_eq'

# Create subsets with equal samples
equalize_sample(train_set, target_train_set, target_size)
equalize_sample(val_set, target_val_set, target_size)


Verify if the sample counts are all equal to the target size (100)

In [13]:
def verify_sample_counts(directory):
    counts = count_samples(directory)
    print(f"Sample counts in {directory}:")
    for person, count in counts.items():
        print(f"{person}: {count}")

verify_sample_counts(target_train_set)
verify_sample_counts(target_val_set)

Sample counts in dataset/train_eq:
n000002: 100
n000003: 100
n000004: 100
n000005: 100
n000006: 100
n000007: 100
n000008: 100
n000010: 100
n000011: 100
n000012: 100
n000013: 100
n000014: 100
n000015: 100
n000016: 100
n000017: 100
n000018: 100
n000019: 100
n000020: 100
n000021: 100
n000022: 100
n000023: 100
n000024: 100
n000025: 100
n000026: 100
n000027: 100
n000028: 100
n000030: 100
n000031: 100
n000032: 100
n000033: 100
n000034: 100
n000035: 100
n000036: 100
n000037: 100
n000038: 100
n000039: 100
n000041: 100
n000042: 100
n000043: 100
n000044: 100
n000045: 100
n000046: 100
n000047: 100
n000048: 100
n000049: 100
n000050: 100
n000051: 100
n000052: 100
n000053: 100
n000054: 100
n000055: 100
n000056: 100
n000057: 100
n000058: 100
n000059: 100
n000060: 100
n000061: 100
n000062: 100
n000063: 100
n000064: 100
n000065: 100
n000066: 100
n000067: 100
n000068: 100
n000069: 100
n000070: 100
n000071: 100
n000072: 100
n000073: 100
n000074: 100
n000075: 100
n000076: 100
n000077: 100
n000079: 100
n00

Process the images by cropping and aligning the faces using MTCNN

In [19]:
import cv2
import os
import numpy as np
from mtcnn import MTCNN
from PIL import Image

def extract_face(image, required_size=(160, 160)):
    detector = MTCNN()
    results = detector.detect_faces(image)
    if results:
        x1, y1, width, height = results[0]['box']
        x2, y2 = x1 + width, y1 + height
        face = image[y1:y2, x1:x2]
        image = Image.fromarray(face)
        image = image.resize(required_size)
        face_array = np.asarray(image)
        return face_array, results[0]['keypoints']
    return None, None

def align_face(image, left_eye, right_eye):
    left_eye_center = np.array(left_eye)
    right_eye_center = np.array(right_eye)

    delta_x = right_eye_center[0] - left_eye_center[0]
    delta_y = right_eye_center[1] - left_eye_center[1]
    angle = np.arctan(delta_y / delta_x) * 180 / np.pi

    center = tuple(map(int, ((left_eye_center + right_eye_center) // 2).tolist()))
    rotation_matrix = cv2.getRotationMatrix2D(center, angle, scale=1)
    aligned_image = cv2.warpAffine(image, rotation_matrix, (image.shape[1], image.shape[0]))

    return aligned_image

def process_and_save_images(input_folder, output_folder):
    detector = MTCNN()
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for root, dirs, files in os.walk(input_folder):
        for filename in files:
            filepath = os.path.join(root, filename)
            image = cv2.imread(filepath)
            if image is None:
                print(f"Warning: {filepath} is not a valid image file.")
                continue

            results = detector.detect_faces(image)
            
            if results:
                landmarks = results[0]['keypoints']
                aligned_face = align_face(image, landmarks['left_eye'], landmarks['right_eye'])

                x1, y1, width, height = results[0]['box']
                x2, y2 = x1 + width, y1 + height
                face = aligned_face[y1:y2, x1:x2]
                face = cv2.resize(face, (160, 160))

                relative_path = os.path.relpath(root, input_folder)
                output_dir = os.path.join(output_folder, relative_path)
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                output_filepath = os.path.join(output_dir, filename)
                cv2.imwrite(output_filepath, face)
            else:
                print(f"Warning: No face detected in {filepath}") # more or less, MTCNN False Negative case

process_and_save_images('dataset/train', 'dataset/train_mtcnn')
process_and_save_images('dataset/val', 'dataset/val_mtcnn')



Recheck sample counts

In [21]:
def count_samples(dataset_dir):
    sample_counts = {}
    for person_folder in os.listdir(dataset_dir):
        person_folder_path = os.path.join(dataset_dir, person_folder)
        if os.path.isdir(person_folder_path):
            sample_counts[person_folder] = len(os.listdir(person_folder_path))
    return sample_counts

train_mtcnn_counts = count_samples('dataset/train_mtcnn')
val_mtcnn_counts = count_samples('dataset/val_mtcnn')

def min_samples(counts):
    return min(counts.values())

min_train_mtcnn_counts = min_samples(train_mtcnn_counts)
min_val_mtcnn_counts = min_samples(val_mtcnn_counts)

print(f'train min: {min_train_mtcnn_counts}')
print(f'val min: {min_val_mtcnn_counts}')


train min: 73
val min: 103


Due to undetected face, samples are needed to be trimmed down to 70

In [22]:
import shutil
import random

target_size = 70
def equalize_sample(original_dir, target_dir, num_samples_per_person):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for person_folder in os.listdir(original_dir):
        person_folder_path = os.path.join(original_dir, person_folder)
        target_person_folder_path = os.path.join(target_dir, person_folder)
        if not os.path.exists(target_person_folder_path):
            os.makedirs(target_person_folder_path)
        
        if os.path.isdir(person_folder_path):
            all_images = os.listdir(person_folder_path)
            sampled_images = random.sample(all_images, num_samples_per_person)
            for image in sampled_images:
                shutil.copy(os.path.join(person_folder_path, image), os.path.join(target_person_folder_path, image))

equalize_sample('dataset/train_mtcnn', 'dataset/train_mtcnn_eq', target_size)
equalize_sample('dataset/val_mtcnn', 'dataset/val_mtcnn_eq', target_size)

verify

In [23]:
def count_samples(dataset_dir):
    sample_counts = {}
    for person_folder in os.listdir(dataset_dir):
        person_folder_path = os.path.join(dataset_dir, person_folder)
        if os.path.isdir(person_folder_path):
            sample_counts[person_folder] = len(os.listdir(person_folder_path))
    return sample_counts

train_mtcnn_counts = count_samples('dataset/train_mtcnn_eq')
val_mtcnn_counts = count_samples('dataset/val_mtcnn_eq')

def min_samples(counts):
    return min(counts.values())

min_train_mtcnn_counts = min_samples(train_mtcnn_counts)
min_val_mtcnn_counts = min_samples(val_mtcnn_counts)

print(f'train min: {min_train_mtcnn_counts}')
print(f'val min: {min_val_mtcnn_counts}')


train min: 70
val min: 70
