In [4]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
import os
import shutil
import random
import hashlib
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

import torch
import cv2

In [9]:
DATA_DIR = Path("/kaggle/input/fer2013plus/fer2013plus/fer2013")
OUTPUT_DIR = Path("/kaggle/working/clean_fer2013plus")

TRAIN_DIR = OUTPUT_DIR / "train"
TEST_DIR  = OUTPUT_DIR / "test"

In [10]:
if OUTPUT_DIR.exists():
    shutil.rmtree(OUTPUT_DIR)

shutil.copytree(DATA_DIR, OUTPUT_DIR)
print("âœ… Dataset copied safely")

âœ… Dataset copied safely


In [11]:
print("Train classes:", sorted(os.listdir(TRAIN_DIR)))
print("Test classes :", sorted(os.listdir(TEST_DIR)))
print("Total classes:", len(os.listdir(TRAIN_DIR)))

Train classes: ['anger', 'contempt', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprise']
Test classes : ['anger', 'contempt', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprise']
Total classes: 8


In [13]:
for cls in sorted(os.listdir(TRAIN_DIR)):
    imgs = os.listdir(TRAIN_DIR / cls)
    print(f"{cls}: {len(imgs)} images")

anger: 2466 images
contempt: 165 images
disgust: 191 images
fear: 652 images
happiness: 7528 images
neutral: 10308 images
sadness: 3514 images
surprise: 3562 images


In [14]:
def md5_hash(path):
    h = hashlib.md5()
    with open(path, "rb") as f:
        h.update(f.read())
    return h.hexdigest()

def find_duplicates(folder):
    seen = {}
    duplicates = []
    for root, _, files in os.walk(folder):
        for f in files:
            p = os.path.join(root, f)
            try:
                h = md5_hash(p)
                if h in seen:
                    duplicates.append(p)
                else:
                    seen[h] = p
            except:
                pass
    return duplicates

duplicates = find_duplicates(OUTPUT_DIR)
print("Duplicates found:", len(duplicates))

Duplicates found: 1828


In [15]:
for img in duplicates:
    if os.path.exists(img):
        os.remove(img)

print("âœ… Duplicate images deleted")

âœ… Duplicate images deleted


In [16]:
def find_corrupt(folder):
    bad = []
    for root, _, files in os.walk(folder):
        for f in files:
            try:
                Image.open(os.path.join(root, f)).verify()
            except:
                bad.append(os.path.join(root, f))
    return bad

corrupt = find_corrupt(OUTPUT_DIR)
print("Corrupt images:", len(corrupt))

Corrupt images: 0


In [17]:
import cv2
from PIL import Image
from tqdm import tqdm

face_cascade = cv2.CascadeClassifier(
    cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
)

no_face = []

for root, _, files in os.walk(OUTPUT_DIR):
    for f in tqdm(files):
        if f.lower().endswith(".jpg"):
            path = os.path.join(root, f)
            try:
                img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
                faces = face_cascade.detectMultiScale(
                    img,
                    scaleFactor=1.1,
                    minNeighbors=3,
                    minSize=(20, 20)
                )
                if len(faces) == 0:
                    no_face.append(path)
            except:
                no_face.append(path)

print("Images with no detected face:", len(no_face))

0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 55/55 [00:00<00:00, 467924.38it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 637/637 [00:00<00:00, 1497629.85it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 862/862 [00:00<00:00, 2411934.66it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1816/1816 [00:00<00:00, 2706771.88it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 163/163 [00:00<00:00, 1106264.65it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2583/2583 [00:00<00:00, 2399000.72it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 848/848 [00:00<00:00, 2042946.46it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 51/51 [00:00<00:00, 620027.55it/s]
0it [00:00, ?it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 155/155 [00:00<00:00, 914370.07it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2296/2296 [00:00<00:00, 1845911.82it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2864/2864 [00:00<00:00, 2486027.87it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7294/7294 [00:00<00:00, 2836648.44it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 575

Images with no detected face: 0





In [18]:
for img in no_face:
    if os.path.exists(img):
        os.remove(img)

print("âœ… No-face images deleted")

âœ… No-face images deleted


In [20]:
for split in ["train", "test"]:
    total = 0
    print(f"\n{split.upper()}")
    for cls in sorted(os.listdir(OUTPUT_DIR / split)):
        cnt = len(os.listdir(OUTPUT_DIR / split / cls))
        total += cnt
        print(cls, cnt)
    print("TOTAL:", total)


TRAIN
anger 2296
contempt 151
disgust 155
fear 575
happiness 7294
neutral 9988
sadness 3319
surprise 2864
TOTAL: 26642

TEST
anger 637
contempt 51
disgust 55
fear 163
happiness 1816
neutral 2583
sadness 848
surprise 862
TOTAL: 7015


In [21]:
zip_path = "/kaggle/working/clean_fer2013plus_8class.zip"
shutil.make_archive(zip_path.replace(".zip",""), "zip", OUTPUT_DIR)
print("ðŸ“¦ Saved:", zip_path)

ðŸ“¦ Saved: /kaggle/working/clean_fer2013plus_8class.zip
