### **Loading libraries**


In [1]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Conv2D, MaxPool2D, Flatten, Dropout, Dense, RandomFlip, RandomRotation, RandomZoom, Input, BatchNormalization
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os

# **Data Preparation and Pre-processing**

## **Data exploration (EDA)**


**Class counts**


In [2]:
train_dir = "FER/train"
test_dir = "FER/test"
emotions = ["Angry", "Disgust", "Fear", "Happy", "Neutral", "Sad", "Surprise"]

# Count images per class
def count_images(base_dir):
    counts = {}
    for emotion in emotions:
        path = base_dir + "/" + emotion
        counts[emotion] = len(os.listdir(path))
    return counts

train_counts = count_images(train_dir)
test_counts = count_images(test_dir)

In [None]:
train_counts

In [None]:
test_counts

**Class distribution visualization**


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].bar(train_counts.keys(), train_counts.values())
axes[0].set_title('Training Set Distribution')
axes[0].set_xlabel('Emotion')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

axes[1].bar(test_counts.keys(), test_counts.values())
axes[1].set_title('Test Set Distribution')
plt.tight_layout()
plt.show()

In [None]:
print("Imbalance Ratio: ")
for emotion in emotions:
    ratio = train_counts[emotion] / sum(train_counts.values()) * 100
    print(f"{emotion}: {ratio:.2f}%")

**Analyze image dimensions and properties**


In [5]:
from PIL import Image
import cv2
from collections import Counter

In [None]:
def analyze_image_properties(base_dir, sample_per_class=None):
    results = {
        'dimensions': [],
        'color_modes': [],
        'file_sizes': [],
        'emotions': [],
        'corrupted_files': [],
        'file_paths': []
    }
    
    for emotion in emotions:
        emotion_path = base_dir + "/" + emotion.lower()
        files = os.listdir(emotion_path)
        
        if sample_per_class:
            files = files[:sample_per_class]
        
        print(f"Analyzing {emotion}: {len(files)} images...")
        
        for file in files:
            img_path = emotion_path + "/" + file
            
            try:
                img = Image.open(img_path)
                
                # check image size (width & length)
                results['dimensions'].append(img.size)

                # check image color channel
                results['color_modes'].append(img.mode)
                
                results['file_sizes'].append(os.path.getsize(img_path))
                results['emotions'].append(emotion)
                results['file_paths'].append(img_path)
                
            except Exception as e:
                results['corrupted_files'].append({
                    'path': img_path,
                    'emotion': emotion,
                    'error': str(e)
                })
                print(f"Corrupted: {file} - {e}")
    
    return results

In [None]:
print("TRAINING SET RESULTS")
train_properties = analyze_image_properties(train_dir)
print("Corrupted Files:", len(train_properties['corrupted_files']))

In [None]:
print("TEST SET RESULTS")
test_properties = analyze_image_properties(test_dir)
print("Corrupted Files:", len(test_properties['corrupted_files']))

In [None]:
# Dimension
train_dims = Counter(train_properties['dimensions'])
test_dims = Counter(test_properties['dimensions'])

# Colour channel
train_modes = Counter(train_properties['color_modes'])
test_modes = Counter(test_properties['color_modes'])

# image file size
train_sizes_kb = np.array(train_properties['file_sizes']) / 1024
test_sizes_kb = np.array(test_properties['file_sizes']) / 1024

In [None]:
print("\nTraining set unique dimensions:")
for dim, count in train_dims.most_common():
    print(f"  {dim[0]}x{dim[1]}: {count} images ({count/len(train_properties['dimensions'])*100:.2f}%)")


In [None]:
print("\nTest set unique dimensions:")
for dim, count in test_dims.most_common():
    print(f"  {dim[0]}x{dim[1]}: {count} images ({count/len(test_properties['dimensions'])*100:.2f}%)")


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

dim_labels_train = [f"{d[0]}x{d[1]}" for d in train_dims.keys()]
dim_counts_train = list(train_dims.values())
axes[0].bar(dim_labels_train, dim_counts_train)
axes[0].set_title('Training Set: Image Dimensions')
axes[0].set_xlabel('Dimension (WxH)')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

dim_labels_test = [f"{d[0]}x{d[1]}" for d in test_dims.keys()]
dim_counts_test = list(test_dims.values())
axes[1].bar(dim_labels_test, dim_counts_test, color='orange')
axes[1].set_title('Test Set: Image Dimensions')
axes[1].set_xlabel('Dimension (WxH)')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('image_dimensions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
print("\nTraining set color modes:")
for mode, count in train_modes.items():
    mode_name = "Grayscale" if mode == 'L' else "RGB" if mode == 'RGB' else mode
    print(f"  {mode_name} ({mode}): {count} images ({count/len(train_properties['color_modes'])*100:.2f}%)")


In [None]:
print("\nTest set color modes:")
for mode, count in test_modes.items():
    mode_name = "Grayscale" if mode == 'L' else "RGB" if mode == 'RGB' else mode
    print(f"  {mode_name} ({mode}): {count} images ({count/len(test_properties['color_modes'])*100:.2f}%)")

In [None]:
print("Training set file sizes (KB):")
print(f"  Mean: {np.mean(train_sizes_kb):.2f} KB")
print(f"  Median: {np.median(train_sizes_kb):.2f} KB")
print(f"  Std Dev: {np.std(train_sizes_kb):.2f} KB")
print(f"  Min: {np.min(train_sizes_kb):.2f} KB")
print(f"  Max: {np.max(train_sizes_kb):.2f} KB")

In [None]:
print("Test set file sizes (KB):")
print(f"  Mean: {np.mean(test_sizes_kb):.2f} KB")
print(f"  Median: {np.median(test_sizes_kb):.2f} KB")
print(f"  Min: {np.min(test_sizes_kb):.2f} KB")
print(f"  Max: {np.max(test_sizes_kb):.2f} KB")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].hist(train_sizes_kb, bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Training Set: File Size Distribution')
axes[0].set_xlabel('File Size (KB)')
axes[0].set_ylabel('Frequency')
axes[0].axvline(np.mean(train_sizes_kb), color='red', linestyle='--', label=f'Mean: {np.mean(train_sizes_kb):.2f} KB')
axes[0].legend()

axes[1].hist(test_sizes_kb, bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Test Set: File Size Distribution')
axes[1].set_xlabel('File Size (KB)')
axes[1].set_ylabel('Frequency')
axes[1].axvline(np.mean(test_sizes_kb), color='red', linestyle='--', label=f'Mean: {np.mean(test_sizes_kb):.2f} KB')
axes[1].legend()

plt.tight_layout()
plt.savefig('file_sizes.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
def analyze_pixel_intensity(base_dir, sample_size=500):
    intensities = []
    
    for emotion in emotions:
        emotion_path = os.path.join(base_dir, emotion)
        files = os.listdir(emotion_path)[:sample_size//len(emotions)]
        
        for file in files:
            img_path = os.path.join(emotion_path, file)
            try:
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img is not None:
                    intensities.extend(img.flatten())
            except:
                continue
    
    return np.array(intensities)

train_intensities = analyze_pixel_intensity(train_dir, sample_size=500)

In [None]:
print(f"Pixel intensity statistics:")
print(f"  Mean: {np.mean(train_intensities):.2f}")
print(f"  Median: {np.median(train_intensities):.2f}")
print(f"  Std Dev: {np.std(train_intensities):.2f}")
print(f"  Min: {np.min(train_intensities)}")
print(f"  Max: {np.max(train_intensities)}")

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train_intensities, bins=256, range=(0, 255), edgecolor='black', alpha=0.7)
plt.title('Pixel Intensity Distribution (Sample of 500 Training Images)')
plt.xlabel('Pixel Intensity')
plt.ylabel('Frequency')
plt.axvline(np.mean(train_intensities), color='red', linestyle='--', label=f'Mean: {np.mean(train_intensities):.2f}')
plt.legend()
plt.savefig('pixel_intensity.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
def intensity_by_emotion(base_dir, sample_per_class=100):
    emotion_intensities = {emotion: [] for emotion in emotions}

    for emotion in emotions:
        emotion_path = base_dir + "/" + emotion.lower()
        files = os.listdir(emotion_path)[:sample_per_class]
        
        for file in files:
            img_path = emotion_path + "/" + file
            try:
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img is not None:
                    emotion_intensities[emotion].append(np.mean(img))
            except:
                continue
    
    return emotion_intensities

emotion_intensity_data = intensity_by_emotion(train_dir, sample_per_class=200)

for emotion in emotions:
    mean_intensity = np.mean(emotion_intensity_data[emotion])
    std_intensity = np.std(emotion_intensity_data[emotion])
    print(f"{emotion.capitalize():10s}: Mean = {mean_intensity:.2f}, Std = {std_intensity:.2f}")

In [None]:
plt.figure(figsize=(12, 6))
plt.boxplot([emotion_intensity_data[e] for e in emotions], tick_labels=emotions)
plt.title('Pixel Intensity Distribution by Emotion Class')
plt.ylabel('Mean Pixel Intensity')
plt.xlabel('Emotion')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.savefig('intensity_by_emotion.png', dpi=300, bbox_inches='tight')
plt.show()

## **Detecting and removing dirty data**


In [3]:
import shutil

In [6]:
# detect images that are too dark or too bright
def detect_extreme_intensity_images(base_dir, min_threshold=20, max_threshold=235):
    extreme_images = []
    
    for emotion in emotions:
        emotion_path = base_dir + "/" + emotion.lower()
        files = os.listdir(emotion_path)
        
        for file in files:
            img_path = emotion_path + "/" + file
            try:
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                
                if img is not None:
                    mean_intensity = np.mean(img)
                    
                    if mean_intensity < min_threshold:
                        extreme_images.append({
                            'path': img_path,
                            'emotion': emotion,
                            'filename': file,
                            'mean_intensity': mean_intensity,
                            'issue': 'too_dark'
                        })
                    elif mean_intensity > max_threshold:
                        extreme_images.append({
                            'path': img_path,
                            'emotion': emotion,
                            'filename': file,
                            'mean_intensity': mean_intensity,
                            'issue': 'too_bright'
                        })
            except Exception as e:
                print(f"  Error reading {file}: {e}")
    
    return extreme_images

print("Analyzing training set...")
train_extreme = detect_extreme_intensity_images(train_dir, min_threshold=20, max_threshold=235)
print("Analyzing test set...")
test_extreme = detect_extreme_intensity_images(test_dir, min_threshold=20, max_threshold=235)

Analyzing training set...
Analyzing test set...


In [7]:
print("Training set by emotion:")
for emotion in emotions:
    too_dark = sum(1 for img in train_extreme if img['emotion'] == emotion and img['issue'] == 'too_dark')
    too_bright = sum(1 for img in train_extreme if img['emotion'] == emotion and img['issue'] == 'too_bright')
    if too_dark > 0 or too_bright > 0:
        print(f"  {emotion.capitalize():10s}: {too_dark} too dark, {too_bright} too bright")

Training set by emotion:


In [8]:
print("Test set by emotion:")
for emotion in emotions:
    too_dark = sum(1 for img in test_extreme if img['emotion'] == emotion and img['issue'] == 'too_dark')
    too_bright = sum(1 for img in test_extreme if img['emotion'] == emotion and img['issue'] == 'too_bright')
    if too_dark > 0 or too_bright > 0:
        print(f"  {emotion.capitalize():10s}: {too_dark} too dark, {too_bright} too bright")

Test set by emotion:


In [9]:
def visualize_extreme_images(extreme_images, n_samples=10, title="Extreme Intensity Images"):
    if len(extreme_images) == 0:
        print("No extreme images to display")
        return
    
    sample_size = min(n_samples, len(extreme_images))
    samples = np.random.choice(len(extreme_images), sample_size, replace=False)
    
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))
    axes = axes.flatten()
    
    for i, idx in enumerate(samples):
        img_data = extreme_images[idx]
        img = cv2.imread(img_data['path'], cv2.IMREAD_GRAYSCALE)
        
        axes[i].imshow(img, cmap='gray', vmin=0, vmax=255)
        axes[i].set_title(f"{img_data['emotion']}\nMean: {img_data['mean_intensity']:.1f}\n({img_data['issue']})", fontsize=8)
        axes[i].axis('off')
    
    plt.suptitle(title, fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('extreme_intensity_samples.png', dpi=300, bbox_inches='tight')
    plt.show()

# Visualize samples
if len(train_extreme) > 0:
    visualize_extreme_images(train_extreme, n_samples=10, title="Training Set: Extreme Intensity Images")

In [10]:
def quarantine_extreme_images(base_dir, extreme_images, quarantine_dir='FER/quarantine_extreme'):
    os.makedirs(quarantine_dir, exist_ok=True)
    os.makedirs((quarantine_dir + '/' + 'too_dark'), exist_ok=True)
    os.makedirs((quarantine_dir + '/' + 'too_bright'), exist_ok=True)
    
    moved_count = 0
    
    for img_data in extreme_images:
        src = img_data['path']
        dst_folder = quarantine_dir + '/' + str(img_data['issue'])
        dst = dst_folder + "/" + f"{img_data['emotion']}_{img_data['filename']}"
        
        try:
            shutil.move(src, dst)
            moved_count += 1
        except Exception as e:
            print(f"Error moving {src}: {e}")
    
    print(f"\nMoved {moved_count} images to quarantine folder: {quarantine_dir}")
    return moved_count

# Quarantine extreme images
print("\n" + "=" * 50)
print("QUARANTINING EXTREME IMAGES")
print("=" * 50)

# Uncomment to execute:
train_moved = quarantine_extreme_images(train_dir, train_extreme, 'FER/quarantine_extreme_train')
test_moved = quarantine_extreme_images(test_dir, test_extreme, 'FER/quarantine_extreme_test')

print("\n⚠️ RECOMMENDATION: Review quarantined images manually before permanently deleting")


QUARANTINING EXTREME IMAGES

Moved 0 images to quarantine folder: FER/quarantine_extreme_train

Moved 0 images to quarantine folder: FER/quarantine_extreme_test

⚠️ RECOMMENDATION: Review quarantined images manually before permanently deleting


# **Model Training**


In [None]:
img_size = (48, 48)
batch_size = 32
epochs = 50

train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
)

test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255
)

train_set = train_datagen.flow_from_directory(
    "FER/train",
    target_size=img_size,
    color_mode="grayscale",
    batch_size=batch_size,
    class_mode="categorical",
    shuffle=True
)

test_set = test_datagen.flow_from_directory(
    'FER/test',
    target_size=img_size,
    color_mode='grayscale',
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False
)



### **Improve data pipeline (prefetch + cache)**


In [None]:
# AUTOTUNE = tf.data.AUTOTUNE

# train_set = train_set.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
# test_set = test_set.cache().prefetch(buffer_size=AUTOTUNE)

### **Data Augmentation Layer**


In [None]:
data_augmentation = Sequential([
    RandomFlip("horizontal"),
    RandomRotation(0.1),
    RandomZoom(0.1)
])

number of classes / type of facial expressions


In [None]:
num_classes = 7

### **2D CNN Model**


In [None]:
model = Sequential()

model.add(Input(shape=(48, 48, 1)))

model.add(data_augmentation)

model.add(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(256, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(num_classes, activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'best_model.keras',
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:
history = model.fit(
    train_set,
    validation_data=test_set,
    epochs=60,
    callbacks=[checkpoint]
)

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
# plt.ylim([0.5, 1])
plt.legend()
plt.title("Accuracy")

plt.subplot(1,2,2)
plt.plot(history.history["loss"], label="Train Loss")
plt.plot(history.history["val_loss"], label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("Loss")

In [None]:
test_loss, test_acc = model.evaluate(test_set)
print("Test Accuracy:", test_acc)