Dataset: https://archive.ics.uci.edu/dataset/124/cmu+face+images

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
from PIL import Image

In [2]:
files = glob.glob("faces/**/*.pgm", recursive=True) # get a list of paths to all the pgm image files from the dataset

In [3]:
data = []
for file_path in files:
    file_name_split = file_path.split("\\")[2].split("_")
    if len(file_name_split) == 4: # only use the full resolution version of each image
        data.append([
            file_name_split[0],
            file_name_split[1],
            file_name_split[2], 
            file_name_split[3][:-4], # [:-4] removes the .pgm at the end of the string
            " ".join(map(str, list(Image.open(file_path).convert('L').getdata())))
        ])

df = pd.DataFrame(data, columns=["person", "direction", "emotion", "eyes", "pixels"])

In [6]:
# Handle duplicates (if any)
df.drop_duplicates(inplace=True)

In [None]:
# Check if there are any images labelled with more than 1 emotion
df[df.duplicated(subset=["pixels"])]

In [None]:
# Check if there are any nulls/NaN values
df[df.isna().any(axis=1)]

In [None]:
# get overall frequency of different emotions in CMU Face Images
emotions = df["emotion"].unique()
emotion_freq = dict()

for emotion in emotions:
    emotion_freq[emotion] = df["emotion"].value_counts()[emotion]

fig, ax = plt.subplots()
ax.bar(emotion_freq.keys(), emotion_freq.values())
ax.set_title("Frequency of emotions in CMU Face Images")

plt.show()

Emotion categories: 
- Angry: 155 images
- Happy: 155 images
- Neutral: 158 images
- Sad: 156 images

Total images: 624

16 images were excluded from the original 640 images due to problems with the camera setup (labelled with .bad suffix)

No duplicates were present in the dataset, and no images were assigned more than 1 emotion

In [10]:
# Display an image

labels = df.loc[:, 'emotion'] 
pixel_list = df.loc[:, 'pixels']

images = np.array([np.fromstring(pixels, dtype=int, sep=" ") for pixels in pixel_list])

In [None]:
# reshape the image to display
img_num = 150
image = images[img_num].reshape(120, 128, 1).astype('float32')

# display the image
plt.imshow(image, cmap="gist_gray")
df.iloc[img_num]["emotion"]

In [14]:
# Write the df to a CSV
df.to_csv("CMU Face Images.csv", index=False)