In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.image import ImageDataGenerator

import matplotlib.pyplot as plt


In [3]:
def import_dataset(filename, verbose):
    # Load the data
    data = pd.read_csv(filename)
    if verbose: print(data.head())
    return data

In [2]:
def remove_emotion(data, emotion):
    filtered_data = data[data['emotion'] != emotion]
    return filtered_data

In [4]:
def plot_categories_distribution(data):
    CLASS_LABELS  = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', "Neutral"]
    emotion_counts = data['emotion'].value_counts(normalize=True).sort_index()
    emotion_percentages = 100 * emotion_counts
    fig = px.bar(x = [f"{label} ({percentage:.2f}%)" for label, percentage in zip(CLASS_LABELS, emotion_percentages)],
                 y = emotion_counts.values * len(data),
                 color = emotion_counts.index,
                 color_continuous_scale="Emrld") 
    fig.update_xaxes(title="Emotions")
    fig.update_yaxes(title = "Number of Images")
    fig.update_layout(showlegend = True,
                      title = {
                          'text': 'Data Distribution',
                          'y':0.95,
                          'x':0.5,
                          'xanchor': 'center',
                          'yanchor': 'top'})
    fig.show()


In [5]:
def format_labels_and_tensors(data, verbose):
    data = data.sample(frac=1, random_state=42)
    
    train_pixels = data["pixels"].astype(str).str.split(" ").tolist()
    train_pixels = np.array(train_pixels, dtype=np.uint8)  # Convert to uint8
    
    # Dynamic reshaping based on the number of samples
    num_samples = train_pixels.shape[0]
    pixels = train_pixels.reshape((num_samples, 48, 48, 1))
    
    # Adjust the num_classes based on unique emotion labels
    num_classes = len(data['emotion'].unique())
    labels = to_categorical(data[['emotion']], num_classes=num_classes)
    
    X_train, X_test, y_train, y_test = train_test_split(pixels, labels, test_size=0.1, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, shuffle=False)
    
    if verbose:
        print(X_train.shape)
        print(X_test.shape)
        print(X_val.shape)
    
    return [X_train, y_train, X_test, y_test, X_val, y_val]


In [6]:
def plot_emotion_samples(X_train, y_train):
    plt.figure(figsize=(15,23))
    label_dict = {0 : 'Angry', 1 : 'Disgust', 2 : 'Fear', 3 : 'Happy', 4 : 'Sad', 5 : 'Surprise', 6 : 'Neutral'}
    i = 1
    for i in range (7):
        img = np.squeeze(X_train[i])
        plt.subplot(1,7,i+1)
        plt.imshow(img)
        index = np.argmax(y_train[i])
        plt.title(label_dict[index])
        plt.axis('off')
        i += 1
    plt.show()

In [7]:

def create_data_objects(datasets):
    X_train, y_train, X_val, y_val, X_test, y_test = datasets
    # Create data generator objects for training and validation
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=30,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True)

    val_datagen = ImageDataGenerator(rescale=1./255)

    # No need to fit the generator in this case
    train_generator = train_datagen.flow(X_train, y_train, batch_size=64)
    val_generator = val_datagen.flow(X_val, y_val, batch_size=64)
    
    # Create a test generator with rescaling
    test_datagen = ImageDataGenerator(rescale=1./255)

    # Flow the data from the test set
    test_generator = test_datagen.flow(X_test, y_test, batch_size=64)
    
    return train_generator, val_generator, test_generator

In [8]:
 def print_tensor_check(datasets):
    X_train, y_train, X_val, y_val, X_test, y_test = datasets
    print('Tensors Check')
    print(f'\ttrain images: type:{type(X_train)}, num of items:{X_train.shape[0]}, \
    data in 1 item:{X_train.shape[1]}, max-min value:{np.max(X_train[1])} - {np.min(X_train[1])}')

    print(f'\tvalidation images: type:{type(X_val)}, num of items:{X_val.shape[0]}, \
    data in 1 item:{X_val.shape[1]}, max-min value:{np.max(X_val[1])} - {np.min(X_val[1])}')

    print(f'\ttest images: type:{type(X_test)}, num of items:{X_test.shape[0]}, \
    data in 1 item:{X_test.shape[1]}, max-min value:{np.max(X_test[1])} - {np.min(X_test[1])}')

    print(f'\ttrain labels: type:{type(y_train)}, num of items:{y_train.shape[0]}, \
    max-min value:{np.max(y_train)} - {np.min(y_train)}')

    print(f'\tvalidation labels: type:{type(y_val)}, num of items:{y_val.shape[0]}, \
    max-min value:{np.max(y_val)} - {np.min(y_val)}')

    print(f'\ttest labels: type:{type(y_test)}, num of items:{y_test.shape[0]}, \
    max-min value:{np.max(y_test)} - {np.min(y_test)}')

In [33]:
def preprocess_dataset(filename, verbose):
    dataset = import_dataset(filename, verbose)
    if verbose: plot_categories_distribution(dataset)
    filtered_data = remove_emotion(dataset, 6)
    if verbose: plot_categories_distribution(filtered_data)
    separated_datasets = format_labels_and_tensors(filtered_data, verbose)
    if verbose: plot_emotion_samples(separated_datasets[0], separated_datasets[1])
    if verbose: print_tensor_check(separated_datasets)
    return separated_datasets