# Data validation
The main idea of this notebook is to validate the dataset ready for training (using dataloaders). We will check if the data is consistent and if there are any missing values. First we load the libraries and the data:

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import torch

# To add src to the path so that we can import modules
current_directory = os.getcwd()
if not current_directory.endswith("emotion_recognition"):
    sys.path.append(os.path.join(current_directory, 'emotion_recognition'))

try:
    from src import ROOT_DIR, PROCESSED_AFFECTNET_DIR, NUMBER_OF_EMOT
except ModuleNotFoundError:
    print("Ensure that src is added to PATH and restart the kernel")
    print(sys.path)

The flags for preprocessing the data are:

In [3]:
# Path of the parameters file
params_path = os.path.join(ROOT_DIR,"params.yaml")

# Read data preparation parameters
with open(params_path, "r", encoding='utf-8') as params_file:
    try:
        params = yaml.safe_load(params_file)
        params = params["preprocessing"]
    except yaml.YAMLError as exc:
        print(exc)

# Convert the params dictionary to a DataFrame
params_df = pd.DataFrame.from_dict(params, orient='index', columns=['Value'])
# Print the DataFrame
print(params_df)

                                Value
random_seed                        33
orig_datasets             [affectnet]
train_split                       0.8
categorical_format         hard_label
continuous_format           cartesian
face_detection_algorithm         None


for 

In [4]:
for data_split in os.listdir(PROCESSED_AFFECTNET_DIR):
    if data_split.endswith('.pkl'):
        print(f"Processing {data_split}-------------------")
        data = pd.read_pickle(os.path.join(PROCESSED_AFFECTNET_DIR, data_split))
        print(f"Shape of the data: {data.shape}")
        print(data.info())

Processing test.pkl-------------------


Shape of the data: (3999, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   key        3999 non-null   object
 1   img_path   3999 non-null   object
 2   cat_emot   3999 non-null   object
 3   cont_emot  3999 non-null   object
dtypes: object(4)
memory usage: 125.1+ KB
None
Processing train.pkl-------------------
Shape of the data: (230120, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230120 entries, 0 to 230119
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   key        230120 non-null  object
 1   img_path   230120 non-null  object
 2   cat_emot   230120 non-null  object
 3   cont_emot  230120 non-null  object
dtypes: object(4)
memory usage: 7.0+ MB
None
Processing val.pkl-------------------
Shape of the data: (57531, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 

We check ranges of categorical and continuous variables for any inconsistencies (we take small subset of the data for this purpose to speed up the process):

In [9]:
for data_split in os.listdir(PROCESSED_AFFECTNET_DIR):
    if data_split.endswith('.pkl'):
        print(f"Analyzing {data_split}-------------------")
        data = pd.read_pickle(os.path.join(PROCESSED_AFFECTNET_DIR, data_split))
        random_idx_list = np.random.randint(0, len(data), 2500)
        for idx in random_idx_list:
            sample = data.iloc[idx]               
            # Check if the variables type
            if not isinstance(sample['img_path'], str):
                print(f"Index: {idx}, img_path is not a string.")
            if not isinstance(sample['cat_emot'], torch.Tensor):
                print(f"Index: {idx}, cat_emot is not a PyTorch tensor.")
            if not isinstance(sample['cont_emot'], torch.Tensor):
                print(f"Index: {idx}, cont_emot is not a PyTorch tensor.")

            # Check if the variables are in correct format
            if sample['cat_emot'].numpy().sum() == 1 and len(sample['cat_emot'].numpy()) == NUMBER_OF_EMOT: # Check if the encoding sums to 1, i.e. one-hot encoding (it can be soft or hard)
                print (f"Index: {idx}, cat_emot is has incorrect encoding.")
            if sample['cont_emot'].numpy().shape != (2,):
                    print(f"Index: {idx}, cont_emot has more than 2 values.")
            if params['continuous_format'] == 'cartesian':
                if sample['cont_emot'].numpy()[0] < -1 or sample['cont_emot'].numpy()[0] > 1:
                    print(f"Index: {idx}, valence is not in the range [-1, 1].")
                if sample['cont_emot'].numpy()[1] < -1 or sample['cont_emot'].numpy()[1] > 1:
                    print(f"Index: {idx}, arousal is not in the range [-1, 1].")
            elif params['continuous_format'] == 'polar':
                if sample['cont_emot'].numpy()[0] < 0 or sample['cont_emot'].numpy()[0] > 1:
                    print(f"Index: {idx}, radius is not in the range [0, 1].")
                if sample['cont_emot'].numpy()[1] < 0 or sample['cont_emot'].numpy()[1] > np.pi:
                    print(f"Index: {idx}, phase is not in the range [0, pi].")

Analyzing test.pkl-------------------
Index: 2584, cat_emot is has incorrect encoding.
Index: 437, cat_emot is has incorrect encoding.
Index: 250, cat_emot is has incorrect encoding.
Index: 1172, cat_emot is has incorrect encoding.
Index: 3925, cat_emot is has incorrect encoding.
Index: 1041, cat_emot is has incorrect encoding.
Index: 3356, cat_emot is has incorrect encoding.
Index: 2127, cat_emot is has incorrect encoding.
Index: 3838, cat_emot is has incorrect encoding.
Index: 391, cat_emot is has incorrect encoding.
Index: 1699, cat_emot is has incorrect encoding.
Index: 2927, cat_emot is has incorrect encoding.
Index: 1156, cat_emot is has incorrect encoding.
Index: 1030, cat_emot is has incorrect encoding.
Index: 3862, cat_emot is has incorrect encoding.
Index: 3301, cat_emot is has incorrect encoding.
Index: 2398, cat_emot is has incorrect encoding.
Index: 2468, cat_emot is has incorrect encoding.
Index: 2960, cat_emot is has incorrect encoding.
Index: 750, cat_emot is has incorr

KeyboardInterrupt: 