# Data exploration on AffectNet raw dataset
First we declare the necessary variables and import the necessary libraries.

In [1]:
import glob
import numpy as np
import pandas as pd
import os

try: 
    from src import RAW_AffectNet_DIR
except ModuleNotFoundError:
    print("Ensure that src is added to PATH and restart the kernel")

In [2]:
validation_path = os.path.join(RAW_AffectNet_DIR, "val_set")
print(validation_path)

/mnt/gpid08/datasets/affectnet/val_set


In [3]:
# Obtener una lista de todos los archivos en el subdirectorio
file_list = os.listdir(os.path.join(validation_path, "annotations"))
# Init general dictionary to store all annotations
annotations_dict = dict()
# Iterate through all archives
for file in file_list:
    if file.endswith(".npy"):
        # Get the file id
        photo_idx = file.split("_")[0]
        # Load npy archive
        data = np.load(os.path.join(validation_path,"annotations", file))
        
        if photo_idx not in annotations_dict:
            annotations_dict[photo_idx] = [[],[],[]]

        type_of_data = file.split("_")[1].split(".")[0]
        if type_of_data == "val":
            annotations_dict[photo_idx][0].append(data.item())
        elif type_of_data == "aro":
            annotations_dict[photo_idx][1].append(data.item())
        elif type_of_data == "exp":
            annotations_dict[photo_idx][2].append(data.item())

# Convert annotations dictionary to DataFrame
annotations = pd.DataFrame.from_dict(annotations_dict, orient='index', columns=['val', 'aro', 'exp'])

# Print the DataFrame
print(annotations)

              val           aro  exp
4915  [-0.285714]    [0.404762]  [6]
374   [-0.634921]    [0.626984]  [7]
2007  [-0.229129]    [0.876669]  [4]
2455   [0.912698]    [0.246032]  [1]
591   [-0.587302]    [0.484127]  [5]
...           ...           ...  ...
139    [0.391986]    [0.745257]  [3]
5341  [-0.119048]    [0.809524]  [4]
2452          [0]  [0.00793651]  [0]
984    [0.600506]     [0.13169]  [1]
2553  [-0.319396]    [0.333914]  [7]

[3999 rows x 3 columns]


Count how many have more than one annotation

In [4]:
more_annot = []
for index, row in annotations.iterrows():
    if len(row['val']) > 1 | len(row['aro']) > 1 | len(row['exp']) > 1:
        more_annot.append(index)
print("There are {} files with more than one annotation".format(len(more_annot)))

We redefine pandas to have only the items, not the vectors.

In [None]:
annotations = annotations.applymap(lambda x: x[0] if isinstance(x, list) else x)
print(annotations)

Empty DataFrame
Columns: [val, aro, exp]
Index: []


  annotations = annotations.applymap(lambda x: x[0] if isinstance(x, list) else x)


Correspond to the number of images and each one has its own annotation.

In [None]:
annotations = annotations.assign(path='')

In [None]:
# Obtener una lista de todos los archivos en el subdirectorio
file_list = os.listdir(os.path.join(validation_path, "images"))
no_correspondence = []

# Iterate through all archives
for file in file_list:
    if file.endswith(".npy"):
        # Get the file id
        photo_idx = file.split(".")[0]
        if photo_idx not in annotations_dict:
            no_correspondence.append(photo_idx)
        

# Print the DataFrame
print(annotations)

Empty DataFrame
Columns: [val, aro, exp]
Index: []


# Distribution of data:


In [None]:
continuous_annotations = []
for photo_idx in range(len(annotations[data_split])):
    sample = annotations[data_split].iloc[photo_idx]
    for person in range (sample['people']):
            labels = {'valence': sample["label_cont"][person][0],
             'arousal': sample["label_cont"][person][1],
             'dominance': sample["label_cont"][person][2]}
            continuous_annotations.append(labels)

# Create a dataframe with sample data
continuous_annotations = pd.DataFrame(continuous_annotations).dropna()

In [None]:
# Transform the data for ridge plot
continuous_annotations_rowed = pd.melt(continuous_annotations, var_name='Variable', value_name='Value')

In [None]:
# Define the range
value_range = [continuous_annotations_rowed['Value'].min(), continuous_annotations_rowed['Value'].max()]

# Calculate the maximum count across all variables
max_count = max(
    continuous_annotations_rowed[continuous_annotations_rowed['Variable'] == 'valence']['Value'].value_counts().max(),
    continuous_annotations_rowed[continuous_annotations_rowed['Variable'] == 'arousal']['Value'].value_counts().max(),
    continuous_annotations_rowed[continuous_annotations_rowed['Variable'] == 'dominance']['Value'].value_counts().max()
)

# Create a histogram
valence_hist = alt.Chart(continuous_annotations_rowed).mark_bar(color = "steelblue", width= 20).encode(
    alt.X("Value", scale=alt.Scale(domain=value_range)),
    alt.Y('count()', scale=alt.Scale(domain=[0, max_count])),
).transform_filter(
    alt.datum.Variable == 'valence'
).properties(
    title = alt.TitleParams(text = 'Valence Distribution', fontSize=16),
    width = 300,
    height = 400
)

# Create a histogram
arousal_hist = alt.Chart(continuous_annotations_rowed).mark_bar(color = "steelblue", width= 20).encode(
    alt.X("Value", scale=alt.Scale(domain=value_range)),
    alt.Y('count()', scale=alt.Scale(domain=[0, max_count])),
).transform_filter(
    alt.datum.Variable == 'arousal'
).properties(
    title = alt.TitleParams(text = 'Arousal Distribution', fontSize=16),
    width = 300,
    height = 400
)

# Create a histogram
dominance_hist = alt.Chart(continuous_annotations_rowed).mark_bar(color = "steelblue", width= 20).encode(
    alt.X("Value", scale=alt.Scale(domain=value_range)),
    alt.Y('count()', scale=alt.Scale(domain=[0, max_count])),
).transform_filter(
    alt.datum.Variable == 'dominance'
).properties(
    title = alt.TitleParams(text = 'Dominance Distribution', fontSize=16),
    width = 300,
    height = 400
)
# Concatenate the lollipop chart and bar chart horizontally
combined_chart = alt.hconcat(valence_hist, arousal_hist, dominance_hist).resolve_scale(y='shared').properties(
    title=alt.TitleParams(text = "Continuous annotations distributions", anchor='middle', fontSize=20, dy=-20),
)
combined_chart

NameError: name 'alt' is not defined