# Data exploration on AffectNet raw dataset
First we declare the necessary variables and import the necessary libraries.

In [1]:
import numpy as np
import pandas as pd
import os

import altair as alt
alt.data_transformers.disable_max_rows() # delete the upper bound of entries in altair
alt.renderers.enable("html") # enable altair to render in html
# Suppress AltairDeprecationWarning warnings
import warnings
from altair.utils.deprecation import AltairDeprecationWarning
warnings.filterwarnings("ignore", category=AltairDeprecationWarning)

from prettytable import PrettyTable

try: 
    from src import RAW_AFFECTNET_DIR, INTERIM_COLUMNS_AFFECTNET
except ModuleNotFoundError:
    print("Ensure that src is added to PATH and restart the kernel")

In [2]:
validation_path = os.path.join(RAW_AFFECTNET_DIR, "val_set")
print(validation_path)

/mnt/gpid08/datasets/affectnet/val_set


In [3]:
# Obtener una lista de todos los archivos en el subdirectorio
file_list = os.listdir(os.path.join(validation_path, "annotations"))
# Init general dictionary to store all annotations
annotations_dict = dict()
# Iterate through all archives
for file in file_list:
    if file.endswith(".npy"):
        # Get the file id
        photo_idx = file.split("_")[0]
        # Load npy archive
        data = np.load(os.path.join(validation_path,"annotations", file))
        
        if photo_idx not in annotations_dict:
            annotations_dict[photo_idx] = ['',[],[],[]] # the first element is the path to the image that will be empty by now

        type_of_data = file.split("_")[1].split(".")[0]
        if type_of_data == "exp":
            annotations_dict[photo_idx][1].append(int(data.item()))
        elif type_of_data == "val":
            annotations_dict[photo_idx][2].append(float(data.item()))
        elif type_of_data == "aro":
            annotations_dict[photo_idx][3].append(float(data.item()))

# Convert annotations dictionary to DataFrame
annotations = pd.DataFrame.from_dict(annotations_dict, orient='index', columns=INTERIM_COLUMNS_AFFECTNET)

# Print the DataFrame
print(annotations)

     path label_cat          val           aro
4915            [6]  [-0.285714]    [0.404762]
374             [7]  [-0.634921]    [0.626984]
2007            [4]  [-0.229129]    [0.876669]
2455            [1]   [0.912698]    [0.246032]
591             [5]  [-0.587302]    [0.484127]
...   ...       ...          ...           ...
139             [3]   [0.391986]    [0.745257]
5341            [4]  [-0.119048]    [0.809524]
2452            [0]        [0.0]  [0.00793651]
984             [1]   [0.600506]     [0.13169]
2553            [7]  [-0.319396]    [0.333914]

[3999 rows x 4 columns]


In [4]:
more_annot = []
for index, row in annotations.iterrows():
    if len(row['val']) < 1 | len(row['aro']) < 1 | len(row['label_cat']) < 1:
        more_annot.append(index)
print("There are {} files with less than one annotation in one of its columns".format(len(more_annot)))

There are 0 files with less than one annotation in one of its columns


Count how many have more than one annotation

In [5]:
more_annot = []
for index, row in annotations.iterrows():
    if len(row['val']) > 1 | len(row['aro']) > 1 | len(row['label_cat']) > 1:
        more_annot.append(index)
print("There are {} files with more than one annotation".format(len(more_annot)))

There are 0 files with more than one annotation


We redefine pandas to have only the items, not the vectors.

In [6]:
annotations = annotations.applymap(lambda x: x[0] if isinstance(x, list) else x)
print(annotations)

     path  label_cat       val       aro
4915               6 -0.285714  0.404762
374                7 -0.634921  0.626984
2007               4 -0.229129  0.876669
2455               1  0.912698  0.246032
591                5 -0.587302  0.484127
...   ...        ...       ...       ...
139                3  0.391986  0.745257
5341               4 -0.119048  0.809524
2452               0  0.000000  0.007937
984                1  0.600506  0.131690
2553               7 -0.319396  0.333914

[3999 rows x 4 columns]


  annotations = annotations.applymap(lambda x: x[0] if isinstance(x, list) else x)


Correspond to the number of images and each one has its own annotation. First we add the column to have the path annotated. 

In [7]:
# Obtener una lista de todos los archivos en el subdirectorio
file_list = os.listdir(os.path.join(validation_path, "images"))
no_correspondence = []

# Iterate through all archives
for file in file_list:
    if file.endswith(".jpg"):
        # Get the file id
        photo_idx = file.split(".")[0]
        if photo_idx in annotations_dict:
            annotations.loc[photo_idx,'path'] = os.path.join(validation_path, "images", file)
        else:
            no_correspondence.append(photo_idx)
    else:
        print("File {} is not a jpg".format(file))
# Print the DataFrame
print(annotations)

                                                   path  label_cat       val  \
4915  /mnt/gpid08/datasets/affectnet/val_set/images/...          6 -0.285714   
374   /mnt/gpid08/datasets/affectnet/val_set/images/...          7 -0.634921   
2007  /mnt/gpid08/datasets/affectnet/val_set/images/...          4 -0.229129   
2455  /mnt/gpid08/datasets/affectnet/val_set/images/...          1  0.912698   
591   /mnt/gpid08/datasets/affectnet/val_set/images/...          5 -0.587302   
...                                                 ...        ...       ...   
139   /mnt/gpid08/datasets/affectnet/val_set/images/...          3  0.391986   
5341  /mnt/gpid08/datasets/affectnet/val_set/images/...          4 -0.119048   
2452  /mnt/gpid08/datasets/affectnet/val_set/images/...          0  0.000000   
984   /mnt/gpid08/datasets/affectnet/val_set/images/...          1  0.600506   
2553  /mnt/gpid08/datasets/affectnet/val_set/images/...          7 -0.319396   

           aro  
4915  0.404762  
374  

In [8]:
print("There are {} files without correspondence".format(len(no_correspondence)))

There are 0 files without correspondence


So we can confirm that all photos in this dataset has a complete annotation.

# Distribution of data:
First we compute the data distribution of the continuous variables. 

In [9]:
valence = []
arousal = []
continuous_annotations = []
for photo_idx in range(len(annotations)):
    sample = annotations.iloc[photo_idx]
    valence.append({'valence': round(sample["val"], 2)})
    arousal.append({'arousal': round(sample["aro"], 2)})
    continuous_annotations.append({'valence': round(sample["val"], 2), 'arousal': round(sample["aro"], 2)})
# Create a dataframe with sample data
valence = pd.DataFrame(valence)
arousal = pd.DataFrame(arousal)
continuous_annotations = pd.DataFrame(continuous_annotations)

Now we will see the value distribution:


In [10]:
valence_stats = {
    'Continuous emotion:': 'Valence',
    'Max': round(annotations['val'].max(), 2),
    'Min': round(annotations['val'].min(), 2),
    'Mean': round(annotations['val'].mean(), 2),
    'Q1': round(annotations['val'].quantile(0.25), 2),
    'Q2 (Median)': round(annotations['val'].median(), 2),
    'Q3': round(annotations['val'].quantile(0.75), 2)
}

arousal_stats = {
    'Continuous emotion:': 'Arousal',
    'Max': round(annotations['aro'].max(), 2),
    'Min': round(annotations['aro'].min(), 2),
    'Mean': round(annotations['aro'].mean(), 2),
    'Q1': round(annotations['aro'].quantile(0.25), 2),
    'Q2 (Median)': round(annotations['aro'].median(), 2),
    'Q3': round(annotations['aro'].quantile(0.75), 2)
}

# Create dataframes from the statistics dictionaries
arousal_stats_df = pd.DataFrame(arousal_stats, index=['Arousal'])
valence_stats_df = pd.DataFrame(valence_stats, index=['Valence'])

# Concatenate the dataframes
statistics_df = pd.concat([arousal_stats_df, valence_stats_df])

# Create a pretty table
table = PrettyTable()
table.field_names = statistics_df.columns

# Add rows to the table
for index, row in statistics_df.iterrows():
    table.add_row(row)

# Display the table
print(table)


+---------------------+------+-------+-------+-------+-------------+------+
| Continuous emotion: | Max  |  Min  |  Mean |   Q1  | Q2 (Median) |  Q3  |
+---------------------+------+-------+-------+-------+-------------+------+
|       Arousal       | 0.98 | -0.67 |  0.35 |  0.02 |     0.44    | 0.67 |
|       Valence       | 0.98 | -0.99 | -0.19 | -0.61 |    -0.18    | 0.03 |
+---------------------+------+-------+-------+-------+-------------+------+


Decide bin size

In [11]:
# Calculate bin size for valence
number_of_bins = 8
valence_range = round(valence['valence'].max()) - round(valence['valence'].min())
valence_bin_size = valence_range / number_of_bins  # Change 5 to the number of bins you want

# Calculate bin size for arousal
arousal_range = round(arousal['arousal'].max()) - round(arousal['arousal'].min())
arousal_bin_size = arousal_range / number_of_bins  # Change 5 to the number of bins you want

In [12]:
# Compute the mean for valence and arousal
valence_mean = round(annotations['val'].mean(), 2); arousal_mean = round(annotations['aro'].mean(), 2)


# Create a histogram with mean line for valence
valence_hist = alt.Chart(valence).mark_bar(color="steelblue", width=20).encode(
    alt.X("valence", bin=alt.Bin(step=valence_bin_size), scale=alt.Scale(domain=[-1, 1]),
          axis=alt.Axis(labelAngle=0, title="Valence intervals", labelFlush=False)),
    alt.Y('count()'),
).properties(
    title=alt.TitleParams(text='Valence Distribution', fontSize=16),
    width=300,
    height=400
)
# Create a histogram with mean line for arousal
arousal_hist = alt.Chart(arousal).mark_bar(color="steelblue", width=20).encode(
    alt.X("arousal", bin=alt.Bin(step=arousal_bin_size), scale=alt.Scale(domain=[-1, 1]),
          axis=alt.Axis(labelAngle=0, title="Arousal intervals", labelFlush=False)),
    y='count()',
).properties(
    title=alt.TitleParams(text='Arousal Distribution', fontSize=16),
    width=300,
    height=400
)


# Add mean line for valence
valence_mean_line = alt.Chart(pd.DataFrame({'mean': [valence_mean]})).mark_rule(color='black', size = 3).encode(
    x='mean'
)

valence_mean_text = alt.Chart(pd.DataFrame({'mean': [valence_mean]})).mark_text(
    align='right', dx=-5, dy = -180
).encode(
    x='mean',
    text=alt.value('Mean: ' + str(valence_mean))
)

# Add mean line for arousal
arousal_mean_line = alt.Chart(pd.DataFrame({'mean': [arousal_mean]})).mark_rule(color='black', size = 3).encode(
    x='mean'
)

arousal_mean_text = alt.Chart(pd.DataFrame({'mean': [arousal_mean]})).mark_text(
    align='right', dx=-5, dy = -180
).encode(
    x='mean',
    text=alt.value('Mean: ' + str(arousal_mean))
)

# Concatenate the histograms and mean lines horizontally
combined_chart = alt.hconcat(
    (valence_hist + valence_mean_line + valence_mean_text), 
    (arousal_hist + arousal_mean_line + arousal_mean_text)
).resolve_scale(y='shared').properties(
    title=alt.TitleParams(text="Continuous annotations distributions", anchor='middle', fontSize=20, dy=-20),
)

combined_chart

Now I will create the plot shown in the paper to see if the results are similar (and confirm that the distribution of data is similar to train). First define general values for the plot.

In [134]:
# Define the general heatmap parameters
total_bins = 40
step_size = 0.25; num_steps = 7
axis_steps= np.arange(-1, 1+step_size, step_size); bin_edges = np.linspace(-1, 1, total_bins+1)  
# Compute the 2D histogram
histogram, xedges, yedges = np.histogram2d(
    continuous_annotations['valence'], 
    continuous_annotations['arousal'], 
    bins=[bin_edges, bin_edges]
)
# Find the maximum count
max_count = histogram.max()
legend_steps = np.logspace(np.log10(1), np.log10(max_count), num_steps).astype(int)
#legend_steps = np.linspace(0, max_count, num=5).astype(int)
print(legend_steps)

# Now add the dotted grid lines:
axis_steps = axis_steps [1:-1] # remove the first and last element of the grid
new_step_size = step_size/10 # define the new step size for plotting the points
axis_steps2 = np.arange(-1+new_step_size, 1, new_step_size)
grid_points = pd.DataFrame({ # create a dataframe with the grid points
    'valence': np.concatenate((np.repeat(axis_steps, len(axis_steps2)), np.tile(axis_steps2, len(axis_steps)))),
    'arousal': np.concatenate((np.tile(axis_steps2, len(axis_steps)), np.repeat(axis_steps, len(axis_steps2))))
})

[  1   2   5  12  28  65 152]


In [144]:
# Create the heatmap using Altair
heatmap = alt.Chart(continuous_annotations).mark_rect().encode(
    x=alt.X('valence:Q', bin=alt.Bin(maxbins=total_bins), scale = alt.Scale(domain=[-1, 1]), 
            axis = alt.Axis(values=axis_steps, labelAngle=0, title="Valence intervals", labelFlush=False)),
    y=alt.Y('arousal:Q', bin=alt.Bin(maxbins=total_bins), scale = alt.Scale(domain=[-1, 1]), 
            axis = alt.Axis(values=axis_steps, labelAngle=0, title="Arousal intervals", labelFlush=False)),
    color=alt.Color('count():Q', scale=alt.Scale(type = 'log', scheme='darkblue ', domain=[1, max_count]), 
                    legend=alt.Legend(title="Count (log-scale)", values=legend_steps)),
    tooltip=[
        alt.Tooltip('valence:Q', bin=alt.Bin(maxbins=total_bins), title='Valence Interval'),
        alt.Tooltip('arousal:Q', bin=alt.Bin(maxbins=total_bins), title='Arousal Interval'),
        'count()'
    ]
).properties(
    title='Heatmap',
    width=400,
    height=400
)
# Create the dotted grid using mark_circle
dotted_grid = alt.Chart(grid_points).mark_circle(
    size=5, color='black', opacity=0.4
).encode(
    x=alt.X('valence:Q', scale=alt.Scale(domain=[-1, 1]), axis = alt.Axis(grid = False)),
    y=alt.Y('arousal:Q', scale=alt.Scale(domain=[-1, 1]), axis = alt.Axis(grid = False))
)

# Overlay the heatmap and the dotted grid
heatmap_with_dotted_grid = (dotted_grid + heatmap).properties(title = 'Valence/Arousal heatmap').interactive()

# Show the heatmap with dotted grid
heatmap_with_dotted_grid

SchemaValidationError: 'darkblue ' is an invalid value for `scheme`. Valid values are:

- One of ['accent', 'category10', 'category20', 'category20b', 'category20c', 'dark2', 'paired', 'pastel1', 'pastel2', 'set1', 'set2', 'set3', 'tableau10', 'tableau20']
- One of ['blues', 'tealblues', 'teals', 'greens', 'browns', 'greys', 'purples', 'warmgreys', 'reds', 'oranges']
- One of ['turbo', 'viridis', 'inferno', 'magma', 'plasma', 'cividis', 'bluegreen', 'bluegreen-3', 'bluegreen-4', 'bluegreen-5', 'bluegreen-6', 'bluegreen-7', 'bluegreen-8', 'bluegreen-9', 'bluepurple', 'bluepurple-3', 'bluepurple-4', 'bluepurple-5', 'bluepurple-6', 'bluepurple-7', 'bluepurple-8', 'bluepurple-9', 'goldgreen', 'goldgreen-3', 'goldgreen-4', 'goldgreen-5', 'goldgreen-6', 'goldgreen-7', 'goldgreen-8', 'goldgreen-9', 'goldorange', 'goldorange-3', 'goldorange-4', 'goldorange-5', 'goldorange-6', 'goldorange-7', 'goldorange-8', 'goldorange-9', 'goldred', 'goldred-3', 'goldred-4', 'goldred-5', 'goldred-6', 'goldred-7', 'goldred-8', 'goldred-9', 'greenblue', 'greenblue-3', 'greenblue-4', 'greenblue-5', 'greenblue-6', 'greenblue-7', 'greenblue-8', 'greenblue-9', 'orangered', 'orangered-3', 'orangered-4', 'orangered-5', 'orangered-6', 'orangered-7', 'orangered-8', 'orangered-9', 'purplebluegreen', 'purplebluegreen-3', 'purplebluegreen-4', 'purplebluegreen-5', 'purplebluegreen-6', 'purplebluegreen-7', 'purplebluegreen-8', 'purplebluegreen-9', 'purpleblue', 'purpleblue-3', 'purpleblue-4', 'purpleblue-5', 'purpleblue-6', 'purpleblue-7', 'purpleblue-8', 'purpleblue-9', 'purplered', 'purplered-3', 'purplered-4', 'purplered-5', 'purplered-6', 'purplered-7', 'purplered-8', 'purplered-9', 'redpurple', 'redpurple-3', 'redpurple-4', 'redpurple-5', 'redpurple-6', 'redpurple-7', 'redpurple-8', 'redpurple-9', 'yellowgreenblue', 'yellowgreenblue-3', 'yellowgreenblue-4', 'yellowgreenblue-5', 'yellowgreenblue-6', 'yellowgreenblue-7', 'yellowgreenblue-8', 'yellowgreenblue-9', 'yellowgreen', 'yellowgreen-3', 'yellowgreen-4', 'yellowgreen-5', 'yellowgreen-6', 'yellowgreen-7', 'yellowgreen-8', 'yellowgreen-9', 'yelloworangebrown', 'yelloworangebrown-3', 'yelloworangebrown-4', 'yelloworangebrown-5', 'yelloworangebrown-6', 'yelloworangebrown-7', 'yelloworangebrown-8', 'yelloworangebrown-9', 'yelloworangered', 'yelloworangered-3', 'yelloworangered-4', 'yelloworangered-5', 'yelloworangered-6', 'yelloworangered-7', 'yelloworangered-8', 'yelloworangered-9', 'darkblue', 'darkblue-3', 'darkblue-4', 'darkblue-5', 'darkblue-6', 'darkblue-7', 'darkblue-8', 'darkblue-9', 'darkgold', 'darkgold-3', 'darkgold-4', 'darkgold-5', 'darkgold-6', 'darkgold-7', 'darkgold-8', 'darkgold-9', 'darkgreen', 'darkgreen-3', 'darkgreen-4', 'darkgreen-5', 'darkgreen-6', 'darkgreen-7', 'darkgreen-8', 'darkgreen-9', 'darkmulti', 'darkmulti-3', 'darkmulti-4', 'darkmulti-5', 'darkmulti-6', 'darkmulti-7', 'darkmulti-8', 'darkmulti-9', 'darkred', 'darkred-3', 'darkred-4', 'darkred-5', 'darkred-6', 'darkred-7', 'darkred-8', 'darkred-9', 'lightgreyred', 'lightgreyred-3', 'lightgreyred-4', 'lightgreyred-5', 'lightgreyred-6', 'lightgreyred-7', 'lightgreyred-8', 'lightgreyred-9', 'lightgreyteal', 'lightgreyteal-3', 'lightgreyteal-4', 'lightgreyteal-5', 'lightgreyteal-6', 'lightgreyteal-7', 'lightgreyteal-8', 'lightgreyteal-9', 'lightmulti', 'lightmulti-3', 'lightmulti-4', 'lightmulti-5', 'lightmulti-6', 'lightmulti-7', 'lightmulti-8', 'lightmulti-9', 'lightorange', 'lightorange-3', 'lightorange-4', 'lightorange-5', 'lightorange-6', 'lightorange-7', 'lightorange-8', 'lightorange-9', 'lighttealblue', 'lighttealblue-3', 'lighttealblue-4', 'lighttealblue-5', 'lighttealblue-6', 'lighttealblue-7', 'lighttealblue-8', 'lighttealblue-9']
- One of ['blueorange', 'blueorange-3', 'blueorange-4', 'blueorange-5', 'blueorange-6', 'blueorange-7', 'blueorange-8', 'blueorange-9', 'blueorange-10', 'blueorange-11', 'brownbluegreen', 'brownbluegreen-3', 'brownbluegreen-4', 'brownbluegreen-5', 'brownbluegreen-6', 'brownbluegreen-7', 'brownbluegreen-8', 'brownbluegreen-9', 'brownbluegreen-10', 'brownbluegreen-11', 'purplegreen', 'purplegreen-3', 'purplegreen-4', 'purplegreen-5', 'purplegreen-6', 'purplegreen-7', 'purplegreen-8', 'purplegreen-9', 'purplegreen-10', 'purplegreen-11', 'pinkyellowgreen', 'pinkyellowgreen-3', 'pinkyellowgreen-4', 'pinkyellowgreen-5', 'pinkyellowgreen-6', 'pinkyellowgreen-7', 'pinkyellowgreen-8', 'pinkyellowgreen-9', 'pinkyellowgreen-10', 'pinkyellowgreen-11', 'purpleorange', 'purpleorange-3', 'purpleorange-4', 'purpleorange-5', 'purpleorange-6', 'purpleorange-7', 'purpleorange-8', 'purpleorange-9', 'purpleorange-10', 'purpleorange-11', 'redblue', 'redblue-3', 'redblue-4', 'redblue-5', 'redblue-6', 'redblue-7', 'redblue-8', 'redblue-9', 'redblue-10', 'redblue-11', 'redgrey', 'redgrey-3', 'redgrey-4', 'redgrey-5', 'redgrey-6', 'redgrey-7', 'redgrey-8', 'redgrey-9', 'redgrey-10', 'redgrey-11', 'redyellowblue', 'redyellowblue-3', 'redyellowblue-4', 'redyellowblue-5', 'redyellowblue-6', 'redyellowblue-7', 'redyellowblue-8', 'redyellowblue-9', 'redyellowblue-10', 'redyellowblue-11', 'redyellowgreen', 'redyellowgreen-3', 'redyellowgreen-4', 'redyellowgreen-5', 'redyellowgreen-6', 'redyellowgreen-7', 'redyellowgreen-8', 'redyellowgreen-9', 'redyellowgreen-10', 'redyellowgreen-11', 'spectral', 'spectral-3', 'spectral-4', 'spectral-5', 'spectral-6', 'spectral-7', 'spectral-8', 'spectral-9', 'spectral-10', 'spectral-11']
- One of ['rainbow', 'sinebow']
- Of type 'object'