# Preparation

In [1]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
IMPRESSIONV2_DIR = Path("data/impressionv2")

In [3]:
def load_pickle(path: Path):
    with open(path, 'rb') as f:
        return pickle.load(f, encoding='latin1')

In [4]:
class CollectPlot:
    def __init__(self):
        self.value = None
        
    def __iadd__(self, value):
        if self.value is None:
            self.value = value
        else:
            self.value += value
        return self
            
    def __imul__(self, value):
        if self.value is None:
            self.value = value
        else:
            self.value *= value
        return self

In [5]:
annot_dict = load_pickle(IMPRESSIONV2_DIR / 'train' / 'annotation_training.pkl')
trans_dict = load_pickle(IMPRESSIONV2_DIR / 'train' / 'transcription_training.pkl')

In [6]:
eth_df = pd.read_csv(IMPRESSIONV2_DIR / 'eth_gender_annotations_dev.csv', sep=';')
att_age_df = pd.read_csv(IMPRESSIONV2_DIR / 'FI_caucasian_subset--att_age_gender--labels.csv')

In [7]:
traits = list(annot_dict.keys())

In [8]:
annot_df = pd.DataFrame.from_dict(annot_dict)

In [9]:
eth_df = eth_df.set_index('VideoName')

In [10]:
annot_eth_df = annot_df.join(eth_df)

In [11]:
eth_key = {1: "asian", 2: "caucasian", 3: "black"}

In [12]:
ids = annot_eth_df['YouTubeID'].unique()
f"There are {len(ids)} unique videos is the training set."

'There are 2624 unique videos is the training set.'

# Number of videos per person

In [13]:
gb = annot_eth_df.groupby(by='YouTubeID')

In [14]:
import holoviews as hv
hv.extension('bokeh')

In [15]:
video_per_person = gb.count()['Gender']
video_per_person_hist = np.histogram(video_per_person, 6, (0.5, video_per_person.max() + 0.5))
hv.Histogram(video_per_person_hist, kdims='Number of videos per person')

# Available data for gender, ethnicity

In [16]:
annot_eth_df.groupby(by='Gender')['YouTubeID'].count(), annot_eth_df.groupby(by='Ethnicity')['YouTubeID'].count()

(Gender
 1    2734
 2    3266
 Name: YouTubeID, dtype: int64,
 Ethnicity
 1     215
 2    5162
 3     623
 Name: YouTubeID, dtype: int64)

# The variance of a trait for the same person between different videos

In [17]:
video_per_person.name = 'N_vids'

In [18]:
annot_eth_n_df = annot_eth_df.merge(video_per_person, on='YouTubeID')

In [19]:
plot = CollectPlot()
for trait in traits:
    plot += hv.Distribution(annot_eth_n_df[annot_eth_n_df['N_vids']==5].groupby(by='YouTubeID').var()[trait])
plot.value

The above plot shows the distributin of the variance of the different fraits, in the case, when we have 5 videos of a person. The variance is usually between 0 and 0.02.

In [20]:
print(f"There are {len(annot_eth_n_df[annot_eth_n_df['N_vids']==5]['YouTubeID'].unique())} such people.")

There are 122 such people.


# Trait distribution per sex

In [21]:
plot = CollectPlot()
for trait in traits:
    plot += (hv.Distribution(annot_eth_df[annot_eth_df['Gender'] == 1][trait], label='male') * hv.Distribution(annot_eth_df[annot_eth_df['Gender'] == 2][trait], label='female')).opts(legend_position='top')
plot.value#.opts(legend_position='right')

# Trait distribution per ethinicty

In [22]:
plot = CollectPlot()
for trait in traits:
    plot_trait = CollectPlot()
    for eth in annot_eth_df['Ethnicity'].unique():
        plot_trait *= hv.Distribution(annot_eth_df[annot_eth_df['Ethnicity'] == eth][trait], label=eth_key[eth])
    plot += plot_trait.value.opts(legend_position='top')
plot.value

# Impact of age and attractiveness

In [23]:
cauc_annot_att_age_df = att_age_df.merge(annot_eth_df, left_on='# video_file_name', right_index=True)

In [24]:
eth = 2
assert cauc_annot_att_age_df['Ethnicity'].unique() == [eth]
print(f"The preceived age and attractiveness data is only available for people, who are {eth_key[eth]}.")

The preceived age and attractiveness data is only available for people, who are caucasian.


In [25]:
assert (cauc_annot_att_age_df[' gender_perceived_annotated'] == cauc_annot_att_age_df['Gender']).all()
print("The preceived and annotated gender is the same.")

The preceived and annotated gender is the same.


In [26]:
def plot_agains_controll(controll):
    plot = CollectPlot()
    for trait in traits:
        plot += hv.Scatter(cauc_annot_att_age_df, kdims=controll, vdims=trait)
    return plot.value

plot_agains_controll(' average_attractiveness_value_predicted')

There doesn't seem to be a correlation beween the preceived attractiveness or any of the traits.

In [27]:
plot_agains_controll(' avg_perceived_age_predicted')

The data is skewd towared people who preceived to be between the age of 20 and 40. Older people (40-60) are labeled as less extrover, less open and less neurotic, but more agreeablehv.Scatter(cauc_annot_att_age_df, kdims=controll, vdims=trait).

# Bonus: age and attractiveness

In [28]:
hv.Scatter(cauc_annot_att_age_df, kdims=' avg_perceived_age_predicted', vdims=' average_attractiveness_value_predicted')

Joung people (20-40) are of all attractiveness. Our figure is plaged by the lack of older people. Thous, who are in the dataset, don't have very high or very low levels of preceived attractiveness.