In [2]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


In [13]:
# load the meta data
metadata_df = pd.read_csv('metadata.csv')
title_embeddings = np.load("metadata_title_embeddings.npz")["embeddings"]
keyword_embeddings = np.load("metadata_keywords_embeddings.npz")["embeddings"]
print(f"Shape of title embeddings: {title_embeddings.shape}")
print(f"Shape of keyword embeddings: {keyword_embeddings.shape}")

Shape of title embeddings: (9026, 1024)
Shape of keyword embeddings: (9026, 1024)


In [14]:
# load the annotations
annotations_df = pd.read_csv('annotations.csv')
annotations_embeddings = np.load("annotations_text_embeddings.npz")["embeddings"]
annotations_df.head(n=3)
print(f"Shape of annotations embeddings: {annotations_embeddings.shape}")

Shape of annotations embeddings: (35826, 1024)


In [50]:
# Load audio features
feature_filename = metadata_df.loc[0, "filename"].replace("mp3", "npz")
features = np.load(os.path.join("audio_features", feature_filename))
# print(list(features.keys())) # keys: ['embeddings', 'melspectrogram', 'mfcc', 'mfcc_delta', 'mfcc_delta2', 'flatness', 'centroid', 'flux', 'energy', 'power', 'bandwidth', 'contrast', 'zerocrossingrate']
print(f"Shape of audio features: {features['embeddings'].shape}")

print("Shape of ZCR feature (time, n_features)", features["zerocrossingrate"].shape)
print("Shape of MFCC features (time, n_features)", features["mfcc"].shape)

Shape of audio features: (233, 768)
Shape of ZCR feature (time, n_features) (233, 1)
Shape of MFCC features (time, n_features) (233, 32)


# Task 1
 Find two interesting recordings with at least two annotators and multiple annotations. Compare the temporal and textual annotations, and try to answer the following questions: </br>
(a) Identify similarities or differences between temporal and textual annotations from different annotators. </br>
(b) To what extent do the annotations rely on or deviate from keywords and textual descriptions in the
audio’s metadata? </br>
(c) Was the temporal and text annotations done according to the task description? </br>






In [85]:
# Find recordings with two or more annotators
annotations_df = pd.read_csv('annotations.csv')

# copy to work on
annotations_df_copy = annotations_df.copy()

# Find rows where file is the same, but different annotators
# First group by filename and count unique annotators
filename_multi_annot = annotations_df_copy.groupby("filename")["annotator"].nunique()
filename_multi_annot = filename_multi_annot[filename_multi_annot >= 2]

# then we filter initial dataframe
filtered_only_multi = annotations_df_copy[annotations_df_copy["filename"].isin(filename_multi_annot.index)]

# We can also filter for just one file, first file with index 0
filtered_only_multi_single = annotations_df_copy[annotations_df_copy["filename"] == filename_multi_annot.index[0]]

filtered_only_multi_single

Unnamed: 0,task_id,filename,annotator,text,onset,offset,filename_unsafe
7568,161982937,102431.mp3,75058291103840756873316169650564417843042967235442422525023433266794403941324,Baby making mid-pitched unrhythmic non-crying vocal noises,19.648504,23.590653,102431_mild_cryingaif.mp3
12072,161982937,102431.mp3,75058291103840756873316169650564417843042967235442422525023433266794403941324,Baby making mid-pitched non-crying vocal noises,3.423446,5.705743,102431_mild_cryingaif.mp3
15463,161982937,102431.mp3,28251014400027049985537852315625094069312034433417461412837429504269879097216,"Baby crying, repeatedly, natural, indoors, nearby.",0.011457,26.225669,102431_mild_cryingaif.mp3
16641,161982937,102431.mp3,75058291103840756873316169650564417843042967235442422525023433266794403941324,Mid-frequency loud constant baby cry,17.32471,18.735585,102431_mild_cryingaif.mp3
17965,161982937,102431.mp3,75058291103840756873316169650564417843042967235442422525023433266794403941324,Mid-frequency loud constant baby cry,6.016965,8.278514,102431_mild_cryingaif.mp3
18846,161982937,102431.mp3,75058291103840756873316169650564417843042967235442422525023433266794403941324,Mid-frequency loud constant baby cry,23.818883,26.018187,102431_mild_cryingaif.mp3
19487,161982937,102431.mp3,75058291103840756873316169650564417843042967235442422525023433266794403941324,Mid-frequency loud constant baby cry,12.552634,14.917924,102431_mild_cryingaif.mp3
26839,161982937,102431.mp3,75058291103840756873316169650564417843042967235442422525023433266794403941324,Baby making mid-pitched unrhythmic non-crying vocal noises,15.374383,17.075732,102431_mild_cryingaif.mp3
34720,161982937,102431.mp3,75058291103840756873316169650564417843042967235442422525023433266794403941324,Baby making mid-pitched unrhythmic non-crying vocal noises,8.506744,12.324405,102431_mild_cryingaif.mp3
34790,161982937,102431.mp3,75058291103840756873316169650564417843042967235442422525023433266794403941324,"Mid-frequency baby crying sound with short, unrhythmic pulses",0.0,2.821749,102431_mild_cryingaif.mp3


# Task 2
## Annotation Quality (6 points): Use the audio recordings annotated by multiple annotators to answer
the following questions: </br>
(a) How precise are the temporal annotations?</br>
(b) How similar are the text annotations that correspond to the same region?</br>
## Use the complete data set (or a subset) to address the following points quantitatively. </br>
(a) How many annotations did we collect per file? How many distinct sound events per file?</br>
(b) How detailed are the text annotations? How much does the quality of annotations vary between</br>
different annotators? </br>
(c) Are there any obvious inconsistencies, outliers, or poor-quality annotations in the data? Propose a
simple method to filter or fix incorrect or poor-quality annotations (e.g., remove outliers, typos, or
spelling errors).



In [88]:
# We can use filtered_only_multi dataframe for this
# It has the one with the same file name, but different annotators (people annotating)
print(f"{filtered_only_multi.head(n=10)}")

Unnamed: 0,task_id,filename,annotator,text,onset,offset,filename_unsafe
7,161983960,141704.mp3,79870316315349615302327941860366026996801340811276116342195412772582394036186,crowd of people howling in arena,0.0,7.187978,141704_Mexican Lucha Libre Crowd yelling whistling and screaming during a match in Mexico City.mp3
11,161985829,56696.mp3,16884412433998590958295201046418790702475322036579696423693650596719891757391,A man talking loudly in the background,25.802332,27.372698,56696_58_GRASSwav.mp3
13,161977997,690473.mp3,42867469332459215984090181974491309564784967972641108761833013123393953003359,birds chirping,12.282189,13.192846,690473_City parking lot ambience with cars passing by restaurant hum bird singing wind in foliage man voice and whistle.mp3
29,161977946,656432.mp3,52071871693074840541222807364176644469330956565548297515640298411415286315997,bird from the background singing,8.580975,17.935011,656432_Ambience - Bird Forrestwav.mp3
33,161986483,114286.mp3,89605346468130623187845926321735893498068174586262008506078141839731327750098,"A washing machine hums loudly, gradually decreasing, with a vibrating sound in a laundry room",10.729478,24.596077,114286_washingmachine4wav.mp3
39,161980218,232200.mp3,64561226020994896301818750922770630849961337120757782389033140156511501318845,quiet voices of people in the background.,0.589127,0.81903,232200_Rain1_FSwav.mp3
53,161978723,620543.mp3,58864092964652614671996668460845143199712161241153187413867581031635218825787,Dog barking loudly in the background,11.925154,13.471761,620543_AMBMisc_Dogs Barking And SquealingDog PlaygroundParkDist Traffic 1_EM_EqOOsprddnswav.mp3
65,161982010,659002.mp3,78768200617931249718756519132900835009181565565677152494580209767523616303610,A wave loudly splashing against something nearby,17.804721,21.278458,659002_Waves Lapping Shore.mp3
74,161987675,560530.mp3,71714977193455752457308303974933244667376894328297014884399140380547807112516,a note playing,6.01405,7.062654,560530_clean blues 3wav.mp3
80,161984202,700522.mp3,55121482465506679224605825803506469881306248144699669975790603067132125825774,Children talking in french pointedly.,7.448445,10.254913,700522_french kids playing at nightwav.mp3


# How precise are temporal annotations?
# How similar are the text annotations that correspond to the same region?

Answer here

# Task 3 (This is what we need to present)
## Audio Features (6 points): Load and analyze the audio features:</br>
(a) Which audio features appear useful? Select only the most relevant ones or perform a down projection
for the next steps. </br>
(b) Extract a fixed-length feature vector for each annotated region as well as for all the silent parts in
between. The most straightforward way to do this is to average the audio features of the corresponding
region over time, as shown in the tutorial session.</br>
(c) Cluster the audio features for the extracted regions. Can you identify meaningful clusters of audio
features? Do the feature vectors of the silent regions predominantly fall into one large cluster?
