In [None]:
#!pip install librosa seaborn kagglehub pyarrow fastparquet 

In [6]:
import pandas as pd
import numpy as np
import os
import shutil

import kagglehub
from glob import glob

## Downloading the data

In [8]:
path = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")

destination_folder = "../data/raw/emotional-speech-audio"
os.makedirs(destination_folder, exist_ok=True)
shutil.move(path, destination_folder)

'../data/raw/emotional-speech-audio/1'

In [9]:
filenames = glob("../data/raw/emotional-speech-audio/*/*/*.wav")
len(filenames)

1440

## Feature Extraction

In [10]:
audio_features = []
for filename in filenames:
    modality = int(filename.split("/")[6].split("-")[0])
    voice_channel = int(filename.split("/")[6].split("-")[1])
    emotion = int(filename.split("/")[6].split("-")[2])
    emontion_intensity = int(filename.split("/")[6].split("-")[3])
    statement = int(filename.split("/")[6].split("-")[4])
    repetition = int(filename.split("/")[6].split("-")[5])
    actor = int(filename.split("/")[6].split("-")[5].split(".")[0])
    gender = "female" if int(actor) % 2 == 0 else "male"
    audio_features.append([filename, modality, voice_channel, emotion, emontion_intensity, statement, repetition, actor, gender])

In [11]:
columns = ["filename", "modality", "vocal_channel", "emotion", "emontion_intensity", "statement", "repetition", "actor", "gender"]
df = pd.DataFrame(audio_features, columns=columns)

In [12]:
df.head(3)

Unnamed: 0,filename,modality,vocal_channel,emotion,emontion_intensity,statement,repetition,actor,gender
0,../data/raw/emotional-speech-audio/1/Actor_10/...,3,1,8,1,2,2,2,female
1,../data/raw/emotional-speech-audio/1/Actor_10/...,3,1,6,2,2,2,2,female
2,../data/raw/emotional-speech-audio/1/Actor_10/...,3,1,5,1,1,1,1,male


In [13]:
modality_map = {1: "full-AV", 2: "video-only", 3: "audio-only"}
vocal_channel_map = {1: "speech", 2: "song"}
emotion_map = {1: "neutral", 2: "calm", 3: "happy", 4: "sad", 5: "angry", 6: "fearful", 7: "disgust", 8: "surprised"}
emotion_intensity_map = {1: "normal", 2: "strong"}
statement_map = {1: "Kids are talking by the door", 2: "Dogs are sitting by the door"}
repetition_map = {1: "1st repetition", 2: "2nd repetition"}

df['modality'] = df['modality'].map(modality_map)
df['vocal_channel'] = df['vocal_channel'].map(vocal_channel_map)
df['emotion'] = df['emotion'].map(emotion_map)
df['emontion_intensity'] = df['emontion_intensity'].map(emotion_intensity_map)
df['statement'] = df['statement'].map(statement_map)
df['repetition'] = df['repetition'].map(repetition_map)

df.sample(3)

Unnamed: 0,filename,modality,vocal_channel,emotion,emontion_intensity,statement,repetition,actor,gender
34,../data/raw/emotional-speech-audio/1/Actor_10/...,audio-only,speech,calm,strong,Kids are talking by the door,2nd repetition,2,female
247,../data/raw/emotional-speech-audio/1/Actor_09/...,audio-only,speech,disgust,normal,Kids are talking by the door,2nd repetition,2,female
328,../data/raw/emotional-speech-audio/1/Actor_20/...,audio-only,speech,calm,normal,Dogs are sitting by the door,2nd repetition,2,female


## Save the data

In [14]:
df.to_parquet("../data/processed/audio_features.parquet", index=False)