In [1]:
#pip install librosa
#do it if you do not have this

In [2]:
#pip install soundfile
#do it if you do not have this

In [None]:
import numpy as np
import torch
import os
import librosa as lb
import soundfile

def list_folders(path):
    #to list all folders of this path
    folders = []
    for root, dirs, files in os.walk(path):
        for dir in dirs:
            folders.append(os.path.join(root, dir))
    return folders

def list_files(path):
    files = []
    for item in os.listdir(path):
        if not item.startswith('.') and not item.startswith('._'):  
            #I wrote this because I met some loading problem, it's not compulsory
            file = os.path.join(path, item)
            if os.path.isfile(file):
                files.append(file)
    return files

def audio_features(wav_file_path, mfcc = True, chroma = False, mel = False,sample_rate = 22050):
    #you can try mel or chroma if you want
    #use try exc to detect loading error
    try:
        audio, sample_rate = lb.load(wav_file_path, sr=sample_rate)
    except Exception as e:
        print(f"Error loading {wav_file_path}: {e}")
        return None
    
    if len(audio.shape) != 1:    #check if the audio is single channel
        return None
    result = np.array([])
    if mfcc:
        mfccs = np.mean(lb.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        stft = np.abs(lb.stft(audio))
        chroma = np.mean(lb.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(lb.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=40, fmin=0, fmax=sample_rate//2).T, axis=0)
        result = np.hstack((result, mel))
    
    return result

#actually we can know labels from the names
ravdess_label_dict = {"01": "neutral", "02": "calm", "03": "happy", "04": "sad", "05": "angry", "06": "fear", "07": "disgust", "08": "surprise"}

folders = list_folders("/path/to/your/dataset")
label_dataset = []
train_dataset =  []
for folder in  folders:
    files = list_files(folder)
    for _file in files:

        label = _file.split("/")[-1].replace(".wav","").split("-")[2]
        ravdess_label = ravdess_label_dict[label]
        label_num = int(label) - 1  #convert the label to a 0-based index

        result = audio_features(_file)
        train_dataset.append(result)
        label_dataset.append(label_num)

#convert the list of feature arrays and labels to numpy array
train_dataset = np.array(train_dataset)
label_dataset = np.array(label_dataset)

train_dataset = torch.tensor(train_dataset,dtype=torch.float)
label_dataset = torch.tensor(label_dataset,dtype=torch.long)

print(train_dataset.shape)
print(label_dataset.shape)
