In [7]:
label2id = {'Happy': 0, 'Sad': 1, 'Angry': 2, 'Anxious': 3, 'Hurt': 4, 'Embarrassed': 5, 'Neutrality': 6}
id2label = {0: 'Happy', 1: 'Sad', 2: 'Angry', 3: 'Anxious', 4: 'Hurt', 5: 'Embarrassed', 6: 'Neutrality'}

In [20]:
import os
import random

def find_wav_files(base_dir, is_train:bool, per_class=100):
    sub_root = '1.Training' if is_train else '2.Validation'
    
    wav_files = []
    folder2label = {
        '1.기쁨': 'Happy',
        '2.슬픔': 'Sad', 
        '3.분노': 'Angry', 
        '4.불안': 'Anxious', 
        '5.상처': 'Hurt',
        '6.당황': 'Embarrassed', 
        '7.중립': 'Neutrality'
    }
    emotion_dir = os.path.join(
        base_dir, 
        '01.데이터', 
        sub_root,
        '원천데이터',
        ('T' if is_train else 'V') + 'S1',
        ('T' if is_train else 'V') + 'S1',
        '1.감정'
    )
    wav_path_dict = {label: [] for label in folder2label.keys()}

    for emo in os.listdir(emotion_dir):
        emo_path = os.path.join(emotion_dir, emo)

        wav_per_class = []
        if not os.path.isdir(emo_path): continue
        for sub_folder in os.listdir(emo_path):
            sub_emo_path = os.path.join(emo_path, sub_folder)
            if not os.path.isdir(sub_emo_path): continue

            wav_list = [
                os.path.join(sub_emo_path, f)
                for f in os.listdir(sub_emo_path) if f.endswith('.wav')
            ]
            if len(wav_list) == 0:
                continue
            
            # sampled = random.sample(wav_list, min(per_class, len(wav_list)))
            # wav_files.extend(sampled)
            # wav_files.extend(wav_list)
            wav_per_class.extend(wav_list)
        wav_path_dict[emo].extend(wav_per_class)
    return wav_path_dict


In [21]:
BASE_DIR = r'C:\Users\SSAFY\Downloads\015.감성 및 발화 스타일별 음성합성 데이터'
wav_path_dict = find_wav_files(BASE_DIR, is_train=True)

In [25]:
for emo in wav_path_dict:
    print(f'{emo}: {wav_path_dict[emo][0]}')
    print(len(wav_path_dict[emo]))

1.기쁨: C:\Users\SSAFY\Downloads\015.감성 및 발화 스타일별 음성합성 데이터\01.데이터\1.Training\원천데이터\TS1\TS1\1.감정\1.기쁨\0001_G1A3E1S0C0_PSB\0001_G1A3E1S0C0_PSB_000001.wav
66097
2.슬픔: C:\Users\SSAFY\Downloads\015.감성 및 발화 스타일별 음성합성 데이터\01.데이터\1.Training\원천데이터\TS1\TS1\1.감정\2.슬픔\0001_G1A3E2S0C0_PSB\0001_0001_G1A3E2S0C0_PSB_000001.wav
64359
3.분노: C:\Users\SSAFY\Downloads\015.감성 및 발화 스타일별 음성합성 데이터\01.데이터\1.Training\원천데이터\TS1\TS1\1.감정\3.분노\0001_G1A3E3S0C0_PSB\0001_G1A3E3S0C0_PSB_000001.wav
65715
4.불안: C:\Users\SSAFY\Downloads\015.감성 및 발화 스타일별 음성합성 데이터\01.데이터\1.Training\원천데이터\TS1\TS1\1.감정\4.불안\0001_G1A3E4S0C0_PSB\0001_G1A3E4S0C0_PSB_000001.wav
65853
5.상처: C:\Users\SSAFY\Downloads\015.감성 및 발화 스타일별 음성합성 데이터\01.데이터\1.Training\원천데이터\TS1\TS1\1.감정\5.상처\0002_G1A4E5S0C0_LYT\0002_G1A4E5S0C0_LYT_000001.wav
63740
6.당황: C:\Users\SSAFY\Downloads\015.감성 및 발화 스타일별 음성합성 데이터\01.데이터\1.Training\원천데이터\TS1\TS1\1.감정\6.당황\0002_G1A4E6S0C0_LYT\0002_G1A4E6S0C0_LYT_000001.wav
64160
7.중립: C:\Users\SSAFY\Downloads\015.감성 및 발화 스타일별 음성합성 데이터\01

In [26]:
import librosa
import numpy as np

def extract_mfcc(file_path, n_mfcc=20):
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)  # shape: (n_mfcc, frame)
    return mfcc


In [None]:
from tqdm import tqdm

mfcc_list_dict = {emo: dict() for emo in label2id.keys()}

def get_top_features(wav_path_dict: dict, top_k=10):
    mfcc_mean_dict = {emo:0 for emo in wav_path_dict.keys()}
    for emo, path_list in wav_path_dict.items():
        mfcc_means = []
        for path in tqdm(path_list, desc=f'{emo}: '):
            mfcc = extract_mfcc(path, n_mfcc=20)
            mfcc_list_dict[emo][path] = mfcc
            mfcc_mean = np.mean(mfcc, axis=1)
            mfcc_means.append(mfcc_mean)
        mfcc_mean_dict[emo] = np.mean(mfcc_means, axis=0)
    all_emotion_means = np.stack(list(mfcc_mean_dict.values()))
    global_mean = np.mean(all_emotion_means, axis=0)

    top_indices = np.argsort(global_mean)[::-1][:top_k]
    return top_indices


In [37]:
top_indices = get_top_features(wav_path_dict=wav_path_dict)

1.기쁨: 100%|██████████| 66097/66097 [15:37<00:00, 70.51it/s]
2.슬픔: 100%|██████████| 64359/64359 [15:19<00:00, 69.99it/s]
3.분노: 100%|██████████| 65715/65715 [15:26<00:00, 70.89it/s]
4.불안: 100%|██████████| 65853/65853 [16:47<00:00, 65.35it/s]
5.상처: 100%|██████████| 63740/63740 [17:29<00:00, 60.75it/s]
6.당황: 100%|██████████| 64160/64160 [16:50<00:00, 63.48it/s]
7.중립: 100%|██████████| 64000/64000 [15:56<00:00, 66.91it/s]


In [61]:
top_indices

array([ 1,  3,  5,  2,  6, 13, 11, 15, 17, 19])

In [None]:
import librosa
import numpy as np

y, sr = librosa.load('sample.wav', sr=16000)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20) 