In [2]:
import json
import os
import math
import librosa
import numpy as np

In [3]:
class AudioFeatureExtractor:
    def __init__(self, audio_path, samples_per_segment, num_mfcc_vectors_per_segment, data, 
                 label, num_segments=3, num_mfcc=40, num_mels=40, n_fft=2048, hop_length=512):
        self.file_path = audio_path
        self.signal, self.sr = librosa.load(audio_path)
        self.samples_per_segment = samples_per_segment
        self.num_mfcc_vectors_per_segment = num_mfcc_vectors_per_segment
        self.num_segments = num_segments
        self.num_mfcc = num_mfcc
        self.n_fft = n_fft
        self.num_mels = num_mels
        self.hop_length = hop_length
        self.label = label
        self.data = data

In [4]:
class MFCCExtractor(AudioFeatureExtractor):
    def extract(self):
        for segment in range(self.num_segments):        
            start = self.samples_per_segment * segment
            finish = start + self.samples_per_segment
            
            mfcc = librosa.feature.mfcc(y=self.signal[start:finish], sr=self.sr, n_mfcc=self.num_mfcc, n_fft=self.n_fft, hop_length=self.hop_length)
            mfcc = mfcc.T
            
            if len(mfcc) == self.num_mfcc_vectors_per_segment:
                self.data["mfcc"].append(mfcc.tolist())
                self.data["labels"].append(self.label-1)
                print("{}, segment:{}".format(self.file_path, segment+1))

In [5]:
class MelSpectrogramExtractor(AudioFeatureExtractor):
    def extract(self):
        for segment in range(self.num_segments):
            start = self.samples_per_segment * segment
            finish = start + self.samples_per_segment

            mel_spec = librosa.feature.melspectrogram(y=self.signal[start:finish], sr=self.sr, n_fft=self.n_fft, n_mels=self.num_mels, hop_length=self.hop_length)
            log_mel_spec = librosa.power_to_db(mel_spec).T

            if len(log_mel_spec) == self.num_mfcc_vectors_per_segment:
                self.data['mels'].append(log_mel_spec.tolist())
                self.data["labels"].append(self.label-1)
                print("{}, segment:{}".format(self.file_path, segment+1))

In [6]:
class STFTExtractor(AudioFeatureExtractor):
    def extract(self):
        for segment in range(self.num_segments):
            start = self.samples_per_segment * segment
            finish = start + self.samples_per_segment
            
            chroma_stft = librosa.feature.chroma_stft(y=self.signal[start:finish], sr=self.sr, n_chroma=self.num_mels, n_fft=self.n_fft, hop_length=self.hop_length)
            log_chroma_stft = librosa.power_to_db(chroma_stft).T
            
            if len(log_chroma_stft) == self.num_mfcc_vectors_per_segment:
                self.data["chroma_stft"].append(log_chroma_stft.tolist())
                self.data["labels"].append(self.label-1)
                print("{}, segment:{}".format(self.file_path, segment+1))

In [7]:
class CQTExtractor(AudioFeatureExtractor):
    def extract(self):
        for segment in range(self.num_segments):
            start = self.samples_per_segment * segment
            finish = start + self.samples_per_segment
            try:
                chroma_cqt = librosa.feature.chroma_cqt(y=self.signal[start:finish], sr=self.sr, n_chroma=self.num_mels, bins_per_octave=80)
                log_chroma_cqt = librosa.power_to_db(chroma_cqt).T

                if len(log_chroma_cqt) == self.num_mfcc_vectors_per_segment:
                    self.data["chroma_cqt"].append(log_chroma_cqt.tolist())
                    self.data["labels"].append(self.label-1)
                    print("{}, segment:{}".format(self.file_path, segment+1))
            except: 
                pass

In [8]:
def create_feature_extractor(feature:str):
    if feature == "mfcc":
        return MFCCExtractor
    if feature == "mels":
        return MelSpectrogramExtractor
    if feature == "chroma_cqt":
        return CQTExtractor
    if feature == "chroma_stft":
        return STFTExtractor

In [9]:
def save_mfcc(dataset_path, feature, num_segments=3, hop_length=512):
    
    data = {
              'emotion':[], 
               feature:[], 
              'labels':[] 
           }
    
    samples_per_segment = int(samples_per_signal / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)

    print(f'Extracting {feature}', end='')
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        
        if dirpath is not dataset_path:

            emotion = dirpath.split("\\")[-1]
            data["emotion"].append(emotion)
            print("\nProcessing: {}".format(emotion))

            for file in filenames:
                file_path = os.path.join(dirpath, file)
                extractor = create_feature_extractor(feature)(file_path, samples_per_segment, num_mfcc_vectors_per_segment, data, label=i)
                extractor.extract()
                
    return data

In [None]:
dataset_path = # pathr to audio data -> r"D:\SER-RO-MAHA\Sample Speech Data"
sample_rate = 22050
duration = 3 #in seconds
samples_per_signal = sample_rate * duration

In [12]:
import warnings
warnings.filterwarnings('ignore')

mfcc_data = save_mfcc(dataset_path, feature='mfcc')
json_path = 'mfcc_data.json'

with open(json_path, 'w') as fp:
    json.dump(mfcc_data, fp, indent=4)

Extracting chroma_cqt
Processing: Anger
D:\SER-RO-MAHA\Sample Speech Data\Anger\03-01-05-01-01-01-01.wav, segment:1
D:\SER-RO-MAHA\Sample Speech Data\Anger\JE_a15.wav, segment:1
D:\SER-RO-MAHA\Sample Speech Data\Anger\JE_a15.wav, segment:2
D:\SER-RO-MAHA\Sample Speech Data\Anger\JE_a15.wav, segment:3

Processing: Disgust
D:\SER-RO-MAHA\Sample Speech Data\Disgust\03-01-07-01-01-01-01.wav, segment:1
D:\SER-RO-MAHA\Sample Speech Data\Disgust\03-01-07-01-01-01-01.wav, segment:2

Processing: Fear
D:\SER-RO-MAHA\Sample Speech Data\Fear\03-01-06-01-01-01-01.wav, segment:1
D:\SER-RO-MAHA\Sample Speech Data\Fear\03-01-06-01-01-01-01.wav, segment:2

Processing: Happy
D:\SER-RO-MAHA\Sample Speech Data\Happy\03-01-03-01-01-01-01.wav, segment:1

Processing: Neutral
D:\SER-RO-MAHA\Sample Speech Data\Neutral\03-01-01-01-01-01-01.wav, segment:1

Processing: Sad
D:\SER-RO-MAHA\Sample Speech Data\Sad\03-01-04-01-01-01-01.wav, segment:1

Processing: Surprise
D:\SER-RO-MAHA\Sample Speech Data\Surprise\03-