In [1]:
import librosa
import os
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score

def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    X, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    result = np.array([])
    
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    
    if chroma:
        stft = np.abs(librosa.stft(X))
        chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma_feat))
    
    if mel:
        mel_feat = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel_feat))
    
    return result

# Define emotion mapping for RAVDESS
emotions = {
    '01': 'Neutral', '02': 'Calm', '03': 'Happy', '04': 'Sad',
    '05': 'Angry', '06': 'Fearful', '07': 'Disgust', '08': 'Surprised'
}
observed_emo = set(emotions.values())

def load_data(test_size=0.2):
    x, y = [], []
    
    # Load RAVDESS dataset
    for file in glob.glob('E:/RAVDESS/RAVDESS/Actor_*/*.wav'):
        file_name = os.path.basename(file)
        emotion = emotions.get(file_name.split('-')[2], None)
        if emotion in observed_emo:
            feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
            x.append(feature)
            y.append(emotion)
    
    # Load TESS dataset
    tess_base = 'E:/TESS/TESS Toronto emotional speech set data/'
    for folder in os.listdir(tess_base):
        folder_path = os.path.join(tess_base, folder)
        if os.path.isdir(folder_path):
            parts = folder.split('_')
            if len(parts) > 1:  # Ensures there's an emotion label
                emotion_label = parts[1].capitalize()
                if emotion_label in observed_emo:
                    for file in glob.glob(f'{folder_path}/*.wav'):
                        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
                        x.append(feature)
                        y.append(emotion_label)
    return train_test_split(np.array(x), np.array(y), test_size=test_size, train_size=0.75, random_state=9)

# Load data
x_train, x_test, y_train, y_test = load_data(test_size=0.25)

# Train MLP Classifier
model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)
model.fit(x_train, y_train)

# Evaluate model
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'F1 Score: {f1:.2f}')


Accuracy: 74.42%
F1 Score: 0.71
