In [2]:
import librosa
import os
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score

# Feature extraction function
def extract_feature(file_name, mfcc=True, chroma=True, mel=True, spectral=True, tonnetz=True):
    X, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    result = np.array([])

    # Ensure n_fft is never larger than the signal
    n_fft = min(512, len(X))  # Reduce to 512 if the file is too short

    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=60).T, axis=0)
        result = np.hstack((result, mfccs))
    
    if chroma:
        stft = np.abs(librosa.stft(X, n_fft=n_fft))
        chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma_feat))
    
    if mel:
        mel_feat = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate, n_fft=n_fft).T, axis=0)
        result = np.hstack((result, mel_feat))
    
    if spectral:
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=X, sr=sample_rate, n_fft=n_fft).T, axis=0)
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, spectral_contrast, spectral_bandwidth))

    if tonnetz:
        tonnetz_feat = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
        result = np.hstack((result, tonnetz_feat))

    return result

# Define emotion mapping for RAVDESS
emotions = {
    '01': 'Neutral', '02': 'Calm', '03': 'Happy', '04': 'Sad',
    '05': 'Angry', '06': 'Fearful', '07': 'Disgust', '08': 'Surprised'
}
observed_emo = set(emotions.values())

# Load dataset function
def load_data(test_size=0.25):
    x, y = [], []

    # Load RAVDESS dataset
    for file in glob.glob('E:/RAVDESS/RAVDESS/Actor_*/*.wav'):
        file_name = os.path.basename(file)
        emotion = emotions.get(file_name.split('-')[2], None)
        if emotion in observed_emo:
            feature = extract_feature(file)
            x.append(feature)
            y.append(emotion)

    # Load TESS dataset
    tess_base = 'E:/TESS/'
    for folder in os.listdir(tess_base):
        folder_path = os.path.join(tess_base, folder)
        if os.path.isdir(folder_path):
            parts = folder.split('_')
            if len(parts) > 1:  # Ensures there's an emotion label
                emotion_label = parts[1].capitalize()
                if emotion_label in observed_emo:
                    for file in glob.glob(f'{folder_path}/*.wav'):
                        feature = extract_feature(file)
                        x.append(feature)
                        y.append(emotion_label)
    
    # Convert to NumPy arrays and split
    x = np.array(x)
    y = np.array(y)
    
    return train_test_split(x, y, test_size=test_size, train_size=0.75, random_state=9)

# Load data
x_train, x_test, y_train, y_test = load_data(test_size=0.25)

# Normalize features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Train MLP Classifier
model = MLPClassifier(
    alpha=0.001,  # Reduced regularization
    batch_size=128,  # Smaller batch size
    epsilon=1e-08,
    hidden_layer_sizes=(512, 256, 128),  # Deeper network
    learning_rate='adaptive',
    max_iter=800  # More iterations
)

model.fit(x_train, y_train)

# Evaluate model
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'F1 Score: {f1:.2f}')




Accuracy: 63.33%
F1 Score: 0.63
