## Importing required libraries

In [17]:
import librosa
import soundfile
import os, glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

### Extract features (mfcc, chroma, mel) from a sound file
#### MFCC : Mel-frequency cepstral coefficients (MFCCs) are coefficients that collectively make up an MFC. In the MFC, the frequency bands are equally spaced on the mel scale, which approximates the human auditory system's response more closely than the linearly-spaced frequency bands used in the cepstrum. The MFCCs are derived from a type of cepstral representation of the audio clip (a nonlinear "spectrum-of-a-spectrum").

#### Chroma: It is a representation of the spectral envelope of a sound, computed in a perceptually-motivated way. It is often used as a feature in audio classification.

#### Mel: It is a representation of the short-term power spectrum of a sound. It provides a more robust representation of the overall shape of a spectral envelope.

In [18]:
def feature_extract(name, mfcc, chroma, mel):
    with soundfile.SoundFile(name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        output=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            output=np.hstack((output, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            output=np.hstack((output, chroma))
        if mel:
                    mel=np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)
                    output=np.hstack((output, mel))
    return output

## Emotions in the RAVDESS dataset
### RAVDESS dataset contains 7356 files (total size: 24.8 GB). The data is pre-sorted into 24 folders (Actor_*). Each actor has read each of the 60 statements in two emotional states (emotional and neutral) and in two levels of emotional intensity (normal, strong). Each of these two variations constitutes half of the trials. Thus, there are 24 actors * 2 emotional states * 2 levels of intensity * 60 statements = 2880 trials. Each trial has two versions: a 16-bit stereo WAV file, and a corresponding 16-bit mono MP4 file.

In [22]:

emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

# Emotions to observe
observed_emotions=['calm', 'happy', 'fearful', 'disgust']

### Load the data and extract features for each sound file

In [25]:
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("D:\\AI_DS\\Quarter_3\\Speech_processing\\Final_Project\\speech-emotion-recognition-ravdess-data\\Actor_*\\*.wav"):
        name=os.path.basename(file)
        emotion=emotions[name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=feature_extract(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

### Split the dataset

In [26]:

x_train,x_test,y_train,y_test=load_data(test_size=0.25)

### Get the shape of the training and testing datasets

In [27]:
print((x_train.shape[0], x_test.shape[0]))

(576, 192)


### Get the number of features extracted

In [28]:
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


### Initialize the Multi Layer Perceptron Classifier

In [29]:
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(112,), learning_rate='adaptive', max_iter=500)

### Train the model

In [30]:
model.fit(x_train,y_train)

### Predict for the test set

In [31]:
y_pred=model.predict(x_test)

### Calculate the accuracy of our model

In [32]:
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)


### Print the accuracy

In [33]:
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 63.02%
