# Speech Emotion Recognition using Librosa

### Import Necessary libraries

In [1]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
import warnings
warnings.filterwarnings("ignore")

Defining a function extract_feature to extract the mfcc, chroma, and mel features from a sound file. This function takes 4 parameters- the file name and three Boolean parameters for the three features:

    - mfcc: Mel Frequency Cepstral Coefficient, represents the short-term power spectrum of a sound
    - chroma: Pertains to the 12 different pitch classes
    - mel: Mel Spectrogram Frequency

In [2]:
#Extract features (mfcc, chroma, mel) from a sound file

def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
    return result

In [3]:
#Emotions in the RAVDESS dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

#Emotions to observe
observed_emotions=['calm', 'happy', 'fearful', 'disgust']

In [4]:
#Load the data and extract features for each sound file
def load_data():
    x,y=[],[]
    for file in glob.glob("/Users/zaid/Desktop/Speech Emotion Recognition ML Proj/speech-emotion-recognition-ravdess-data/Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return x,y
    #return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [5]:
X,y = load_data()

In [11]:
X = np.array(X)
X = pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,170,171,172,173,174,175,176,177,178,179
0,-537.004395,31.666399,-6.542091,4.567605,-7.387805,-13.684702,-17.860582,-11.828494,-0.225548,-11.868469,...,0.000858,0.001073,0.001119,0.001249,0.000946,0.000919,0.001475,0.001867,0.001133,0.000950
1,-383.007202,9.420697,-26.200880,2.628421,-17.898153,-16.619091,-16.220095,-11.781564,-1.197635,-13.881689,...,0.004067,0.004942,0.004713,0.003613,0.003791,0.003923,0.003090,0.002704,0.001470,0.001169
2,-606.815308,42.624638,1.537446,5.558994,-6.486233,-4.224218,-14.775834,-7.948472,-1.429373,-4.777880,...,0.000816,0.000669,0.000448,0.000357,0.000393,0.000801,0.000465,0.000162,0.000173,0.000086
3,-555.719055,43.634399,7.089331,7.287218,-7.250546,-12.701806,-14.231813,-13.405810,0.316495,-10.287708,...,0.006458,0.006884,0.006208,0.006897,0.006973,0.008390,0.007365,0.003125,0.003766,0.003333
4,-526.230774,30.395340,-10.102286,5.732557,-10.387459,-13.651891,-12.542537,-9.009912,0.964212,-7.356021,...,0.001960,0.002502,0.002561,0.005435,0.005965,0.004992,0.003769,0.004138,0.002868,0.001951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,-646.522766,54.202343,10.829182,15.782007,-0.777860,-0.975148,-10.103933,-4.550802,-4.039625,-5.251637,...,0.000128,0.000130,0.000121,0.000157,0.000140,0.000158,0.000147,0.000090,0.000083,0.000072
764,-525.341064,39.242966,-24.850788,14.384969,-11.483785,-7.030712,-12.698215,-8.867377,-2.992077,-9.694036,...,0.000232,0.000190,0.000173,0.000174,0.000319,0.000201,0.000392,0.000203,0.000125,0.000086
765,-627.105347,47.312954,-5.354202,24.124666,-3.931024,2.508424,-6.277518,-1.511968,-0.450381,0.038337,...,0.000038,0.000025,0.000021,0.000023,0.000022,0.000015,0.000011,0.000008,0.000007,0.000004
766,-691.606018,61.090164,12.652380,23.446877,1.102112,9.368806,-1.883122,-3.115121,-2.259609,1.354561,...,0.000016,0.000011,0.000011,0.000023,0.000024,0.000030,0.000013,0.000008,0.000004,0.000002


### K-Means Clustering

In [21]:
from sklearn.cluster import KMeans

In [22]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)

In [26]:
y_kmeans = kmeans.labels_

In [24]:
y_numeric = [0]*len(y)
for i in range(len(y)):
    if y[i] == 'fearful':
        y_numeric[i] = 3
    elif y[i] == 'disgust':
        y_numeric[i] = 1
    elif y[i] == 'calm':
        y_numeric[i] = 2
    elif y[i] == 'happy':
        y_numeric[i] = 0

In [25]:
y_numeric = np.array(y_numeric)
y_numeric

array([3, 3, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2, 3, 3, 1, 1, 2, 2, 0, 0, 1, 1,
       3, 3, 1, 1, 3, 3, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 3, 3, 1, 1, 3, 3,
       2, 2, 0, 0, 3, 3, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2, 3, 3, 1, 1, 0, 0,
       2, 2, 3, 3, 1, 1, 3, 3, 1, 1, 0, 0, 2, 2, 1, 1, 3, 3, 2, 2, 0, 0,
       2, 2, 0, 0, 1, 1, 3, 3, 1, 1, 3, 3, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1,
       3, 3, 0, 0, 2, 2, 3, 3, 1, 1, 3, 3, 1, 1, 0, 0, 2, 2, 3, 3, 1, 1,
       0, 0, 2, 2, 0, 0, 2, 2, 3, 3, 1, 1, 2, 2, 0, 0, 1, 1, 3, 3, 1, 1,
       3, 3, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 3, 3, 1, 1, 3, 3, 2, 2, 0, 0,
       3, 3, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2, 3, 3, 1, 1, 0, 0, 2, 2, 3, 3,
       1, 1, 3, 3, 1, 1, 0, 0, 2, 2, 1, 1, 3, 3, 2, 2, 0, 0, 2, 2, 0, 0,
       1, 1, 3, 3, 1, 1, 3, 3, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 3, 3, 0, 0,
       2, 2, 3, 3, 1, 1, 3, 3, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2, 3, 3, 1, 1,
       3, 3, 1, 1, 0, 0, 2, 2, 1, 1, 3, 3, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1,
       3, 3, 1, 1, 3, 3, 2, 2, 0, 0, 2, 2, 0, 0, 1,

In [27]:
#Calculate the accuracy of our model
accuracy=accuracy_score(y_true = y_numeric, y_pred = y_kmeans)

#Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 40.49%


In [28]:
x_train,x_test,y_train,y_test=train_test_split(X, y, test_size=0.20, random_state=9)

In [29]:
x_train.shape

(614, 180)

In [30]:
#Get the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

(614, 154)


In [31]:
#Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


### Gaussian Naive Bayesian

In [32]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

In [33]:
clf.fit(x_train, y_train)

GaussianNB()

In [34]:
y_pred_gnb = clf.predict(x_test)

In [35]:
#Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred_gnb)
#Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 40.91%


### Support Vector Machine Classifier

In [36]:
model2 = svm.SVC()
model2.fit(x_train,y_train)

SVC()

In [37]:
y_pred_svc = model2.predict(x_test)

In [38]:
#Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred_svc)
#Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 42.21%


### Random Forest Classifier

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [41]:
y_pred_rf = clf.predict(x_test)

In [42]:
#Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred_rf)
#Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 48.70%


### Decision Trees

In [43]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)

In [44]:
clf.fit(x_train, y_train)

DecisionTreeClassifier(random_state=0)

In [45]:
y_pred_dt = clf.predict(x_test)

In [46]:
#Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred_dt)
#Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 51.95%


### Perceptron Classifier

In [47]:
from sklearn.linear_model import Perceptron

clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(x_train, y_train)

Perceptron()

In [48]:
y_pred_perceptron = clf.predict(x_test)

In [49]:
#DataFlair - Calculate the accuracy of model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred_perceptron)
#DataFlair - Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 38.31%


### Multi Layer Perceptron Classifier

In [50]:
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [51]:
# Train the model
model.fit(x_train,y_train)

MLPClassifier(alpha=0.01, batch_size=256, hidden_layer_sizes=(300,),
              learning_rate='adaptive', max_iter=500)

In [52]:
# Predict for the test set
y_pred=model.predict(x_test)

In [53]:
#DataFlair - Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
#DataFlair - Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 72.73%
