## Importing Data

In [1]:
path = '../input/ravdess-emotional-speech-audio'

In [2]:
import pandas as pd
import numpy as np
import os
import random
import matplotlib.pyplot as plt

In [3]:
def preprocess(path):
    file_names = os.listdir(path)
    male_files = file_names[0::2]
    female_files = file_names[1::2]
    audio_files = []
    for i in male_files:
        [audio_files.append([os.path.join(i,file), 'happy', 'male']) for file in os.listdir(os.path.join(path, i)) if file.startswith("03-01-03")]
        [audio_files.append([os.path.join(i,file), 'sad', 'male']) for file in os.listdir(os.path.join(path, i)) if file.startswith("03-01-04")]
    for i in female_files:
        [audio_files.append([os.path.join(i,file), 'happy', 'female']) for file in os.listdir(os.path.join(path, i)) if file.startswith("03-01-03")]
        [audio_files.append([os.path.join(i,file), 'sad', 'female']) for file in os.listdir(os.path.join(path, i)) if file.startswith("03-01-04")]
    return audio_files

In [4]:
audio_files = preprocess(path)
print('audio_files shape: ',np.shape(audio_files))

audio_files shape:  (384, 3)


In [5]:
audio_files = pd.DataFrame(audio_files)
audio_files = audio_files.sample(frac=1).reset_index(drop=True)
audio_files.columns = ['Audio Path', 'Emotion', 'Gender']
audio_files.head()

Unnamed: 0,Audio Path,Emotion,Gender
0,Actor_06/03-01-04-02-02-01-06.wav,sad,male
1,Actor_13/03-01-04-02-01-02-13.wav,sad,male
2,Actor_05/03-01-04-01-01-01-05.wav,sad,male
3,Actor_14/03-01-03-01-01-01-14.wav,happy,male
4,Actor_22/03-01-03-02-02-01-22.wav,happy,male


In [6]:
audio_files.nunique()

Audio Path    384
Emotion         2
Gender          2
dtype: int64

## Extracting MFCC and mel spec features

In [7]:
import librosa

In [8]:
from sklearn.preprocessing import minmax_scale
def normalize(x, axis=0):
    return minmax_scale(x, axis=axis)

In [9]:
def Data(audio_files, function):
    data = []
    for i in range(len(audio_files)):
        x , sr = librosa.load(os.path.join(path, audio_files['Audio Path'][i]))
        if function == 'mfcc':
            data.append(np.mean(librosa.feature.mfcc(y=x, sr=sr, n_mfcc=128).T,axis=0))
        else:
            data.append(np.mean(librosa.feature.melspectrogram(y=x, sr=sr, n_mels=128).T,axis=0))
        
    data = pd.DataFrame(data)
    data['class'] = audio_files['Emotion']
    X = data.iloc[:,:-1]
    X = normalize(X)
    y = data.iloc[:,-1]
    return X,y

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [11]:
encoder = LabelEncoder()
encoder.fit(audio_files['Emotion'])

LabelEncoder()

In [12]:
X, y = Data(audio_files, 'mfcc')
X_train_mfcc, X_test_mfcc, y_train_mfcc, y_test_mfcc = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
y_train_mfcc = encoder.transform(y_train_mfcc)
y_test_mfcc = encoder.transform(y_test_mfcc)

In [14]:
X, y = Data(audio_files, 'melspectrogram')
X_train_melspec, X_test_melspec, y_train_melspec, y_test_melspec = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
y_train_melspec = encoder.transform(y_train_melspec)
y_test_melspec = encoder.transform(y_test_melspec)

## Logistic Regression

In [121]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [122]:
model_mfcc_lr = LogisticRegression(random_state=0).fit(X_train_mfcc, y_train_mfcc)
pred_mfcc_lr = model_mfcc_lr.predict(X_test_mfcc)
print(accuracy_score(y_test_mfcc, pred_mfcc_lr))
print(confusion_matrix(y_test_mfcc, pred_mfcc_lr))
print(classification_report(y_test_mfcc, pred_mfcc_lr))

0.8110236220472441
[[54  7]
 [17 49]]
              precision    recall  f1-score   support

           0       0.76      0.89      0.82        61
           1       0.88      0.74      0.80        66

    accuracy                           0.81       127
   macro avg       0.82      0.81      0.81       127
weighted avg       0.82      0.81      0.81       127



In [124]:
model_melspec_lr = LogisticRegression(random_state=0).fit(X_train_melspec, y_train_melspec)
pred_melspec_lr = model_melspec_lr.predict(X_test_melspec)
print(accuracy_score(y_test_melspec, pred_melspec_lr))
print(confusion_matrix(y_test_melspec, pred_melspec_lr))
print(classification_report(y_test_melspec, pred_melspec_lr))

0.7322834645669292
[[38 23]
 [11 55]]
              precision    recall  f1-score   support

           0       0.78      0.62      0.69        61
           1       0.71      0.83      0.76        66

    accuracy                           0.73       127
   macro avg       0.74      0.73      0.73       127
weighted avg       0.74      0.73      0.73       127



## CNN

In [125]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, InputLayer, UpSampling2D

In [126]:
from tensorflow.keras.callbacks import EarlyStopping
callback = EarlyStopping(monitor='val_loss', patience=3)

In [127]:
tf.keras.backend.clear_session()
model_mfcc_cnn = Sequential()
model_mfcc_cnn.add(InputLayer(input_shape=(16, 8, 1)))
model_mfcc_cnn.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', padding = "same"))
model_mfcc_cnn.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2), padding='valid'))
model_mfcc_cnn.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding = "same"))
model_mfcc_cnn.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2), padding='valid'))
model_mfcc_cnn.add(Conv2D(filters=8, kernel_size=(3, 3), activation='relu', padding = "same"))
model_mfcc_cnn.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2), padding='valid'))
model_mfcc_cnn.add(Flatten())
model_mfcc_cnn.add(Dense(32, activation='relu'))
model_mfcc_cnn.add(Dense(len(audio_files['Emotion'].unique()), activation='softmax'))

In [128]:
model_mfcc_cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [129]:
history_mfcc = model_mfcc_cnn.fit(X_train_mfcc.reshape(X_train_mfcc.shape[0], 16, 8, 1),
                    y_train_mfcc, epochs=500,
                    validation_data=(X_test_mfcc.reshape(X_test_mfcc.shape[0], 16, 8, 1), y_test_mfcc),
                    callbacks=[callback])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500


In [132]:
tf.keras.backend.clear_session()
model_melspec_cnn = Sequential()
model_melspec_cnn.add(InputLayer(input_shape=(16, 8, 1)))
model_melspec_cnn.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu', padding = "same"))
model_melspec_cnn.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2), padding='valid'))
model_melspec_cnn.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding = "same"))
model_melspec_cnn.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2), padding='valid'))
model_melspec_cnn.add(Conv2D(filters=8, kernel_size=(3, 3), activation='relu', padding = "same"))
model_melspec_cnn.add(MaxPooling2D(pool_size=(2, 2), strides=(2,2), padding='valid'))
model_melspec_cnn.add(Flatten())
model_melspec_cnn.add(Dense(32, activation='relu'))
model_melspec_cnn.add(Dense(len(audio_files['Emotion'].unique()), activation='softmax'))

In [133]:
model_melspec_cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [134]:
history_melspec = model_melspec_cnn.fit(X_train_melspec.reshape(X_train_melspec.shape[0], 16, 8, 1),
                    y_train_melspec, epochs=500,
                    validation_data=(X_test_melspec.reshape(X_test_melspec.shape[0], 16, 8, 1), y_test_melspec),
                    callbacks=[callback])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500


## Autoencoder

In [135]:
X, y = Data(audio_files[audio_files['Emotion'] == 'happy'].reset_index(drop=True), 'mfcc')
X_train_happy, X_test_happy, y_train_happy, y_test_happy = train_test_split(X, y, test_size=0.33, random_state=42)

In [136]:
X, y = Data(audio_files[audio_files['Emotion'] == 'sad'].reset_index(drop=True), 'mfcc')
X_train_sad, X_test_sad, y_train_sad, y_test_sad = train_test_split(X, y, test_size=0.33, random_state=42)

In [137]:
tf.keras.backend.clear_session()
encoder = Sequential()
encoder.add(InputLayer(input_shape=(16, 8, 1)))
encoder.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
encoder.add(MaxPooling2D(2, 2))
encoder.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
encoder.add(MaxPooling2D(2, 2))

In [138]:
decoder = Sequential()
decoder.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
decoder.add(UpSampling2D((2, 2)))
decoder.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
decoder.add(UpSampling2D((2, 2)))
decoder.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
decoder.add(Conv2D(1, kernel_size=(3, 3), activation='sigmoid', padding='same'))

In [139]:
autoencoder = Sequential([encoder, decoder])
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [140]:
callback = EarlyStopping(monitor='val_loss', patience=3)

In [141]:
history_autoencoder = autoencoder.fit(X_train_happy.reshape(X_train_happy.shape[0], 16, 8, 1),
                                      X_train_happy.reshape(X_train_happy.shape[0], 16, 8, 1),
                                      epochs=50,
                                      validation_data=(X_test_happy.reshape(X_test_happy.shape[0], 16, 8, 1),
                                                       X_test_sad.reshape(X_test_sad.shape[0], 16, 8, 1)))
                                      #callbacks=[callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## Variational Autoencoder

In [146]:
from tensorflow.keras import layers
from tensorflow import keras

In [211]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [217]:
latent_dim = 2
encoder_inputs = keras.Input(shape=(16, 8, 1))
x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)
z_mean = layers.Dense(latent_dim, name="z_mean")(x)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

Model: "encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           [(None, 16, 8, 1)]   0                                            
__________________________________________________________________________________________________
conv2d_24 (Conv2D)              (None, 8, 4, 32)     320         input_31[0][0]                   
__________________________________________________________________________________________________
conv2d_25 (Conv2D)              (None, 4, 2, 64)     18496       conv2d_24[0][0]                  
__________________________________________________________________________________________________
flatten_9 (Flatten)             (None, 512)          0           conv2d_25[0][0]                  
____________________________________________________________________________________________

In [218]:
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(8 * 4 * 1, activation="relu")(latent_inputs)
x = layers.Reshape((8, 4, 1))(x)
x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
x = layers.Conv2DTranspose(32, 3, activation="relu", strides=1, padding="same")(x)
decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_32 (InputLayer)        [(None, 2)]               0         
_________________________________________________________________
dense_30 (Dense)             (None, 32)                96        
_________________________________________________________________
reshape_20 (Reshape)         (None, 8, 4, 1)           0         
_________________________________________________________________
conv2d_transpose_51 (Conv2DT (None, 16, 8, 64)         640       
_________________________________________________________________
conv2d_transpose_52 (Conv2DT (None, 16, 8, 32)         18464     
_________________________________________________________________
conv2d_transpose_53 (Conv2DT (None, 16, 8, 1)          289       
Total params: 19,489
Trainable params: 19,489
Non-trainable params: 0
_______________________________________________________

In [219]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.binary_crossentropy(data, reconstruction), axis=(1, 2)
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [220]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())
history = vae.fit(X_train_happy.reshape(X_train_happy.shape[0], 16, 8, 1),
                  epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
