## Dataset links

Ryerson Audio-Visual Database of Emotional Speech and Song [RAVDESS](https://smartlaboratory.org/ravdess/)

Crowd-sourced Emotional Mutimodal Actors Dataset [SAVEE](http://kahlan.eps.surrey.ac.uk/savee/)

Surrey Audio-Visual Expressed Emotion [TESS](https://tspace.library.utoronto.ca/handle/1807/24487)

Toronto emotional speech set [CREMA-D](https://github.com/CheyneyComputerScience/CREMA-D)

## Importing necessary libraries

In [None]:
import librosa
import librosa.display
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import IPython.display as ipd  # To play sound in the notebook
from IPython.core.display import display
import json
import seaborn as sns
import os

# tensorflow
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model, model_from_json
from tensorflow.keras import layers, models, Model, optimizers
from tensorflow.keras.layers import Dense, Input, Flatten, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, MaxPooling2D
from tensorflow.keras.utils import to_categorical

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# setting seed
np.random.seed(42)
tf.random.set_seed(42)

## Loading Datasets

In [None]:
TESS = "./Datasets/TESS/"
RAV = "./Datasets/RAVDESS/"
SAVEE = "./Datasets/SAVEE/"
CREMA = "./Datasets/CREMA-D/"

### Loading SAVEE

In [None]:
dir_list = os.listdir(SAVEE)

emotion=[]
path = []
for i in dir_list:
    if i[-8:-6]=='_a':
        emotion.append('male_angry')
    elif i[-8:-6]=='_d':
        emotion.append('male_disgust')
    elif i[-8:-6]=='_f':
        emotion.append('male_fear')
    elif i[-8:-6]=='_h':
        emotion.append('male_happy')
    elif i[-8:-6]=='_n':
        emotion.append('male_neutral')
    elif i[-8:-6]=='sa':
        emotion.append('male_sad')
    elif i[-8:-6]=='su':
        emotion.append('male_surprise')
    path.append(SAVEE + i)
    
SAVEE_df = pd.DataFrame(emotion, columns = ['labels'])
SAVEE_df = pd.concat([SAVEE_df, pd.DataFrame(path, columns = ['path'])], axis = 1)
SAVEE_df.labels.value_counts()

### Loading RAVDESS

In [None]:
dir_list = os.listdir(RAV)

emotion=[]
gender=[]
path=[]

for i in dir_list:
    fname = os.listdir(RAV + i)
    for f in fname:
        part = f.split(".")[0].split("-")
        emotion.append(int(part[2]))
        if int(part[6])%2 == 0:
            gender.append("female")
        else:
            gender.append("male")
        path.append(RAV + i + "/" + f)
        
RAV_df = pd.DataFrame(emotion, columns=["emotions"])
RAV_df = RAV_df.replace({1:"neutral",2:"calm",3:"happy",4:"sad",5:"angry",6:"fear",7:"disgust",8:"surprise"})
RAV_df = pd.concat([RAV_df, pd.DataFrame(gender, columns=["gender"])], axis=1)
RAV_df["labels"] = RAV_df["gender"] + "_" + RAV_df["emotions"]
RAV_df = pd.concat([RAV_df, pd.DataFrame(path, columns=["path"])], axis=1)
RAV_df = RAV_df.drop(["emotions", "gender"], axis=1)
RAV_df.labels.value_counts()

### Loading TESS

In [None]:
dir_list = os.listdir(TESS)

emotion = []
path = []

for i in dir_list:
    fname = os.listdir(TESS + i)
    for f in fname:
        part = f.split(".")[0].split("_")
        if part[2]=="angry":
            emotion.append("female_angry")
        elif part[2]=="disgust":
            emotion.append("female_disgust")
        elif part[2]=="fear":
            emotion.append("female_fear")
        elif part[2]=="happy":
            emotion.append("female_happy")
        elif part[2]=="neutral":
            emotion.append("female_neutral")
        elif part[2]=="ps":
            emotion.append("female_surprise")
        elif part[2]=="sad":
            emotion.append("female_sad")
        path.append(TESS + i + "/" + f)
            
TESS_df = pd.DataFrame(emotion, columns=["labels"])
TESS_df = pd.concat([TESS_df, pd.DataFrame(path, columns=["path"])], axis=1)
TESS_df.labels.value_counts()

### Loading CREMA-D

In [None]:
dir_list = os.listdir(CREMA)
dir_list.sort()

gender = []
emotion = []
path = []
female = [1002,1003,1004,1006,1007,1008,1009,1010,1012,1013,1018,1020,1021,1024,1025,1028,1029,1030,1037,1043,1046,1047,1049,
          1052,1053,1054,1055,1056,1058,1060,1061,1063,1072,1073,1074,1075,1076,1078,1079,1082,1084,1089,1091]

for i in dir_list: 
    part = i.split('_')
    if int(part[0]) in female:
        temp = 'female'
    else:
        temp = 'male'
    gender.append(temp)
    if part[2] == 'SAD' and temp == 'male':
        emotion.append('male_sad')
    elif part[2] == 'ANG' and temp == 'male':
        emotion.append('male_angry')
    elif part[2] == 'DIS' and temp == 'male':
        emotion.append('male_disgust')
    elif part[2] == 'FEA' and temp == 'male':
        emotion.append('male_fear')
    elif part[2] == 'HAP' and temp == 'male':
        emotion.append('male_happy')
    elif part[2] == 'NEU' and temp == 'male':
        emotion.append('male_neutral')
    elif part[2] == 'SAD' and temp == 'female':
        emotion.append('female_sad')
    elif part[2] == 'ANG' and temp == 'female':
        emotion.append('female_angry')
    elif part[2] == 'DIS' and temp == 'female':
        emotion.append('female_disgust')
    elif part[2] == 'FEA' and temp == 'female':
        emotion.append('female_fear')
    elif part[2] == 'HAP' and temp == 'female':
        emotion.append('female_happy')
    elif part[2] == 'NEU' and temp == 'female':
        emotion.append('female_neutral')
    else:
        emotion.append('Unknown')
    path.append(CREMA + i)
    
CREMA_df = pd.DataFrame(emotion, columns = ['labels'])
CREMA_df = pd.concat([CREMA_df,pd.DataFrame(path, columns = ['path'])],axis=1)
CREMA_df.labels.value_counts()

## Concatenating all 4 datasets and resetting index

In [None]:
df = pd.concat([SAVEE_df, RAV_df, TESS_df, CREMA_df], axis = 0)
df.reset_index(inplace=True)
df.drop("index", inplace=True, axis=1)
df

In [None]:
df.labels.value_counts()

## Analysing and visualising example audio file

In [None]:
data, sample_rate = librosa.load("C:/Users/yashr/Desktop/Audio Project/Datasets/SAVEE/DC_a01.wav", res_type='kaiser_fast', sr=44100)
ipd.Audio(data, rate=sample_rate)

### Spectrogram

In [None]:
spec = librosa.stft(data)
fig = plt.figure(figsize=(8,6))
librosa.display.specshow(spec, x_axis='time', y_axis='mel')
plt.colorbar()
plt.show()

### Mel-spectrogram

In [None]:
mel_spec = librosa.feature.melspectrogram(data, sr=sample_rate, n_mels=128,fmax=12000) 
fig = plt.figure(figsize=(8,6))
librosa.display.specshow(mel_spec, x_axis='time', y_axis='mel')
plt.colorbar()
plt.show()

### Mel-spectrogram on decibel scale

In [None]:
db_spec = librosa.power_to_db(mel_spec, ref=np.min)
fig = plt.figure(figsize=(8,6))
librosa.display.specshow(db_spec, x_axis='time', y_axis='mel')
plt.colorbar()
plt.show()

## Datasets split

### Splitting data on target labels.

In [None]:
train, test = train_test_split(df, test_size=0.1, random_state=42, stratify=df.labels)
val, test = train_test_split(test, test_size=0.2, random_state=42, stratify=test.labels)


labels_train = train.labels.values.tolist()
labels_val = val.labels.values.tolist()
labels_test = test.labels.values.tolist()

## Feature extraction

#### Below three cells convert the complete data into mel spectrograms and then reads back as pixel values into arrays and saves those arrays into train and test lists.Takes a bit of time.

In [None]:
%%time

matplotlib.use('Agg') #Does not let the figures show in cells. Saves time while converting complete data

image_path = "./mel-spec-images/"
train_path = "train/"
test_path = "test/"
val_path = "val/"
for i in train_path,test_path, val_path:
    if not os.path.exists(image_path + i):
        os.makedirs(image_path + i)

X_train = []
y_train = []

for index,path in enumerate(train.path):
    '''converting audio data into spectrogram and saving on HD'''
    data, sample_rate = librosa.load(path, res_type='kaiser_fast', sr=44100)
    spectrogram = librosa.feature.melspectrogram(data, sr=sample_rate, n_mels=128,fmax=12000) 
    db_spec = librosa.power_to_db(spectrogram, ref=np.max)
    fig = plt.figure(figsize=(8,6), num=1, clear=True)
    librosa.display.specshow(db_spec)
    plt.savefig(image_path + train_path + str(index) + ".png")  
    
    '''Loading saved spectrogram as array'''
    image=tf.keras.preprocessing.image.load_img(image_path + train_path + str(index) + ".png", color_mode='rgb', target_size= (128,128))
    image=np.array(image)
    X_train.append(image)
    y_train.append(labels_train[index])

In [None]:
%%time

X_val = []
y_val = []

for index,path in enumerate(val.path):
    '''converting audio data into spectrogram and saving on HD'''
    data, sample_rate = librosa.load(path, res_type='kaiser_fast', sr=44100)
    spectrogram = librosa.feature.melspectrogram(data, sr=sample_rate, n_mels=128,fmax=12000) 
    db_spec = librosa.power_to_db(spectrogram, ref=np.max)
    fig = plt.figure(figsize=(8,6), num=1, clear=True)
    librosa.display.specshow(db_spec)
    plt.savefig(image_path + val_path + str(index) + ".png")  
    
    '''Loading saved spectrogram as array'''
    image=tf.keras.preprocessing.image.load_img(image_path + val_path + str(index) + ".png", color_mode='rgb', target_size= (128,128))
    image=np.array(image)
    X_val.append(image)
    y_val.append(labels_val[index])

In [None]:
%%time

X_test = []
y_test = []

for index,path in enumerate(test.path):
    '''converting audio data into spectrogram and saving on HD'''
    data, sample_rate = librosa.load(path, res_type='kaiser_fast', sr=44100)
    spectrogram = librosa.feature.melspectrogram(data, sr=sample_rate, n_mels=128,fmax=12000) 
    db_spec = librosa.power_to_db(spectrogram, ref=np.max)
    fig = plt.figure(figsize=(8,6), num=1, clear=True)
    librosa.display.specshow(db_spec)
    plt.savefig(image_path + test_path + str(index) + ".png") 
    
    '''Loading saved spectrogram as array'''
    image=tf.keras.preprocessing.image.load_img(image_path + test_path + str(index) + ".png", color_mode='rgb', target_size= (128,128))
    image=np.array(image)
    X_test.append(image)
    y_test.append(labels_test[index])

In [None]:
print("Instances in X_train:", len(X_train))
print("Instances in X_val:", len(X_val))
print("Instances in X_test:", len(X_test))

In [None]:
#Convert the list into arrays and normalise the pixel values between 0 and 1

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_val = np.asarray(X_val)
y_val = np.asarray(y_val)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

X_train = X_train.astype('float32')
X_val = X_val.astype('float32')
X_test = X_test.astype('float32')

X_train /= 255
X_val /= 255
X_test /= 255

In [None]:
print("shape of X_train: ", X_train.shape)
print("shape of y_train: ", y_train.shape)
print("shape of X_val: ", X_val.shape)
print("shape of y_val: ", y_val.shape)
print("shape of X_test: ", X_test.shape)
print("shape of y_test: ", y_test.shape)

In [None]:
#Encoding the target variables

lb = LabelEncoder()

y_train = to_categorical(lb.fit_transform(y_train))
y_val = to_categorical(lb.fit_transform(y_val))
y_test = to_categorical(lb.fit_transform(y_test))

In [None]:
lb.classes_

In [None]:
#Final labels as a separate variable for use when we are loading the array data directly from HD 
#since label encoder won't work in inverse transforming when we load the arrays from HD

classes = np.array(['female_angry', 'female_calm', 'female_disgust', 'female_fear',
                   'female_happy', 'female_neutral', 'female_sad', 'female_surprise',
                   'male_angry', 'male_calm', 'male_disgust', 'male_fear',
                   'male_happy', 'male_neutral', 'male_sad', 'male_surprise'],
                  dtype='<U15')
classes

In [None]:
#Saving the arrays into HD for future use

np.save("X_train", X_train)
np.save("X_val", X_val)
np.save("X_test", X_test)
np.save("y_train", y_train)
np.save("y_val", y_val)
np.save("y_test", y_test)

In [None]:
# #Loading numpy arrays from HD

# X_train = np.load("./X_train.npy")
# X_val = np.load("./X_val.npy")
# X_test = np.load("./X_test.npy")
# y_train = np.load("./y_train.npy")
# y_val = np.load("./y_val.npy")
# y_test = np.load("./y_test.npy")

## Loading pre-trained VGG19 model and modifying the bottom layers

In [None]:
from tensorflow.keras.applications import VGG19

vgg_model = VGG19(weights='imagenet',
                  include_top=False,
                  input_shape=(128, 128, 3))

model = Sequential()
for layer in vgg_model.layers:
    model.add(layer)
model.add(Flatten())  
model.add(Dropout(0.5))
model.add(Dense(1024, activation='relu'))
model.add(Dense(16, activation='softmax'))

learning_rate= 1e-6
model.compile(loss="categorical_crossentropy", optimizer=optimizers.RMSprop(learning_rate=learning_rate), metrics=["accuracy"])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", min_delta=0, patience=5,
                                                    verbose=0, mode="auto", baseline=None,
                                                    restore_best_weights=False)
                                                
plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.4, patience=3, verbose=0)

checkpoint = tf.keras.callbacks.ModelCheckpoint("checkpoint_new_model", monitor="val_accuracy", verbose=0,
                                                save_best_only=True, save_weights_only=False, 
                                                mode="auto", save_freq="epoch")


In [None]:
history = model.fit(X_train, y_train, batch_size = 16, epochs=50, validation_data=(X_val,y_val), callbacks = [early_stopping, plateau])

In [None]:
%matplotlib inline

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
model.evaluate(X_test, y_test)

## Evaluating the model using confusion matrix and classification report

In [None]:
preds = model.predict(X_test, batch_size=16, verbose=1)

# Predicted labels

preds = preds.argmax(axis=1)
# preds = classes[preds]
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
preds = pd.DataFrame({'predictedvalues': preds})

# Actual labels

actual = y_test.argmax(axis=1)
# actual = classes[actual]
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
actual = pd.DataFrame({'actualvalues': actual})

# Combining predicted and actual values in a single dataframe

finaldf = actual.join(preds)
finaldf.head()

### Overall evaluation

In [None]:
predictions = model.predict(X_test)
predictions = np.argmax(predictions,axis=1)

In [None]:
new_y_test = np.argmax(y_test,axis=1)

In [None]:
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
    df_cm = pd.DataFrame(confusion_matrix, index=class_names, columns=class_names)
    fig = plt.figure(figsize=figsize)
    heatmap = sns.heatmap(df_cm, annot=True, fmt="d") 
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
matrix = confusion_matrix(new_y_test, predictions)
print_confusion_matrix(matrix, class_names = classes)

In [None]:
from sklearn.metrics import classification_report
report = classification_report(new_y_test, predictions, target_names=classes)
print(report)

### Gender identification evaluation

In [None]:
gender_df = finaldf.copy()
gender_df['actualvalues'] = finaldf.actualvalues.replace({'female_angry':'female'
                                       , 'female_disgust':'female'
                                       , 'female_fear':'female'
                                       , 'female_happy':'female'
                                       , 'female_sad':'female'
                                       , 'female_surprise':'female'
                                       , 'female_neutral':'female'
                                       , 'female_calm':'female'
                                       , 'male_angry':'male'
                                       , 'male_fear':'male'
                                       , 'male_happy':'male'
                                       , 'male_sad':'male'
                                       , 'male_surprise':'male'
                                       , 'male_neutral':'male'
                                       , 'male_disgust':'male'
                                       , 'male_calm':'male'
                                      })

gender_df['predictedvalues'] = finaldf.predictedvalues.replace({'female_angry':'female'
                                       , 'female_disgust':'female'
                                       , 'female_fear':'female'
                                       , 'female_happy':'female'
                                       , 'female_sad':'female'
                                       , 'female_surprise':'female'
                                       , 'female_neutral':'female'
                                       , 'female_calm':'female'
                                       , 'male_angry':'male'
                                       , 'male_fear':'male'
                                       , 'male_happy':'male'
                                       , 'male_sad':'male'
                                       , 'male_surprise':'male'
                                       , 'male_neutral':'male'
                                       , 'male_disgust':'male'
                                       , 'male_calm':'male'
                                      })

gender_classes = gender_df.actualvalues.unique()  
gender_classes.sort() 

# Confusion matrix 
matrix = confusion_matrix(gender_df.actualvalues, gender_df.predictedvalues)
print(accuracy_score(gender_df.actualvalues, gender_df.predictedvalues))
print_confusion_matrix(matrix, class_names = gender_classes)

In [None]:
print(classification_report(gender_df.actualvalues, gender_df.predictedvalues, target_names=gender_classes))

### Emotion identification evaluation

In [None]:
emotions_df = finaldf.copy()
emotions_df['actualvalues'] = emotions_df.actualvalues.replace({'female_angry':'angry'
                                       , 'female_disgust':'disgust'
                                       , 'female_fear':'fear'
                                       , 'female_happy':'happy'
                                       , 'female_sad':'sad'
                                       , 'female_surprise':'surprise'
                                       , 'female_neutral':'neutral'
                                       , 'female_calm':'calm'
                                       , 'male_angry':'angry'
                                       , 'male_fear':'fear'
                                       , 'male_happy':'happy'
                                       , 'male_sad':'sad'
                                       , 'male_surprise':'surprise'
                                       , 'male_neutral':'neutral'
                                       , 'male_disgust':'disgust'
                                       , 'male_calm':'calm'
                                      })

emotions_df['predictedvalues'] = emotions_df.predictedvalues.replace({'female_angry':'angry'
                                       , 'female_disgust':'disgust'
                                       , 'female_fear':'fear'
                                       , 'female_happy':'happy'
                                       , 'female_sad':'sad'
                                       , 'female_surprise':'surprise'
                                       , 'female_neutral':'neutral'
                                       , 'female_calm':'calm'
                                       , 'male_angry':'angry'
                                       , 'male_fear':'fear'
                                       , 'male_happy':'happy'
                                       , 'male_sad':'sad'
                                       , 'male_surprise':'surprise'
                                       , 'male_neutral':'neutral'
                                       , 'male_disgust':'disgust'
                                       , 'male_calm':'calm'
                                      })

emotion_classes = emotions_df.actualvalues.unique() 
emotion_classes.sort() 

# Confusion matrix 
c = confusion_matrix(emotions_df.actualvalues, emotions_df.predictedvalues)
print(accuracy_score(emotions_df.actualvalues, emotions_df.predictedvalues))
print_confusion_matrix(c, class_names = emotion_classes)

In [None]:
print(classification_report(emotions_df.actualvalues, emotions_df.predictedvalues, target_names=emotion_classes))

## Saving and loading model

In [None]:
model.save("best_model.h5")

model_json = model.to_json()
with open("best_model.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
# json_file = open('./best_model.json', 'r')
# model_json = json_file.read()
# json_file.close()
# model = model_from_json(model_json)

# model.load_weights("./best_model.h5")

# optimizer=optimizers.RMSprop(learning_rate=0.000001)
# model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

## Evaluating model on own voice

### Need to install and import below modules to record the audio

In [None]:
!pip install pipwin
!pipwin install pyaudio
!pip install wave

In [None]:
import pyaudio
import wave

### The below function records the voice for 4 seconds and passes the recording though the trained model to be evaluated and outputs it's prediction

In [None]:
def testing():
    '''Creating new wav file for predicting user audio'''
    CHUNK = 1024 
    FORMAT = pyaudio.paInt16 
    CHANNELS = 2 
    RATE = 44100 
    RECORD_SECONDS = 4
    WAVE_OUTPUT_FILENAME = "./testing.wav"
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)
    print("* recording")
    frames = []
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("* done recording")
    stream.stop_stream()
    stream.close()
    p.terminate()
    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

    '''Converts the above recorded wav file into mel-spec and reads back into our Neural network readable format'''
    data, sample_rate = librosa.load("./testing.wav", res_type='kaiser_fast',sr=44100)
    spectrogram = librosa.feature.melspectrogram(data, sr=sample_rate, n_mels=128,fmax=12000) 
    db_spec = librosa.power_to_db(spectrogram, ref=np.max)
    fig = plt.figure(figsize=(8,6), num=1, clear=True)
    librosa.display.specshow(db_spec)
    plt.savefig("./testing.png")  
    image=tf.keras.preprocessing.image.load_img("./testing.png", color_mode='rgb', target_size= (128,128))
    testing_file=np.array(image)
    testing_file = testing_file.astype("float32")
    testing_file /= 255
    testing_file = np.expand_dims(testing_file, axis=0)

    '''Passing the converted file to the model for prediction'''
    preds = model.predict(testing_file)
    preds = preds.argmax(axis=1)
#     preds = classes[preds]
    preds = preds.astype(int).flatten()
    preds = (lb.inverse_transform((preds)))
    print("Voice predicted as: ", preds) 

    '''Listening to recorded audio'''
    display(ipd.Audio('./testing.wav', autoplay=True))

In [None]:
testing()