In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import librosa
import librosa.display
import matplotlib.pyplot as plt
import IPython
import IPython.display
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import normalize
import sklearn.metrics as metrics
import seaborn as sn
import warnings
warnings.filterwarnings('ignore')

# **Loading data and plotting**

In [None]:
Crema_Path='/kaggle/input/speech-emotion-recognition-en/Crema'

crema=[] # list to place our data as label | file path
for wav in os.listdir(Crema_Path):
    emotion=wav.partition(".wav")[0].split('_') # the name of the file is as follows (#_chars_label)
    if emotion[2]=='SAD': # labels are the 3rd element in our splitted list because we split by '_'
        crema.append(('sad',Crema_Path+'/'+wav))
    elif emotion[2]=='ANG':
        crema.append(('angry',Crema_Path+'/'+wav))
    elif emotion[2]=='DIS':
        crema.append(('disgust',Crema_Path+'/'+wav))
    elif emotion[2]=='FEA':
        crema.append(('fear',Crema_Path+'/'+wav))
    elif emotion[2]=='HAP':
        crema.append(('happy',Crema_Path+'/'+wav))
    elif emotion[2]=='NEU':
        crema.append(('neutral',Crema_Path+'/'+wav))
    else:
        crema.append(('unknown',Crema_Path+'/'+wav))
Crema_df = pd.DataFrame.from_dict(crema) # convert list to dataframe to be easier to manipulate
Crema_df.rename(columns={0:'Emotion',1:'File_Path'},inplace=True)
Crema_df.head()

In [None]:
colors={'disgust':'#804E2D','happy':'#F19C0E','sad':'#478FB8','neutral':'#4CB847','fear':'#7D55AA','angry':'#C00808','surprise':'#EE00FF'}

def wave_plot(data,sr,emotion,color): # function to display waveform of audio file
    plt.figure()
    plt.title(f'{emotion} emotion for waveplot',size=17)
    plt.ylabel('Amplitude')
    librosa.display.waveshow(y=data,sr=sr,color=color)

emotion_names = Crema_df['Emotion'].unique() # get the names of emotion classes

audio_path=[] # list to hold paths of the first audio file of each emotion
for emotion in emotion_names:
    path = np.array(Crema_df['File_Path'][Crema_df['Emotion']==emotion])[1] # add path only if emotion is the current emotion
    data,sr = librosa.load(path)
    wave_plot(data,sr,emotion,colors[emotion]) # plot waveform of audio
    audio_path.append(path)
plt.show()

In [None]:
print(f'{emotion_names[0]} Audio Sample')
IPython.display.Audio(audio_path[0]) # display audio player (has to be indexed and not in loop)

In [None]:
print(f'{emotion_names[1]} Audio Sample')
IPython.display.Audio(audio_path[1])

In [None]:
print(f'{emotion_names[2]} Audio Sample')
IPython.display.Audio(audio_path[2])

In [None]:
print(f'{emotion_names[3]} Audio Sample')
IPython.display.Audio(audio_path[3])

In [None]:
print(f'{emotion_names[4]} Audio Sample')
IPython.display.Audio(audio_path[4])

In [None]:
print(f'{emotion_names[5]} Audio Sample')
IPython.display.Audio(audio_path[5])

In [None]:
def ml_spectogram(data,sr,emotion):
    ps = librosa.feature.melspectrogram(y=data, sr=sr) # get spectrogram feature
    ps_db= librosa.power_to_db(ps, ref=np.max) # convert from power spectrogram to dB
    plt.figure()
    plt.title(f'{emotion} emotion for ml spectogram',size=17)
    librosa.display.specshow(ps_db, x_axis='s', y_axis='log') # plot spectrogram
    plt.colorbar() # in the colorbar, the brighter in color, the higher the frequency

audio_path=[]
for emotion in emotion_names:
    path = np.array(Crema_df['File_Path'][Crema_df['Emotion']==emotion])[1] # add path only if emotion is the current emotion
    data,sr = librosa.load(path)
    ml_spectogram(data,sr,emotion)
    audio_path.append(path)
plt.show()

In [None]:
def load_data(crema):
    max_len = 0
    index = 0
    for i in range(crema.shape[0]): # loop over the dataset
        data,sr = librosa.load(crema[i][1])
        if data.shape[0] > max_len: # check the largest sized audio file and save its length
          max_len = data.shape[0]
          index = i # also save its index
    return max_len, index
    # we will use this information to help us in padding the rest of the data to match
def pad_data(length, index):
    emotion = []
    audio = []
    sample_rate = []
    for i in range(crema.shape[0]):
          data,sr = librosa.load(crema[i][1])
          padd = length - data.shape[0] # padding size is the difference in max size and current size
          if i != index: # if the current file is not already the largest, we need padding
              data = np.pad(data, pad_width= (0, padd)) # pad with 0s(default) with the length from 0 to padd
          audio.append(data)
          emotion.append(crema[i][0]) # append data label into emotion to hold all data labels
          sample_rate.append(sr) # append each file's sample rate
    return audio, emotion, sample_rate

## **Load data and get max length of audio**

In [None]:
crema = np.array(crema)
length, index = load_data(crema) # get max length and index

## **Padding the data to be of same length**

In [None]:
audio, emotion, sample_rate = pad_data(length, index) # add padding and get audio files, labels, and sample rates
audio = np.array(audio)
emotion = np.array(emotion)
sample_rate = np.array(sample_rate)

## **One hot encoding for the labels**

In [None]:
# hot encoding is used to encode our labels into binary vectors
unique, inverse = np.unique(emotion, return_inverse=True) # inverse outputs labels as decimal numbers
emotion = np.eye(unique.shape[0])[inverse] # np.eye outputs hot encoding
print(unique)
print(inverse)
print(emotion)
print(emotion.shape)

## **Split the data**

In [None]:
# split data into train and test 70:30, then split train into train and validation 95:5
X_train,X_test,y_train,y_test=train_test_split(audio, emotion, random_state=42, test_size=0.3, stratify=emotion)
X_t,X_valid,y_t,y_valid=train_test_split(X_train, y_train, random_state=42, test_size=0.05, stratify=y_train)

In [None]:
# zero-crossing rate is the rate of sign-changes along a signal
# the librosa function splits data into windows or frames and takes samples within this window
# and gets average of their sign changes using sgn function
# hops are skips within a frame to select a number of samples for our average
def zcr(data):
    zcr=librosa.feature.zero_crossing_rate(data,frame_length=2048,hop_length=512)
    return np.mean(zcr)

# RMSE acts as an indicator of loudness, since higher the energy, louder the sound
# frames are used to calculate energy over them
# hops are similar to zcr
def rmse(data):
    rmse=librosa.feature.rms(y=data,frame_length=2048,hop_length=512)
    return np.mean(rmse)

# concatenate our 1D features (zcr and rmse)
def extract_features(data):
    result=np.array([])
    result=np.hstack((result, zcr(data), rmse(data)))
    return result

# get mel spectrogram feature
def extract_features_spect(data, sample_rate):
    return librosa.feature.melspectrogram(y=data, sr=sample_rate)

# **Creating the feature space**

## **Extract the zero crossing rate and the energy features**

In [None]:
train_audio = np.array([])
for i in range(X_train.shape[0]):
    result = extract_features(X_train[i]) # extract 1D features for train data
    if i==0:
        train_audio = np.hstack((train_audio, result))
    else:
        train_audio = np.vstack((train_audio, result))
        
test_audio = np.array([])
for i in range(X_test.shape[0]): # extract 1D features for test data
    result = extract_features(X_test[i])
    if i==0:
        test_audio = np.hstack((test_audio, result))
    else:
        test_audio = np.vstack((test_audio, result))
        
valid_audio = np.array([])
for i in range(X_valid.shape[0]): # extract 1D features for validation data
    if i==0:
        valid_audio = np.hstack((valid_audio, result))
    else:
        valid_audio = np.vstack((valid_audio, result))

## **Extract the ml spectogram feature**

In [None]:
train_audio_spec = []
for i in range(X_train.shape[0]): # extract 2D features for train data
    result = extract_features_spect(X_train[i], sample_rate[i])
    train_audio_spec.append(result)

train_audio_spec = np.array(train_audio_spec)

test_audio_spec = []
for i in range(X_test.shape[0]): # extract 2D features for test data
    result = extract_features_spect(X_test[i], sample_rate[i])
    test_audio_spec.append(result)

test_audio_spec = np.array(test_audio_spec)

valid_audio_spec = []
for i in range(X_valid.shape[0]): # extract 2D features for validation data
    result = extract_features_spect(X_valid[i], sample_rate[i])
    valid_audio_spec.append(result)

valid_audio_spec = np.array(valid_audio_spec)

## **Reshaping the data**

In [None]:
print(train_audio.shape)
print(test_audio.shape)
print(valid_audio.shape)

# reshape is used to express the audio data as its features along with the number of channels of sound
train_audio = train_audio.reshape(train_audio.shape[0], train_audio.shape[1], 1) # 1 in 3rd arg denotes channels (mono)
print(train_audio.shape)

test_audio = test_audio.reshape(test_audio.shape[0], test_audio.shape[1], 1)
print(test_audio.shape)

valid_audio = valid_audio.reshape(valid_audio.shape[0], valid_audio.shape[1], 1)
print(valid_audio.shape)

print(train_audio_spec.shape)
print(test_audio_spec.shape)
print(valid_audio_spec.shape)

train_audio_spec = train_audio_spec.reshape(train_audio_spec.shape[0], train_audio_spec.shape[1], train_audio_spec.shape[2], 1)
print(train_audio_spec.shape)

test_audio_spec = test_audio_spec.reshape(test_audio_spec.shape[0], test_audio_spec.shape[1], test_audio_spec.shape[2], 1)
print(train_audio_spec.shape)

valid_audio_spec = valid_audio_spec.reshape(valid_audio_spec.shape[0], valid_audio_spec.shape[1], valid_audio_spec.shape[2], 1)
print(train_audio_spec.shape)

# **CNN model for zcr and energy features**

In [None]:
# run conv1D model with different number of filters (32, 64, 128)
# filter size = 7, uses padding of zeros, and activation function as relu function
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv1D(32, kernel_size=7, strides=1,padding='same', activation='relu', input_shape=(train_audio.shape[1],1)))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, padding='same')) # maxpooling with stride(step) = 2
model.add(tf.keras.layers.Conv1D(64, kernel_size=7, strides=1,padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(tf.keras.layers.Conv1D(128, kernel_size=7, strides=1,padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(256, activation='relu')) # Dense layer feeds all outputs from the previous layer to all its neurons, each neuron providing one output to the next layer. 
model.add(tf.keras.layers.Dense(6, activation='softmax'))

print(model.summary())
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics='accuracy')

history = model.fit(train_audio, y_train, epochs=40, validation_data=(valid_audio, y_valid)) # run model on data

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Val'], loc='lower right')
plt.show()

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label = 'val_loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()

In [None]:
test_loss, test_acc = model.evaluate(test_audio, y_test, verbose=2) # get accuracy and loss percentages of model
print('Accuracy: ', test_acc)
print('Loss: ', test_loss)

In [None]:
y_pred = model.predict(test_audio) # test our model with test data

cm = metrics.confusion_matrix(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1))
df_cm = pd.DataFrame(cm, index=[i for i in emotion_names],columns=[i for i in emotion_names])
plt.figure()
sn.heatmap(df_cm, annot=True)

In [None]:
f1_score(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1),average='weighted')

In [None]:
# run another conv1D model with different number of filters (64, 128)
# filter size = 3, uses padding of zeros, and activation function as relu function
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv1D(64, kernel_size=3, strides=1,padding='same', activation='relu', input_shape=(train_audio.shape[1],1)))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(tf.keras.layers.Conv1D(128, kernel_size=3, strides=1,padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(6, activation='softmax'))

print(model.summary())
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics='accuracy')

history = model.fit(train_audio, y_train, epochs=50, validation_data=(valid_audio, y_valid))

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Val'], loc='lower right')
plt.show()

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label = 'val_loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()

In [None]:
test_loss, test_acc = model.evaluate(test_audio, y_test, verbose=2)
print('Accuracy: ', test_acc)
print('Loss: ', test_loss)

In [None]:
y_pred = model.predict(test_audio)

cm = metrics.confusion_matrix(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1))
df_cm = pd.DataFrame(cm, index=[i for i in emotion_names],columns=[i for i in emotion_names])
plt.figure()
sn.heatmap(df_cm, annot=True)

# **CNN model for ml spectogram feature**

In [None]:
# run conv2D model with different number of filters (32, 64)
# filter size = (3,3), uses padding of zeros, and activation function as relu function
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(32, kernel_size=(3,3), strides=(1,1),padding='same', activation='relu', input_shape=(train_audio_spec.shape[1], train_audio_spec.shape[2], 1)))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))
model.add(tf.keras.layers.Conv2D(64, kernel_size=(3,3), strides=(1,1),padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(6, activation='softmax'))

print(model.summary())
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics='accuracy')

history = model.fit(train_audio_spec, y_train, epochs=20, validation_data=(valid_audio_spec, y_valid))

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Val'], loc='lower right')
plt.show()

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label = 'val_loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()

In [None]:
test_loss, test_acc = model.evaluate(test_audio_spec, y_test, verbose=2)
print('Accuracy: ', test_acc)
print('Loss: ', test_loss)

In [None]:
y_pred = model.predict(test_audio_spec)

cm = metrics.confusion_matrix(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1))
df_cm = pd.DataFrame(cm, index=[i for i in emotion_names],columns=[i for i in emotion_names])
plt.figure()
sn.heatmap(df_cm, annot=True)

In [None]:
f1_score(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1),average='weighted')

# **Comparison between conv1D and conv2D**

### **conv1D with architecture of conv2D**

In [None]:
# to compare between 1D and 2D models, we will try to use the same parameters of our 2D model in a 1D model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv1D(32, kernel_size=3, strides=1,padding='same', activation='relu', input_shape=(train_audio.shape[1],1)))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(tf.keras.layers.Conv1D(64, kernel_size=3, strides=1,padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2, padding='same'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(6, activation='softmax'))

print(model.summary())
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics='accuracy')

history = model.fit(train_audio, y_train, epochs=20, validation_data=(valid_audio, y_valid))

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Val'], loc='lower right')
plt.show()

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label = 'val_loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()

In [None]:
test_loss, test_acc = model.evaluate(test_audio, y_test, verbose=2)
print('Accuracy: ', test_acc)
print('Loss: ', test_loss)

In [None]:
y_pred = model.predict(test_audio)

cm = metrics.confusion_matrix(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1))
df_cm = pd.DataFrame(cm, index=[i for i in emotion_names],columns=[i for i in emotion_names])
plt.figure()
sn.heatmap(df_cm, annot=True)

In [None]:
f1_score(np.argmax(y_test,axis=-1), np.argmax(y_pred,axis=-1),average='weighted')

### **conv2D with architecture of conv1D**

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(64, kernel_size=(3,3), strides=(1,1),padding='same', activation='relu', input_shape=(train_audio_spec.shape[1], train_audio_spec.shape[2], 1)))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))
model.add(tf.keras.layers.Conv2D(128, kernel_size=(3,3), strides=(1,1),padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(6, activation='softmax'))

print(model.summary())
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics='accuracy')

history = model.fit(train_audio_spec, y_train, epochs=20, validation_data=(valid_audio_spec, y_valid))

# Paper On Speech Emotion Recognition in Neurological Disorders Using Convolutional Neural Network

This paper is concerned with the detection of emotion in people that suffer a neurological disorder that would make it typically difficult for them to *express* emotion. The proposed SER model is claimed to have helped detect and classify emotions (happiness, calmness, fear, and more) in patients. The system uses tonal properties like **MFCCs** and RAVDESS audio speech and song databases for training and testing. In addition, a custom local dataset is developed to support further training and testing. 

MFCC is an acronym for Mel Frequency Cepstral Co-efficients which are the coefficients that collectively make up an MFC. MFC is a representation of the short-term power spectrum of a sound, based on a linear cosine transform of a log power spectrum on a nonlinear mel scale of frequency. Mel scale is a scale that relates the perceived frequency of a tone to the actual measured frequency. It scales the frequency in order to match more closely what the human ear can hear.
A frequency measured in Hertz (f) can be converted to the Mel scale using the following formula:

               Mel(f) = 2595log(1 + f/700)

This system enables one to classify eight emotions of neurologically disordered person including calm, angry, fearful, disgust, happy, surprise, neutral and sad. 

This paper's methodolgy is the following: 

1- The data is taken from RAVDESS dataset\
2- The data is preprocessed\
3- The features are extracted from the data\
4- The model is used on the data\
5- The data is classified

Now let's walk through each step and explain briefly what it does:
 ### 1- RAVDESS and local datasets
The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS) is a validated database of emotional speech and song. It contains 7356 files including 8 emotions such as anger, happiness, calm, neutral, surprise, sad, fear, and disgust. It has speech and song files under three modality formats: Audio-only (16bit, 48kHz .wav), Audio-Video (720p H.264, AAC 48kHz, .mp4) and Video-only (no sound). All the recordings are in American English.\
We take in this dataset using its path on the device and we read it into a data structure *(ex: list)*. We then use the label given in the name of the file to split data into their respective emotion class *(ex: 'ANG' in filename denotes angry class)*\
The local dataset is created by recording voices from 25 patients from Chittagong, Bangladesh. Ten of them are stroke patients, eight of them are affected with dementia, four of them have epilepsy and rest of them have migraine headache. There are 400 audio files in 8 emotions.
### 2- Preprocessing
All files are used with a sampling rate of 16KHz using the parameter *‘sr = 16000’* in the *load function of the Librosa library*. Augmentation in the audio database usually generates additional audio files by applying some special operation on the original database, such as injecting noise, adjusting pitch, changing vocal tract, adjusting speed, etc. In this case, all of the files are augmented with *injecting noise by using the NoiseAug function from the nlpaug library*.
### 3- Feature extraction
For feature extraction, the *Mfcc function of the Librosa library* is used. The sample rate is 16KHz for each audio file. The number of *MFCC extracted are 100*. The shape of the extracted features would not be the same and the range would not be specific without normalization. The unstructured feature may reduce the accuracy and recognition rate. In this research, after extracting features from each file, we normalized them by *subtracting each feature from the maximum one* to make the shape the same. After normalization, these data are used to train and test the system.
### 4- CNN model
The normalised data is then fed into the proposed model for emotion prediction. There are four convolution layers in this model with 16, 32, 64, and 128 filters and the kernel size for each layer is 2*2. Using keras API, we can use the *conv2D function* to build our model with desired number of filters and kernel size. Rectified Linear Unit (ReLU) used as the activation function in each convolution layer as shown.
```math
               ReLU(y) = max(0,y)
```
After the convolution layer, there is a max-pooling layer where the pool size is 2*2. It selects the largest value from the rectified feature map and reduces the size of the data, so the number of parameters is decreased. Here, *MaxPooling from keras API* is also used. Like the convolution layer, ReLU has been applied as an activation function in hidden layers. A dropout layer is also inserted with the dropout value of 0.2 which randomly deactivates 20% neurons to avoid over-fitting. In the last hidden layer, one Global Average Pooling layer has been added which takes the average which is suitable for feeding into our dense output layer. The output layer of this model consists of eight nodes as it has eight classes. As an activation function, Softmax has been applied as shown in this layer.
```math
                Softmax(y)=ei∑jej
```
The dataset was split into training set, validation set and testing set using *train_test_split from sklearn*. Training set and validation set were used to train the model. Testing set was used to test the performance of the model. This model was trained using multiple split ratios (70:20:10, 75:15:10, 80:10:10) and activation functions (relu, sigmoid, softmax, softplus). During the learning process, the performance of this model was best when 75:15:10 split ratio and softmax activation function.

### Results
The best accuracy for training and validation was 0.937 and 0.825. The average testing accuracy was 0.813 where average training and validation accuracy was 0.913 and 0.817. Confusion matrix of RAVDESS augmented dataset with this best result is shown.\
[table.html](https://link.springer.com/chapter/10.1007/978-3-030-59277-6_26/tables/3)\
The best testing accuracy for the local dataset was 0.612. The best accuracy for training and validation was 0.685 and 0.625. The average testing accuracy was 0.610 where average training and validation accuracy was 0.680 and 0.619. The confusion matrix of the local augmented dataset with this best result is presented.\
[table.html](https://link.springer.com/chapter/10.1007/978-3-030-59277-6_26/tables/3)

*Numpy library was used for numerical analysis. Matplotlib library was used for graphical representation, such as confusion matrix, accuracy vs epochs graph, loss vs epochs graph, etc.*