# <center>Speech Emotion Recognition Datasets test<center>

In [None]:
# Import libraries 
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import numpy as np
import IPython.display as ipd  # To play sound in the notebook
import os
import sys
import warnings

import soundfile


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import keras
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

# ignore warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
# Paths for data
SAVEE = "surrey-audiovisual-expressed-emotion-savee/ALL/"
RAV = "ravdess-emotional-speech-audio/audio_speech_actors_01-24/"
TESS = "toronto-emotional-speech-set-tess/TESS Toronto emotional speech set data/"
CREMA = "cremad/AudioWAV/"
ANAD = "Arabic Natural Audio Dataset/Speech/"
KSUEmotions = "ksu_emotions/data/SPEECH/"


#  <center> About Surrey Audio-Visual Expressed Emotion (SAVEE) Dataset <center>
The SAVEE database was recorded from four native English male speakers (identified as DC, JE, JK, KL), postgraduate students and researchers at the University of Surrey aged from 27 to 31 years. Emotion has been described psychologically in discrete categories: anger, disgust, fear, happiness, sadness and surprise. A neutral category is also added to provide recordings of 7 emotion categories.

The text material consisted of 15 TIMIT sentences per emotion: 3 common, 2 emotion-specific and 10 generic sentences that were different for each emotion and phonetically-balanced. The 3 common and 2 × 6 = 12 emotion-specific sentences were recorded as neutral to give 30 neutral sentences. This resulted in a total of 120 utterances per speaker, for example:

Common: She had your dark suit in greasy wash water all year. <br> 
Anger: Who authorized the unlimited expense account? <br>
Disgust: Please take this dirty table cloth to the cleaners for me. <br> 
Fear: Call an ambulance for medical assistance. <br>
Happiness: Those musicians harmonize marvelously. <br>
Sadness: The prospect of cutting back spending is an unpleasant one for any governor. <br>
Surprise: The carpet cleaners shampooed our oriental rug. <br>
Neutral: The best way to learn is to solve extra problems.<br>
#### Academic citation 
@inproceedings{Vlasenko_combiningframe,
author = {Vlasenko, Bogdan and Schuller, Bjorn and Wendemuth, Andreas and Rigoll, Gerhard},
year = {2007},
month = {01},
pages = {2249-2252},
title = {Combining frame and turn-level information for robust recognition of emotions within speech},
journal = {Proceedings of Interspeech}
}
#### Acquired from
https://www.kaggle.com/datasets/ejlok1/surrey-audiovisual-expressed-emotion-savee 

In [None]:
# Get the data location for SAVEE
dir_list = os.listdir(SAVEE)

# parse the filename to get the emotions
emotion=[]
path = []
for i in dir_list:
    if i[-8:-6]=='_a':
        emotion.append('angry')
    elif i[-8:-6]=='_d':
        emotion.append('disgust')
    elif i[-8:-6]=='_f':
        emotion.append('fear')
    elif i[-8:-6]=='_h':
        emotion.append('happy')
    elif i[-8:-6]=='_n':
        emotion.append('neutral')
    elif i[-8:-6]=='sa':
        emotion.append('sad')
    elif i[-8:-6]=='su':
        emotion.append('surprise')
    else:
        emotion.append('error') 
    path.append(SAVEE + i)

# Now check out the label count distribution 
SAVEE_df = pd.DataFrame(emotion, columns = ['labels'])
SAVEE_df['source'] = 'SAVEE'
SAVEE_df = pd.concat([SAVEE_df, pd.DataFrame(path, columns = ['path'])], axis = 1)
SAVEE_df.labels.value_counts()

# <center> Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS)<center>

#### Files

This portion of the RAVDESS contains 1440 files: 60 trials per actor x 24 actors = 1440. The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two lexically-matched statements in a neutral North American accent. Speech emotions includes calm, happy, sad, angry, fearful, surprise, and disgust expressions. Each expression is produced at two levels of emotional intensity (normal, strong), with an additional neutral expression.

#### File naming convention

Each of the 1440 files has a unique filename. The filename consists of a 7-part numerical identifier (e.g., 03-01-06-01-02-01-12.wav). These identifiers define the stimulus characteristics:

#### Filename identifiers

Modality (01 = full-AV, 02 = video-only, 03 = audio-only).<br>
Vocal channel (01 = speech, 02 = song).<br>
Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).<br>
Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.<br>
Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").<br>
Repetition (01 = 1st repetition, 02 = 2nd repetition).<br>
Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).<br>
Filename example: 03-01-06-01-02-01-12.wav<br>
Audio-only (03)<br>
Speech (01)<br>
Fearful (06)<br>
Normal intensity (01)<br>
Statement "dogs" (02)<br>
1st Repetition (01)<br>
12th Actor (12)<br>
Female, as the actor ID number is even.<br>
<br>
#### Academic citation 
Livingstone SR, Russo FA (2018) The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A dynamic, multimodal set of facial and vocal expressions in North American English. PLoS ONE 13(5): e0196391. https://doi.org/10.1371/journal.pone.0196391.

#### Acquired from
https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio 

In [None]:
dir_list = os.listdir(RAV)
dir_list.sort()


emotion = []
path = []
for dir in dir_list:
    # as their are 20 different actors in our previous directory we need to extract files for each actor.
    actor = os.listdir(RAV + dir)
    for file in actor:
        part = file.split('.')[0]
        part = part.split('-')
        # third part in each file represents the emotion associated to that file.
        emotion.append(int(part[2]))
        path.append(RAV + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(emotion, columns=['labels'])

# dataframe for path of files.
path_df = pd.DataFrame(path, columns=['path'])
RAV_df = pd.concat([emotion_df, path_df], axis=1)
RAV_df['source'] = 'RAVDESS'

# changing integers to actual emotions.
RAV_df.labels.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)
RAV_df.labels.value_counts()

# <center>Toronto emotional speech set (TESS)<center>
There are a set of 200 target words were spoken in the carrier phrase "Say the word _' by two actresses (aged 26 and 64 years) and recordings were made of the set portraying each of seven emotions (anger, disgust, fear, happiness, pleasant surprise, sadness, and neutral). There are 2800 data points (audio files) in total.

The dataset is organised such that each of the two female actor and their emotions are contain within its own folder. And within that, all 200 target words audio file can be found. The format of the audio file is a WAV format
#### Academic citation 
“Toronto emotional speech set (TESS) | TSpace Repository.” https://tspace.library.utoronto.ca/handle/1807/24487 (accessed Sep. 11, 2022).
#### Acquired from
https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess

In [None]:
dir_list = os.listdir(TESS)
dir_list.sort()

path = []
emotion = []

for i in dir_list:
    fname = os.listdir(TESS + i)
    for f in fname:
        if i == 'OAF_angry' or i == 'YAF_angry':
            emotion.append('angry')
        elif i == 'OAF_disgust' or i == 'YAF_disgust':
            emotion.append('disgust')
        elif i == 'OAF_Fear' or i == 'YAF_fear':
            emotion.append('fear')
        elif i == 'OAF_happy' or i == 'YAF_happy':
            emotion.append('happy')
        elif i == 'OAF_neutral' or i == 'YAF_neutral':
            emotion.append('neutral')                                
        elif i == 'OAF_Pleasant_surprise' or i == 'YAF_pleasant_surprised':
            emotion.append('surprise')               
        elif i == 'OAF_Sad' or i == 'YAF_sad':
            emotion.append('sad')
        else:
            emotion.append('Unknown')
        path.append(TESS + i + "/" + f)

TESS_df = pd.DataFrame(emotion, columns = ['labels'])
TESS_df['source'] = 'TESS'
TESS_df = pd.concat([TESS_df,pd.DataFrame(path, columns = ['path'])],axis=1)
TESS_df.labels.value_counts()

In [None]:
TESS_df.drop(TESS_df[TESS_df['labels']=='Unknown'].index, inplace=True)
TESS_df.labels.value_counts()

# <center> Crowd Sourced Emotional Multimodal Actors Dataset (CREMA-D) <center>

CREMA-D is a data set of 7,442 original clips from 91 actors. These clips were from 48 male and 43 female actors between the ages of 20 and 74 coming from a variety of races and ethnicities (African America, Asian, Caucasian, Hispanic, and Unspecified). Actors spoke from a selection of 12 sentences. The sentences were presented using one of six different emotions (Anger, Disgust, Fear, Happy, Neutral, and Sad) and four different emotion levels (Low, Medium, High, and Unspecified).
#### Academic citation 
Cao H, Cooper DG, Keutmann MK, Gur RC, Nenkova A, Verma R. CREMA-D: Crowd-sourced Emotional Multimodal Actors Dataset. IEEE Trans Affect Comput. 2014 Oct-Dec;5(4):377-390. doi: 10.1109/TAFFC.2014.2336244. PMID: 25653738; PMCID: PMC4313618.
#### Acquired from
https://www.kaggle.com/datasets/ejlok1/cremad 

In [None]:
dir_list = os.listdir(CREMA)
dir_list.sort()

emotion = []
path = []

for wav in os.listdir(CREMA):
    info = wav.partition(".wav")[0].split("_")
    if info[2] == 'SAD':
        emotion.append("sad")
    elif info[2] == 'ANG':
        emotion.append("angry")
    elif info[2] == 'DIS':
        emotion.append("disgust")
    elif info[2] == 'FEA':
        emotion.append("fear")
    elif info[2] == 'HAP':
        emotion.append("happy")
    elif info[2] == 'NEU':
        emotion.append("neutral")
    else:
        emotion.append("unknown")
    path.append(CREMA + wav)

    
CREMA_df = pd.DataFrame(emotion, columns = ['labels'])
CREMA_df['source'] = 'CREMA'
CREMA_df = pd.concat([CREMA_df,pd.DataFrame(path, columns = ['path'])],axis=1)
CREMA_df.labels.value_counts()

# <center>Combine all the dataset's dataframe into one<center>

In [None]:
df_Eng = pd.concat([SAVEE_df, RAV_df, TESS_df, CREMA_df], axis = 0)
print(df_Eng.labels.value_counts())
df_Eng.to_csv("English_Data_path.csv",index=False)

In [None]:
df_Eng.head()

In [None]:
# Display number of rows, columns, etc.
df_Eng.info()
#or
#df.shape

In [None]:
#let's plot the count of each emotions in the english dataset.
plt.figure(figsize =(21, 3))
sns.countplot(df_Eng.labels)
plt.ylabel('Count', size=12)
plt.xlabel('Emotions', size=12)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()

# <center> Arabic Natural Audio Dataset </center>

the dataset has 3 discrete emotions: Happy,angry, and surprised. 

Eight videos of live calls between an anchor and a human outside the studio were downloaded from online Arabic talk shows. Each video was then divided into turns: callers and receivers. To label each video, 18 listeners were asked to listen to each video and select whether they perceive a happy, angry or surprised emotion. Silence, laughs and noisy chunks were removed. Every chunk was then automatically divided into 1 sec speech units forming our final corpus composed of 1384 records.


#### Academic citation 
klaylat, Samira; Osman, ziad; Zantout, Rached; Hamandi, Lama (2018), “Arabic Natural Audio Dataset”, Mendeley Data, V1, doi: 10.17632/xm232yxf7t.1
#### Acquired from
https://data.mendeley.com/datasets/xm232yxf7t/1

In [None]:
dir_list = os.listdir(ANAD)
dir_list.sort()

class_emotions = {'V1': 'happy', 'V2': 'surprise', 'V3': 'happy', 'V4': 'angry',
        'V5': 'angry', 'V6': 'surprise', 'V7': 'angry','V8': 'happy'}

emotion = []
path = []



for audio_file in dir_list:
    for key in class_emotions:
        if key in audio_file:
            emotions = class_emotions[key]
            emotion.append(emotions)
    path.append(ANAD + audio_file)

    
    

ANAD_df = pd.DataFrame(emotion, columns = ['labels'])
ANAD_df['source'] = 'ANAD'
ANAD_df = pd.concat([ANAD_df,pd.DataFrame(path, columns = ['path'])],axis=1)
ANAD_df.labels.value_counts() 

In [None]:
ANAD_df.head()

# <center> KSUEmotions </center>

the dataset has 5 emotions: neutral, sadness, happiness, surprise, and questioning.
the collection of data was devided into two pahses. Phase 1 had 10 female speakers and 10 male speakers which has 1596 records, while phase 2 has 7 female speakers and 7 male speakers that has 1680 records which makes the totoal of the data set 3276 records.

#### File format
The audio files are named using the DxxExxPgxxSxxTxx

<img src="KSUE_file.png"> 

#### Academic citation 
A. H. Meftah, M. A. Qamhan, Y. Seddiq, Y. A. Alotaibi and S. A. Selouani, "King Saud University Emotions Corpus: Construction, Analysis, Evaluation, and Comparison," in IEEE Access, vol. 9, pp. 54201-54219, 2021, doi: 10.1109/ACCESS.2021.3070751. 

#### Acquired from 
https://catalog.ldc.upenn.edu/LDC2017S12

In [None]:
dir_list = os.listdir(KSUEmotions)
dir_list.sort()

class_emotions = {'E00': 'neutral', 'E01': 'happy', 'E02': 'sad', 'E03': 'surprise',
                  'E04': 'questioning', 'E05': 'angry'}

emotion = []
path = []



for phase in dir_list:
    if os.path.isdir(os.path.join(KSUEmotions, phase)):
        for emotion_path in os.listdir(os.path.join(KSUEmotions, phase)): 
            for audio_file in os.listdir(os.path.join(KSUEmotions, phase, emotion_path)):
                path.append(os.path.join(KSUEmotions, phase, emotion_path, audio_file))
                emotions = str(0)
                for key in class_emotions:
                    if key in audio_file:
                        emotions = class_emotions[key]
                        emotion.append(emotions)
                        

KSUEmotions_df = pd.DataFrame(emotion, columns = ['labels'])
KSUEmotions_df['source'] = 'KSUEmotions'
KSUEmotions_df = pd.concat([KSUEmotions_df,pd.DataFrame(path, columns = ['path'])],axis=1)
KSUEmotions_df.labels.value_counts()                  

In [None]:
KSUEmotions_df.head()

In [None]:
df_Arab = pd.concat([ANAD_df, KSUEmotions_df], axis = 0)
df_Arab.labels.value_counts()

In [None]:
df_Arab.to_csv("Arabic_Data_path.csv",index=False)

In [None]:
# Display number of rows, columns, etc.
df_Arab.info()
#or
#df.shape

In [None]:
#let's plot the count of each emotions in the english dataset.
plt.figure(figsize =(21, 3))
sns.countplot(df_Arab.labels)
plt.ylabel('Count', size=12)
plt.xlabel('Emotions', size=12)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()

# <center> All Datasets </center>

In [None]:
data_path = pd.concat([df_Eng, df_Arab], axis = 0)
data_path.labels.value_counts()

In [None]:
data_path = data_path[data_path["labels"]!="questioning"]
data_path = data_path[data_path["labels"]!="disgust"]

In [None]:
data_path.to_csv("Data_path.csv",index=False)
data_path.labels.value_counts()

In [None]:
# Display number of rows, columns, etc.
data_path.info()
#or
#df.shape

In [None]:
#let's plot the count of each emotions in the english dataset.
plt.figure(figsize =(21, 3))
sns.countplot(data_path.labels)
plt.ylabel('Count', size=12)
plt.xlabel('Emotions', size=12)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()

# <center> Features's extraction </center>

## Data Augmentation

In [None]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)


In [None]:
def extract_features(data, sample_rate, frame_length=2048, hop_length=512):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, offset=0.4)
    data = data.T
    
     # without augmentation
    res1 = extract_features(data,sample_rate)
    result = np.array(res1)
    
    # data with noise
    data_noise = noise(data)
    res2 = extract_features(data_noise,sample_rate)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with pitching
    data_pitch = pitch(data, sample_rate)
    res3 = extract_features(data_pitch,sample_rate)
    result = np.vstack((result, res3)) # stacking vertically
    
    # data with pitching and white_noise
    new_data = pitch(data, sample_rate)
    data_noise_pitch = noise(new_data)
    res4 = extract_features(data_noise_pitch,sample_rate)
    result = np.vstack((result, res4)) # stacking vertically
    
    # data with stretch
    data_stretch = stretch(data)
    res5 = extract_features(data_stretch,sample_rate)
    result = np.vstack((result, res5)) # stacking vertically
    
    # data with pitch shifting
    new_data = pitch(data, sample_rate)
    data_shift_pitch = shift(new_data)
    res6 = extract_features(data_shift_pitch,sample_rate)
    result = np.vstack((result, res6)) # stacking vertically
    
    return result


In [None]:
X, Y = [], []
i = 0
for path, emotion in zip(data_path.path, data_path.labels):
    feature = get_features(path)
    #data, sample_rate = librosa.load(path, offset=0.6) in get_features()
    for ele in feature:
        X.append(ele)
        # appending emotion 5 times as we have made 5 augmentation techniques on each audio file.
        Y.append(emotion)
        i+=1
        if i%500==0:
            print(i)

In [None]:
len(X), len(Y), data_path.path.shape

In [None]:
Features_Augmentation = pd.DataFrame(X)
Features_Augmentation['labels'] = Y
Features_Augmentation.to_csv('features_augmentation.csv', index=False)
Features_Augmentation.head(20)