<a href="https://colab.research.google.com/github/ayaelsayed25/Speech-Emotion-Recognition/blob/main/Speech_Emotion_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Exploring Audios, Extracting Features, Dataset splitting**

**Load dataset and determine the avg length of the sound file (then all should have the same length)**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install keras-tuner

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import librosa
import zipfile
import io
# import drive
zf = zipfile.ZipFile('/content/drive/My Drive/content/Crema.zip')
zf.extractall('mydataset')

In [None]:
path = '/content/mydataset/Crema/'
sounds = []
labels = []
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        label = filename[9:12]
        sound = librosa.core.load(path+filename, sr=16000)[0]
        labels.append(label)
        sounds.append(sound)
print(len(sounds))

7442


In [None]:
import statistics
import math
avg_len = math.floor(statistics.mean(map(lambda sound: len(sound) , sounds)))
print(avg_len)
print(sound.shape)

40686
(45912,)


In [None]:
def add_padding(sound, required_len):
    if len(sound)>required_len:  #if length of audio > required length, remove the difference
        max_offset = len(sound)-required_len
        offset = np.random.randint(max_offset)
        sound = sound[offset:(required_len+offset)]
    else:
        if required_len > len(sound): #if length < required length, add padding
            max_offset = required_len - len(sound)
            offset = np.random.randint(max_offset)
        else:  #if equal, do nothing, offset = 0
            offset = 0
        sound = np.pad(sound, (offset, required_len - len(sound) - offset), "constant")
    return sound

In [None]:
def extract_features(sounds, required_len, melspectrogram):
    #no normalization, no max and add padding only, constant sr
    data = []
    for sound in sounds: 
        sound = add_padding(sound, required_len)
        if melspectrogram == True:
          sound = librosa.feature.melspectrogram(y=sound, sr=16000) #extract the melspectrogram features
          sound = librosa.power_to_db(sound,top_db=80, ref= np.max)
        data.append(sound)
    return data

# **Time Domain Features**

In [None]:
sounds = extract_features(sounds,avg_len, False)
data = np.array(sounds)
print(data.shape)

(7442, 40686)


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
# Encode the classification labels
labels = np.array(labels)
le = LabelEncoder()
labels = to_categorical(le.fit_transform(labels)) 
print(labels.shape)
# split the dataset 
from sklearn.model_selection import train_test_split
x_train, x_rem, y_train, y_rem = train_test_split(data,labels, train_size=0.7, random_state = 42)
x_valid, x_test, y_valid, y_test = train_test_split(x_rem,y_rem, test_size=0.95, random_state =42)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_valid.shape, y_valid.shape)
print(type(x_train))

(7442, 6)
(5209, 128, 80) (5209, 6) (2122, 128, 80) (2122, 6) (111, 128, 80) (111, 6)
<class 'numpy.ndarray'>


In [None]:
from tensorflow.keras.optimizers import Adam
def build_model(hp) : 
  inputs = Input(shape=(x_train.shape[1],1))
  conv = Conv1D( filters=hp.Int('conv_1_filter', min_value=4, max_value=16, step=2),
          # adding filter size or kernel size
          kernel_size=hp.Choice('conv_1_kernel', values = [3,15]),
          #activation function
          activation='relu', strides = 1)(inputs)
  conv = MaxPooling1D(3)(conv)
  conv = Dropout(0.3)(conv)

  conv = Conv1D(filters=hp.Int('conv_1_filter', min_value=8, max_value=32, step=4),
          # adding filter size or kernel size
          kernel_size=hp.Choice('conv_1_kernel', values = [3,15]),activation = 'relu',strides = 1)(conv)
  conv = MaxPooling1D(3)(conv)
  conv = Dropout(0.3)(conv)

  conv = Conv1D(filters=hp.Int('conv_1_filter', min_value=16, max_value=64, step=8),
          # adding filter size or kernel size
          kernel_size=hp.Choice('conv_1_kernel', values = [3,15]),activation = 'relu',strides = 1)(conv)
  conv = MaxPooling1D(3)(conv)
  conv = Dropout(0.3)(conv)

  conv = Conv1D(filters=hp.Int('conv_1_filter', min_value=32, max_value=128, step=16),
          # adding filter size or kernel size
          kernel_size=hp.Choice('conv_1_kernel', values = [3,15]),padding='valid',activation = 'relu',strides = 1)(conv)
  conv = MaxPooling1D(3)(conv)
  conv = Dropout(0.3)(conv)

  conv = Flatten()(conv)

  conv = Dense(units=hp.Int('dense_1_units', min_value=32, max_value=128, step=16), activation = 'relu')(conv)
  conv = Dropout(0.3)(conv)

  conv = Dense(units=hp.Int('dense_1_units', min_value=32, max_value=128, step=16), activation = 'relu')(conv)
  conv = Dropout(0.3)(conv)
  outputs = Dense(labels.shape[1], activation='softmax')(conv)
  model = Model(inputs, outputs)
  model.summary()
  model.compile(loss='categorical_crossentropy', optimizer=Adam(hp.Choice('learning_rate', values=[1e-2,1e-3])),metrics=['accuracy'])

  return model

In [None]:
from kerastuner import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

tuner_search = RandomSearch(build_model,objective='val_accuracy',max_trials=5,directory='output',project_name='haha')
es = EarlyStopping(monitor='val_loss',mode='min', verbose = 1,patience = 10,min_delta = 0.0001)
mc = ModelCheckpoint('best_model2.hdf5', monitor='val_acc', verbose=1,save_best_only=True,mode='max')

tuner_search.search(x_train, y_train, epochs=50,callbacks=[es,mc],batch_size=32, validation_data=(x_valid,y_valid))
best_model = tuner.get_best_models()[0]

# history=model.fit(x_train,y_train,epochs=100,callbacks=[es,mc],batch_size=32,validation_data=(x_valid,y_valid))

In [None]:
tuner_search.search(x_train, y_train, epochs=50,callbacks=[es,mc],batch_size=32, validation_data=(x_valid,y_valid))
best_model = tuner.get_best_models()[0]

In [None]:
best_model = tuner_search.get_best_models()[0]

In [None]:
from keras.layers import  Conv1D, MaxPooling1D, GlobalAveragePooling1D, Input, Dropout, Flatten, Dense
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K

inputs = Input(shape=(x_train.shape[1],1))
conv = Conv1D(8,13,padding='valid',activation = 'relu',strides = 1)(inputs)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

conv = Conv1D(16,11,padding='valid',activation = 'relu',strides = 1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

conv = Conv1D(32,9,padding='valid',activation = 'relu',strides = 1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

conv = Conv1D(64,7,padding='valid',activation = 'relu',strides = 1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

conv = Flatten()(conv)

conv = Dense(256, activation = 'relu')(conv)
conv = Dropout(0.3)(conv)

conv = Dense(128, activation = 'relu')(conv)
conv = Dropout(0.3)(conv)
outputs = Dense(labels.shape[1], activation='softmax')(conv)
model = Model(inputs, outputs)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss',mode='min', verbose = 1,patience = 10,min_delta = 0.0001)
mc = ModelCheckpoint('best_model2.hdf5', monitor='val_acc', verbose=1,save_best_only=True,mode='max')
history=model.fit(x_train,y_train,epochs=100,callbacks=[es,mc],batch_size=32,validation_data=(x_valid,y_valid))


In [None]:
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy) 

# **Melspectrogram**

In [None]:
sounds = extract_features(sounds,avg_len, True)
data = np.array(sounds)
print(data.shape)

(7442, 128, 80)


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
# Encode the classification labels
labels = np.array(labels)
le = LabelEncoder()
labels = to_categorical(le.fit_transform(labels)) 
print(labels.shape)
# split the dataset 
from sklearn.model_selection import train_test_split
x_train, x_rem, y_train, y_rem = train_test_split(data,labels, train_size=0.7, random_state = 42)
x_valid, x_test, y_valid, y_test = train_test_split(x_rem,y_rem, test_size=0.95, random_state =42)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_valid.shape, y_valid.shape)
print(type(x_train))

(7442, 6)
(5209, 128, 80) (5209, 6) (2122, 128, 80) (2122, 6) (111, 128, 80) (111, 6)
<class 'numpy.ndarray'>


**Display Melspectrogram of a Given Audio**

In [None]:
import matplotlib.pyplot as plt
import librosa.display
def display_melspectrogram(filename):
    y, sr = librosa.load(path+filename)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    fig, ax = plt.subplots()
    S_dB = librosa.power_to_db(S, ref=np.max)
    img = librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=sr, fmax=8000, ax=ax)
    fig.colorbar(img, ax=ax, format='%+2.0f dB')
    ax.set(title='Mel-frequency spectrogram')
    plt.show()

In [None]:
display_melspectrogram('1001_DFA_DIS_XX.wav')

# **Melspectrogram Model**

**Preparation**

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics

num_rows = x_train.shape[1]
num_columns = x_train.shape[2]
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)
x_valid = x_valid.reshape(x_valid.shape[0], num_rows, num_columns, num_channels)

num_labels = labels.shape[1]
filter_size = 2
pool_size = 2

**Construct Model Version 1**

In [None]:
# model = None
# model = Sequential()
# model.add(Conv2D(filters=16, kernel_size=filter_size, input_shape=(num_rows, num_columns, num_channels), activation='relu', strides=1))
# model.add(MaxPooling2D(pool_size=pool_size))
# model.add(Dropout(0.2))

# model.add(Conv2D(filters=32, kernel_size=filter_size, activation='relu', strides=1))
# model.add(MaxPooling2D(pool_size=pool_size))
# model.add(Dropout(0.2))

# model.add(Conv2D(filters=64, kernel_size=filter_size, activation='relu', strides=1))
# model.add(MaxPooling2D(pool_size=pool_size))
# model.add(Dropout(0.2))

# model.add(Conv2D(filters=128, kernel_size=filter_size, activation='relu', strides=1))
# model.add(MaxPooling2D(pool_size=pool_size))
# model.add(Dropout(0.2))
# model.add(Dense(num_labels, activation='softmax'))

**Construct Model Version 2**

In [None]:
# model = Sequential()
# model.add(Conv2D(filters=16, kernel_size=filter_size, input_shape=(num_rows, num_columns, num_channels), activation='relu', strides=1))
# model.add(MaxPooling2D(pool_size=pool_size))
# model.add(Dropout(0.2))

# model.add(Conv2D(filters=32, kernel_size=filter_size, activation='relu', strides=1))
# model.add(MaxPooling2D(pool_size=pool_size))
# model.add(Dropout(0.2))

# model.add(Conv2D(filters=64, kernel_size=filter_size, activation='relu', strides=1))
# model.add(MaxPooling2D(pool_size=pool_size))
# model.add(Dropout(0.2))

# model.add(Flatten())
# model.add(Dense(256, activation='relu'))
# model.add(Dropout(0.2))

# model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.2))

# model.add(Dense(num_labels, activation='softmax'))

**Construct Model Version 3**

In [None]:
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=3, input_shape=(num_rows, num_columns, num_channels), activation='relu', strides=1))
model.add(MaxPooling2D(pool_size=pool_size))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=3, activation='relu', strides=1))
model.add(MaxPooling2D(pool_size=pool_size))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=3, activation='relu', strides=1))
model.add(MaxPooling2D(pool_size=pool_size))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(num_labels, activation='softmax'))

In [None]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# Display model architecture summary 
model.summary()

# Calculate pre-training accuracy 
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy) 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 126, 78, 16)       160       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 63, 39, 16)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 63, 39, 16)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 61, 37, 32)        4640      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 30, 18, 32)       0         
 2D)                                                             
                                                                 
 dropout_1 (Dropout)         (None, 30, 18, 32)        0

In [None]:
from keras.callbacks import ModelCheckpoint 
from datetime import datetime 

num_epochs = 100
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn4.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

# **Evaluation**

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
score = model.evaluate(x_valid, y_valid, verbose=0)
print("Accuracy", score[1])
y_predict = model.predict(x_valid)
print(classification_report([np.argmax(i) for i in y_valid] ,[np.argmax(i) for i in y_predict]))
cm=confusion_matrix([np.argmax(i) for i in y_valid] ,[np.argmax(i) for i in y_predict] )
print(cm)

Accuracy 0.45045045018196106
              precision    recall  f1-score   support

           0       0.50      0.70      0.58        23
           1       0.24      0.17      0.20        24
           2       0.50      0.50      0.50        12
           3       0.35      0.43      0.39        14
           4       0.64      0.43      0.51        21
           5       0.47      0.53      0.50        17

    accuracy                           0.45       111
   macro avg       0.45      0.46      0.45       111
weighted avg       0.45      0.45      0.44       111

[[16  2  2  2  1  0]
 [ 8  4  0  6  1  5]
 [ 0  1  6  2  0  3]
 [ 5  1  1  6  1  0]
 [ 3  5  1  1  9  2]
 [ 0  4  2  0  2  9]]
