<a href="https://colab.research.google.com/github/bagustris/ravdess_song/blob/master/ravdess_song_si.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAVDESS Song Emotion Recognition - Speaker Independent

In [1]:
!git clone https://github.com/bagustris/ravdess_song.git

Cloning into 'ravdess_song'...
remote: Enumerating objects: 1073, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 1073 (delta 4), reused 6 (delta 0), pack-reused 1057[K
Receiving objects: 100% (1073/1073), 230.46 MiB | 23.83 MiB/s, done.
Resolving deltas: 100% (8/8), done.
Checking out files: 100% (1022/1022), done.


In [2]:
cd ravdess_song/

/content/ravdess_song


In [13]:
import glob
import os
import librosa
import numpy as np
import tensorflow as tf

In [4]:
# grab all wav files
data_path = 'archive'
files = glob.glob(os.path.join(data_path + '/*/', '*.wav'))
files.sort()

In [5]:
# function to extract feature
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name, sr=None)
    stft = np.abs(librosa.stft(X))
    mfcc = np.mean(librosa.feature.mfcc(
        y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    mfcc_std = np.std(librosa.feature.mfcc(
        y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(
        S=stft, sr=sample_rate).T, axis=0)
    chroma_std = np.std(librosa.feature.chroma_stft(
        S=stft, sr=sample_rate).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
    mel_std = np.std(librosa.feature.melspectrogram(
        X, sr=sample_rate).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(
        S=stft, sr=sample_rate).T, axis=0)
    contrast_std = np.std(librosa.feature.spectral_contrast(
        S=stft, sr=sample_rate).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(
        y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
    tonnetz_std = np.std(librosa.feature.tonnetz(
        y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
    return (mfcc, chroma, mel, contrast, tonnetz,
            mfcc_std, chroma_std, mel_std, contrast_std, tonnetz_std)

In [6]:
# create empty list to store features and labels
feat_train = []
feat_test = []
lab_train = []
lab_test = []

In [7]:
# iterate over all files
for file in files:
    print("processing ...", file)
    feat_i = np.hstack(extract_feature(file))
    lab_i = os.path.basename(file).split('-')[2]
    # create speaker independent split
    if int(file[-6:-4]) > 20:
        feat_test.append(feat_i)
        lab_test.append(int(lab_i)-1)
    else:
        feat_train.append(feat_i)
        lab_train.append(int(lab_i)-1)  # make labels start from 0

processing ... archive/Actor_01/03-02-01-01-01-01-01.wav
processing ... archive/Actor_01/03-02-01-01-01-02-01.wav
processing ... archive/Actor_01/03-02-01-01-02-01-01.wav
processing ... archive/Actor_01/03-02-01-01-02-02-01.wav
processing ... archive/Actor_01/03-02-02-01-01-01-01.wav
processing ... archive/Actor_01/03-02-02-01-01-02-01.wav
processing ... archive/Actor_01/03-02-02-01-02-01-01.wav
processing ... archive/Actor_01/03-02-02-01-02-02-01.wav
processing ... archive/Actor_01/03-02-02-02-01-01-01.wav
processing ... archive/Actor_01/03-02-02-02-01-02-01.wav
processing ... archive/Actor_01/03-02-02-02-02-01-01.wav
processing ... archive/Actor_01/03-02-02-02-02-02-01.wav
processing ... archive/Actor_01/03-02-03-01-01-01-01.wav
processing ... archive/Actor_01/03-02-03-01-01-02-01.wav
processing ... archive/Actor_01/03-02-03-01-02-01-01.wav
processing ... archive/Actor_01/03-02-03-01-02-02-01.wav
processing ... archive/Actor_01/03-02-03-02-01-01-01.wav
processing ... archive/Actor_01

In [10]:
X_train = np.array(feat_train)
X_test = np.array(feat_test)
y_train = np.array(lab_train)
y_test = np.array(lab_test)

In [11]:
# reshape x untuk cnn
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [14]:
# callbacks
earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                             patience=10,
                                             restore_best_weights=True)

In [15]:
def model_cnn():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.BatchNormalization(axis=-1,
              input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(tf.keras.layers.Conv1D(256, 128, 1, padding='same'))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.MaxPooling1D(2))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dropout(0.4))
    model.add(tf.keras.layers.Dense(6, activation='softmax'))

    # compile model: set loss, optimizer, metric
    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=['accuracy'])
    return model

In [16]:
# create the model
model = model_cnn()
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization (BatchNo (None, 386, 1)            4         
_________________________________________________________________
conv1d (Conv1D)              (None, 386, 256)          33024     
_________________________________________________________________
activation (Activation)      (None, 386, 256)          0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 193, 256)          0         
_________________________________________________________________
flatten (Flatten)            (None, 49408)             0         
_________________________________________________________________
dropout (Dropout)            (None, 49408)             0         
_________________________________________________________________
dense (Dense)                (None, 6)                 2

In [17]:
# train the model
hist = model.fit(X_train, 
                 y_train, 
                 epochs=100, 
                 shuffle=True,
                 callbacks=earlystop,
                 validation_split=0.1,
                 batch_size=16)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


In [19]:
# evaluate the model on test partition
evaluate = model.evaluate(X_test, y_test, batch_size=16)
print("Loss: ", evaluate[0], "--> Accuracy: ", evaluate[1])

Loss:  0.5350328683853149 --> Accuracy:  0.75
