In [18]:
import os
import numpy as np
import librosa
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

LOADING DATASET & SPLITTING IT INTO TRAIN AND TEST

In [19]:
def load_data(directory):
    labels = []
    features = []
    for label in os.listdir(directory):
        class_path = os.path.join(directory, label)
        if os.path.isdir(class_path):
            for file in os.listdir(class_path):
                file_path = os.path.join(class_path, file)
                if file.endswith('.wav'):
                    y, sr = librosa.load(file_path, sr=None)
                    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                    mfcc = np.mean(mfcc.T, axis=0)
                    features.append(mfcc)
                    labels.append(label)
    return features, labels

In [20]:
features, labels = load_data('./Sound Source/')

In [21]:
label_dict = {label: num for num, label in enumerate(sorted(set(labels)))}
numeric_labels = [label_dict[label] for label in labels]

In [22]:
X = np.array(features)
y = to_categorical(numeric_labels)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

CREATING THE MODEL

In [24]:
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], 1), return_sequences=True, kernel_regularizer = l2(0.01)))

model.add(Dropout(0.5))
model.add(LSTM(64, return_sequences=False, kernel_regularizer=l2(0.01)))
model.add(Dense(y_train.shape[1], activation='softmax'))

In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
# define early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

In [27]:
# create a model checkpoint callback
mc = ModelCheckpoint('Models/audio_best_model.keras', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [28]:
model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_test, y_test), callbacks=[mc])

Epoch 1/100
[1m17/18[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 14ms/step - accuracy: 0.3228 - loss: 2.9658
Epoch 1: val_accuracy improved from -inf to 0.36727, saving model to Models/audio_best_model.keras
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.3271 - loss: 2.9476 - val_accuracy: 0.3673 - val_loss: 2.4791
Epoch 2/100
[1m16/18[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 15ms/step - accuracy: 0.3964 - loss: 2.3894
Epoch 2: val_accuracy improved from 0.36727 to 0.38545, saving model to Models/audio_best_model.keras
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.3974 - loss: 2.3735 - val_accuracy: 0.3855 - val_loss: 2.0364
Epoch 3/100
[1m17/18[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 14ms/step - accuracy: 0.4332 - loss: 1.9688
Epoch 3: val_accuracy improved from 0.38545 to 0.42909, saving model to Models/audio_best_model.keras
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1c85f411b50>

MODEL EVALUATION

In [29]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7597 - loss: 1.1510 
Test accuracy: 0.7527272701263428


In [30]:
# model.save('audio_model.h5')

In [31]:
np.save('Train Test Splits/A_X_train.npy', X_train)
np.save('Train Test Splits/A_X_test.npy', X_test)
np.save('Train Test Splits/A_y_train.npy', y_train)
np.save('Train Test Splits/A_y_test.npy', y_test)

LOADING THE MODEL

In [32]:
#model = load_model('./audio_model.h5')
# load the saved model
model = load_model('Models/audio_best_model.keras')
X_test = np.load('Train Test Splits/A_X_test.npy')
y_test = np.load('Train Test Splits/A_y_test.npy')

In [33]:
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step


In [34]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(true_classes, predicted_classes))
print(confusion_matrix(true_classes, predicted_classes))

              precision    recall  f1-score   support

           0       0.88      0.75      0.81        71
           1       0.71      0.72      0.72        68
           2       0.64      0.80      0.71        59
           3       0.85      0.79      0.82        77

    accuracy                           0.76       275
   macro avg       0.77      0.76      0.76       275
weighted avg       0.78      0.76      0.77       275

[[53  6 11  1]
 [ 1 49  9  9]
 [ 4  7 47  1]
 [ 2  7  7 61]]


TESTING WITH NEW DATA

In [35]:
def predict_emotion(audio_file_path, model):
    y, sr = librosa.load(audio_file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc = np.mean(mfcc.T, axis=0)
    
    input_data = np.expand_dims(mfcc, axis=0)
    predictions = model.predict(input_data)
    predicted_class = np.argmax(predictions[0])

    return predicted_class, predictions[0][predicted_class]

In [36]:
#audio_file_path = './test/Angry/6783_kz_tirtil.wav' 
#audio_file_path = './test/Calm/7895_sk_deyim.wav'  
#audio_file_path = './test/Happy/6783_mt_ordek.wav'  
#audio_file_path = './test/Sad/6783_hl_sepet.wav' 

In [37]:
#audio_file_path = './test/Angry/7895_kz_japon.wav' 
#audio_file_path = './test/Calm/7895_sk_japon.wav' 

#audio_file_path = './test/Sad/7895_hl_japon.wav' 

In [38]:

#audio_file_path = './new sound/angry_gommek.ogg' 
#audio_file_path = './new sound/calm_gommek.ogg' 
#audio_file_path = './new sound/happy_gommek.ogg' 
#audio_file_path = './new sound/sad_gommek.ogg' 

#audio_file_path = './new sound/angry_sira.ogg' 
#audio_file_path = './new sound/calm_sira.ogg' 
#audio_file_path = './new sound/happy_sira.ogg' 
#audio_file_path = './new sound/sad_sira.ogg' 

audio_file_path = './test/Sad/7895_hl_saf.wav'

In [39]:
predicted_class, confidence = predict_emotion(audio_file_path, model)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


In [40]:
emotions = {0: 'Angry', 1: 'Calm', 2: 'Happy', 3: 'Sad'}
predicted_emotion = emotions[predicted_class]
print("Predicted emotion: ", predicted_emotion)
print("Prediction confidence: %", confidence * 100)

Predicted emotion:  Angry
Prediction confidence: % 99.87126588821411
