REQ LIBRARIES

In [None]:
!pip install librosa soundfile tensorflow scikit-learn




IMPORT LIBRARIES

In [None]:
import os
import librosa
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras import layers, models


CONNECT GOOGLE DRIVE

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


SET DATASET PATH

In [None]:
DATA_PATH = "/content/drive/MyDrive/RAVDESS/audio_speech_actors_01-24"


EMOTION LABLE MAPPING

In [None]:
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}


FEATURE EXTRACTION(MFCC)

In [None]:
features = []
labels = []

for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith(".wav"):
            file_path = os.path.join(root, file)

            # extract emotion from filename
            emotion_code = file.split("-")[2]
            emotion = emotion_map[emotion_code]

            # load audio
            audio, sr = librosa.load(file_path, duration=3, offset=0.5)

            # extract MFCC
            mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
            mfcc_scaled = np.mean(mfcc.T, axis=0)

            features.append(mfcc_scaled)
            labels.append(emotion)


CONVERT TO NUMPY ARRAYS

In [None]:
X = np.array(features)
y = np.array(labels)

print(X.shape)
print(y.shape)


(1440, 40)
(1440,)


ENCODE EMOTION LABELS

In [None]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)


TRAIN–TEST SPLIT

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)


RESHAPE DATA FOR CNN

In [None]:
X_train = X_train[..., np.newaxis]
X_test = X_test[..., np.newaxis]

print(X_train.shape)


(1152, 40, 1)


BUILD CNN + LSTM MODEL

In [None]:
model = models.Sequential()

model.add(layers.Conv1D(64, 5, activation='relu', input_shape=(40,1)))
model.add(layers.MaxPooling1D(2))
model.add(layers.Dropout(0.3))

model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(2))
model.add(layers.Dropout(0.3))

model.add(layers.LSTM(128))

model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(8, activation='softmax'))

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TRAIN MODEL

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2
)


Epoch 1/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.1225 - loss: 2.0870 - val_accuracy: 0.1775 - val_loss: 2.0144
Epoch 2/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.1857 - loss: 2.0317 - val_accuracy: 0.1991 - val_loss: 1.9927
Epoch 3/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.2424 - loss: 1.9720 - val_accuracy: 0.2251 - val_loss: 1.9602
Epoch 4/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.2605 - loss: 1.9563 - val_accuracy: 0.2597 - val_loss: 1.9232
Epoch 5/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.2774 - loss: 1.9091 - val_accuracy: 0.2857 - val_loss: 1.8850
Epoch 6/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.3016 - loss: 1.8737 - val_accuracy: 0.2727 - val_loss: 1.8848
Epoch 7/50
[1m29/29[0m [32m━━━━

EVALUATE MODEL

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy * 100)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4351 - loss: 1.6597
Test Accuracy: 45.48611044883728


TEST WITH NEW AUDIO

In [None]:
def predict_emotion(file_path):
    audio, sr = librosa.load(file_path, duration=3, offset=0.5)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    mfcc = np.mean(mfcc.T, axis=0)
    mfcc = mfcc.reshape(1, 40, 1)

    prediction = model.predict(mfcc)
    emotion = encoder.inverse_transform([np.argmax(prediction)])
    return emotion[0]
