In [21]:
# =================================
# 1. IMPORT LIBRARIES
# =================================
import os
import numpy as np
import librosa

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


# =================================
# 2. DOWNLOAD DATASET FROM KAGGLE
# =================================
import kagglehub

DATA_PATH = kagglehub.dataset_download(
    "uwrfkaggler/ravdess-emotional-speech-audio"
)

print("Dataset Path:", DATA_PATH)


# =================================
# 3. EMOTION MAPPING (RAVDESS)
# =================================
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}


# =================================
# 4. FEATURE EXTRACTION FUNCTION
# =================================
def extract_mfcc(file_path, n_mfcc=40):
    audio, sr = librosa.load(file_path, duration=3, offset=0.5)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfcc.T   # (time_steps, features)


# =================================
# 5. LOAD AUDIO FILES
# =================================
features = []
labels = []

for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith(".wav"):
            file_path = os.path.join(root, file)

            # RAVDESS filename format:
            # 03-01-05-01-01-01-01.wav → emotion = 05
            emotion_code = file.split("-")[2]

            if emotion_code in emotion_map:
                mfcc = extract_mfcc(file_path)
                features.append(mfcc)
                labels.append(emotion_map[emotion_code])

print("Total Samples:", len(features))
print("Emotion Classes:", set(labels))


# =================================
# 6. PAD SEQUENCES
# =================================
max_len = max(f.shape[0] for f in features)

X = pad_sequences(
    features,
    maxlen=max_len,
    dtype="float32",
    padding="post"
)

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(labels)
y = to_categorical(y_encoded)

print("Encoded Classes:", le.classes_)


# =================================
# 7. TRAIN / TEST SPLIT
# =================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# =================================
# 8. BUILD LSTM MODEL
# =================================
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X.shape[1], X.shape[2])),
    Dropout(0.3),

    LSTM(64),
    Dropout(0.3),

    Dense(64, activation="relu"),
    Dense(y.shape[1], activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


# =================================
# 9. TRAIN MODEL
# =================================
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

model.fit(
    X_train,
    y_train,
    epochs=40,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop]
)


# =================================
# 10. EVALUATE MODEL
# =================================
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy*100:.2f}%")


# =================================
# 11. SAMPLE PREDICTIONS
# =================================
y_pred_probs = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

pred_labels = le.inverse_transform(y_pred_classes)
true_labels = le.inverse_transform(y_true_classes)

print("\nSample Predictions:")
for i in range(10):
    print(f"True: {true_labels[i]} --> Predicted: {pred_labels[i]}")


# =================================
# 12. PREDICT NEW AUDIO FILE
# =================================
def predict_emotion(audio_path):
    mfcc = extract_mfcc(audio_path)
    mfcc = pad_sequences([mfcc], maxlen=max_len, padding="post")
    pred = model.predict(mfcc)
    return le.inverse_transform([np.argmax(pred)])[0]


# Example Prediction
sample_audio = file_path  # last loaded audio
print("\nPredicted Emotion:", predict_emotion(sample_audio))


Using Colab cache for faster access to the 'ravdess-emotional-speech-audio' dataset.
Dataset Path: /kaggle/input/ravdess-emotional-speech-audio
Total Samples: 2880
Emotion Classes: {'surprised', 'angry', 'sad', 'happy', 'fearful', 'calm', 'neutral', 'disgust'}
Encoded Classes: ['angry' 'calm' 'disgust' 'fearful' 'happy' 'neutral' 'sad' 'surprised']


  super().__init__(**kwargs)


Epoch 1/40
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 270ms/step - accuracy: 0.1917 - loss: 2.0311 - val_accuracy: 0.2842 - val_loss: 1.7897
Epoch 2/40
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 256ms/step - accuracy: 0.3335 - loss: 1.7405 - val_accuracy: 0.3536 - val_loss: 1.6509
Epoch 3/40
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 263ms/step - accuracy: 0.3744 - loss: 1.6270 - val_accuracy: 0.3883 - val_loss: 1.5436
Epoch 4/40
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 260ms/step - accuracy: 0.4436 - loss: 1.4791 - val_accuracy: 0.4230 - val_loss: 1.4872
Epoch 5/40
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 260ms/step - accuracy: 0.4891 - loss: 1.3371 - val_accuracy: 0.4772 - val_loss: 1.3774
Epoch 6/40
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 255ms/step - accuracy: 0.5299 - loss: 1.2746 - val_accuracy: 0.4772 - val_loss: 1.3731
Epoch 7/40
[1m58/58[