In [1]:
!pip install tensorflow librosa numpy pandas scikit-learn



In [32]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
def load_data(data_dirs):
    labels = []
    features = []

    for data_dir in data_dirs:
        for file in os.listdir(data_dir):
            if file.endswith('.wav'):
                # Load audio file
                file_path = os.path.join(data_dir, file)
                signal, sr = librosa.load(file_path, sr=22050)

                # Extract features (MFCC)
                mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=50)
                mfccs = np.mean(mfccs.T, axis=0)

                features.append(mfccs)
                # Extract emotion label from filename (assumes format: xx-xx-xx-emotion-xx.wav)
                emotion = file.split('-')[2]  # Adjust this based on your file naming convention
                labels.append(emotion)

    return np.array(features), np.array(labels)

# Load the dataset
actors = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24']

data_dirs = []
for actor in actors:
    data_dirs.append(f'/content/drive/MyDrive/RAVDESS/Actor_{actor}')

X, y = load_data(data_dirs)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42)


In [67]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, MultiHeadAttention, LayerNormalization, Embedding
from tensorflow.keras.optimizers import SGD

def create_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)

    # LSTM Layer
    lstm_out = LSTM(512, return_sequences=True)(inputs) #128
    lstm_out = Dropout(0.2)(lstm_out)

    # Transformer Layer
    for _ in range(4):
        attention_output = MultiHeadAttention(num_heads=4, key_dim=512)(lstm_out, lstm_out)
        attention_output = LayerNormalization(epsilon=1e-6)(attention_output + lstm_out)
        attention_output = Dropout(0.2)(attention_output)
        ff_output  = Dense(512, activation='relu')(attention_output)
        ff_output  = Dropout(0.2)(ff_output)
        #ff_output  = Dense(64)(ff_output)
        lstm_out = LayerNormalization(epsilon=1e-6)(attention_output + ff_output)
        lstm_out = Dropout(0.2)(lstm_out)

    # Fully Connected Layers
    flatten = tf.keras.layers.Flatten()(lstm_out)
    dense_out = Dense(128, activation='relu')(flatten)
    dense_out = Dropout(0.2)(dense_out)
    outputs = Dense(num_classes, activation='softmax')(dense_out)

    model = Model(inputs, outputs)
    return model

# Define model parameters
input_shape = (X_train.shape[1], 1)  # Adjust as needed
num_classes = len(np.unique(y_encoded))

# Create the model
model = create_model(input_shape, num_classes)
optimizer = SGD(learning_rate=0.001, momentum=0.9)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [68]:
# Reshape data for LSTM input
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Train the model
history = model.fit(X_train_reshaped, y_train, epochs=250, batch_size=120, validation_data=(X_test_reshaped, y_test))


Epoch 1/250
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 424ms/step - accuracy: 0.1603 - loss: 3.0111 - val_accuracy: 0.1736 - val_loss: 2.4589
Epoch 2/250
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 367ms/step - accuracy: 0.1745 - loss: 2.4364 - val_accuracy: 0.2153 - val_loss: 1.9803
Epoch 3/250
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 372ms/step - accuracy: 0.1720 - loss: 2.2056 - val_accuracy: 0.2639 - val_loss: 1.9531
Epoch 4/250
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 376ms/step - accuracy: 0.1796 - loss: 2.0775 - val_accuracy: 0.2361 - val_loss: 1.9505
Epoch 5/250
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 377ms/step - accuracy: 0.1990 - loss: 2.0122 - val_accuracy: 0.1875 - val_loss: 1.9734
Epoch 6/250
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 389ms/step - accuracy: 0.1971 - loss: 1.9733 - val_accuracy: 0.1806 - val_loss: 1.9388
Epoch 7/250
[1m11/11

In [69]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.4988 - loss: 1.5940
Test Accuracy: 0.5278
