In [7]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, LSTM, Dense, Dropout, Reshape, BatchNormalization, GRU, Bidirectional
from tensorflow.keras.models import Model
import random
import seaborn as sns
from sklearn.metrics import confusion_matrix


In [2]:
# Function to load audio files and extract MFCCs
def load_audio_files(data_path):
    features, labels = [], []
    for actor in os.listdir(data_path):
        actor_path = os.path.join(data_path, actor)
        if os.path.isdir(actor_path):
            for file in os.listdir(actor_path):
                file_path = os.path.join(actor_path, file)
                if file.endswith(".wav"):
                    signal, sr = librosa.load(file_path, sr=22050)
                    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=40)
                    features.append(mfccs.T)
                    labels.append(file.split('-')[2])
    return features, labels

# Data Augmentation (Adding noise & shifting time)
def augment_audio(signal):
    noise = np.random.normal(0, 0.005, signal.shape)
    signal_noisy = signal + noise
    shift = int(0.1 * len(signal))  # Shift by 10%
    signal_shifted = np.roll(signal, shift)
    return signal_noisy, signal_shifted

# Function to plot spectrograms
def plot_spectrogram(y, sr, title="Spectrogram"):
    plt.figure(figsize=(10, 4))
    S = librosa.feature.melspectrogram(y=y, sr=sr)
    librosa.display.specshow(librosa.power_to_db(S, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title(title)
    plt.show()

In [3]:
# Load data
data_path = "./Audio_Speech_Actors_01-24"  # Change to actual dataset path
features, labels = load_audio_files(data_path)

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
labels = to_categorical(labels)

# Padding sequences to the same length
max_len = max(len(feature) for feature in features)
features_padded = np.array([np.pad(f, ((0, max_len - len(f)), (0, 0)), mode='constant') for f in features])

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(features_padded, labels, test_size=0.2, random_state=42)

# Reshape input to fit CNN input expectations
X_train = np.expand_dims(X_train, axis=-1)  # Shape: (samples, time, features, 1)
X_test = np.expand_dims(X_test, axis=-1)

In [12]:
class EmotionCNNLSTM(Model):
    def __init__(self, input_shape, num_classes):
        super(EmotionCNNLSTM, self).__init__()
        # Improved CNN layers
        self.conv1 = Conv2D(64, (3, 3), activation='relu', padding='same')
        self.bn1 = BatchNormalization()
        self.pool1 = MaxPooling2D((2, 2))
        self.dropout1 = Dropout(0.2)  # Light dropout after first conv block
        
        self.conv2 = Conv2D(128, (3, 3), activation='relu', padding='same')
        self.bn2 = BatchNormalization()
        self.pool2 = MaxPooling2D((2, 2))
        self.dropout2 = Dropout(0.3)  # Moderate dropout after second conv block
        
        self.conv3 = Conv2D(256, (3, 3), activation='relu', padding='same')
        self.bn3 = BatchNormalization()
        self.pool3 = MaxPooling2D((2, 2))
        
        self.flatten = Flatten()
        self.dense1 = Dense(256, activation='relu')
        self.bn4 = BatchNormalization()
        self.dropout3 = Dropout(0.4)
        
        # Stacked Bi-LSTM layers
        self.reshape = Reshape((1, -1))  # Reshape for LSTM
        self.lstm1 = Bidirectional(LSTM(128, return_sequences=True, 
                                       recurrent_dropout=0.2,
                                       kernel_regularizer=tf.keras.regularizers.l2(1e-4)))
        self.bn5 = BatchNormalization()
        self.lstm2 = Bidirectional(LSTM(64, return_sequences=False,
                                       recurrent_dropout=0.2,
                                       kernel_regularizer=tf.keras.regularizers.l2(1e-4)))
        
        self.dense2 = Dense(128, activation='relu')
        self.bn6 = BatchNormalization()
        self.dropout4 = Dropout(0.4)
        self.dense3 = Dense(num_classes, activation='softmax')

    def call(self, inputs, training=False):
        x = self.conv1(inputs)
        x = self.bn1(x, training=training)
        x = self.pool1(x)
        x = self.dropout1(x, training=training)
        
        x = self.conv2(x)
        x = self.bn2(x, training=training)
        x = self.pool2(x)
        x = self.dropout2(x, training=training)
        
        x = self.conv3(x)
        x = self.bn3(x, training=training)
        x = self.pool3(x)
        
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.bn4(x, training=training)
        x = self.dropout3(x, training=training)
        
        x = self.reshape(x)
        x = self.lstm1(x)
        x = self.bn5(x, training=training)
        x = self.lstm2(x)
        
        x = self.dense2(x)
        x = self.bn6(x, training=training)
        x = self.dropout4(x, training=training)
        return self.dense3(x)


class EmotionCNNGRU(Model):
    def __init__(self, input_shape, num_classes):
        super(EmotionCNNGRU, self).__init__()
        # Improved CNN layers
        self.conv1 = Conv2D(64, (3, 3), activation='relu', padding='same')
        self.bn1 = BatchNormalization()
        self.pool1 = MaxPooling2D((2, 2))
        self.dropout1 = Dropout(0.2)
        
        self.conv2 = Conv2D(128, (3, 3), activation='relu', padding='same')
        self.bn2 = BatchNormalization()
        self.pool2 = MaxPooling2D((2, 2))
        self.dropout2 = Dropout(0.3)
        
        self.flatten = Flatten()
        self.dense1 = Dense(256, activation='relu')
        self.bn3 = BatchNormalization()
        self.dropout3 = Dropout(0.4)
        
        # Stacked Bi-GRU layers with regularization
        self.reshape = Reshape((1, -1))
        self.gru1 = Bidirectional(GRU(128, return_sequences=True, 
                                     recurrent_dropout=0.2,
                                     kernel_regularizer=tf.keras.regularizers.l2(1e-4)))
        self.bn4 = BatchNormalization()
        self.gru2 = Bidirectional(GRU(64, return_sequences=False,
                                     recurrent_dropout=0.2,
                                     kernel_regularizer=tf.keras.regularizers.l2(1e-4)))
        
        self.dense2 = Dense(128, activation='relu')
        self.bn5 = BatchNormalization()
        self.dropout4 = Dropout(0.4)
        self.dense3 = Dense(num_classes, activation='softmax')

    def call(self, inputs, training=False):
        x = self.conv1(inputs)
        x = self.bn1(x, training=training)
        x = self.pool1(x)
        x = self.dropout1(x, training=training)
        
        x = self.conv2(x)
        x = self.bn2(x, training=training)
        x = self.pool2(x)
        x = self.dropout2(x, training=training)
        
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.bn3(x, training=training)
        x = self.dropout3(x, training=training)
        
        x = self.reshape(x)
        x = self.gru1(x)
        x = self.bn4(x, training=training)
        x = self.gru2(x)
        
        x = self.dense2(x)
        x = self.bn5(x, training=training)
        x = self.dropout4(x, training=training)
        return self.dense3(x)


# Alternative Model 2: Pure CNN
class EmotionPureCNN(Model):
    def __init__(self, input_shape, num_classes):
        super(EmotionPureCNN, self).__init__()
        self.conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')
        self.bn1 = BatchNormalization()
        self.pool1 = MaxPooling2D((2, 2))
        self.conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')
        self.bn2 = BatchNormalization()
        self.pool2 = MaxPooling2D((2, 2))
        self.flatten = Flatten()
        self.dense1 = Dense(128, activation='relu')
        self.dropout = Dropout(0.5)
        self.dense2 = Dense(num_classes, activation='softmax')

    def call(self, inputs):
        x = self.conv1(inputs)
        x = self.bn1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dropout(x)
        return self.dense2(x)

In [13]:
# Model selection
input_shape = (max_len, 40, 1)
num_classes = labels.shape[1]

# Choose one of the improved models
model = EmotionCNNLSTM(input_shape, num_classes)
# model = ImprovedEmotionCNNGRU(input_shape, num_classes)

# Learning rate scheduler
def lr_schedule(epoch):
    initial_lr = 0.001
    if epoch > 20:
        return initial_lr * 0.1
    elif epoch > 10:
        return initial_lr * 0.5
    else:
        return initial_lr

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_schedule)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True
)

# Compile with Adam optimizer with weight decay
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0)

# Compile model
model.compile(
    loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

# Train with callbacks
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,  # Increase epochs, early stopping will prevent overfitting
    batch_size=32,
    callbacks=[lr_scheduler, early_stopping]
)


Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 847ms/step - accuracy: 0.2028 - loss: 2.7918 - val_accuracy: 0.1493 - val_loss: 2.1821 - learning_rate: 0.0010
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 796ms/step - accuracy: 0.3136 - loss: 2.1782 - val_accuracy: 0.1597 - val_loss: 2.1695 - learning_rate: 0.0010
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 753ms/step - accuracy: 0.3848 - loss: 1.9475 - val_accuracy: 0.1528 - val_loss: 2.1814 - learning_rate: 0.0010
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 672ms/step - accuracy: 0.4341 - loss: 1.7524 - val_accuracy: 0.1840 - val_loss: 2.1487 - learning_rate: 0.0010
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 671ms/step - accuracy: 0.4451 - loss: 1.6352 - val_accuracy: 0.1632 - val_loss: 2.2465 - learning_rate: 0.0010
Epoch 6/50
[1m 5/36[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m 

KeyboardInterrupt: 

In [None]:
input_shape = (max_len, 40)  # (Time, Features)
num_classes = labels.shape[1]
model = EmotionCNNLSTM(input_shape, num_classes)

# Compile and train model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=32)

Epoch 1/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 145ms/step - accuracy: 0.1297 - loss: 2.1215 - val_accuracy: 0.1910 - val_loss: 2.0559
Epoch 2/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 138ms/step - accuracy: 0.1889 - loss: 2.0579 - val_accuracy: 0.0938 - val_loss: 2.0685
Epoch 3/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 133ms/step - accuracy: 0.1643 - loss: 2.0557 - val_accuracy: 0.1111 - val_loss: 2.0627
Epoch 4/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 138ms/step - accuracy: 0.1666 - loss: 2.0531 - val_accuracy: 0.1597 - val_loss: 2.0423
Epoch 5/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 132ms/step - accuracy: 0.1901 - loss: 2.0377 - val_accuracy: 0.1389 - val_loss: 2.0603
Epoch 6/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 150ms/step - accuracy: 0.1372 - loss: 2.0722 - val_accuracy: 0.1111 - val_loss: 2.0642
Epoch 7/30
[1m36/36[0m [

In [10]:
input_shape = (max_len, 40)  # (Time, Features)
num_classes = labels.shape[1]
model = EmotionCNNGRU(input_shape, num_classes)

# Compile and train model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=32)

Epoch 1/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 156ms/step - accuracy: 0.1344 - loss: 2.2684 - val_accuracy: 0.1111 - val_loss: 2.0710
Epoch 2/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 142ms/step - accuracy: 0.1582 - loss: 2.0586 - val_accuracy: 0.1111 - val_loss: 2.0703
Epoch 3/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 136ms/step - accuracy: 0.1440 - loss: 2.0580 - val_accuracy: 0.1042 - val_loss: 2.0672
Epoch 4/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 138ms/step - accuracy: 0.1369 - loss: 2.0678 - val_accuracy: 0.1076 - val_loss: 2.0748
Epoch 5/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 163ms/step - accuracy: 0.1517 - loss: 2.0561 - val_accuracy: 0.1424 - val_loss: 2.0858
Epoch 6/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 166ms/step - accuracy: 0.1489 - loss: 2.0713 - val_accuracy: 0.1111 - val_loss: 2.0854
Epoch 7/30
[1m36/36[0m [3

In [11]:
input_shape = (max_len, 40)  # (Time, Features)
num_classes = labels.shape[1]
model = EmotionPureCNN(input_shape, num_classes)

# Compile and train model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=32)

Epoch 1/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 175ms/step - accuracy: 0.1317 - loss: 53.0898 - val_accuracy: 0.1771 - val_loss: 2.0213
Epoch 2/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 157ms/step - accuracy: 0.1942 - loss: 1.9711 - val_accuracy: 0.2222 - val_loss: 1.9334
Epoch 3/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 162ms/step - accuracy: 0.2562 - loss: 1.8508 - val_accuracy: 0.2778 - val_loss: 1.8731
Epoch 4/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 159ms/step - accuracy: 0.4177 - loss: 1.5605 - val_accuracy: 0.3160 - val_loss: 1.8104
Epoch 5/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 147ms/step - accuracy: 0.6365 - loss: 1.0686 - val_accuracy: 0.3681 - val_loss: 1.7708
Epoch 6/30
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 148ms/step - accuracy: 0.8257 - loss: 0.5673 - val_accuracy: 0.4340 - val_loss: 1.8760
Epoch 7/30
[1m36/36[0m [