<a href="https://colab.research.google.com/github/atharva753/SPEECH-EMOTION-RECOGNITION-USING-RNN/blob/main/Train_CRNN_Model_RAVDESS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Install necessary libraries if not already installed
!pip install numpy scikit-learn tensorflow

import os
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, TimeDistributed, Conv2D, MaxPooling2D, Flatten, LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# ---------------------------
# STEP 1: Load Processed Data from Google Drive
# ---------------------------
features_folder = '/content/drive/MyDrive/ser/extracted_features'
X = np.load(os.path.join(features_folder, "X.npy"))  # Expected shape: (2880, 143, 40)
y = np.load(os.path.join(features_folder, "y.npy"))  # Expected shape: (2880, num_classes)

print("Original X shape:", X.shape)  # (2880, 143, 40)
print("y shape:", y.shape)

# ---------------------------
# STEP 2: Preprocess the Data
# ---------------------------
# We need to add two extra dimensions so that each time step is a 4D tensor: (height, width, channels)
# Currently, X has shape (2880, 143, 40). We want to end up with (2880, 143, 40, 1, 1)
# First, add one channel dimension:
X = X[..., np.newaxis]  # Shape becomes: (2880, 143, 40, 1)
# Then, add one more dimension for the "width" of the image (so that each time step is 3D):
X = np.expand_dims(X, axis=-1)  # Now shape: (2880, 143, 40, 1, 1)
print("New X shape (with channel and extra dim):", X.shape)

# Split data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

# ---------------------------
# STEP 3: Build the CRNN Model using an Input layer
# ---------------------------
time_steps, n_mfcc, _, _ = X.shape[1:]  # time_steps=143, n_mfcc=40, and the last two dimensions are 1,1
num_classes = y.shape[1]

model = Sequential()
# Define input with shape (time_steps, n_mfcc, 1, 1) for each sample
model.add(Input(shape=(time_steps, n_mfcc, 1, 1)))

# We use TimeDistributed Conv2D layers.
# For each time step, the "image" shape is (n_mfcc, 1, 1) i.e. (40, 1, 1).
# We'll use a kernel size of (3, 1) which fits our "image".
model.add(TimeDistributed(Conv2D(filters=32, kernel_size=(3, 1), activation='relu', padding='same')))
model.add(TimeDistributed(BatchNormalization()))
model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 1))))
model.add(TimeDistributed(Dropout(0.3)))

# Second convolutional block
model.add(TimeDistributed(Conv2D(filters=64, kernel_size=(3, 1), activation='relu', padding='same')))
model.add(TimeDistributed(BatchNormalization()))
model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 1))))
model.add(TimeDistributed(Dropout(0.3)))

model.add(TimeDistributed(Flatten()))

# Add Bidirectional LSTM layers to capture temporal dependencies
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))

# Final classification layer
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# ---------------------------
# STEP 4: Train the Model
# ---------------------------
callbacks = [
    EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=5, min_lr=1e-5)
]

history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=callbacks)

# ---------------------------
# STEP 5: Evaluate the Model
# ---------------------------
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Original X shape: (2880, 143, 40)
y shape: (2880, 8)
New X shape (with channel and extra dim): (2880, 143, 40, 1, 1)
Training samples: 2304
Testing samples: 576


Epoch 1/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m686s[0m 9s/step - accuracy: 0.1895 - loss: 2.0672 - val_accuracy: 0.2581 - val_loss: 1.9392 - learning_rate: 0.0010
Epoch 2/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 9s/step - accuracy: 0.2635 - loss: 1.8709 - val_accuracy: 0.2386 - val_loss: 2.0406 - learning_rate: 0.0010
Epoch 3/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m484s[0m 8s/step - accuracy: 0.3413 - loss: 1.7238 - val_accuracy: 0.2625 - val_loss: 1.8394 - learning_rate: 0.0010
Epoch 4/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m473s[0m 8s/step - accuracy: 0.3722 - loss: 1.6489 - val_accuracy: 0.3102 - val_loss: 1.8927 - learning_rate: 0.0010
Epoch 5/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m509s[0m 8s/step - accuracy: 0.3913 - loss: 1.5500 - val_accuracy: 0.3384 - val_loss: 1.8272 - learning_rate: 0.0010
Epoch 6/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m492s[0