In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"bhramarisarkar","key":"a9d685055a19994f691b0f5a8c5929f1"}'}

In [None]:
import os
import shutil

# Make a Kaggle directory
os.makedirs("/root/.kaggle", exist_ok=True)

# Move kaggle.json to the correct location
shutil.move("kaggle.json", "/root/.kaggle/kaggle.json")

# Set permissions to prevent security warnings
os.chmod("/root/.kaggle/kaggle.json", 600)

In [None]:
!kaggle datasets download -d ejlok1/cremad --unzip


Dataset URL: https://www.kaggle.com/datasets/ejlok1/cremad
License(s): ODC Attribution License (ODC-By)


In [None]:
!pip install tqdm
!pip install librosa
!pip install tensorflow
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [None]:
import os
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, Flatten, LSTM, Bidirectional, Dense, Dropout,
                                     TimeDistributed, GlobalAveragePooling1D, LayerNormalization)
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import random
from tqdm import tqdm

# Data Path
AUDIO_PATH = "AudioWAV/"

MAX_FRAMES = 200  # Adjust based on dataset analysis

In [None]:
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=16000)

    # Extract Features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    # Set a fixed length for all features
    def pad_or_truncate(feature):
        if feature.shape[1] > MAX_FRAMES:
            return feature[:, :MAX_FRAMES]  # Truncate
        else:
            return np.pad(feature, ((0, 0), (0, MAX_FRAMES - feature.shape[1])), mode='constant')

    mfcc = pad_or_truncate(mfcc)
    mfcc_delta = pad_or_truncate(mfcc_delta)
    mfcc_delta2 = pad_or_truncate(mfcc_delta2)
    mel_spec = pad_or_truncate(mel_spec)
    contrast = pad_or_truncate(contrast)

    feature_stack = np.vstack((mfcc, mfcc_delta, mfcc_delta2, mel_spec, contrast))
    return feature_stack



In [None]:
#  Data Augmentation
def augment_audio(y, sr):
    if random.random() < 0.5:
        y = librosa.effects.time_stretch(y, rate=random.uniform(0.9, 1.1))
    if random.random() < 0.5:
        y = librosa.effects.pitch_shift(y, sr=sr, n_steps=random.randint(-2, 2))
    if random.random() < 0.5:
        y = y + 0.005 * np.random.randn(len(y))
    return y


In [None]:
#  Label Mapping
def get_label(filename):
    emotions = {'NEU': 'neutral', 'HAP': 'happy', 'SAD': 'sad', 'ANG': 'angry',
                'FEA': 'fear', 'DIS': 'disgust', 'SUR': 'surprise'}
    label_code = filename.split('_')[2]
    return emotions.get(label_code, 'unknown')

In [None]:
#  Load Dataset
features, labels = [], []
for file in tqdm(os.listdir(AUDIO_PATH)):
    if file.endswith(".wav"):
        file_path = os.path.join(AUDIO_PATH, file)
        y, sr = librosa.load(file_path, sr=16000)
        y = augment_audio(y, sr)  # Apply augmentation
        feature_stack = extract_features(file_path)
        features.append(feature_stack)
        labels.append(get_label(file))

#  Convert to NumPy Arrays
X = np.array(features)
y = np.array(labels)

100%|██████████| 7442/7442 [05:53<00:00, 21.05it/s]


In [None]:
# Encode Labels
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_encoded = keras.utils.to_categorical(y_encoded)

# Reshape for CNN Input
X = X[..., np.newaxis]

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

#  Optimized Model
input_layer = Input(shape=(X.shape[1], X.shape[2], 1))

In [None]:
# CNN Feature Extraction
x = Conv2D(32, (3,3), activation='relu', padding='same')(input_layer)
x = LayerNormalization()(x)
x = MaxPooling2D((2,2))(x)
x = Dropout(0.3)(x)

x = Conv2D(64, (3,3), activation='relu', padding='same')(x)
x = LayerNormalization()(x)
x = MaxPooling2D((2,2))(x)
x = Dropout(0.3)(x)

In [None]:
# Flatten CNN Output and Apply LSTMs
x = TimeDistributed(Flatten())(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)

# 🎯 Replace Attention with GlobalAveragePooling1D (More Stable)
x = GlobalAveragePooling1D()(x)

In [None]:
# Fully Connected Layers
x = Dense(256, activation='relu')(x)
x = Dropout(0.4)(x)
output_layer = Dense(len(encoder.classes_), activation='softmax')(x)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile Model
model.compile(optimizer=Adam(learning_rate=0.0005), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# 🎯 Train Model
epochs = 50  # Increase epochs for better learning
batch_size = 32
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)

# 🎯 Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"🚀 Final Test Accuracy: {accuracy*100:.2f}%")

# Save Model
model.save("ser_model_real_world_v2.h5")


Epoch 1/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 135ms/step - accuracy: 0.2722 - loss: 1.6854 - val_accuracy: 0.4224 - val_loss: 1.4502
Epoch 2/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 126ms/step - accuracy: 0.4123 - loss: 1.4518 - val_accuracy: 0.4305 - val_loss: 1.4275
Epoch 3/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 128ms/step - accuracy: 0.4267 - loss: 1.4241 - val_accuracy: 0.4298 - val_loss: 1.4090
Epoch 4/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 121ms/step - accuracy: 0.4386 - loss: 1.3852 - val_accuracy: 0.4419 - val_loss: 1.3836
Epoch 5/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 127ms/step - accuracy: 0.4536 - loss: 1.3501 - val_accuracy: 0.4439 - val_loss: 1.3668
Epoch 6/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 128ms/step - accuracy: 0.4654 - loss: 1.3327 - val_accuracy: 0.4426 - val_loss: 1.3966
Epoch 7/50



🚀 Final Test Accuracy: 48.96%


In [None]:
from google.colab import files

files.download('ser_model_real_world_v2.h5')