In [1]:
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Bidirectional, Conv2D, MaxPooling2D, Flatten, TimeDistributed, BatchNormalization, Reshape
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
import random
import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

2024-06-11 08:44:46.099775: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 08:44:46.099893: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 08:44:46.228762: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
df = pd.read_csv('/kaggle/input/audio-dataset/Audios_Final')
df

Unnamed: 0,Emotion,file
0,surprised,/kaggle/input/speech-emotion-recognition-en/Ra...
1,neutral,/kaggle/input/speech-emotion-recognition-en/Ra...
2,disgust,/kaggle/input/speech-emotion-recognition-en/Ra...
3,disgust,/kaggle/input/speech-emotion-recognition-en/Ra...
4,neutral,/kaggle/input/speech-emotion-recognition-en/Ra...
...,...,...
12157,surprised,/kaggle/input/speech-emotion-recognition-en/Te...
12158,surprised,/kaggle/input/speech-emotion-recognition-en/Te...
12159,surprised,/kaggle/input/speech-emotion-recognition-en/Te...
12160,surprised,/kaggle/input/speech-emotion-recognition-en/Te...


In [3]:
def stretch(data, rate=1):
    return librosa.effects.time_stretch(data, rate=rate)


def extract_mfcc_and_deltas_with_augmentation(file_path, max_pad_len=130, stretch_factor=1.2):
    audio, sample_rate = librosa.load(file_path, sr=None, duration=2.5, offset=0.6)
    stretched_audio = stretch(audio, rate=stretch_factor)

    # Function to extract MFCCs and their deltas from an audio sample
    def extract_and_pad_features(audio):
        # Extract MFCCs and their deltas
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfcc_delta = librosa.feature.delta(mfccs)
        mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
        features = np.concatenate((mfccs, mfcc_delta, mfcc_delta2), axis=0)
        
        # Pad or truncate the features to ensure uniform size
        pad_width = max_pad_len - features.shape[1]
        if pad_width < 0:  # Truncate
            features = features[:, :max_pad_len]
        else:  # Pad with zeros
            features = np.pad(features, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
        return features

    normal_features = extract_and_pad_features(audio)
    stretched_features = extract_and_pad_features(stretched_audio)

    return normal_features, stretched_features

In [4]:
from tqdm import tqdm

feature_list = []
y = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    path = row['file']
    emotion = row['Emotion']
    
    normal_features, stretched_features = extract_mfcc_and_deltas_with_augmentation(path)
    
    # Append both sets of features to your feature list
    feature_list.append(normal_features)
    feature_list.append(stretched_features)
    
    # Append the emotion label twice, once for each set of features
    y.extend([emotion] * 2)

100%|██████████| 12162/12162 [14:16<00:00, 14.20it/s]


In [5]:
# Assuming `feature_list` is a list of arrays, you might need to adjust shapes or concatenate along the correct axis
X = np.array(feature_list)
y = np.array(y)

In [6]:
np.save('new_features.npy', X)
np.save('new_labels.npy', y)

# Concatenating MFCCs

In [7]:
import numpy as np

# MFCC Features
X = np.load('/kaggle/input/final-audio-data/new_features.npy')
y = np.load('/kaggle/input/final-audio-data/new_labels.npy')

In [8]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)

# Convert integers to dummy (one-hot encoded) variables
dummy_y = to_categorical(encoded_y)

In [9]:
import numpy as np

# MFCC Features
# data = np.load('/kaggle/input/mfcc-all-audios/features.npy')
# labels = np.load('/kaggle/input/mfcc-all-audios/labels.npy')
data = X
labels = y

In [10]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size=0.2, random_state=42)

In [11]:
# Extracting only Original
X_test = X_test[::2]  
y_test = y_test[::2]  

In [12]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Initialize the StandardScaler
scaler = StandardScaler()

# Reshape the data to 2D for scaling (combining the first and third dimensions)
num_samples, num_features, num_steps = X_train.shape
X_train_reshaped = X_train.reshape(-1, num_features*num_steps)
X_test_reshaped = X_test.reshape(-1, num_features*num_steps)

# Fit on the training data and transform both training and test data.
scaler.fit(X_train_reshaped)
X_train_scaled = scaler.transform(X_train_reshaped)
X_test_scaled = scaler.transform(X_test_reshaped)

# Reshape the scaled data back to its original 3D shape
X_train_scaled = X_train_scaled.reshape(num_samples, num_features, num_steps)
X_test_scaled = X_test_scaled.reshape(X_test.shape[0], num_features, num_steps)

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Layer
import tensorflow.keras.backend as K
import tensorflow as tf

input_shape = (120, 130)

model = Sequential([
    Conv1D(filters=256, kernel_size=3, activation='relu', input_shape=input_shape, padding='same'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.25),
    
    Conv1D(filters=256, kernel_size=3, activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.25),
    
    Conv1D(filters=256, kernel_size=3, activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.25),
    
#     Conv1D(filters=256, kernel_size=3, activation='relu', padding='same'),
#     BatchNormalization(),
#     MaxPooling1D(pool_size=2),
#     Dropout(0.2),
    
    Bidirectional(LSTM(2048, return_sequences=True)),
    Bidirectional(LSTM(1024, return_sequences=True)),
    Bidirectional(LSTM(512)),
    
    # Dense layers
    Dense(128, activation='relu'),
    Dense(8, activation='softmax')  # Assuming 8 classes for emotion classification
])

# Compilation of the model
model.compile(optimizer=Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Summary of the model
model.summary()

In [14]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    min_delta=0.00005,
    patience=11,
    verbose=1,
    restore_best_weights=True,
)

lr_scheduler = ReduceLROnPlateau(
    monitor='val_accuracy',
    factor=0.5,
    patience=7,
    min_lr=1e-7,
    verbose=1,
)

callbacks = [
    early_stopping,
    lr_scheduler,
]

In [15]:
history = model.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test), epochs=95, batch_size=64, callbacks=callbacks)

Epoch 1/95
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 240ms/step - accuracy: 0.2020 - loss: 1.9153 - val_accuracy: 0.3872 - val_loss: 1.5093 - learning_rate: 0.0010
Epoch 2/95
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 250ms/step - accuracy: 0.4009 - loss: 1.4692 - val_accuracy: 0.4147 - val_loss: 1.4774 - learning_rate: 0.0010
Epoch 3/95
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 249ms/step - accuracy: 0.4730 - loss: 1.3250 - val_accuracy: 0.5171 - val_loss: 1.1940 - learning_rate: 0.0010
Epoch 4/95
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 248ms/step - accuracy: 0.5315 - loss: 1.1771 - val_accuracy: 0.5442 - val_loss: 1.1423 - learning_rate: 0.0010
Epoch 5/95
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 247ms/step - accuracy: 0.5632 - loss: 1.1105 - val_accuracy: 0.5393 - val_loss: 1.1700 - learning_rate: 0.0010
Epoch 6/95
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0

# Testing the Model

In [17]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f'Test Accuracy: {accuracy*100:.2f}%')

Test Accuracy: 84.67%


In [18]:
import numpy as np

# Assuming a binary classification model; adjust accordingly for multi-class
predictions = model.predict(X_test_scaled)
y_pred = np.argmax(predictions, axis=1)

[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 60ms/step


In [19]:
label_mapping = {
    "angry": 0,
    "calm": 1,
    "disgust": 2,
    "fear": 3,
    "happy": 4,
    "neutral": 5,
    "sad": 6,
    "surprised": 7
}

sorted_labels = sorted(label_mapping, key=label_mapping.get)

In [20]:
from sklearn.metrics import classification_report, confusion_matrix

# Assuming y_pred is already calculated and y_test is properly formatted
y_testing = np.argmax(y_test, axis=1)

# Generate classification report using the sorted_labels for target_names
print(classification_report(y_testing, y_pred, target_names=sorted_labels))

              precision    recall  f1-score   support

       angry       0.91      0.87      0.89       387
        calm       0.61      0.85      0.71        41
     disgust       0.84      0.84      0.84       419
        fear       0.83      0.83      0.83       377
       happy       0.85      0.83      0.84       380
     neutral       0.85      0.88      0.86       307
         sad       0.81      0.83      0.82       413
   surprised       0.94      0.90      0.92       109

    accuracy                           0.85      2433
   macro avg       0.83      0.85      0.84      2433
weighted avg       0.85      0.85      0.85      2433



In [21]:
import plotly.figure_factory as ff

# Define the axis labels based on the number of classes
class_labels = ['angry', 'calm', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprised']

cm = confusion_matrix(y_testing, y_pred, labels=range(len(class_labels)))
# Create a confusion matrix heatmap, note the reversal of cm and y-axis labels
fig = ff.create_annotated_heatmap(z=cm, x=class_labels, y=class_labels, colorscale='Viridis')

# Add labels and title
fig.update_layout(title_text='<b>Confusion Matrix for Multi-Class Classification</b>',
                  xaxis=dict(title='Predicted label'),
                  yaxis=dict(title='True label', autorange="reversed"))  # Ensure y-axis starts from the top

# Add color scale
fig['data'][0]['showscale'] = True
fig.show()

In [22]:
# model.save('Research-Main-Audio-Model.h5')