In [None]:
from pathlib import Path
import os
import pandas as pd

dataset_dir = Path('C:\\Users\\arnab\\Desktop\\JUPYTER NOTEBOOK\\INTERNSHIP\\ZIDIO\\Speech_Data')
audio_paths = list(dataset_dir.glob('**/*.wav'))
emotion_labels = [os.path.split(os.path.split(path)[0])[1] for path in audio_paths]
audio_df = pd.DataFrame({'audio_file': audio_paths, 'emotion': emotion_labels}).sample(frac=1).reset_index(drop=True)

In [None]:
print(audio_paths[:1])

print(emotion_labels[:2])

In [None]:
import numpy as np
import librosa

***Audio Processing Functions with Some Additional Tunings***

In [None]:
def load_audio(path):
    try:
        data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
        return data, sample_rate
    except Exception as e:
        print(f"Error loading file {path}: {e}")
        return None, None

In [None]:
def add_noise(data):
    noise_value = 0.015 * np.random.uniform() * np.amax(data)
    return data + noise_value * np.random.normal(size=data.shape[0])

In [None]:
def stretch_audio(data, rate=0.9):
    return librosa.effects.time_stretch(data, rate=rate)

In [None]:
def change_pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

***Feature Extraction***

In [None]:
def extract_features(data, sample_rate):
    features = [
        np.mean(librosa.feature.mfcc(y=data, sr=sample_rate, n_fft=512).T, axis=0),
    ]
    return np.hstack(features)

In [None]:
def export_features(path):
    data, sample_rate = load_audio(path)
    if data is None:
        return np.array([])
    result = [extract_features(data, sample_rate)]
    noisy_data = add_noise(data)
    result.append(extract_features(noisy_data, sample_rate))
    stretched_pitch = change_pitch(stretch_audio(data), sample_rate)
    result.append(extract_features(stretched_pitch, sample_rate))
    return np.array(result)

In [None]:
X_train, y_train = [], []
for path, emotion in zip(audio_df.audio_file, audio_df.emotion):
    features = export_features(path)
    if features.size > 0:
        for element in features:
            X_train.append(element)
            y_train.append(emotion)

In [None]:
feature_dataframe = pd.DataFrame(X_train)
feature_dataframe['EMOTIONS'] = y_train

In [None]:
print(feature_dataframe.head())

In [None]:
print(feature_dataframe['EMOTIONS'].value_counts())

***Checking Some Audio Samples With Above Tunings & Plotting Them***

In [None]:
from IPython.display import Audio
from scipy.io.wavfile import read

In [None]:
rate, speech = read(audio_df['audio_file'][2342])
print(audio_df['emotion'][2342])
Audio(speech, rate=rate, autoplay=False)

In [None]:
rate, speech = read(audio_df['audio_file'][20])
print(audio_df['emotion'][20])
Audio(speech, rate=rate, autoplay=False)

In [None]:
import matplotlib.pyplot as plt
import librosa.display

In [None]:
figure = plt.figure(figsize=(13,6))
audio_speech,rate = librosa.load(audio_df['audio_file'][120])
print(audio_df['emotion'][120])
librosa.display.waveshow(audio_speech, sr=rate, color = 'orange')
Audio(audio_speech, rate=rate)

In [None]:
figure = plt.figure(figsize=(13,6))
audio_speech,rate = librosa.load(audio_df['audio_file'][10])
print(audio_df['emotion'][10])
librosa.display.waveshow(audio_speech, sr=rate, color = 'green')
Audio(audio_speech, rate=rate)

In [None]:
figure = plt.figure(figsize=(13,6))
plt.title("Tune 1: Voice With Some Noise")
audio_speech,sample_rate = librosa.load(audio_df['audio_file'][2000])
print(audio_df['emotion'][2000])
noise_injection = add_noise(audio_speech)
librosa.display.waveshow(noise_injection, sr=sample_rate)
Audio(noise_injection, rate=sample_rate)

In [None]:
figure = plt.figure(figsize=(13,6))
plt.title("Tune 2: Streched Voice")
audio_speech,sample_rate = librosa.load(audio_df['audio_file'][2000])
print(audio_df['emotion'][2000])
stretching_audio = stretch_audio(audio_speech)
librosa.display.waveshow(stretching_audio, sr=sample_rate, color='red')
Audio(stretching_audio, rate=sample_rate)

***Spectrogram of a Audio Sample***

In [None]:
figure = plt.figure(figsize=(13,6))
audio_speech, rate = librosa.load(audio_df['audio_file'][2000])
stft_audio = librosa.stft(audio_speech)
Db_audio = librosa.amplitude_to_db(abs(stft_audio))
librosa.display.specshow(Db_audio, sr=rate, x_axis='time', y_axis='hz')
plt.title('Spectrogram')
plt.colorbar()
plt.show()

***Visualization of MFCCs of a Audio Sample***

In [None]:
audio_file, sr = librosa.load(audio_df['audio_file'][2000])
mfccs = librosa.feature.mfcc(y=audio_file, sr=sr, n_mfcc=13)

In [None]:
plt.figure(figsize=(13, 6))
librosa.display.specshow(mfccs, sr=sr, x_axis='time', cmap='cool')
plt.title('Mel-Frequency Cepstral Coefficients (MFCCs)')
plt.colorbar(format='%+2.0f dB')
plt.xlabel('Time (s)')
plt.ylabel('MFCC Coefficients')
plt.show()

***Preprocessing***

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder_label = OneHotEncoder()

X = feature_dataframe.iloc[:, :-1].values
emotions_array = feature_dataframe['EMOTIONS'].values.reshape(-1, 1)
Y = encoder_label.fit_transform(emotions_array).toarray()

print(X.shape)
print(Y.shape)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle=True)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_data = StandardScaler()

X_train = scaler_data.fit_transform(X_train)
X_test = scaler_data.transform(X_test)

***Model Creation (LSTM)***

In [None]:
timesteps = 1
num_features = X_train.shape[1]
X_train = np.expand_dims(X_train, axis=1)
X_test = np.expand_dims(X_test, axis=1)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam

In [None]:
def create_lstm_model(input_shape, num_classes):
    model = Sequential([
        Input(shape=input_shape),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(64),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    optimizer = Adam(learning_rate=0.001)
    model.compile(optimizer= optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
lstm_model = create_lstm_model((timesteps, num_features), Y.shape[1])
lstm_model.summary()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

callbacks = [ModelCheckpoint('speech-emotion-recognition.keras', verbose=1, save_best_only=True)]
history = lstm_model.fit(X_train, y_train, batch_size=64, epochs=50, callbacks=callbacks, validation_data=(X_test, y_test))

***Visualization of Accuracy***

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(history.history['accuracy'], label='Train Accuracy', color='red', marker='o', linestyle='-')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='blue', marker='o', linestyle='-')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(history.history['loss'], label='Train Loss', color='red', marker='o', linestyle='-')
plt.plot(history.history['val_loss'], label='Validation Loss', color='blue', marker='o', linestyle='-')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

***Test Output***

In [None]:
from tensorflow.keras.models import load_model

model = load_model('speech-emotion-recognition.keras')
results = model.evaluate(X_test, y_test)
print('Test loss:', results[0])
print('Test accuracy:', results[1])

In [None]:
test_prediction = model.predict(X_test)
y_pred = encoder_label.inverse_transform(test_prediction)
y_test_inv = encoder_label.inverse_transform(y_test)

In [None]:
print(y_pred[0:10])

In [None]:
print(y_test_inv[0:10])

***Confusion Matrix***

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(np.argmax(y_test, axis=1), np.argmax(test_prediction, axis=1))
plt.figure(figsize=(13, 6))
sns.heatmap(conf_matrix, linecolor='white', cmap='Blues', annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('Actual Labels')
plt.show()

***Classification Report***

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test_inv, y_pred))