In [1]:
%pip install librosa
%pip install noisereduce
%pip install noisereduce sounddevice
%pip install audiomentations
%pip install noisereduce

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting audiomentations
  Downloading audiomentations-0.40.0-py3-none-any.whl.metadata (11 kB)
Collecting numpy-minmax<1,>=0.3.0 (from audiomentations)
  Downloading numpy_minmax-0.4.0-cp312-cp312-win_amd64.whl.metadata (4.3 kB)
Collecting numpy-rms<1,>=0.4.2 (from audiomentations)
  Downloading numpy_rms-0.5.0-cp312-cp312-win_amd64.whl.metadata (3.6 kB)
Collecting librosa!=0.10.0,<0.11.0,>=0.8.0 (from audiomentations)
  Downloading librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting python-stretch<1,>=0.3.1 (from audiomentations)
  Downloading python_stretch-0.3.1-cp312-abi3-win_amd64.whl.metadata (3.7 kB)
INFO: pip is looking at multiple versions of numpy-minmax to determine which version is compatible with other requirements. This could take a while.
Collecting numpy-min

In [1]:
import zipfile
import os
import librosa
import soundfile as sf
import noisereduce as nr
from tqdm import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch
import random
import tempfile
import numpy as np
import tensorflow as tf
import random

ModuleNotFoundError: No module named 'audiomentations'

In [3]:
# function of unzipping the zip file of dataset.

def extract_zip_dataset(zip_path, extract_path):

    # Create the extraction directory if it does not exist
    if not os.path.exists(extract_path):
        os.makedirs(extract_path)

    # Extract the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    print(f"Dataset extracted to {extract_path}")
    return extract_path



In [5]:
# Unzipping the Zipped dataset of Echo-Heist.
zip_path = "Audio_Sentiment.zip"
extract_path = "/content/unzipped"
base_folder = extract_zip_dataset(zip_path, extract_path)

Dataset extracted to /content/unzipped


In [None]:

# Now we are removing the Background Noise and saving audio file as a WAV file format

def reduce_noise_and_convert(file_path, output_path):
        # Loading the  audio file
        y, sr = librosa.load(file_path, sr=None)

        # Denoising by the "noisereduce library"
        y_denoised = nr.reduce_noise(y=y, sr=sr)

        # Normalizing the Denoised audio file
        y_denoised = librosa.util.normalize(y_denoised)

        # Ensure output directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Save as .wav
        sf.write(output_path, y_denoised, sr)

        return True


# Finding each file & Denoising them.
def find_and_process_files(input_dir, output_dir, extensions=None):

    if extensions is None:
        extensions = ['.wav','.amr', '.mpeg']

    audio_files = []

    # Gather all valid audio files
    for root, _, files in os.walk(input_dir):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                audio_files.append(os.path.join(root, file))

    print(f"We have processed {len(audio_files)} audio files.")

    count = 0

    for file_path in tqdm(audio_files, desc="Processing"):
        # Get relative path and convert extension to .wav
        relative_path = os.path.relpath(file_path, input_dir)
        relative_path = os.path.splitext(relative_path)[0] + ".wav"
        output_path = os.path.join(output_dir, relative_path)

        if reduce_noise_and_convert(file_path, output_path):
            count += 1

    print(f"\n  We have successfully processed {count}/{len(audio_files)} files.")


if __name__ == "__main__":
    input_folder = "/content/unzipped"
    output_folder = "denoised_final_folder"
    find_and_process_files(input_folder, output_folder)


In [None]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

# Dictionary for emotions' label
emotion_labels = {
    'sad': 0,
    'surprised': 1,
    'joyfully': 2,
    'euphoric': 3
}

In [None]:
#Function for Extracting the Features from an Audio File

def extract_features(y, sr, max_pad_len=150):
    try:
        y = librosa.util.normalize(y)
        y_trimmed, _ = librosa.effects.trim(y, top_db=20)

        mfccs = librosa.feature.mfcc(y=y_trimmed, sr=sr, n_mfcc=20)
        mfccs_normalized = (mfccs - np.mean(mfccs, axis=1, keepdims=True)) / np.std(mfccs, axis=1, keepdims=True)

        mel_spec = librosa.feature.melspectrogram(y=y_trimmed, sr=sr, n_mels=40, fmax=8000)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_normalized = (mel_spec_db - np.mean(mel_spec_db)) / np.std(mel_spec_db)

        features = np.vstack([mfccs_normalized, mel_spec_normalized])

        if features.shape[1] > max_pad_len:
            start = (features.shape[1] - max_pad_len) // 2
            features = features[:, start:start + max_pad_len]
        else:
            pad_width = max_pad_len - features.shape[1]
            features = np.pad(features, pad_width=((0, 0), (0, pad_width)), mode='constant', constant_values=0)

        return features

    except Exception as e:
        print(f"Error extracting features: {e}")

        return None

In [None]:
# Folder of loading The Dataset and if sampling-rate is not matching then resampling it

def load_audio(file_path, target_sr=22050):
    try:
        y, sr = librosa.load(file_path, sr=None)
        if len(y) < sr * 0.5:
            print(f"Warning: Audio file too short: {file_path}")
            return None, None
        if sr != target_sr:
            y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        return y, target_sr
    except Exception as e:
        print(f"Error loading audio file {file_path}: {e}")

        return None

In [None]:
# Loading the Dataset from base_folder & Extracting the features

def load_dataset(base_folder, target_sr=22050):
    data, labels, file_paths = [], [], []
    emotion_files = {emotion: 0 for emotion in emotion_labels.keys()}
    total_files = 0
    valid_files = 0

    print(f"Loading dataset from {base_folder}.")

    for root, dirs, files in os.walk(base_folder):
        for file in files:
            if file.lower().endswith(('.wav','.amr','.mpeg')):
                total_files += 1
                file_path = os.path.join(root, file)
                file_emotion = None
                for emotion in emotion_labels.keys():
                    if emotion in file.lower() or emotion in os.path.basename(root).lower():
                        file_emotion = emotion
                        break
                if file_emotion:
                    y, sr = load_audio(file_path, target_sr)
                    if y is not None:
                        features = extract_features(y, sr)
                        if features is not None:
                            data.append(features)
                            labels.append(emotion_labels[file_emotion])
                            file_paths.append(file_path)
                            emotion_files[file_emotion] += 1
                            valid_files += 1

    print(f"Processed {total_files} files, {valid_files} valid files used.")
    for emotion, count in emotion_files.items():
        print(f"  - {emotion}: {count} files")

    return np.array(data), np.array(labels), file_paths

In [None]:
# Start Augmentation of data bcz it's very less
def create_augmentation_pipeline():
    return Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.01, p=0.5),
        TimeStretch(min_rate=0.9, max_rate=1.1, p=0.5),
        PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    ])

def generate_augmented_data(file_paths, labels, target_count=100):
    print("Start generating the augmented data...")
    emotion_files = {}
    for i, label in enumerate(labels):
        if label not in emotion_files:
            emotion_files[label] = []
        emotion_files[label].append(file_paths[i])

    class_counts = np.bincount(labels)
    print("Original class distribution:", class_counts)

    augmentation = create_augmentation_pipeline()
    augmented_features, augmented_labels = [], []

    for label, files in emotion_files.items():
        current_count = class_counts[label]
        if current_count < target_count:
            num_needed = target_count - current_countc
            print(f"Class {label}: Adding {num_needed} augmented samples")

            for _ in tqdm(range(num_needed)):
                file_path = random.choice(files)
                y, sr = load_audio(file_path)
                if y is None:
                    continue
                augmented_audio = augmentation(samples=y, sample_rate=sr)
                features = extract_features(augmented_audio, sr)
                if features is not None:
                    augmented_features.append(features)
                    augmented_labels.append(label)

    return np.array(augmented_features), np.array(augmented_labels)

In [None]:
# Now building the Deep-Learning Model.

def build_optimized_cnn(input_shape, num_classes):

    regularizer = regularizers.l2(0.001)
    model = models.Sequential([
        layers.Input(shape=input_shape),

        layers.Conv2D(32, (3, 3), padding='same', activation='relu', kernel_regularizer=regularizer),
        layers.BatchNormalization(),
        layers.Conv2D(32, (3, 3), padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.2),

        layers.Conv2D(64, (3, 3), padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.Conv2D(64, (3, 3), padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.3),

        layers.Conv2D(128, (3, 3), padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.Conv2D(128, (3, 3), padding='same', activation='relu'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.4),

        layers.Flatten(),
        layers.Dense(256, activation='relu', kernel_regularizer=regularizer),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu', kernel_regularizer=regularizer),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])

    initial_learning_rate = 0.001
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate, decay_steps=1000, decay_rate=0.9, staircase=True
    )

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    return model

In [None]:
## Preparing the Data for Training the Model.

def train_with_mixup(model, X_train, y_train, X_val, y_val, batch_size=32, epochs=100):
    def mixup(x, y, alpha=0.2):
        lam = np.random.beta(alpha, alpha) if alpha > 0 else 1
        batch_size = tf.shape(x)[0]
        indices = tf.random.shuffle(tf.range(batch_size))
        mixed_x = lam * x + (1 - lam) * tf.gather(x, indices)
        mixed_y = lam * y + (1 - lam) * tf.gather(y, indices)
        return mixed_x, mixed_y

    @tf.function
    def train_step(x, y):
        mixed_x, mixed_y = mixup(x, y)
        with tf.GradientTape() as tape:
            y_pred = model(mixed_x, training=True)
            loss = loss_fn(mixed_y, y_pred)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        train_acc_metric.update_state(mixed_y, y_pred)
        return loss

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss_fn = tf.keras.losses.CategoricalCrossentropy()
    train_acc_metric = tf.keras.metrics.CategoricalAccuracy()

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(1024).batch(batch_size)
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size)

    history = {"loss": [], "accuracy": [], "val_loss": [], "val_accuracy": []}
    best_val_acc = 0
    patience_counter = 0

    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        train_acc_metric.reset_state()
        losses = []
        for step, (x_batch, y_batch) in enumerate(train_dataset):
            loss = train_step(x_batch, y_batch)
            losses.append(float(loss))
        avg_loss = sum(losses) / len(losses)
        train_acc = train_acc_metric.result()

        val_losses, val_accs = [], []
        for x_val_batch, y_val_batch in val_dataset:
            val_pred = model(x_val_batch, training=False)
            val_loss = loss_fn(y_val_batch, val_pred)
            val_losses.append(float(val_loss))
            val_acc = tf.keras.metrics.categorical_accuracy(y_val_batch, val_pred)
            val_accs.append(float(tf.reduce_mean(val_acc)))

        avg_val_loss = sum(val_losses) / len(val_losses)
        avg_val_acc = sum(val_accs) / len(val_accs)

        history["loss"].append(avg_loss)
        history["accuracy"].append(float(train_acc))
        history["val_loss"].append(avg_val_loss)
        history["val_accuracy"].append(avg_val_acc)

        print(f"Loss: {avg_loss:.4f}, Accuracy: {float(train_acc):.4f}, "
              f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {avg_val_acc:.4f}")

        if avg_val_acc > best_val_acc:
            best_val_acc = avg_val_acc
            patience_counter = 0
            model.save('best_emotion_model.h5')
        else:
            patience_counter += 1
            if patience_counter >= 20:
                print(f"Early stopping at epoch {epoch + 1}")
                break

    model = tf.keras.models.load_model('best_emotion_model.h5')
    return model, history

In [None]:
def main():
    print("Starting speech emotion recognition training...")
    base_folder = "/content/denoised_final_folder"
    X, y, file_paths = load_dataset(base_folder)

    if len(X) == 0:
        print("Error: No data loaded. Exiting.")
        return

    X_aug, y_aug = generate_augmented_data(file_paths, y, target_count=150)

    if len(X_aug) > 0:
        X_combined = np.vstack([X, X_aug])
        y_combined = np.concatenate([y, y_aug])
    else:
        X_combined = X
        y_combined = y

    print(f"Final dataset: {X_combined.shape[0]} samples")
    print(f"Class distribution: {np.bincount(y_combined)}")

    X_reshaped = X_combined.reshape(X_combined.shape[0], X_combined.shape[1], X_combined.shape[2], 1)
    y_onehot = tf.keras.utils.to_categorical(y_combined, num_classes=len(emotion_labels))

    X_train, X_temp, y_train, y_temp = train_test_split(X_reshaped, y_onehot, test_size=0.2, random_state=42, stratify=y_combined)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    input_shape = (X_train.shape[1], X_train.shape[2], 1)
    model = build_optimized_cnn(input_shape, len(emotion_labels))
    model.summary()

    model, history = train_with_mixup(model, X_train, y_train, X_val, y_val, batch_size=16, epochs=150)

    test_loss, test_acc = model.evaluate(X_test, y_test)
    print(f"Test accuracy: {test_acc:.4f}")

    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    class_names = [k for k, v in sorted(emotion_labels.items(), key=lambda item: item[1])]
    print("\nClassification Report:")
    print(classification_report(y_true_classes, y_pred_classes, target_names=class_names))


# CHecking the confusion metrics
    cm = confusion_matrix(y_true_classes, y_pred_classes)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()

# graph of

    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history['accuracy'], label='Train')
    plt.plot(history['val_accuracy'], label='Validation')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history['loss'], label='Train')
    plt.plot(history['val_loss'], label='Validation')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.tight_layout()
    plt.show()

    model.save('finishing__model.h5')
    print("Model saved as 'finishing_touches_model.h5'")
    return model


if __name__ == "__main__":
    main()

In [None]:
#### prediction code 

def predict_emotion(audio_file_path, model_path='/content/best_emotion_model.h5'):
    # Define emotion labels
    emotion_labels = {
        'sad': 0,
        'surprised': 1,
        'joyfully': 2,
        'euphoric': 3
    }
    # Reverse the dictionary for prediction output
    idx_to_emotion = {v: k for k, v in emotion_labels.items()}

    # Load the model
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at {model_path}")
    model = tf.keras.models.load_model(model_path)

    # Helper functions from your training script
    def load_audio(file_path, target_sr=22050):
        try:
            y, sr = librosa.load(file_path, sr=None)
            if len(y) < sr * 0.5:
                print(f"Warning: Audio file too short: {file_path}")
                return None, None
            if sr != target_sr:
                y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
            return y, target_sr
        except Exception as e:
            print(f"Error loading audio file {file_path}: {e}")
            return None, None

    def extract_features(y, sr, max_pad_len=150):
        try:
            y = librosa.util.normalize(y)
            y_trimmed, _ = librosa.effects.trim(y, top_db=20)

            mfccs = librosa.feature.mfcc(y=y_trimmed, sr=sr, n_mfcc=20, n_fft=2048, hop_length=512)
            mfccs_normalized = (mfccs - np.mean(mfccs, axis=1, keepdims=True)) / np.std(mfccs, axis=1, keepdims=True)

            mel_spec = librosa.feature.melspectrogram(y=y_trimmed, sr=sr, n_mels=40, n_fft=2048, hop_length=512, fmax=8000)
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
            mel_spec_normalized = (mel_spec_db - np.mean(mel_spec_db)) / np.std(mel_spec_db)

            features = np.vstack([mfccs_normalized, mel_spec_normalized])

            if features.shape[1] > max_pad_len:
                start = (features.shape[1] - max_pad_len) // 2
                features = features[:, start:start + max_pad_len]
            else:
                pad_width = max_pad_len - features.shape[1]
                features = np.pad(features, pad_width=((0, 0), (0, pad_width)), mode='constant', constant_values=0)

            return features
        except Exception as e:
            print(f"Error extracting features: {e}")
            return None

    # Loading the audio file
    y, sr = load_audio(audio_file_path)
    if y is None:
        return "Error", 0.0, {}

    # Extract features from the audio
    features = extract_features(y, sr)
    if features is None:
        return "Error extracting features", 0.0, {}

    # Reshape features to match model input shape (add batch and channel dimensions)
    features = features.reshape(1, features.shape[0], features.shape[1], 1)

    # Make prediction
    prediction = model.predict(features)[0]
    predicted_class = np.argmax(prediction)
    confidence = prediction[predicted_class]

    # Create dictionary with all emotion probabilities
    all_probs = {idx_to_emotion[i]: float(prob) for i, prob in enumerate(prediction)}

    return idx_to_emotion[predicted_class], float(confidence), all_probs






def predict_emotion_with_visualization(audio_file_path, model_path='/content/best_emotion_model.h5', show_plots=True):

    # Defining the emotion labels
    emotion_labels = {
        'sad': 0,
        'surprised': 1,
        'joyfully': 2,
        'euphoric': 3
    }
    # Reverse the dictionary for prediction output
    idx_to_emotion = {v: k for k, v in emotion_labels.items()}

    # Load the model
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at {model_path}")
    model = tf.keras.models.load_model(model_path)

    # Load and preprocess audio
    y, sr = librosa.load(audio_file_path, sr=22050)
    y = librosa.util.normalize(y)
    y_trimmed, _ = librosa.effects.trim(y, top_db=20)

    # Extract features
    mfccs = librosa.feature.mfcc(y=y_trimmed, sr=sr, n_mfcc=20)
    mfccs_normalized = (mfccs - np.mean(mfccs, axis=1, keepdims=True)) / np.std(mfccs, axis=1, keepdims=True)

    mel_spec = librosa.feature.melspectrogram(y=y_trimmed, sr=sr, n_mels=40,fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_normalized = (mel_spec_db - np.mean(mel_spec_db)) / np.std(mel_spec_db)

    features = np.vstack([mfccs_normalized, mel_spec_normalized])

    # Apply same padding/cropping as in training
    max_pad_len = 150
    if features.shape[1] > max_pad_len:
        start = (features.shape[1] - max_pad_len) // 2
        features = features[:, start:start + max_pad_len]
    else:
        pad_width = max_pad_len - features.shape[1]
        features = np.pad(features, pad_width=((0, 0), (0, pad_width)), mode='constant', constant_values=0)

    # Reshape features for the model
    features_for_model = features.reshape(1, features.shape[0], features.shape[1], 1)

    # Make prediction
    prediction = model.predict(features_for_model)[0]
    predicted_class = np.argmax(prediction)
    confidence = prediction[predicted_class]
    predicted_emotion = idx_to_emotion[predicted_class]

    # Create dictionary with all emotion probabilities
    all_probs = {idx_to_emotion[i]: float(prob) for i, prob in enumerate(prediction)}

    # Visualize
    if show_plots:
        plt.figure(figsize=(15, 10))

        # Plot waveform
        plt.subplot(3, 1, 1)
        plt.title(f"Waveform - Predicted: {predicted_emotion} ({confidence:.2f})")

        time = np.arange(0, len(y_trimmed)) / sr
        plt.plot(time, y_trimmed, color='b')
        plt.xlabel("Time (s)")
        plt.ylabel("Amplitude")

        # Plot MFCC
        plt.subplot(3, 1, 2)
        plt.title("MFCC Features")
        librosa.display.specshow(mfccs, sr=sr, x_axis='time')
        plt.colorbar(format='%+2.0f dB')
        plt.ylabel("MFCC Coeffs")

        # Plot Mel Spectrogram
        plt.subplot(3, 1, 3)
        plt.title("Mel Spectrogram")
        librosa.display.specshow(mel_spec_db, sr=sr, x_axis='time', y_axis='mel')
        plt.colorbar(format='%+2.0f dB')
        plt.ylabel("Frequency (Hz)")

        plt.tight_layout()
        plt.show()

        # Plot emotion probabilities
        plt.figure(figsize=(10, 5))
        plt.bar(all_probs.keys(), all_probs.values())
        plt.title("Emotion Prediction Probabilities")
        plt.ylabel("Probability")
        plt.ylim(0, 1)
        for i, (emotion, prob) in enumerate(all_probs.items()):
            plt.text(i, prob + 0.02, f"{prob:.2f}", ha='center')
        plt.tight_layout()
        plt.show()

    return predicted_emotion, float(confidence), all_probs



if __name__ == "__main__":

    audio_path = "/content/sad.wav"
    emotion, confidence, probabilities = predict_emotion(audio_path)
    print(f"Predicted emotion: {emotion}")
    print(f"Confidence: {confidence:.2f}")
    print("All probabilities:")
    for emotion, prob in probabilities.items():
        print(f"{emotion} : {prob:.2f}")

    try:
        emotion, confidence, probabilities = predict_emotion_with_visualization(audio_path)
    except ImportError:
        print("Some Import Error")