# ESC-50 Environmental Sound Classification
**Course:** AIN413 - Machine Learning for Healthcare  
**Project Title:** Vibration-Based Feedback Devices for Environmental Sound Recognition  


## 1. Setup and Dataset Download

In [None]:
!pip install noisereduce
!pip install tensorflow.keras

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from google.colab import files
import librosa
import librosa.display
import noisereduce as nr
import seaborn as sns
from IPython.display import Audio, display
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from scipy.stats import loguniform,randint,uniform
from sklearn.model_selection import RandomizedSearchCV
from IPython.display import Audio, display
import numpy as np
import librosa
import os
from collections import Counter





In [None]:
# Upload kaggle.json which includes Kaggle API to download from kaggle directly

files.upload()
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!pip install -q kaggle librosa scikit-learn matplotlib seaborn tensorflow

In [None]:
# Download and unzip dataset
!kaggle datasets download -d mmoreaux/environmental-sound-classification-50
!unzip -q environmental-sound-classification-50.zip

In [None]:
meta = pd.read_csv('/content/esc50.csv')
DATA_PATH = '/content/audio/audio'
TEST_DIR = '/content/wav_sound'

!unzip -q /content/wav_sound.zip

selected_categories = [
    "dog", "thunderstorm", "crying_baby",
    "door_wood_knock", "siren", "car_horn"
]

filtered_meta = meta[meta["category"].isin(selected_categories)].reset_index(drop=True)
filtered_meta = filtered_meta.drop(columns=["esc10", "src_file", "take", "fold"])

## 2. Exploratory Data Analysis

In [None]:
sns.countplot(data=filtered_meta, x='category', order=filtered_meta['category'].value_counts().index)
plt.title('Class Distributions')
plt.tight_layout()
plt.show()

In [None]:
durations = []
for f in filtered_meta['filename']:
    y, sr = librosa.load(f"{DATA_PATH}/{f}", sr=None)
    durations.append(len(y)/sr)

plt.hist(durations, bins=20)
plt.xlabel("Duration (seconds)")
plt.title("Distribution of Audio Lengths")
plt.show()


In [None]:
energies = [np.mean(librosa.feature.rms(y=librosa.load(f"{DATA_PATH}/{f}", sr=None)[0])) for f in filtered_meta['filename']]
sns.histplot(energies, bins=20)
plt.title("Root Mean Square Energy Distribution")
plt.xlabel("Energy")
plt.show()


In [None]:
# Get all files
all_files = [f for f in os.listdir(DATA_PATH) if f.endswith(".wav")]
selected_files = random.sample(all_files, 4)  # Choose many as you want
# Visualize
plt.figure(figsize=(15, 8))
for i, file in enumerate(selected_files):
    file_path = os.path.join(DATA_PATH, file)
    y, sr = librosa.load(file_path, sr=None)

    # Waveform
    plt.subplot(2, len(selected_files), i + 1)
    librosa.display.waveshow(y, sr=sr)
    plt.title(f'Waveform\n{file}')
    plt.tight_layout()

    # Spectrogram
    plt.subplot(2, len(selected_files), i + 1 + len(selected_files))
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Spectrogram')
    plt.tight_layout()

plt.suptitle("Random Sampled ESC-50 Audio Files", fontsize=16, y=1.02)
plt.show()

## 3. Feature Extraction and Data Augmentation


In [None]:
# This function removes silence, reduces background noise, resamples to 44.1kHz.

def adapt_audio(file_path, target_sr=44100):
    y, sr = librosa.load(file_path, sr=None)

    # Noice reduction
    try:
        import noisereduce as nr
        y = nr.reduce_noise(y=y, sr=sr)
    except:
        pass

    # Resample
    if sr != target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)

    return y, target_sr


In [None]:
# Adds random white noise to the audio signal to simulate background interference.
def add_noise(y, noise_level=0.005):
    noise = np.random.randn(len(y))  # Generate random noise
    return y + noise_level * noise  # Add it to the original signal

# Stretches the audio in time (makes it faster or slower).
# Useful for data augmentation.
def time_stretch(y, rate=1.1):
    return librosa.effects.time_stretch(y=y, rate=rate)

# Shifts the pitch of the audio without changing its speed.
# Also used for data augmentation.
def pitch_shift(y, sr, n_steps=2):
    return librosa.effects.pitch_shift(y=y, sr=sr, n_steps=n_steps)


In [None]:
# Feature Extraction with Augmentation (MFCC + Delta features)

def extract_augmented_mfccs(file_path):
    y, sr = adapt_audio(file_path)  # Preprocess the audio: trim, denoise, resample
    features = []

    # Original audio
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    combined = np.vstack([mfcc, delta, delta2])
    features.append(np.mean(combined.T, axis=0))  # Average over time axis

    # Noisy version
    y_noise = add_noise(y)
    mfcc = librosa.feature.mfcc(y=y_noise, sr=sr, n_mfcc=40)
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    combined = np.vstack([mfcc, delta, delta2])
    features.append(np.mean(combined.T, axis=0))

    # Pitch-shifted version
    y_pitch = pitch_shift(y, sr=sr)
    mfcc = librosa.feature.mfcc(y=y_pitch, sr=sr, n_mfcc=40)
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    combined = np.vstack([mfcc, delta, delta2])
    features.append(np.mean(combined.T, axis=0))

    # Time-stretched version
    try: # In case any errors happen.
        y_stretch = time_stretch(y=y)
        mfcc = librosa.feature.mfcc(y=y_stretch, sr=sr, n_mfcc=40)
        delta = librosa.feature.delta(mfcc)
        delta2 = librosa.feature.delta(mfcc, order=2)
        combined = np.vstack([mfcc, delta, delta2])
        features.append(np.mean(combined.T, axis=0))
    except:
        pass  # Skip this augmentation if it fails

    return features  # List of 1D feature vectors (one for each variation)


## 4. Generating Train Data


In [None]:
# Extracting features from all audio files and prepare labels

X_mfcc, y = [], []

# Loop through each row
for _, row in tqdm(filtered_meta.iterrows(), total=len(filtered_meta)):
    # Extract  augmented versions of data
    feats = extract_augmented_mfccs(os.path.join(DATA_PATH, row['filename']))

    # Add each feature set to the dataset
    for feat in feats:
        X_mfcc.append(feat)
        y.append(row['category'])  # Store the corresponding label

# Convert feature list to NumPy array for model input
X_mfcc = np.array(X_mfcc)

# Convert string labels to integer classes
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_mfcc, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

## 5. Model Training and Comparison

### 5.1 Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)

param_dist_rf = {
    'n_estimators': randint(20, 100),
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'bootstrap': [True, False]
}

random_search_rf = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist_rf,
    n_iter=10,
    cv=3,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

random_search_rf.fit(X_train, y_train)
best_rf = random_search_rf.best_estimator_

y_pred_rf = best_rf.predict(X_test)

print("Best Parameters:", random_search_rf.best_params_)
print("\nGridSearch Optimized Random Forest Report:\n")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_, zero_division=0))




### 5.2 Support Vector Machine

In [None]:
svm = SVC(probability=True)

param_dist_svm = {
    'C': uniform(0.1, 10),  # regularization
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

random_search_svm = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_dist_svm,
    n_iter=10,
    cv=3,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

random_search_svm.fit(X_train, y_train)
best_svm = random_search_svm.best_estimator_
y_pred_svm = best_svm.predict(X_test)

print("Best SVM Parameters:", random_search_svm.best_params_)
print("\nOptimized SVM Classification Report:\n")
print(classification_report(y_test, y_pred_svm, target_names=le.classes_, zero_division=0))


### 5.3 Multi-Layer Perceptron

In [None]:
mlp = MLPClassifier(random_state=42, max_iter=300)

param_dist_mlp = {
    'hidden_layer_sizes': [(64,), (64, 32)],
    'activation': ['relu', 'tanh'],
    'alpha': loguniform(1e-5, 1e-2),
    'solver': ['adam'],
    'learning_rate': ['constant', 'adaptive']
}

random_search_mlp = RandomizedSearchCV(
    estimator=mlp,
    param_distributions=param_dist_mlp,
    n_iter=10,
    cv=3,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

random_search_mlp.fit(X_train, y_train)
best_mlp = random_search_mlp.best_estimator_

y_pred_mlp = best_mlp.predict(X_test)

print("Best MLP Parameters:", random_search_mlp.best_params_)
print("\nOptimized MLP Classification Report:\n")
print(classification_report(y_test, y_pred_mlp, target_names=le.classes_, zero_division=0))


### 5.4 Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(random_state=42)

param_dist_dt = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', None]
}

random_search_dt = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_dist_dt,
    n_iter=10,
    cv=3,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

random_search_dt.fit(X_train, y_train)
best_dt = random_search_dt.best_estimator_

y_pred_dt = best_dt.predict(X_test)

print("Best Decision Tree Parameters:", random_search_dt.best_params_)
print("\nOptimized Decision Tree Classification Report:\n")
print(classification_report(y_test, y_pred_dt, target_names=le.classes_, zero_division=0))


### 5.5 Convolutional Neural Network

In [None]:
def augment_audio_variants(y, sr):
    variants = []

    variants.append(y)

    # Added random noise
    noise = np.random.randn(len(y))
    y_noise = y + 0.005 * noise
    variants.append(y_noise)

    # Pitch shift
    variants.append(librosa.effects.pitch_shift(y, sr=sr, n_steps=2))
    variants.append(librosa.effects.pitch_shift(y, sr=sr, n_steps=-2))

    # Time stretch
    try:
        variants.append(librosa.effects.time_stretch(y, rate=1.1))
        variants.append(librosa.effects.time_stretch(y, rate=0.9))
    except:
        pass

    return variants

In [None]:
def extract_logmel_from_audio(y, sr, n_mels=128, hop_length=512, fix_len=216):

    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
    log_mel = librosa.power_to_db(mel)
    log_mel = librosa.util.fix_length(log_mel, size=fix_len, axis=1)
    return log_mel[..., np.newaxis]

In [None]:
X_logmel, y = [], []

for _, row in tqdm(filtered_meta.iterrows(), total=len(filtered_meta)):
    filepath = os.path.join(DATA_PATH, row['filename'])
    y_raw, sr = librosa.load(filepath, sr=22050, duration=5)

    # Create variants for audios
    augmented_audios = augment_audio_variants(y_raw, sr)


    for audio in augmented_audios:
        logmel = extract_logmel_from_audio(audio, sr)
        X_logmel.append(logmel)
        y.append(row["category"])


X_logmel = np.array(X_logmel)
y_encoded = LabelEncoder().fit_transform(y)
y_cat = to_categorical(y_encoded)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_logmel, y_cat, test_size=0.2, stratify=y_encoded, random_state=42
)


In [None]:
# Set num class
num_classes = len(np.unique(y_encoded))

input_shape = (128, 216, 1)

# Model description
cnn = Sequential([
    Input(shape=input_shape),
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),

    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),

    Flatten(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')
])

cnn.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

cnn.summary()

history = cnn.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32,
    verbose=1
)

y_pred_cnn = cnn.predict(X_test)
y_pred_classes_cnn = np.argmax(y_pred_cnn, axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_pred_classes_cnn, target_names=le.classes_))

## 6. Evaluation With Sound Recorded From Real World Examples

In [None]:
label_to_text = {
    'dog': "🐶 Dog barking detected!",
    'thunderstorm': "🌩️ Thunderstorm sound detected!",
    'door_wood_knock': "🚪 Door knocking sound detected!",
    'car_horn': "📣 Car horn detected!",
    'crying_baby': "👶 Baby crying detected!",
    'siren': "🚑 Siren sound detected!"
}


In [None]:
def predict_test_audio(model, CONFIDENCE_THRESHOLD=0.6):
    y_true = []
    y_pred = []
    match_counter = Counter()
    total_counter = Counter()

    print("\n📢 Prediction and Playback (Top-2 with confidence):\n")

    for file in sorted(os.listdir(TEST_DIR)):
        if file.endswith(".wav"):
            file_path = os.path.join(TEST_DIR, file)

            # Preprocess the audio file
            y_test, sr_test = adapt_audio(file_path)
            mfcc = librosa.feature.mfcc(y=y_test, sr=sr_test, n_mfcc=40)
            delta = librosa.feature.delta(mfcc)
            delta2 = librosa.feature.delta(mfcc, order=2)
            features = np.vstack([mfcc, delta, delta2])
            input_feat = np.mean(features.T, axis=0).reshape(1, -1)

            # Predict class probabilities
            probs = model.predict_proba(input_feat)[0]
            sorted_indices = np.argsort(probs)[::-1]

            # Top-2 predictions
            top1_idx = sorted_indices[0]
            top2_idx = sorted_indices[1]
            top1_label = le.inverse_transform([top1_idx])[0]
            top2_label = le.inverse_transform([top2_idx])[0]
            top1_conf = probs[top1_idx]
            top2_conf = probs[top2_idx]

            # Try to get true label from filename if possible
            true_label = None
            for label in le.classes_:
                if label in file.lower():
                    true_label = label
                    break

            if true_label:
                y_true.append(true_label)
                y_pred.append(top1_label)
                total_counter[true_label] += 1
                if top1_label == true_label:
                    match_counter[true_label] += 1

            # Display result
            if top1_conf >= CONFIDENCE_THRESHOLD:
                print(f"{file}: ✅ {label_to_text.get(top1_label, top1_label)} (Confidence = {top1_conf:.2f})")
            else:
                print(f"{file}: ❓ Not confident\n ↪ 1. {label_to_text.get(top1_label, top1_label)} ({top1_conf:.2f})"
                      f"\n ↪ 2. {label_to_text.get(top2_label, top2_label)} ({top2_conf:.2f})")

            display(Audio(file_path))

    # Accuracy calculation using Counter
    total = sum(total_counter.values())
    correct = sum(match_counter.values())
    accuracy = correct / total if total > 0 else 0.0
    print(f"\n📊 Accuracy of the Real world scenarios = {accuracy:.2f} ({correct}/{total})")

def predict_test_audio_cnn(model, CONFIDENCE_THRESHOLD=0.6):
    total = 0
    correct = 0

    print("\n📢 Prediction and Playback:\n")

    for file in sorted(os.listdir(TEST_DIR)):
        if file.endswith(".wav"):
            file_path = os.path.join(TEST_DIR, file)

            # 1. Load and extract log-mel spectrogram
            y, sr = librosa.load(file_path, sr=22050, duration=5)
            mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
            logmel = librosa.power_to_db(mel)
            logmel = librosa.util.fix_length(logmel, size=216, axis=1)

            # 2. Reshape for CNN input
            input_feat = logmel[np.newaxis, ..., np.newaxis]  # (1, 128, 216, 1)

            # 3. Predict
            prediction = model.predict(input_feat)
            predicted_index = np.argmax(prediction, axis=1)[0]
            predicted_label = le.inverse_transform([predicted_index])[0]

            # 4. Display prediction
            message = label_to_text.get(predicted_label, predicted_label)
            print(f"{file}: {message}")
            display(Audio(file_path))

            # 5. Check ground truth from filename
            for label in le.classes_:
                if label in file.lower():
                    total += 1
                    if predicted_label == label:
                        correct += 1
                    break

    if total > 0:
        accuracy = correct / total
        print(f"\n📊 Accuracy of the real-world scenarios = {accuracy:.2f} ({correct}/{total})")
    else:
        print("\n⚠️ No ground truth labels could be inferred from filenames.")



In [None]:
predict_test_audio(best_mlp)


In [None]:
predict_test_audio(best_rf)

In [None]:
predict_test_audio(best_dt)

In [None]:
predict_test_audio(best_svm)

In [None]:
predict_test_audio_cnn(cnn)