In [1]:
!pip install librosa numpy pandas scikit-learn joblib tqdm soundfile




In [13]:
!git clone https://github.com/karoldvl/ESC-50.git


Cloning into 'ESC-50'...
remote: Enumerating objects: 4199, done.[K
remote: Counting objects: 100% (69/69), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 4199 (delta 62), reused 34 (delta 34), pack-reused 4130 (from 1)[K
Receiving objects: 100% (4199/4199), 878.77 MiB | 24.23 MiB/s, done.
Resolving deltas: 100% (292/292), done.
Updating files: 100% (2011/2011), done.


In [1]:
!wget https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz
!tar -xzf UrbanSound8K.tar.gz


--2026-01-21 07:54:46--  https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz
Resolving zenodo.org (zenodo.org)... 137.138.52.235, 188.185.43.153, 188.185.48.75, ...
Connecting to zenodo.org (zenodo.org)|137.138.52.235|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/1203745/files/UrbanSound8K.tar.gz [following]
--2026-01-21 07:54:53--  https://zenodo.org/records/1203745/files/UrbanSound8K.tar.gz
Connecting to zenodo.org (zenodo.org)|137.138.52.235|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6023741708 (5.6G) [application/octet-stream]
Saving to: ‘UrbanSound8K.tar.gz’


2026-01-21 08:22:33 (3.49 MB/s) - ‘UrbanSound8K.tar.gz’ saved [6023741708/6023741708]



In [5]:
CLASS_MAP = {
    "human": 0,
    "anthropogenic": 1,
    "animal": 2
}

In [6]:
ESC_HUMAN = [
    "crying_baby", "laughing", "sneezing", "coughing"
]

ESC_ANTHRO = [
    "engine", "train", "airplane", "car_horn",
    "chainsaw", "hand_saw"
]

ESC_ANIMAL = [
    "dog", "cat", "chirping_birds", "crow", "insects"
]


In [7]:
URBAN_ANTHRO = [
    "air_conditioner", "car_horn", "engine_idling",
    "jackhammer", "drilling", "street_music"
]

URBAN_HUMAN = [
    "children_playing"
]


In [8]:
import librosa
import numpy as np

def extract_features(audio_path):
    y, sr = librosa.load(audio_path, sr=22050, mono=True)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)

    return np.hstack((
        np.mean(mfcc, axis=1),
        np.std(mfcc, axis=1)
    ))

In [12]:
!ls


sample_data  UrbanSound8K  UrbanSound8K.tar.gz


In [14]:
import pandas as pd
from tqdm import tqdm

meta_esc = pd.read_csv("ESC-50/meta/esc50.csv")

X, y = [], []

for _, row in tqdm(meta_esc.iterrows(), total=len(meta_esc)):
    label = row["category"]
    path = f"ESC-50/audio/{row['filename']}"

    if label in ESC_HUMAN:
        class_id = CLASS_MAP["human"]
    elif label in ESC_ANTHRO:
        class_id = CLASS_MAP["anthropogenic"]
    elif label in ESC_ANIMAL:
        class_id = CLASS_MAP["animal"]
    else:
        continue

    try:
        X.append(extract_features(path))
        y.append(class_id)
    except:
        pass

100%|██████████| 2000/2000 [00:33<00:00, 59.71it/s] 


In [15]:
meta_urban = pd.read_csv("UrbanSound8K/metadata/UrbanSound8K.csv")

for _, row in tqdm(meta_urban.iterrows(), total=len(meta_urban)):
    label = row["class"]
    path = f"UrbanSound8K/audio/fold{row['fold']}/{row['slice_file_name']}"

    if label in URBAN_ANTHRO:
        class_id = CLASS_MAP["anthropogenic"]
    elif label in URBAN_HUMAN:
        class_id = CLASS_MAP["human"]
    else:
        continue

    try:
        X.append(extract_features(path))
        y.append(class_id)
    except:
        pass


100%|██████████| 8732/8732 [03:33<00:00, 40.82it/s]


In [17]:
from sklearn.utils.class_weight import compute_class_weight

weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y),
    y=y
)
class_weights = dict(enumerate(weights))
print(class_weights)

{0: np.float64(2.0198275862068966), 1: np.float64(0.4133004057152937), 2: np.float64(11.715)}


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=25,
    n_jobs=-1,
    class_weight="balanced",
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(
    y_test,
    y_pred,
    target_names=["human", "anthropogenic", "animal"]
))

               precision    recall  f1-score   support

        human       0.85      0.64      0.73       232
anthropogenic       0.91      0.99      0.95      1134
       animal       1.00      0.15      0.26        40

     accuracy                           0.91      1406
    macro avg       0.92      0.59      0.65      1406
 weighted avg       0.91      0.91      0.89      1406



In [20]:
joblib.dump(model, "sound_classifier.pkl")

['sound_classifier.pkl']

In [21]:
from google.colab import files
files.download("sound_classifier.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>