In [None]:
# Cell 1 - imports and environment
import os
import zipfile
import requests
from pathlib import Path
import random
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


In [None]:
# Cell 2 - download & extract ESC-50 (if not present)
DATA_DIR = Path("data/esc50")
ZIP_PATH = DATA_DIR / "esc50-master.zip"
ESC50_URL = "https://github.com/karolpiczak/ESC-50/archive/master.zip"

DATA_DIR.mkdir(parents=True, exist_ok=True)

if not ZIP_PATH.exists():
    r = requests.get(ESC50_URL, stream=True, timeout=30)
    r.raise_for_status()
    with open(ZIP_PATH, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

# Extract
with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(DATA_DIR)
print("Extracted to", DATA_DIR)
meta_path = DATA_DIR / "ESC-50-master" / "meta" / "esc50.csv"
print("Meta CSV:", meta_path)
meta = pd.read_csv(meta_path)
meta.head()


In [None]:
# Cell 3 - inspect dataset and plot an example
print("Total clips in CSV:", len(meta))
print("Unique classes:", meta['category'].nunique())

# pick one example and plot waveform + mel-spectrogram
example = meta.iloc[0]
example_file = DATA_DIR / "ESC-50-master" / "audio" / example['filename']
print("Example file:", example_file)

y, sr = librosa.load(example_file, sr=22050, duration=5.0)
plt.figure(figsize=(10,3))
librosa.display.waveshow(y, sr=sr)
plt.title(f"Waveform: {example['filename']} ({example['category']})")
plt.tight_layout()
plt.show()

# mel-spectrogram
melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=64)
logmelspec = librosa.power_to_db(melspec)
plt.figure(figsize=(8,4))
librosa.display.specshow(logmelspec, sr=sr, x_axis='time', y_axis='mel')
plt.title("Mel-spectrogram (dB)")
plt.colorbar(format="%+2.0f dB")
plt.tight_layout()
plt.show()


“The mel scale is a perceptual frequency scale: it’s roughly linear at low frequencies and roughly logarithmic at high frequencies. It models how humans (and many animals) perceive pitch, so mel spectrograms emphasize low-frequency detail while compressing high-frequency information.”

In [None]:
# Cell 4 - dataset class
class ESC50Dataset(Dataset):
    def __init__(self, metadata_df, base_dir, sr=22050, n_mels=64, duration=5.0, n_fft=2048, hop_length=512):
        """
        metadata_df: pandas DataFrame (must have 'filename' and 'target' columns)
        base_dir: Path to DATA_DIR where ESC-50-master sits
        """
        self.df = metadata_df.reset_index(drop=True)
        self.base_audio_dir = Path(base_dir) / "ESC-50-master" / "audio"
        self.sr = sr
        self.n_mels = n_mels
        self.duration = duration
        self.samples = int(sr * duration)
        self.n_fft = n_fft
        self.hop_length = hop_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        file_path = self.base_audio_dir / row['filename']
        y, sr = librosa.load(str(file_path), sr=self.sr, duration=self.duration)
        # pad if short
        if y.shape[0] < self.samples:
            y = np.pad(y, (0, self.samples - y.shape[0]), mode='constant')
        else:
            y = y[:self.samples]
        # mel spectrogram
        melspec = librosa.feature.melspectrogram(y=y, sr=self.sr, n_mels=self.n_mels,
                                                 n_fft=self.n_fft, hop_length=self.hop_length)
        logmelspec = librosa.power_to_db(melspec, ref=np.max).astype(np.float32)
        # normalize each example (optional)
        logmelspec = (logmelspec - logmelspec.mean()) / (logmelspec.std() + 1e-9)
        X = torch.from_numpy(logmelspec).unsqueeze(0)  # shape: (1, n_mels, time)
        y_label = int(row['target'])
        return X, y_label


In [None]:
# Cell 5 - Dataset creation and loaders
SUBSET_SIZE = 2000   # how many clips to use (total dataset = 2000 clips)
BATCH_SIZE = 32      # how many clips per training step

# take a stratified sample so each of the 50 classes is represented fairly
subset_df = meta.groupby('target', group_keys=False).apply(
    lambda g: g.sample(n=max(1, int(SUBSET_SIZE/50)), random_state=SEED)
).reset_index(drop=True)

dataset_full = ESC50Dataset(subset_df, base_dir=DATA_DIR)

# split into 80% train, 20% validation
train_size = int(0.8 * len(dataset_full))
val_size = len(dataset_full) - train_size
train_ds, val_ds = random_split(dataset_full, [train_size, val_size])

# PyTorch DataLoaders handle batching & shuffling
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)


In [None]:
print("Train set size:", len(train_ds))
print("Val set size:", len(val_ds))
print("Number of classes:", subset_df['target'].nunique())


In [None]:
# Cell 6 - Model definition
class AudioCNN(nn.Module):
    def __init__(self, n_classes=50):
        super().__init__()
        # Three convolutional “feature extractor” blocks
        self.block1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.block2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.block3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        # Global pooling ensures output size doesn’t depend on input length
        self.gap = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(64, n_classes)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.gap(x)         # shrink to (batch, 64, 1, 1)
        x = x.view(x.size(0), -1)  # flatten to (batch, 64)
        return self.fc(x)

model = AudioCNN(n_classes=50).to(device)

Defines a CNN to classify mel spectrograms.
Convs + ReLU + pooling gradually compress the spectrogram into features.
Adaptive pooling makes the output fixed-size regardless of input length.
Final linear layer maps features to 50 sound classes.

In [None]:
# Print the architecture
print(model)

# Sanity check with a dummy input: one batch of size 1, 64 mel bands, 431 time frames
dummy = torch.zeros(1, 1, 64, 431).to(device)
out = model(dummy)
print("Output shape:", out.shape)


In [None]:
# Cell 7 - training
import time

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for X, y in tqdm(loader, desc="Train", leave=False):
        X = X.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        optimizer.zero_grad()
        out = model(X)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * X.size(0)
        preds = out.argmax(1)
        correct += (preds == y).sum().item()
        total += y.size(0)
    return running_loss / total, correct / total

def validate(model, loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_true, all_pred = [], []
    with torch.no_grad():
        for X, y in tqdm(loader, desc="Val", leave=False):
            X = X.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            out = model(X)
            loss = criterion(out, y)
            running_loss += loss.item() * X.size(0)
            preds = out.argmax(1)
            correct += (preds == y).sum().item()
            total += y.size(0)
            all_true.extend(y.cpu().numpy())
            all_pred.extend(preds.cpu().numpy())
    return running_loss / total, correct / total, all_true, all_pred

EPOCHS = 50
best_val_acc = 0.0
save_path = "esc50_cnn_best.pth"

for epoch in range(EPOCHS):
    t0 = time.time()
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_acc, y_true, y_pred = validate(model, val_loader, criterion)
    scheduler.step()
    print(f"Epoch {epoch+1}/{EPOCHS} | train_loss={train_loss:.4f} train_acc={train_acc:.3f} | val_loss={val_loss:.4f} val_acc={val_acc:.3f} | time={(time.time()-t0):.1f}s")
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'epoch': epoch}, save_path)
        print("Saved best model.")


Trains the model for EPOCHS (6 for a quick run, then try 50).
Training phase: feeds batches of spectrograms, updates weights with backprop.
Validation phase: evaluates performance on held-out data.
Tracks accuracy and loss.
Uses a learning-rate scheduler to reduce LR every few epochs.

In [None]:
val_loss, val_acc, y_true, y_pred = validate(model, val_loader, criterion)
print("Validation accuracy:", val_acc)
print(classification_report(y_true, y_pred))

# Confusion matrix (for readability, show only top-K classes if 50 is too big)
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
fig, ax = plt.subplots(figsize=(10,10))
disp.plot(ax=ax, include_values=False, cmap='Blues')
plt.title("Confusion matrix")
plt.show()
