In [2]:
import pandas as pd
import os

# Load CSV metadata
meta_df = pd.read_csv('ESC-50-master/meta/esc50.csv')

# Add full path to audio files
meta_df['filepath'] = meta_df['filename'].apply(lambda x: os.path.join('ESC-50-master/audio/', x))

# Display sample
meta_df.head()


Unnamed: 0,filename,fold,target,category,esc10,src_file,take,filepath
0,1-100032-A-0.wav,1,0,dog,True,100032,A,ESC-50-master/audio/1-100032-A-0.wav
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A,ESC-50-master/audio/1-100038-A-14.wav
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A,ESC-50-master/audio/1-100210-A-36.wav
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B,ESC-50-master/audio/1-100210-B-36.wav
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A,ESC-50-master/audio/1-101296-A-19.wav


In [3]:
!pip install audiomentations

Collecting audiomentations
  Downloading audiomentations-0.40.0-py3-none-any.whl.metadata (11 kB)
Collecting numpy-minmax<1,>=0.3.0 (from audiomentations)
  Downloading numpy_minmax-0.4.0-cp310-cp310-win_amd64.whl.metadata (4.3 kB)
Collecting numpy-rms<1,>=0.4.2 (from audiomentations)
  Downloading numpy_rms-0.5.0-cp310-cp310-win_amd64.whl.metadata (3.6 kB)
Collecting python-stretch<1,>=0.3.1 (from audiomentations)
  Downloading python_stretch-0.3.1-cp310-cp310-win_amd64.whl.metadata (3.7 kB)
INFO: pip is looking at multiple versions of numpy-minmax to determine which version is compatible with other requirements. This could take a while.
Collecting numpy-minmax<1,>=0.3.0 (from audiomentations)
  Downloading numpy_minmax-0.3.1-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
INFO: pip is looking at multiple versions of numpy-rms to determine which version is compatible with other requirements. This could take a while.
Collecting numpy-rms<1,>=0.4.2 (from audiomentations)
  Downloading numpy


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: C:\Users\Arnav\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import random
import librosa
import numpy as np
import torch
from torch.utils.data import Dataset
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch, Gain

class ESC50Dataset(Dataset):
    def __init__(self, df, sample_rate=44100, duration=5.0, augment_type='none', n_mels=128):
        self.df = df.reset_index(drop=True)
        self.sr = sample_rate
        self.length = int(sample_rate * duration)
        self.augment_type = augment_type
        self.n_mels = n_mels

        self.weak_transform = Compose([
            AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
        ])
        self.strong_transform = Compose([
            AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.02, p=0.5),
            PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
            TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
            Gain(min_gain_db=-6, max_gain_db=6, p=0.5)
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = row['filepath']
        label = row['target'] if 'target' in row else -1

        y, _ = librosa.load(path, sr=self.sr)
        if len(y) < self.length:
            y = np.pad(y, (0, self.length - len(y)))
        else:
            y = y[:self.length]

        if self.augment_type == 'weak':
            y = self.weak_transform(samples=y, sample_rate=self.sr)
        elif self.augment_type == 'strong':
            y = self.strong_transform(samples=y, sample_rate=self.sr)

        # Convert to Mel spectrogram
        mel = librosa.feature.melspectrogram(y=y, sr=self.sr, n_mels=self.n_mels)
        mel_db = librosa.power_to_db(mel, ref=np.max)

        # Normalize and convert to torch tensor [1, H, W]
        mel_tensor = torch.tensor(mel_db, dtype=torch.float32).unsqueeze(0)

        return mel_tensor, torch.tensor(label, dtype=torch.long)


In [4]:
def split_labeled_unlabeled_and_save(df, labeled_fraction=0.1, save_dir="labeled-unlabeled"):
    import os
    os.makedirs(f'{save_dir}/{labeled_fraction}', exist_ok=True)

    labeled_df_list = []
    unlabeled_df_list = []

    for label in sorted(df['target'].unique()):
        class_df = df[df['target'] == label]
        n_total = len(class_df)
        n_labeled = max(1, int(n_total * labeled_fraction))

        labeled = class_df.sample(n=n_labeled, random_state=42)
        unlabeled = class_df.drop(labeled.index)

        labeled_df_list.append(labeled)
        unlabeled_df_list.append(unlabeled)

    labeled_df = pd.concat(labeled_df_list).reset_index(drop=True)
    unlabeled_df = pd.concat(unlabeled_df_list).reset_index(drop=True)

    # Drop label/category from the unlabeled set
    unlabeled_df = unlabeled_df.drop(columns=["target", "category"])

    labeled_df.head()
    unlabeled_df.head()
    
    # Save both
    labeled_df.to_csv(f"{save_dir}/{labeled_fraction}/labeled.csv", index=False)      # with labels
    unlabeled_df.to_csv(f"{save_dir}/{labeled_fraction}/unlabeled.csv", index=False)  # without labels

    print(f"Saved labeled.csv (with labels) and unlabeled.csv (no labels) to '{save_dir}/{labeled_fraction}'")




In [5]:
from torch.utils.data import DataLoader

def get_ssl_loaders(meta_df,labeled_fraction=0.1, fold=1, batch_size=16, split_dir="labeled-unlabeled"):
    # Load pre-saved labeled and unlabeled CSVs for this fold
    labeled_df = pd.read_csv(f"{split_dir}/{labeled_fraction}/labeled.csv")
    unlabeled_df = pd.read_csv(f"{split_dir}/{labeled_fraction}/unlabeled.csv")

    # Get validation set from meta_df
    val_df = meta_df[meta_df['fold'] == fold]

    # Create datasets
    labeled_dataset = ESC50Dataset(labeled_df, augment_type='weak')
    unlabeled_dataset = DualViewESC50Dataset(unlabeled_df)
    val_dataset = ESC50Dataset(val_df, augment_type='none')

    # Create loaders
    labeled_loader = DataLoader(labeled_dataset, batch_size=batch_size, shuffle=True)
    unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    return labeled_loader, unlabeled_loader, val_loader



In [27]:
labeled_fraction = 0.2
split_labeled_unlabeled_and_save(
    meta_df,
    labeled_fraction=labeled_fraction,
    save_dir="labeled-unlabeled"
)

Saved labeled.csv (with labels) and unlabeled.csv (no labels) to 'labeled-unlabeled/0.2'


In [6]:
class DualViewESC50Dataset(Dataset):
    def __init__(self, df):
        self.weak_dataset = ESC50Dataset(df, augment_type='weak')
        self.strong_dataset = ESC50Dataset(df, augment_type='strong')

    def __len__(self):
        return len(self.weak_dataset)

    def __getitem__(self, idx):
        weak_x, _ = self.weak_dataset[idx]
        strong_x, _ = self.strong_dataset[idx]
        return weak_x, strong_x


In [7]:
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models


class BasicBlock1D(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=9, pool_size=4):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, padding=kernel_size // 2),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(),
            nn.MaxPool1d(pool_size)
        )

    def forward(self, x):
        return self.conv(x)

class ResNet1DAudioEncoder(nn.Module):
    def __init__(self, embed_dim=256):
        super().__init__()
        
        # Load pretrained ResNet18
        base_model = models.resnet18(pretrained=True)

        # Modify first conv layer to accept 1-channel input instead of 3
        base_model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

        # Remove final classification layer and get features
        self.feature_extractor = nn.Sequential(*list(base_model.children())[:-1])  # [B, 512, 1, 1]

        # Final linear layer to project into embedding space
        self.projection = nn.Sequential(
            nn.Flatten(),               # → [B, 512]
            nn.Linear(512, embed_dim),  # → [B, embed_dim]
            nn.ReLU()
        )

    def forward(self, x):
        """
        Expects input x: [B, 1, H, W] — e.g., Mel spectrogram
        """
        x = self.feature_extractor(x)         # [B, 512, 1, 1]
        x = self.projection(x)                # [B, embed_dim]
        return x


class EPASSProjectors(nn.Module):
    def __init__(self, embed_dim=256, proj_dim=128, num_proj=3):
        super().__init__()
        self.projectors = nn.ModuleList([
            nn.Sequential(
                nn.Linear(embed_dim, proj_dim),
                nn.ReLU(),
                nn.Linear(proj_dim, proj_dim)
            ) for _ in range(num_proj)
        ])

    def forward(self, x):
        projections = [proj(x) for proj in self.projectors]
        ensemble = torch.stack(projections).mean(dim=0)  # Mean ensemble
        return ensemble, projections


In [8]:
import torch.nn.functional as F

def train_one_epoch(encoder, projectors, classifier, labeled_loader, unlabeled_loader, optimizer, device, epoch, confidence_thresh=0.95):
    encoder.train()
    projectors.train()
    classifier.train()

    total_loss, total_ce, total_uloss, total_closs = 0.0, 0.0, 0.0, 0.0

    unlabeled_iter = iter(unlabeled_loader)

    for (x_l, y_l) in labeled_loader:
        # Get one batch from unlabeled loader
        try:
            xw, xs = next(unlabeled_iter)
        except StopIteration:
            unlabeled_iter = iter(unlabeled_loader)
            xw, xs = next(unlabeled_iter)

        x_l, y_l = x_l.to(device), y_l.to(device)
        xw, xs = xw.to(device), xs.to(device)

        # ---- Forward ----
        embed_l = encoder(x_l)       # [B, 256]
        embed_w = encoder(xw)        # [B, 256]
        embed_s = encoder(xs)        # [B, 256]

        # Classifier
        logits_l = classifier(embed_l)
        logits_w = classifier(embed_w)
        logits_s = classifier(embed_s)

        # Supervised CE Loss
        ce_loss = F.cross_entropy(logits_l, y_l)

        # Pseudo-labels
        probs_w = torch.softmax(logits_w.detach(), dim=1)
        max_probs, pseudo_labels = torch.max(probs_w, dim=1)
        mask = (max_probs >= confidence_thresh).float()

        uloss = F.cross_entropy(logits_s, pseudo_labels, reduction='none')
        uloss = (uloss * mask).mean()

        # ---- EPASS: Contrastive Loss ----
        ens_w, _ = projectors(embed_w)
        ens_s, _ = projectors(embed_s)

        # Normalize
        ens_w = F.normalize(ens_w, dim=1)
        ens_s = F.normalize(ens_s, dim=1)

        # Cosine sim loss
        sim = (ens_w * ens_s).sum(dim=1)
        closs = -torch.log(sim + 1e-6).mean()

        # ---- Total loss ----
        loss = ce_loss + uloss + 0.5 * closs

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_ce += ce_loss.item()
        total_uloss += uloss.item()
        total_closs += closs.item()

    print(f"  Pseudo-labels used: {mask.sum().item()} / {mask.shape[0]}")
    print(f"[Epoch {epoch}] Loss: {total_loss:.4f} | CE: {total_ce:.4f} | U: {total_uloss:.4f} | Contrastive: {total_closs:.4f}")


In [9]:
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np

def evaluate(encoder, classifier, val_loader, device, num_classes=50, plot_loss_curves=False, train_losses=None, val_losses=None):
    encoder.eval()
    classifier.eval()

    correct, total = 0, 0
    all_preds = []
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits = classifier(encoder(x))

            probs = torch.softmax(logits, dim=1)
            preds = probs.argmax(dim=1)

            all_probs.append(probs.cpu().numpy())
            all_preds.append(preds.cpu().numpy())
            all_labels.append(y.cpu().numpy())

            correct += (preds == y).sum().item()
            total += y.size(0)

    acc = 100.0 * correct / total
    all_probs = np.concatenate(all_probs, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    # print(f"\nValidation Accuracy: {acc:.2f}%")
    # print("\nClassification Report:")
    # print(classification_report(all_labels, all_preds, digits=4))

    # Core metrics
    precision = precision_score(all_labels, all_preds, average="macro", zero_division=0)
    recall = recall_score(all_labels, all_preds, average="macro", zero_division=0)
    f1 = f1_score(all_labels, all_preds, average="macro", zero_division=0)

    # AUC-PRC
    try:
        auc_prc = average_precision_score(
            y_true=np.eye(num_classes)[all_labels],
            y_score=all_probs,
            average="macro"
        )
        print(f"AUC-PRC (macro): {auc_prc:.4f}")
    except Exception as e:
        auc_prc = None
        print(f"AUC-PRC: Not computable — {str(e)}")

    # AUC-ROC
    try:
        auc_roc = roc_auc_score(
            y_true=np.eye(num_classes)[all_labels],
            y_score=all_probs,
            average="macro",
            multi_class='ovr'
        )
        print(f"AUC-ROC (macro): {auc_roc:.4f}")
    except Exception as e:
        auc_roc = None
        print(f"AUC-ROC: Not computable — {str(e)}")

    # Optional: Plot Loss Curves
    if plot_loss_curves and train_losses and val_losses:
        plt.figure(figsize=(8, 5))
        plt.plot(train_losses, label='Train Loss')
        plt.plot(val_losses, label='Val Loss')
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Train vs Validation Loss")
        plt.legend()
        plt.grid(True)
        plt.show()

    # Return all stats
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc_prc": auc_prc,
        "auc_roc": auc_roc
    }


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

num_epochs = 20
batch_size = 16
labeled_fraction = 0.2
split_dir = "labeled-unlabeled"

fold_accuracies = []
fold_metrics = []

save_dir = "models/ESC-50/EPASS"
os.makedirs(save_dir, exist_ok=True)

for fold in range(2, 6):
    print(f"\n=== Fold {fold} ===")

    # Load data loaders from saved splits
    labeled_loader, unlabeled_loader, val_loader = get_ssl_loaders(
        meta_df=meta_df,
        labeled_fraction=labeled_fraction,
        fold=fold,
        batch_size=batch_size,
        split_dir=split_dir
    )

    # Reinitialize model and optimizer for each fold
    encoder = ResNet1DAudioEncoder(embed_dim=256).to(device)
    projectors = EPASSProjectors(embed_dim=256, proj_dim=128, num_proj=3).to(device)
    classifier = nn.Linear(256, 50).to(device)

    params = list(encoder.parameters()) + list(projectors.parameters()) + list(classifier.parameters())
    optimizer = torch.optim.Adam(params, lr=1e-3)

    best_metric = None
    best_acc = 0.0

    for epoch in range(1, num_epochs + 1):
        train_one_epoch(
            encoder,
            projectors,
            classifier,
            labeled_loader,
            unlabeled_loader,
            optimizer,
            device,
            epoch,
            confidence_thresh=0.5
        )
        metrics = evaluate(
            encoder,
            classifier,
            val_loader,
            device,
            num_classes=50,
            plot_loss_curves=False
        )

        acc = metrics["accuracy"]
        print(f"Validation Accuracy: {acc:.2f}%")
        if metrics["accuracy"] > best_acc:
            best_acc = metrics["accuracy"]
            best_metrics = metrics
            torch.save({
                'encoder_state_dict': encoder.state_dict(),
                'projectors_state_dict': projectors.state_dict(),
                'classifier_state_dict': classifier.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'fold': fold,
                'epoch': epoch,
                'metrics': best_metrics
            }, os.path.join(save_dir, f"best_model_fold{fold}.pt"))
            print(f"Saved best model for Fold {fold} at epoch {epoch}")

    fold_accuracies.append(best_acc)
    fold_metrics.append(best_metrics)
    print(f"Best accuracy for fold {fold}: {best_acc:.2f}%")

# Summary of all stats
print("\n=== 5-Fold Summary ===")
avg_metrics = {}

for i, m in enumerate(fold_metrics, 1):
    print(f"\nFold {i}:")
    for k, v in m.items():
        print(f"  {k}: {v:.4f}")
        avg_metrics[k] = avg_metrics.get(k, 0.0) + (v if v is not None else 0.0)

print("\nAverage Metrics Across Folds:")
for k, total in avg_metrics.items():
    avg_val = total / len(fold_metrics)
    print(f"  {k}: {avg_val:.4f}")




Using device: cpu

=== Fold 2 ===




  Pseudo-labels used: 0.0 / 16
[Epoch 1] Loss: 98.5026 | CE: 97.8990 | U: 0.4654 | Contrastive: 0.2765
AUC-PRC (macro): 0.1757
AUC-ROC (macro): 0.7978
Validation Accuracy: 10.75%
Saved best model for Fold 2 at epoch 1
  Pseudo-labels used: 0.0 / 16
[Epoch 2] Loss: 84.0869 | CE: 83.8346 | U: 0.2412 | Contrastive: 0.0222
AUC-PRC (macro): 0.1538
AUC-ROC (macro): 0.8013
Validation Accuracy: 9.00%
  Pseudo-labels used: 1.0 / 16
[Epoch 3] Loss: 78.0266 | CE: 77.4651 | U: 0.5585 | Contrastive: 0.0059
AUC-PRC (macro): 0.2332
AUC-ROC (macro): 0.8355
Validation Accuracy: 17.00%
Saved best model for Fold 2 at epoch 3
  Pseudo-labels used: 1.0 / 16
[Epoch 4] Loss: 70.2874 | CE: 69.5162 | U: 0.7690 | Contrastive: 0.0042
AUC-PRC (macro): 0.3454
AUC-ROC (macro): 0.8906
Validation Accuracy: 28.50%
Saved best model for Fold 2 at epoch 4
  Pseudo-labels used: 2.0 / 16
[Epoch 5] Loss: 64.4344 | CE: 62.9396 | U: 1.4934 | Contrastive: 0.0029
AUC-PRC (macro): 0.3548
AUC-ROC (macro): 0.8982
Validation Accura



  Pseudo-labels used: 0.0 / 16
[Epoch 1] Loss: 96.5360 | CE: 96.3922 | U: 0.0000 | Contrastive: 0.2876
AUC-PRC (macro): 0.1749
AUC-ROC (macro): 0.8209
Validation Accuracy: 12.00%
Saved best model for Fold 3 at epoch 1
  Pseudo-labels used: 0.0 / 16
[Epoch 2] Loss: 83.5525 | CE: 83.5245 | U: 0.0185 | Contrastive: 0.0191
AUC-PRC (macro): 0.2829
AUC-ROC (macro): 0.8686
Validation Accuracy: 17.00%
Saved best model for Fold 3 at epoch 2
  Pseudo-labels used: 0.0 / 16
[Epoch 3] Loss: 73.8096 | CE: 73.0257 | U: 0.7803 | Contrastive: 0.0072
AUC-PRC (macro): 0.3118
AUC-ROC (macro): 0.8859
Validation Accuracy: 20.75%
Saved best model for Fold 3 at epoch 3
  Pseudo-labels used: 1.0 / 16
[Epoch 4] Loss: 68.3760 | CE: 66.9902 | U: 1.3835 | Contrastive: 0.0046
AUC-PRC (macro): 0.3495
AUC-ROC (macro): 0.9298
Validation Accuracy: 26.00%
Saved best model for Fold 3 at epoch 4
  Pseudo-labels used: 5.0 / 16
[Epoch 5] Loss: 63.6408 | CE: 61.3858 | U: 2.2538 | Contrastive: 0.0023
AUC-PRC (macro): 0.3841
A



  Pseudo-labels used: 0.0 / 16
[Epoch 1] Loss: 96.9830 | CE: 96.7901 | U: 0.0594 | Contrastive: 0.2672
AUC-PRC (macro): 0.1618
AUC-ROC (macro): 0.7777
Validation Accuracy: 6.50%
Saved best model for Fold 4 at epoch 1
  Pseudo-labels used: 0.0 / 16
[Epoch 2] Loss: 84.8571 | CE: 84.6458 | U: 0.2013 | Contrastive: 0.0201
AUC-PRC (macro): 0.2328
AUC-ROC (macro): 0.8573
Validation Accuracy: 15.00%
Saved best model for Fold 4 at epoch 2
  Pseudo-labels used: 1.0 / 16
[Epoch 3] Loss: 74.5981 | CE: 73.7932 | U: 0.8020 | Contrastive: 0.0059
AUC-PRC (macro): 0.2750
AUC-ROC (macro): 0.8801
Validation Accuracy: 18.50%
Saved best model for Fold 4 at epoch 3
  Pseudo-labels used: 2.0 / 16
[Epoch 4] Loss: 70.1940 | CE: 69.3274 | U: 0.8645 | Contrastive: 0.0040
AUC-PRC (macro): 0.3107
AUC-ROC (macro): 0.8907
Validation Accuracy: 22.50%
Saved best model for Fold 4 at epoch 4
  Pseudo-labels used: 4.0 / 16
[Epoch 5] Loss: 66.9873 | CE: 64.9641 | U: 2.0215 | Contrastive: 0.0035
AUC-PRC (macro): 0.3687
AU



  Pseudo-labels used: 0.0 / 16
[Epoch 1] Loss: 99.1575 | CE: 98.5353 | U: 0.4968 | Contrastive: 0.2507
AUC-PRC (macro): 0.1562
AUC-ROC (macro): 0.7708
Validation Accuracy: 9.50%
Saved best model for Fold 5 at epoch 1
  Pseudo-labels used: 1.0 / 16
[Epoch 2] Loss: 84.6941 | CE: 84.5742 | U: 0.1112 | Contrastive: 0.0173
AUC-PRC (macro): 0.2218
AUC-ROC (macro): 0.8183
Validation Accuracy: 13.75%
Saved best model for Fold 5 at epoch 2
  Pseudo-labels used: 1.0 / 16
[Epoch 3] Loss: 75.2123 | CE: 74.9844 | U: 0.2256 | Contrastive: 0.0045
AUC-PRC (macro): 0.2010
AUC-ROC (macro): 0.8169
Validation Accuracy: 16.75%
Saved best model for Fold 5 at epoch 3
  Pseudo-labels used: 1.0 / 16
[Epoch 4] Loss: 73.8309 | CE: 72.5615 | U: 1.2681 | Contrastive: 0.0028
AUC-PRC (macro): 0.3109
AUC-ROC (macro): 0.8875
Validation Accuracy: 21.00%
Saved best model for Fold 5 at epoch 4
  Pseudo-labels used: 3.0 / 16
[Epoch 5] Loss: 65.3597 | CE: 62.9537 | U: 2.4046 | Contrastive: 0.0028
AUC-PRC (macro): 0.3704
AU