# This is the code for the solution proposed by THE UNCLES team concering the sound classification challenge

### The given folders structure:

 - train:
 - 10 subfolders that contain soundfiles of the sound classes

#### The soundfiles are of a short format and the number of soundfiles in every class is nearly the same in every subfolder, the classes are balanced

### In order to produce better results, we will use some data augmentation techniques too

- We will process audio files into **Mel Spectrograms**, a visual representation of sound.
- The model will be based on **ResNet-18** not pretrained.
- We implement **Mixup augmentation**, a data augmentation technique to improve generalization.
- We also use **early stopping and learning rate decay** to optimize training.

---

# I-Installing Dependencies

In [1]:
import os
import numpy as np 
import torch # for dl
import torchaudio # for audio processing
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split # for handling data loading and splitting
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB, TimeMasking, FrequencyMasking # for audio processing 
from sklearn.metrics import f1_score # the used metric in the challenge
from tqdm import tqdm # for progress bar visualization
import torchvision.models as models # for using model architectures


## II-Configuration setup

These are the key **hyperparameters** for training:

- `sample_rate`: The number of samples per second in an audio clip.
- `n_mels`: Number of mel-frequency bins in spectrograms.
- `batch_size`: Number of samples per batch for training.
- `epochs`: Total training cycles.
- `learning_rate`: Step size for optimizer updates.
- `train_split`: Proportion of data used for training.
- `patience`: How long to wait before early stopping.


In [None]:
CONFIG = {
    "sample_rate": 16000,
    "n_mels": 128,
    "n_fft": 1024,
    "hop_length": 512,
    "batch_size": 32,
    "epochs": 60,
    "learning_rate": 1e-4,
    "audio_duration": 2,
    "num_classes": 10,
    "train_split": 0.8,
    "patience": 10,
    "lr_decay_factor": 0.5,
    "lr_decay_patience": 2,
    "weight_decay": 5e-4,
    "mixup_alpha": 0.4  # Mixup hyperparameter (higher means stronger mixup)
}

## iii-Defining the AudioDataset class

In [None]:
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, train=True):
        self.file_paths = file_paths
        self.labels = labels
        self.train = train
        
        self.mel_spec_transform = MelSpectrogram(
            sample_rate=CONFIG["sample_rate"],
            n_fft=CONFIG["n_fft"],
            hop_length=CONFIG["hop_length"],
            n_mels=CONFIG["n_mels"]
        )
        self.amp_to_db = AmplitudeToDB(top_db=80)

        # Spectrogram augmentations (SpecAugment)
        self.time_mask = TimeMasking(time_mask_param=20)
        self.freq_mask = FrequencyMasking(freq_mask_param=10)
    
    def __getitem__(self, idx):
        audio_path = self.file_paths[idx]
        label = self.labels[idx]
        
        waveform, sr = torchaudio.load(audio_path)
        if sr != CONFIG["sample_rate"]:
            waveform = torchaudio.transforms.Resample(sr, CONFIG["sample_rate"])(waveform)

        # Ensure consistent length
        target_length = int(CONFIG["audio_duration"] * CONFIG["sample_rate"])
        if waveform.size(1) > target_length:
            start = np.random.randint(0, waveform.size(1) - target_length) if self.train else 0
            waveform = waveform[:, start:start + target_length]
        else:
            waveform = F.pad(waveform, (0, target_length - waveform.size(1)))
        
        # ----- Waveform Augmentations -----
        if self.train:
            waveform = torch.roll(waveform, shifts=np.random.randint(-1600, 1600), dims=1)
            waveform = waveform * np.random.uniform(0.8, 1.2)
        # -----------------------------------

        mel_spec = self.mel_spec_transform(waveform)
        mel_spec = self.amp_to_db(mel_spec)
        mel_spec = (mel_spec - mel_spec.mean()) / (mel_spec.std() + 1e-8)  # Normalize
        
        if self.train:
            mel_spec = self.time_mask(mel_spec)
            mel_spec = self.freq_mask(mel_spec)

        return mel_spec, label

**Mixup** is a data augmentation technique where two training samples are mixed together using a random weight **λ**.

- It helps improve model generalization.
- It prevents the model from becoming overconfident.
- Instead of training on a single label, the model learns a weighted combination of labels.


In [None]:
# Mixup augmentation
def mixup_data(x, y, alpha=CONFIG["mixup_alpha"]):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

## iv-Function to display a spectrogram

In [None]:
def plot_spectrogram(file_path):
    waveform, sr = torchaudio.load(file_path)
    mel_spec_transform = MelSpectrogram(
        sample_rate=CONFIG["sample_rate"],
        n_fft=CONFIG["n_fft"],
        hop_length=CONFIG["hop_length"],
        n_mels=CONFIG["n_mels"]
    )
    mel_spec = mel_spec_transform(waveform)
    plt.figure(figsize=(10, 4))
    plt.imshow(mel_spec.log2()[0].numpy(), cmap="inferno", aspect="auto")
    plt.title("Mel Spectrogram")
    plt.show()


## v-Defining the model architecture

In [None]:
class EfficientResNetAudio(nn.Module):
    def __init__(self, num_classes=10, input_channels=1):
        super(EfficientResNetAudio, self).__init__()
        self.resnet = models.resnet18(pretrained=False)
        self.resnet.conv1 = nn.Conv2d(input_channels, 64, kernel_size=7, stride=2, padding=3, bias=False)
        in_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(in_features, num_classes)
        )
    
    def forward(self, x):
        return self.resnet(x)


## vii-Defining how the training epoch should be like

In [None]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss, all_preds, all_labels = 0, [], []
    
    for data, target in tqdm(train_loader, desc="Training"):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        # Apply Mixup augmentation
        mixed_data, targets_a, targets_b, lam = mixup_data(data, target, CONFIG["mixup_alpha"])
        output = model(mixed_data)
        
        loss = mixup_criterion(criterion, output, targets_a, targets_b, lam)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
        all_preds.extend(output.argmax(dim=1).cpu().numpy())
        all_labels.extend(target.cpu().numpy())

    return total_loss / len(train_loader), f1_score(all_labels, all_preds, average='macro')

## iix-Model training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

data_dir = "/kaggle/input/urban-sound-classification/train"
file_paths, labels = [], []

for label_idx, class_name in enumerate(sorted(os.listdir(data_dir))):
    class_dir = os.path.join(data_dir, class_name)
    for file_name in os.listdir(class_dir):
        file_paths.append(os.path.join(class_dir, file_name))
        labels.append(label_idx)

dataset = AudioDataset(file_paths, labels, train=True)
train_size = int(CONFIG["train_split"] * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=CONFIG["batch_size"], shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG["batch_size"], shuffle=False, num_workers=4, pin_memory=True)

model = EfficientResNetAudio(num_classes=CONFIG["num_classes"]).to(device)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG["learning_rate"], weight_decay=CONFIG["weight_decay"])

for epoch in range(CONFIG["epochs"]):
    train_epoch(model, train_loader, criterion, optimizer, device)


### ix-Model Evaluation

In [None]:
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

def test(model, test_loader, device):
    total_loss, all_preds, all_labels = 0, [], []
    
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            all_preds.extend(output.argmax(dim=1).cpu().numpy())
            all_labels.extend(target.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='macro')
    print(f"Test F1 Score: {f1:.4f}")

test(model, test_loader, device)


## Script for building the submission

In [None]:

import os
import torch
import torchaudio
import pandas as pd
import torch.nn as nn
import torchvision.models as models
from torchaudio.transforms import MelSpectrogram

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Config
CONFIG = {
    "sample_rate": 16000,
    "n_mels": 128,
    "n_fft": 1024,
    "hop_length": 512,
    "batch_size": 32,
    "model_path": "best_model.pth",  # Update based on the best saved model
}

# Class Labels
CLASSES = [
    "airport", "bus", "metro", "metro_station", "park",
    "public_square", "shopping_mall", "street_pedestrian",
    "street_traffic", "tram"
]
class_to_idx = {i: label for i, label in enumerate(CLASSES)}

# ResNet Model for Audio
class ResNetAudio(nn.Module):
    def __init__(self, num_classes=10):
        super(ResNetAudio, self).__init__()
        self.resnet = models.resnet18(weights=None)
        self.resnet.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)

    def forward(self, x):
        return self.resnet(x)

# Load Model
model = ResNetAudio(num_classes=10).to(device)
model.load_state_dict(torch.load(CONFIG["model_path"], map_location=device))
model.eval()

# Function to process test audio file
def process_audio(filepath):
    waveform, sr = torchaudio.load(filepath)

    if sr != CONFIG["sample_rate"]:
        waveform = torchaudio.transforms.Resample(sr, CONFIG["sample_rate"])(waveform)

    mel_spec = MelSpectrogram(
        sample_rate=CONFIG["sample_rate"],
        n_fft=CONFIG["n_fft"],
        hop_length=CONFIG["hop_length"],
        n_mels=CONFIG["n_mels"]
    )(waveform)

    mel_spec = torch.log(mel_spec + 1e-9)
    mel_spec = mel_spec.expand(3, -1, -1)  # Convert to 3-channel for ResNet

    return mel_spec

# Predict labels for test data
test_dir = "/kaggle/input/urban-sound-classification/test"
test_files = sorted(os.listdir(test_dir))  # Ensure filenames are sorted for consistency
predictions = []

for filename in test_files:
    filepath = os.path.join(test_dir, filename)
    mel_spec = process_audio(filepath)
    mel_spec = mel_spec.unsqueeze(0).to(device)  # Add batch dimension

    with torch.no_grad():
        output = model(mel_spec)
        pred_label = torch.argmax(output, dim=1).item()

    predictions.append([filename, class_to_idx[pred_label]])

# Save predictions to CSV
submission_df = pd.DataFrame(predictions, columns=["filename", "scene_label"])
submission_df.to_csv("submission.csv", index=False)

print("✅ Predictions saved to submission.csv")
