In [1]:
import random
import os
import numpy as np
import sklearn
import torch
from torch.cuda import manual_seed_all
from torch import manual_seed as torch_manual_seed
from torch.backends import cudnn
import matplotlib as mpl
from matplotlib import pyplot as plt
import torchaudio
import torchaudio.transforms as T

In [2]:
# pre spectrogram augmentations
# these are examples and can be changed based on domain knowledge

def stretch_waveform(waveform, rate=1.2):
    time_stretch = T.TimeStretch()
    # `rate > 1.0` speeds up, `rate < 1.0` slows down
    return time_stretch(waveform, rate)

def shift_pitch(waveform, sample_rate=44100, n_steps = 2):
    pitch_shift = T.PitchShift(sample_rate, n_steps)  # Shift up by 2 semitones
    return pitch_shift(waveform)

def scale_volume(waveform, factor = None):
    if factor is None:
        waveform *= torch.FloatTensor(1).uniform_(0.8, 1.5).item()  # Amplifies waveform by random factor
    else:
        waveform *= factor
    return waveform

def crop_waveform(waveform, crop_size):
    start = torch.randint(0, max(1, waveform.size(-1) - crop_size), (1,)).item()
    return waveform[:, start:start + crop_size]

def apply_reverb(waveform):
    reverb = T.Reverberate()
    return reverb(waveform)

def time_shift(waveform, shift):
    return torch.roll(waveform, shifts=shift, dims=-1)

def add_noise(waveform, noise_level=0.005):
    noise = torch.randn_like(waveform) * noise_level
    return waveform + noise

# Augment on-the-fly stochastically
# again these are just examples and do not necessarily utilize the methods above
def augment_waveform(data):
    waveform, sample_rate = data
    if torch.rand(1).item() > 0.9:
        waveform = add_noise(waveform)
    if torch.rand(1).item() > 0.9:
        waveform = time_shift(waveform, shifts=torch.randint(-waveform.size(-1) // 2, waveform.size(-1) // 2, (1,)).item())
    if torch.rand(1).item() > 0.9:
        waveform = scale_volume(waveform)
    if torch.rand(1).item() > 0.9:
        waveform = apply_reverb(waveform)
    if torch.rand(1).item() > 0.9:
        waveform = shift_pitch(waveform, sample_rate, n_steps= torch.randint(-12, 12, (1,)).item())
    if torch.rand(1).item() > 0.9:
        waveform = stretch_waveform(waveform, rate= torch.FloatTensor(1).uniform_(0.5, 1.5).item())
    return waveform, sample_rate


In [3]:
# Create a MelSpectrogram transformation
mel_spectrogram_transform = T.MelSpectrogram(
    sample_rate=44100,         # Default sample rate, change if needed
    n_fft=1024,                # Number of FFT bins
    hop_length=512,            # Hop length between windows
    n_mels=64                  # Number of Mel bands
)

def waveform_to_spectrogram(data):
    waveform, sample_rate = data
    spectrogram = mel_spectrogram_transform(waveform)  # Apply the spectrogram transformation
    return spectrogram

In [4]:
# post spectrogram augmentations

# Example augmentations, could add more
time_mask = T.TimeMasking(time_mask_param=10)

freq_mask = T.FrequencyMasking(freq_mask_param=8)

# hybridizes two sounds
def mixup(spectrogram1, spectrogram2, alpha=0.2):
    lam = torch.FloatTensor(1).uniform_(0, alpha).item()
    return lam * spectrogram1 + (1 - lam) * spectrogram2

# should probably implement a randomization process like above
def augment_spectrogram(spectrogram):
    augmented = time_mask(spectrogram)  # Apply time masking
    augmented = freq_mask(augmented)   # Apply frequency masking
    return augmented
    

In [5]:
# Decode audio files
def decode_audio(file_tuple):
    file_path, file = file_tuple
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

In [6]:
import os
import torchaudio
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class UrbanSoundDataset(Dataset):
    def __init__(self, audio_path, fold, csv_path, transform=None):
        self.audio_path = os.path.join(audio_path, f"fold{fold}")
        self.file_list = [os.path.join(self.audio_path, f) for f in os.listdir(self.audio_path) if f.endswith(".wav")]
        self.transform = transform

        # Load the metadata CSV file
        self.metadata = pd.read_csv(csv_path)

    def get_label(self, file_name):
        """Fetch the class label for a given file name from the metadata."""
        label_row = self.metadata.loc[self.metadata['slice_file_name'] == file_name, 'class']
        if not label_row.empty:
            return label_row.values[0]
        else:
            raise ValueError(f"File name {file_name} not found in metadata CSV.")

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        # Load the audio file
        file_path = self.file_list[idx]
        waveform, sample_rate = torchaudio.load(file_path)

        # Convert mono to stereo if necessary
        if waveform.size(0) == 1:  # If mono
            waveform = waveform.repeat(3, 1)

        # Apply transformations
        if self.transform:
            waveform = self.transform(waveform)

        # Extract the file name from the path
        file_name = os.path.basename(file_path)

        # Get the corresponding label for the file
        label = self.get_label(file_name)

        return waveform, label


In [7]:
import torchaudio.transforms as T

# Example transformations
def augment_waveform(waveform):
    # Add your augmentation logic here (e.g., noise addition, time stretch, etc.)
    return waveform

waveform_to_spectrogram = T.MelSpectrogram(sample_rate=16000, n_mels=128)
augment_spectrogram = T.AmplitudeToDB()

# Combine transformations into a callable function
def transform_pipeline(waveform):
    waveform = augment_waveform(waveform)
    spectrogram = waveform_to_spectrogram(waveform)
    # spectrogram = augment_spectrogram(spectrogram)
    return spectrogram

def pad_with_noise(spectrogram, max_time, noise_std=0.01):
    """
    Pads a spectrogram with Gaussian noise instead of zeros.

    Args:
        spectrogram (Tensor): Shape (channels, freq_bins, time_steps)
        max_time (int): Target time dimension
        noise_std (float): Standard deviation of the Gaussian noise

    Returns:
        Tensor: Padded spectrogram with noise
    """
    # Compute how much padding is needed
    pad_amount = max_time - spectrogram.size(2)
    
    if pad_amount > 0:
        # Generate random noise matching the shape of missing time steps
        noise = torch.randn((spectrogram.size(0), spectrogram.size(1), pad_amount)) * noise_std
        
        # Concatenate noise along the time axis
        spectrogram = torch.cat([spectrogram, noise], dim=2)
    
    return spectrogram

# def convert_to_three_channels(spectrogram):
#     # Convert [2, 224, 224] to [3, 224, 224]
#     if spectrogram.size(0) == 2:
#         # Duplicate the first channel to create a third channel
#         return torch.cat((spectrogram, spectrogram[0:1, :, :]), dim=0)
#     return spectrogram

def convert_to_three_channels(spectrogram):
    # Convert [2, 224, 224] to [3, 224, 224]
    if spectrogram.size(0) == 2:
        # Calculate the mean of the two channels
        mean_channel = torch.mean(spectrogram, dim=0, keepdim=True)
        # Concatenate the mean channel as the third channel
        return torch.cat((spectrogram, mean_channel), dim=0)
    return spectrogram



In [8]:
import torch
import torchvision
from torch import nn

class densenet(torch.nn.Module):
    """
    DenseNet Class, derived from Pytorch. Intended for model manipulation (i.e. unfreezing layers, etc.)
    To use model, try (densenet).model(data)
    May change to reflect manual implementation of densenet161.
    """
    def __init__(self, weights = "DEFAULT", drop = 0.5):
        super().__init__()  # Initialize the nn.Module base class
        self.model = torchvision.models.densenet161(weights = weights)
        
        num_features = self.model.classifier.in_features
        self.model.classifier = nn.Sequential(
            nn.Dropout(drop),  # Add dropout with 50% probability
            nn.Linear(num_features, 10)  # Adjust for 10 output classes (UrbanSound8k)
        )
        
        # Ensure classifier is trainable
        for param in self.model.classifier.parameters():
            param.requires_grad = True


    def forward(self, x):
        return self.model(x)  # Delegate forward pass to the original DenseNet

    def layer_change(self, layer=0):
        if layer > 0:    
            # Freeze earlier layers (optional)
            for name, param in self.model.features.named_parameters():
                if "conv0" in name or "denseblock1" in name:  # Freeze initial layers and denseblock1
                    param.requires_grad = False
        if layer > 1:    
            # Freeze earlier layers (optional)
            for name, param in self.model.features.named_parameters():
                if "denseblock2" in name:  # Freeze initial layers and denseblock2
                    param.requires_grad = False
        if layer > 2:    
            # Freeze earlier layers (optional)
            for name, param in self.model.features.named_parameters():
                if "denseblock3" in name:  # Freeze initial layers and denseblock3
                    param.requires_grad = False
        if layer > 3:    
            # Freeze earlier layers (optional)
            for name, param in self.model.features.named_parameters():
                if "denseblock4" in name:  # Freeze initial layers and denseblock4
                    param.requires_grad = False

In [9]:
# model = densenet()
# for param in model.model.features.named_parameters():
#     print(param[0])

In [10]:
# Define training and testing loops

def train_loop(train_dataloader, val_dataloader, model, loss_fn, optimizer, scheduler=None, epochs=1):
    model.train()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Store metrics
    epoch_train_losses = []  # Track training loss across epochs
    epoch_val_losses = []  # Track validation loss across epochs
    epoch_val_accuracies = []  # Track validation accuracy across epochs

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        size = len(train_dataloader.dataset)
        total_loss = 0  # Initialize variable to accumulate training loss

        for batch, (X, y) in enumerate(train_dataloader):
            # Compute prediction and loss
            X = X.to(device)
            y = y.to(device)
            pred = model(X)
            loss = loss_fn(pred, y)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate loss
            total_loss += loss.item()

            # Print progress periodically
            total_batches = len(train_dataloader)
            if batch % (total_batches // 5) == 0:  # Prints 5 times per epoch
                current = (batch + 1) * len(X)
                print(f"loss: {loss.item():>7f}  [{current:>5d}/{size:>5d}]")

        # Average training loss for the epoch
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Training Loss (Epoch): {avg_train_loss:>7f}")
        epoch_train_losses.append(avg_train_loss)

        # **Validation Step**
        print("Validating...")
        avg_val_loss, val_accuracy = test_loop(val_dataloader, model, loss_fn, verbose=False)
        print(f"Validation Loss: {avg_val_loss:.6f}, Validation Accuracy: {val_accuracy * 100:.2f}%")

        # Track validation metrics
        epoch_val_losses.append(avg_val_loss)
        epoch_val_accuracies.append(val_accuracy)

        if scheduler is not None:
            scheduler.step(val_accuracy)
            print(f"Learning Rate: {scheduler.get_last_lr()}")

    # Return metrics for tracking/aggregation across folds
    return epoch_train_losses, epoch_val_losses, epoch_val_accuracies

def test_loop(dataloader, model, loss_fn, verbose=True):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    # Average loss and accuracy for this fold
    avg_test_loss = test_loss / num_batches
    accuracy = correct / size
    if verbose:
        print(f"Test Error: \n Accuracy: {(100*accuracy):>0.1f}%, Avg loss: {avg_test_loss:>8f} \n")
    return avg_test_loss, accuracy  # Return both average loss and accuracy for this fold

In [11]:
import torchvision.transforms as transforms

def custom_collate_fn(batch):
    # Resize and normalize for DenseNet
    resize_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize for DenseNet
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Standard ImageNet normalization
    ])
    
    inputs, labels = zip(*batch)  # Separate inputs and labels
    max_time = max(spectrogram.size(2) for spectrogram in inputs)

    # Pad inputs to the same length along the time dimension
    padded_inputs = [
        torch.nn.functional.pad(input, (0, max_time - input.size(2)))
        for input in inputs
    ]

    # Convert to 3 channels and resize
    resized_inputs = [resize_transform(convert_to_three_channels(input)) for input in padded_inputs]
    
    # Map labels to numeric class IDs
    class_mapping = {
        "air_conditioner": 0,
        "car_horn": 1,
        "children_playing": 2,
        "dog_bark": 3,
        "drilling": 4,
        "engine_idling": 5,
        "gun_shot": 6,
        "jackhammer": 7,
        "siren": 8,
        "street_music": 9
    }

    numeric_labels = [class_mapping[label] for label in labels]

    # Stack inputs and labels
    return torch.stack(resized_inputs), torch.tensor(numeric_labels)


In [12]:
print(os.getcwd())

/sfs/gpfs/tardis/home/asm2fe/UrbanAdversary


In [13]:
from sklearn.model_selection import train_test_split
import torch

# Specify paths and batch size
AUDIO_PATH = "./UrbanSound8K/audio"
CSV_PATH = "./UrbanSound8K/metadata/UrbanSound8K.csv"
batch_size = 70
epochs = 25

def setup_seed(seed):
    torch_manual_seed(seed)
    manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    cudnn.deterministic = True

def dense_tune(layer = 0, lr = 2e-4, weight_decay = 0.2, weights = "DEFAULT", drop = 0.5, SEED = 666):
    loss_fn = torch.nn.CrossEntropyLoss()
    setup_seed(SEED)

    # Variables to accumulate metrics across folds
    fold_train_losses = []
    fold_val_losses = []
    fold_val_accuracies = []

    # Loop through folds
    for fold in range(1, 11):
        model = densenet(weights = weights, drop = drop)
        model.layer_change(layer = layer) # freeze first conv and dense block(s) if desired

        print(f"Processing Fold {fold}")
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay = weight_decay)
        # optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay = 0.01, momentum = 0.9)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', factor=0.5, patience=0, threshold_mode = 'abs', threshold= 0.01, min_lr=1e-9)

        # Initialize dataset and DataLoader
        dataset = UrbanSoundDataset(audio_path=AUDIO_PATH, fold=fold, transform=transform_pipeline, csv_path=CSV_PATH)
        train_indices, val_indices = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)

        train_dataset = torch.utils.data.Subset(dataset, train_indices)
        val_dataset = torch.utils.data.Subset(dataset, val_indices)

        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)

        # Train and validate (over multiple epochs per fold)
        epoch_train_losses, epoch_val_losses, epoch_val_accuracies = train_loop(
            train_dataloader, val_dataloader, model, loss_fn, optimizer, scheduler, epochs=epochs
        )

        # Aggregate fold-level metrics (e.g., last epoch metrics)
        fold_train_losses.append(epoch_train_losses[-1])  # Last epoch's training loss
        fold_val_losses.append(epoch_val_losses[-1])  # Last epoch's validation loss
        fold_val_accuracies.append(epoch_val_accuracies[-1])  # Last epoch's validation accuracy

    # Compute average metrics across folds
    mean_train_loss = sum(fold_train_losses) / len(fold_train_losses)
    mean_val_loss = sum(fold_val_losses) / len(fold_val_losses)
    mean_val_accuracy = sum(fold_val_accuracies) / len(fold_val_accuracies)

    print(f"\nCross-Validation Results:")
    print(f"Avg Training Loss: {mean_train_loss:.6f}")
    print(f"Avg Validation Loss: {mean_val_loss:.6f}")
    print(f"Avg Validation Accuracy: {mean_val_accuracy * 100:.2f}%")


In [14]:

dense_tune(layer = 0, lr = 2e-4, weight_decay = 0.2, weights = "DEFAULT", drop = 0.5)

Processing Fold 1
Epoch 1/25
loss: 2.331131  [   70/  698]
loss: 2.014276  [  210/  698]
loss: 1.880963  [  350/  698]
loss: 1.672783  [  490/  698]
loss: 1.541667  [  630/  698]
Training Loss (Epoch): 1.889011
Validating...
Validation Loss: 1.856203, Validation Accuracy: 45.14%
Learning Rate: [0.0002]
Epoch 2/25
loss: 1.809719  [   70/  698]
loss: 1.277156  [  210/  698]
loss: 1.275134  [  350/  698]
loss: 1.094460  [  490/  698]
loss: 0.964627  [  630/  698]
Training Loss (Epoch): 1.189580
Validating...
Validation Loss: 0.775463, Validation Accuracy: 69.14%
Learning Rate: [0.0002]
Epoch 3/25
loss: 0.608593  [   70/  698]
loss: 0.521410  [  210/  698]
loss: 0.532534  [  350/  698]
loss: 0.507833  [  490/  698]
loss: 0.704651  [  630/  698]
Training Loss (Epoch): 0.627951
Validating...
Validation Loss: 0.766771, Validation Accuracy: 72.57%
Learning Rate: [0.0002]
Epoch 4/25
loss: 0.471835  [   70/  698]
loss: 0.624454  [  210/  698]
loss: 0.316563  [  350/  698]
loss: 0.503466  [  490/

In [15]:

dense_tune(layer = 1, lr = 2e-4, weight_decay = 0.2, weights = "DEFAULT", drop = 0.5)

Processing Fold 1
Epoch 1/25
loss: 2.331131  [   70/  698]
loss: 2.055089  [  210/  698]
loss: 1.877670  [  350/  698]
loss: 1.687566  [  490/  698]
loss: 1.555803  [  630/  698]
Training Loss (Epoch): 1.905555
Validating...
Validation Loss: 1.838341, Validation Accuracy: 38.86%
Learning Rate: [0.0002]
Epoch 2/25
loss: 1.756370  [   70/  698]
loss: 1.312833  [  210/  698]
loss: 1.193984  [  350/  698]
loss: 1.035488  [  490/  698]
loss: 0.948568  [  630/  698]
Training Loss (Epoch): 1.199020
Validating...
Validation Loss: 0.995627, Validation Accuracy: 58.86%
Learning Rate: [0.0002]
Epoch 3/25
loss: 0.746033  [   70/  698]
loss: 0.646298  [  210/  698]
loss: 0.543342  [  350/  698]
loss: 0.599732  [  490/  698]


KeyboardInterrupt: 

In [None]:

dense_tune(layer = 2, lr = 2e-4, weight_decay = 0.2, weights = "DEFAULT", drop = 0.5)

In [None]:

dense_tune(layer = 3, lr = 2e-4, weight_decay = 0.2, weights = "DEFAULT", drop = 0.5)

In [None]:

dense_tune(layer = 4, lr = 2e-4, weight_decay = 0.2, weights = "DEFAULT", drop = 0.5)

In [None]:
dense_tune(layer = 0, lr = 2e-3, weight_decay = 0.2, weights = None, drop = 0.5)

In [None]:
# import torchaudio

# audio_path = "./UrbanSound8k/audio/fold1/137156-9-0-30.wav"
# waveform, sample_rate = torchaudio.load(audio_path)
# print(f"Shape: {waveform.shape}, Sample Rate: {sample_rate}")

In [None]:
def show_stereogram(spectrogram):
    # Convert to numpy
    spectrogram_np = spectrogram.numpy()  # Shape: (2, Freq, Time)

    # Plot left and right channels
    fig, axs = plt.subplots(2, 1, figsize=(6, 6), constrained_layout=True)

    axs[0].imshow(spectrogram_np[0], aspect='auto', origin='lower', cmap='magma')
    axs[0].set_title(f"Spectrogram {i+1} - Left Channel")
    axs[0].set_ylabel("Frequency Bins")
    axs[0].set_xlabel("Time Frames")

    axs[1].imshow(spectrogram_np[1], aspect='auto', origin='lower', cmap='magma')
    axs[1].set_title(f"Spectrogram {i+1} - Right Channel")
    axs[1].set_ylabel("Frequency Bins")
    axs[1].set_xlabel("Time Frames")

    plt.show()