In [1]:
import random
import os
import numpy as np
import sklearn
import torch
from torch.cuda import manual_seed_all
from torch.backends import cudnn
import matplotlib as mpl
from matplotlib import pyplot as plt
import torchaudio
import torchaudio.transforms as T

In [2]:
# pre spectrogram augmentations
# these are examples and can be changed based on domain knowledge

time_stretch = T.TimeStretch()
def stretch_waveform(waveform, rate=1.2):
    # `rate > 1.0` speeds up, `rate < 1.0` slows down
    return time_stretch(waveform, rate)

pitch_shift = T.PitchShift(sample_rate=44100, n_steps=2)  # Shift up by 2 semitones
def shift_pitch(waveform, sample_rate):
    return pitch_shift(waveform)

def scale_volume(waveform, factor=1.5):
    return waveform * factor  # Amplifies waveform by factor

def crop_waveform(waveform, crop_size):
    start = torch.randint(0, max(1, waveform.size(-1) - crop_size), (1,)).item()
    return waveform[:, start:start + crop_size]

def apply_reverb(waveform):
    reverb = T.Reverberate()
    return reverb(waveform)

def time_shift(waveform, shift):
    return torch.roll(waveform, shifts=shift, dims=-1)

def add_noise(waveform, noise_level=0.005):
    noise = torch.randn_like(waveform) * noise_level
    return waveform + noise

# Augment on-the-fly stochastically
# again these are just examples and do not necessarily utilize the methods above
def augment_waveform(data):
    waveform, sample_rate = data
    if torch.rand(1).item() > 0.5:
        waveform += torch.randn_like(waveform) * 0.005
    if torch.rand(1).item() > 0.5:
        waveform = torch.roll(waveform, shifts=torch.randint(-5000, 5000, (1,)).item(), dims=-1)
    if torch.rand(1).item() > 0.5:
        waveform *= torch.FloatTensor(1).uniform_(0.8, 1.5).item()
    return waveform, sample_rate


In [3]:
# Create a MelSpectrogram transformation
mel_spectrogram_transform = T.MelSpectrogram(
    sample_rate=44100,         # Default sample rate, change if needed
    n_fft=1024,                # Number of FFT bins
    hop_length=512,            # Hop length between windows
    n_mels=64                  # Number of Mel bands
)

def waveform_to_spectrogram(data):
    waveform, sample_rate = data
    spectrogram = mel_spectrogram_transform(waveform)  # Apply the spectrogram transformation
    return spectrogram

In [4]:
# post spectrogram augmentations

# Example augmentations, could add more
time_mask = T.TimeMasking(time_mask_param=10)

freq_mask = T.FrequencyMasking(freq_mask_param=8)

# hybridizes two sounds
def mixup(spectrogram1, spectrogram2, alpha=0.2):
    lam = torch.FloatTensor(1).uniform_(0, alpha).item()
    return lam * spectrogram1 + (1 - lam) * spectrogram2

# should probably implement a randomization process like above
def augment_spectrogram(spectrogram):
    augmented = time_mask(spectrogram)  # Apply time masking
    augmented = freq_mask(augmented)   # Apply frequency masking
    return augmented

In [5]:
# Decode audio files
def decode_audio(file_tuple):
    file_path, file = file_tuple
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

In [6]:
import os
import torchaudio
from torch.utils.data import Dataset, DataLoader

class UrbanSoundDataset(Dataset):
    def __init__(self, audio_path, fold, transform=None):
        self.audio_path = os.path.join(audio_path, f"fold{fold}")
        self.norm_path = os.path.normpath(self.audio_path)
        self.file_list = [os.path.join(self.norm_path, f) for f in os.listdir(self.norm_path) if f.endswith(".wav")]
        self.transform = transform

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        # Load the audio file
        file_path = self.file_list[idx]
        waveform, sample_rate = torchaudio.load(file_path)

        # Convert mono to stereo if necessary
        if waveform.size(0) == 1:
            waveform = waveform.repeat(2, 1)

        
        # Apply any transformations (e.g., augmentations, spectrogram)
        if self.transform:
            waveform = self.transform(waveform)
        
        return waveform

In [7]:
import torchaudio.transforms as T

# Example transformations
def augment_waveform(waveform):
    # Add your augmentation logic here (e.g., noise addition, time stretch, etc.)
    return waveform

waveform_to_spectrogram = T.MelSpectrogram(sample_rate=16000, n_mels=128)
augment_spectrogram = T.AmplitudeToDB()

# Combine transformations into a callable function
def transform_pipeline(waveform):
    waveform = augment_waveform(waveform)
    spectrogram = waveform_to_spectrogram(waveform)
    spectrogram = augment_spectrogram(spectrogram)
    return spectrogram

def pad_spectrogram(spectrogram, target_shape):
    current_shape = spectrogram.size()
    # Calculate padding for each dimension
    pad_time = target_shape[2] - current_shape[2]
    if pad_time > 0:
        padded_spectrogram = torch.nn.functional.pad(spectrogram, (0, pad_time))
    else:
        padded_spectrogram = spectrogram[:, :, :target_shape[2]]  # Crop if needed
    return padded_spectrogram



In [8]:
import torchvision.transforms as transforms

# Resize and normalize for DenseNet
resize_transform = transforms.Compose([
    transforms.Resize((224, 224)),                # Resize spectrogram to DenseNet input size
    transforms.Normalize(mean=[0.485], std=[0.229])  # Normalize to ImageNet standards (adjust channels)
])

def custom_collate_fn(batch):
    # Extract spectrograms from the batch
    spectrograms = [item for item in batch]

    # Determine the maximum time dimension in the batch
    max_time = max(spectrogram.size(2) for spectrogram in spectrograms)

    # Pad all spectrograms to the same length along the time dimension
    padded_batch = [
        torch.nn.functional.pad(spectrogram, (0, max_time - spectrogram.size(2)))
        for spectrogram in spectrograms
    ]

    # Resize and normalize spectrograms for DenseNet
    resized_batch = [resize_transform(spectrogram) for spectrogram in padded_batch]

    # Stack the resized spectrograms to create a batch
    return torch.stack(resized_batch)


# Specify paths and batch size
AUDIO_PATH = "./UrbanSound8k/audio"
batch_size = 32

target_shape = [2, 128, 1024]  # Adjust time_frames as needed


import torch


# Loop through folds
for fold in range(1, 11):
    # Initialize dataset and DataLoader
    dataset = UrbanSoundDataset(audio_path=AUDIO_PATH, fold=fold, transform=transform_pipeline)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)


    # Process each batch
    for batch in dataloader:
        for spectrogram in batch:
            print(spectrogram.size())  # Process each spectrogram

torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([2, 224, 224])
torch.Size([

KeyboardInterrupt: 

In [None]:
import torchaudio

audio_path = "./UrbanSound8k/audio/fold1/137156-9-0-30.wav"
waveform, sample_rate = torchaudio.load(audio_path)
print(f"Shape: {waveform.shape}, Sample Rate: {sample_rate}")

Shape: torch.Size([2, 176400]), Sample Rate: 44100
