In [13]:
# First cell: Import necessary libraries
import random
import torch
import librosa
import numpy as np
import torchaudio
import IPython.display as ipd
from datasets import load_dataset
from transformers import AutoProcessor, ClapProcessor

# Seed for reproducibility
SEED = 42
torch.manual_seed(SEED)

# Function to listen to audio in the notebook
def listen_to_audio(audio, sample_rate):
    ipd.display(ipd.Audio(data=audio, rate=sample_rate))

# Load the ESC-50 dataset and the CLAP processor
processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

# Dataset loading
music_dataset = load_dataset("lewtun/music_genres_small", split="train")
target_sampling_rate = 48000

# Resampling function
def resample_audio(audio, original_sampling_rate, target_sampling_rate):
    if original_sampling_rate != target_sampling_rate:
        return librosa.resample(audio, orig_sr=original_sampling_rate, target_sr=target_sampling_rate)
    return audio

# Get a sample from the dataset
sample = music_dataset[0]
audio = sample['audio']['array']
original_sampling_rate = sample['audio']['sampling_rate']
audio_resampled = resample_audio(audio, original_sampling_rate, target_sampling_rate)
category = sample['genre']
# Check if the audio contains valid values
print(f"Min value in audio: {audio.min()}")
print(f"Max value in audio: {audio.max()}")
print(f"Are all values in audio zero? {np.all(audio == 0)}")
print("Audio shape:", audio.shape)
print("Len in seconds:", len(audio) / original_sampling_rate)
# Listen to the original audio
print(f"Original Audio Category: {category}")
listen_to_audio(audio_resampled, target_sampling_rate)



ModuleNotFoundError: No module named 'torch'

In [7]:
# Get a sample from the dataset
import random
sample = random.choice(music_dataset)
audio = sample['audio']['array']
original_sampling_rate = sample['audio']['sampling_rate']
audio_resampled = resample_audio(audio, original_sampling_rate, target_sampling_rate)
category = sample['genre']
listen_to_audio(audio_resampled, target_sampling_rate)

In [9]:
# Cell: Pitch Shifting
def pitch_shift(audio, n_steps=2):
    return librosa.effects.pitch_shift(audio, sr=target_sampling_rate, n_steps=n_steps)

# Apply pitch shifting
audio_shifted = pitch_shift(audio_resampled, n_steps=2)

# Listen to the original and augmented audio
print("Original Audio:")
listen_to_audio(audio_resampled, target_sampling_rate)

print("Pitch-Shifted Audio:")
listen_to_audio(audio_shifted, target_sampling_rate)


Original Audio:


Pitch-Shifted Audio:


In [None]:
# Cell: Time Stretching
def time_stretch(audio, rate=1.2):
    return librosa.effects.time_stretch(audio, rate=rate)

# Apply time-stretching
audio_stretched = time_stretch(audio_resampled, rate=1.2)

# Listen to the original and augmented audio
print("Original Audio:")
listen_to_audio(audio_resampled, target_sampling_rate)

print("Time-Stretched Audio:")
listen_to_audio(audio_stretched, target_sampling_rate)


In [None]:
# Cell: Random Cropping
def random_crop(audio, max_length, crop_length=None):
    if crop_length is None:
        crop_length = random.randint(int(0.5 * max_length), max_length)
    start = random.randint(0, max_length - crop_length)
    return audio[start:start + crop_length]

# Apply random cropping
crop_length = 48000 * 3  # 2 second crop from the 48kHz resampled audio
audio_cropped = random_crop(audio_resampled, len(audio_resampled), crop_length=crop_length)

# Listen to the original and cropped audio
print("Original Audio:")
listen_to_audio(audio_resampled, target_sampling_rate)

print("Cropped Audio:")
listen_to_audio(audio_cropped, target_sampling_rate)


In [None]:
# Cell: Add Noise
from scipy import signal

def add_colored_noise(audio, snr_db, color='pink'):
    # Generate white noise
    white_noise = np.random.randn(len(audio))
    
    # Apply filter for pink noise
    if color == 'pink':
        b, a = signal.butter(1, 0.2, btype='low')  # Low-pass filter for pink noise
        noise = signal.lfilter(b, a, white_noise)
    else:
        noise = white_noise  # Default white noise
    
    # Adjust noise to desired SNR
    audio_power = np.mean(audio ** 2)
    noise_power = audio_power / (10 ** (snr_db / 10))
    noise = noise * np.sqrt(noise_power / np.mean(noise ** 2))
    
    # Add noise to audio
    noisy_audio = audio + noise
    return np.clip(noisy_audio, -1.0, 1.0)
# Apply noise injection
audio_noisy = add_colored_noise(audio_resampled, 0.9, color='white')

# Listen to the original and augmented audio
print("Original Audio:")
listen_to_audio(audio_resampled, target_sampling_rate)

print("Pink noise Audio:")
listen_to_audio(audio_noisy, target_sampling_rate)


In [14]:
# Cell: SpecAugment
from torchaudio.transforms import TimeMasking, FrequencyMasking

def spec_augment(audio, sr, time_mask_param=30, freq_mask_param=15):
    # Convert audio to spectrogram using librosa
    spec = librosa.feature.melspectrogram(y=audio, sr=sr)
    spec = torch.tensor(spec)
    time_masking = TimeMasking(time_mask_param=time_mask_param)
    freq_masking = FrequencyMasking(freq_mask_param=freq_mask_param)
    spec_augmented = freq_masking(time_masking(spec))
    return  librosa.feature.inverse.mel_to_audio(spec_augmented.numpy(), sr=target_sampling_rate)


# Apply SpecAugment
audio_reconstructed = spec_augment(audio_resampled, target_sampling_rate)

# Listen to the original and augmented spectrogram audio
print("Original Spectrogram Audio:")
listen_to_audio(audio_resampled, target_sampling_rate)

print("SpecAugmented Spectrogram Audio:")
# Convert back to audio for listening
listen_to_audio(audio_reconstructed, target_sampling_rate)


ModuleNotFoundError: No module named 'torchaudio'

In [19]:
from scipy import signal

def random_crop(audio, max_length, crop_length):
    start = random.randint(0, max_length - crop_length)
    return audio[start:start + crop_length]

def add_colored_noise(audio, snr_db, color='pink'):
    white_noise = np.random.randn(len(audio))
    # Apply filter for pink noise
    if color == 'pink':
        b, a = signal.butter(1, 0.2, btype='low')  # Low-pass filter for pink noise
        noise = signal.lfilter(b, a, white_noise)
    else:
        noise = white_noise  # Default white noise

    # Adjust noise to desired SNR
    audio_power = np.mean(audio ** 2)
    noise_power = audio_power / (10 ** (snr_db / 10))
    noise = noise * np.sqrt(noise_power / np.mean(noise ** 2))

    # Add noise to audio
    noisy_audio = audio + noise
    return np.clip(noisy_audio, -1.0, 1.0)


def augment_audio(audio):
    length = random.uniform(2, 10) * 48000
    audio = random_crop(audio, audio.size, int(length))
    if random.uniform(0, 1) < .15 or 1:
        snr_db = random.uniform(10, 30)
        color = random.choice(['white', 'pink'])
        print("Color Noise: ", color, snr_db)
        audio = add_colored_noise(audio, snr_db, color)
    return audio

audio_noisy = augment_audio(audio_resampled)
# Listen to the original and augmented audio
print("Original Audio:")
listen_to_audio(audio_resampled, target_sampling_rate)

print("Pink noise Audio:")
listen_to_audio(audio_noisy, target_sampling_rate)

Color Noise:  pink 20.420732917985973
Original Audio:


Pink noise Audio:
