Functions for adding noise and segmenting

In [None]:
from pydub import AudioSegment
import numpy as np
import os
import math

def add_noise_to_audio(audio, noise_level=0.02):

    # Convert audio to numpy array (pydub uses raw audio)
    samples = np.array(audio.get_array_of_samples())
    
    # Generate random noise
    noise = np.random.normal(0, noise_level * np.max(samples), samples.shape).astype(samples.dtype)
    
    # Add noise to the audio signal
    noisy_samples = samples + noise
    noisy_samples = np.clip(noisy_samples, -32768, 32767)  # Ensure values are within int16 range
    
    # Convert numpy array back to AudioSegment
    noisy_audio = AudioSegment(
        noisy_samples.tobytes(),
        frame_rate=audio.frame_rate,
        sample_width=audio.sample_width,
        channels=audio.channels
    )
    
    return noisy_audio

def split_or_pad_audio(file_path, output_dir, segment_duration=5000):
    # Load the audio file
    audio = AudioSegment.from_file(file_path, format="flac")
    audio_length = len(audio)
    
    # Calculate the number of 5-second segments
    num_segments = math.ceil(audio_length / segment_duration)
    
    segments = []
    for i in range(num_segments):
        start = i * segment_duration
        end = start + segment_duration
        segment = audio[start:end]
        
        # If segment is less than 5 seconds, pad with silence
        if len(segment) < segment_duration:
            segment = segment + AudioSegment.silent(duration=(segment_duration - len(segment)))
        
        # Define the output file path
        segment_filename = f"{os.path.splitext(os.path.basename(file_path))[0]}_seg_{i}.flac"
        segment_path = os.path.join(output_dir, segment_filename)
    

        noisy_seg = add_noise_to_audio(segment)
        noisy_path = segment_path.replace("clean","noisy")
        noisy_dir = os.path.dirname(noisy_path)
        os.makedirs(noisy_dir, exist_ok=True)
        noisy_seg.export(noisy_path, "flac")
        
        # Export the segment
        segment.export(segment_path, format="flac")
        segments.append(segment_path)

    
    return segments

In [None]:
# Spliting Audios in 5 seconds and adding noise

import os
import pandas as pd

dataset = []
count = 0
max_files = 5

for root, dirs1, files in os.walk("LibriSpeech/dev-clean"):
    for file in files:
        if file.endswith(".flac"):
            audio_path = os.path.join(root, file)
            output_dir = os.path.dirname(audio_path.replace("dev-clean", "segments/clean"))
            os.makedirs(output_dir, exist_ok=True)
            segment_list = split_or_pad_audio(audio_path,output_dir,5000)
            for segment in segment_list:
                clean_path = segment
                noisy_path = clean_path.replace("clean", "noisy")
                dataset.append((noisy_path, segment))
            count += 1
            
            if count >= max_files:
                break
    if count >= max_files:
        break

df = pd.DataFrame(dataset,columns=["noisy_path", "clean_path"])
df.to_csv("audio_dataset.csv", index=False)

In [None]:
import librosa
import numpy as np
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

class AudioDataset(Dataset):
    def __init__(self, metadata_file):
        """
        Args:
            metadata_file (str): Path to the file with audio file paths and labels.
            transform (callable, optional): Optional transform to apply on a sample.
        """
        self.metadata = pd.read_csv(metadata_file, names=['noisy_path', 'clean_path'], skiprows=1)  

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        # Retrieve file paths for noisy and clean audio
        noisy_path = self.metadata.iloc[idx]['noisy_path']
        clean_path = self.metadata.iloc[idx]['clean_path']
        
        # Load audio and calculate spectrogram from clean and noisy audio
        y_noisy, sr = librosa.load(noisy_path, sr=None)
        if not isinstance(y_noisy, np.ndarray):
           y_noisy = np.array(y_noisy)
        S_noisy = librosa.stft(y_noisy, n_fft=2048, hop_length=256)
        S_dB_noisy = librosa.amplitude_to_db(np.abs(S_noisy), ref=np.max)

        y_clean, sr = librosa.load(clean_path, sr=None)
        if not isinstance(y_clean, np.ndarray):
            y_clean = np.array(y_clean)
        S_clean = librosa.stft(y_clean, n_fft=2048, hop_length=256)
        S_dB_clean = librosa.amplitude_to_db(np.abs(S_clean), ref=np.max)
        
        # Convert to tensor
        S_dB_noisy = torch.tensor(S_dB_noisy, dtype=torch.float32)
        S_dB_clean = torch.tensor(S_dB_clean, dtype=torch.float32)
        
        return S_dB_noisy, S_dB_clean
