In [None]:
import numpy as np
from pydub import AudioSegment
import os
import shutil
import math

In [None]:
def white_noise_gen(duration, sample_rate):
    num_samples = int(duration * sample_rate)

    # Creates array of length num_samples with mean 0, and std 1
    white_noise = np.random.normal(0, 1, num_samples)

    return white_noise
 
# merges a song with white noise of random volume
def song_noise_adder(audio_path, output_path):
    song = AudioSegment.from_file(audio_path)
    
    white_noise = white_noise_gen(song.duration_seconds, song.frame_rate)
    
    # Match generated white noise specifications with original song
    white_noise = AudioSegment(
            data=white_noise.tobytes(),
            sample_width=song.sample_width,
            frame_rate=song.frame_rate,
            channels=song.channels
        )

    volume_adjustment = np.random.uniform(15, 30) # random value between 15 and 30

    # Overlay the original audio with the noise
    noisy_audio = song.overlay(white_noise - volume_adjustment)
    
    # Adress edge case where noisy audio and original are mismatched in duration
    noisy_audio = noisy_audio[:song.duration_seconds * 1000] # 1000 = milliseconds in a second

    noisy_audio.export(output_path, format="wav")

In [None]:
# Function creates organized noised and clean training audio,
# input source_path as path to folder of audio with no sub-folders

# Creates file structure of:
#          output_path
#               ^
#             song_1
#               ^    
# clean_song_1.wav, dirty_song_1.wav

def training_data_gen(source_path, output_path):
    # Clear output_path if it already exists
    if os.path.exists(output_path):
            shutil.rmtree(output_path)
            os.mkdir(output_path) 
    
    # Collects all songs data needs to be generated for
    song_path_arr = os.listdir(source_path)
    
    for song_path in song_path_arr:
        # Create name for each song's folder
        song_folder = os.path.splitext(song_path)[0] # remove extension
        
        # Get relative path of song's will-be folder
        song_folder_path = os.path.join(output_path, song_folder)
        os.mkdir(song_folder_path)
        
        song_source_path = os.path.join(source_path, song_path)
        
        # Get target paths of dirty and clean files
        clean_song_path = os.path.join(song_folder_path, f"clean_data_{song_path}")
        dirty_song_path = os.path.join(song_folder_path, f"dirty_data_{song_path}")

        # Add clean and dirty files to their respective folder
        shutil.copy(song_source_path, clean_song_path)
        song_noise_adder(song_source_path, dirty_song_path)


In [None]:
source_folder = "input_training_songs"
output_folder = "training_data_generated"

training_data_gen(source_folder, output_folder)