## Preprocessing of the training and evaluation data

The audio recordings come from YouTube podcasts in spanish language. Noise taken from DEMAND dataset. The list is available in the `sources.txt` file.

The preprocessing consists in three steps:

1. Downsample the audio recording wav 44.1kHz to wav 16kHz and convert to 1 channel. It loses some quality but it improves the training time. It was recommended in the paper. First and last minute of the recording is removed because of music.

    `ffmpeg -ss 60 -i input -ar 16000 -ac 1 output`

2. Create a copy of audio recording and add noise.
    2. Noise is taken from DEMAND dataset as recommended in the paper.
3. Tokenize the audio in samples 3-8 seconds long on silence.
4. Split the data to train and evaluation set, create pairs (clean, noisy).


In [5]:
import auditok as at
import librosa
import os
import soundfile as sf
import numpy as np

In [6]:
noise_src = ["OMEETING", "OOFFICE", "PRESTO", "SPSQUARE"]
clean_src = ['podcast1_16_5.wav', 'podcast2_16_5.wav', 'podcast3_16_5.wav', 'podcast4_16_5.wav']
original_dir = "data/podcast/"
diff_noise_dir = "data/noise/"
noise_idx = 0
ch_names = ["/ch%.2d.wav" % i for i in range(1,16)]
data = {}

In [4]:
sr = 16000
for filename in clean_src:
    clean_file = original_dir + filename
    clean, sr = librosa.load(clean_file, sr=sr)
    data[filename] = clean
print(data.keys())
for noise_dir in noise_src:
    for ch in ch_names:
        noise_file = diff_noise_dir + noise_dir + ch
        noise, sr = librosa.load(noise_file, sr=sr)
        minute = len(noise) // 5 + 1
        noise = noise[:minute]
        data[noise_dir + ch] = noise
print(data.keys())

  return f(*args, **kwargs)


FileNotFoundError: [Errno 2] No such file or directory: 'data/in/original/podcast1_16_5.wav'

## 2. Create a copy and add noise

In [4]:
noise_src = ["OMEETING", "OOFFICE", "PRESTO"]
clean_src = ['podcast1_16_5.wav', 'podcast2_16_5.wav', 'podcast3_16_5.wav']
parts_n = 5
sr = 16000
for noise_dir in noise_src:
    noise_idx = 0
    for filename in clean_src:
        print(filename)
        clean = data[filename]
        parts = np.array_split(clean, parts_n)
        for i in range(parts_n):
            ch = ch_names[noise_idx]
            noise_idx += 1
            noise = data[noise_dir + ch]
            lim = min(parts[i].shape, noise.shape)[0]
            clean = clean[:lim]
            noise = noise[:lim]
            noisy_data = (clean+noise) / 2
            filename_clean_file = "data/in/clean/" + filename[len("podcast"):-len("_16_5.wav")] +"_"+noise_dir+ch[1:]
            filename_noisy_file = "data/in/noisy/" + filename[len("podcast"):-len("_16_5.wav")] +"_"+noise_dir+ch[1:]
            sf.write(filename_noisy_file, noisy_data, sr, subtype='PCM_16')
            sf.write(filename_clean_file, clean, sr, subtype='PCM_16')

for noise_dir in ["SPSQUARE"]:
    noise_idx = 0
    for filename in ["podcast4_16_5.wav"]:
        print(filename)
        clean = data[filename]
        parts = np.array_split(clean, parts_n)
        for i in range(parts_n):
            ch = ch_names[noise_idx]
            noise_idx += 1
            noise = data[noise_dir + ch]
            lim = min(parts[i].shape, noise.shape)[0]
            clean = clean[:lim]
            noise = noise[:lim]
            noisy_data = (clean+noise) / 2
            filename_clean_file = "data/in/clean/" + filename[len("podcast"):-len("_16_5.wav")] +"_"+noise_dir+ch[1:]
            filename_noisy_file = "data/in/noisy/" + filename[len("podcast"):-len("_16_5.wav")] +"_"+noise_dir+ch[1:]
            print(filename_noisy_file)
            sf.write(filename_noisy_file, noisy_data, sr, subtype='PCM_16')
            sf.write(filename_clean_file, clean, sr, subtype='PCM_16')

            

podcast1_16_5.wav
podcast2_16_5.wav
podcast3_16_5.wav
podcast1_16_5.wav
podcast2_16_5.wav
podcast3_16_5.wav
podcast1_16_5.wav
podcast2_16_5.wav
podcast3_16_5.wav
podcast4_16_5.wav
data/in/noisy/4_SPSQUAREch01.wav
data/in/noisy/4_SPSQUAREch02.wav
data/in/noisy/4_SPSQUAREch03.wav
data/in/noisy/4_SPSQUAREch04.wav
data/in/noisy/4_SPSQUAREch05.wav


## 3. Tokenize the audio

In [9]:
parts_n = 5
final = "X_0_00_YYYY.wav"
noisy_dir = "data/podcast/noisy/"
clean_dir = "data/podcast/clean/"
for filename in os.listdir(clean_dir):
    audio_regions = at.split(
        clean_dir + filename,
        min_dur=3,     # minimum duration of a valid audio event in seconds
        max_dur=9,       # maximum duration of an event
        max_silence=0.3, # maximum duration of tolerated continuous silence within an event
        energy_threshold=55 # threshold of detection
    )
    it_position = 0
    noisy_region = at.load(noisy_dir + filename)
    for i, clean_sample in enumerate(audio_regions):
        clean_sample.save("data/in/clean/"+str(i)+"_"+filename)
        noisy_sample = noisy_region[it_position:it_position+len(clean_sample)]
        noisy_sample.save("data/in/noisy/"+str(i)+"_"+filename)
        it_position += len(sample)