## Preprocessing of the training and evaluation data

The audio recordings come from YouTube podcasts in spanish language. Noise taken from DEMAND dataset. The list is available in the `sources.txt` file.

The preprocessing consists in three steps:

1. Downsample the audio recording wav 44.1kHz to wav 16kHz and convert to 1 channel. It loses some quality but it improves the training time. It was recommended in the paper. First and last minute of the recording is removed because of music.

    `ffmpeg -ss 60 -i input -ar 16000 -ac 1 output`

2. Create a copy of audio recording and add noise.
    2. Noise is taken from DEMAND dataset as recommended in the paper.
3. Tokenize the audio in samples 3-8 seconds long on silence.
4. Split the data to train and evaluation set, create pairs (clean, noisy).


In [1]:
import auditok as at
import librosa
import os
import soundfile as sf
import numpy as np

In [2]:
noise_src = ["OMEETING", "OOFFICE", "PRESTO", "SPSQUARE"]
clean_src = ['podcast1_16_5.wav', 'podcast2_16_5.wav', 'podcast3_16_5.wav', 'podcast4_16_5.wav']
original_dir = "data/in/original/"
diff_noise_dir = "data/noise/"
noise_idx = 0
ch_names = ["/ch%.2d.wav" % i for i in range(1,16)]
data = {}

In [3]:
sr = 16000
for filename in clean_src:
    clean_file = original_dir + filename
    clean, sr = librosa.load(clean_file, sr=sr)
    data[filename] = clean
print(data.keys())
for noise_dir in noise_src:
    for ch in ch_names:
        noise_file = diff_noise_dir + noise_dir + ch
        noise, sr = librosa.load(noise_file, sr=sr)
        minute = len(noise) // 5 + 1
        noise = noise[:minute]
        data[noise_dir + ch] = noise
print(data.keys())

dict_keys(['podcast1_16_5.wav', 'podcast2_16_5.wav', 'podcast3_16_5.wav', 'podcast4_16_5.wav'])
dict_keys(['podcast1_16_5.wav', 'podcast2_16_5.wav', 'podcast3_16_5.wav', 'podcast4_16_5.wav', 'OMEETING/ch01.wav', 'OMEETING/ch02.wav', 'OMEETING/ch03.wav', 'OMEETING/ch04.wav', 'OMEETING/ch05.wav', 'OMEETING/ch06.wav', 'OMEETING/ch07.wav', 'OMEETING/ch08.wav', 'OMEETING/ch09.wav', 'OMEETING/ch10.wav', 'OMEETING/ch11.wav', 'OMEETING/ch12.wav', 'OMEETING/ch13.wav', 'OMEETING/ch14.wav', 'OMEETING/ch15.wav', 'OOFFICE/ch01.wav', 'OOFFICE/ch02.wav', 'OOFFICE/ch03.wav', 'OOFFICE/ch04.wav', 'OOFFICE/ch05.wav', 'OOFFICE/ch06.wav', 'OOFFICE/ch07.wav', 'OOFFICE/ch08.wav', 'OOFFICE/ch09.wav', 'OOFFICE/ch10.wav', 'OOFFICE/ch11.wav', 'OOFFICE/ch12.wav', 'OOFFICE/ch13.wav', 'OOFFICE/ch14.wav', 'OOFFICE/ch15.wav', 'PRESTO/ch01.wav', 'PRESTO/ch02.wav', 'PRESTO/ch03.wav', 'PRESTO/ch04.wav', 'PRESTO/ch05.wav', 'PRESTO/ch06.wav', 'PRESTO/ch07.wav', 'PRESTO/ch08.wav', 'PRESTO/ch09.wav', 'PRESTO/ch10.wav', 'PRE

In [4]:
noise_src = ["OMEETING", "OOFFICE", "PRESTO"]
clean_src = ['podcast1_16_5.wav', 'podcast2_16_5.wav', 'podcast3_16_5.wav']
parts_n = 5
sr = 16000
for noise_dir in noise_src:
    noise_idx = 0
    for filename in clean_src:
        print(filename)
        clean = data[filename]
        parts = np.array_split(clean, parts_n)
        for i in range(parts_n):
            ch = ch_names[noise_idx]
            noise_idx += 1
            noise = data[noise_dir + ch]
            lim = min(parts[i].shape, noise.shape)[0]
            clean = clean[:lim]
            noise = noise[:lim]
            noisy_data = (clean+noise) / 2
            filename_clean_file = "data/in/clean/" + filename[len("podcast"):-len("_16_5.wav")] +"_"+noise_dir+ch[1:]
            filename_noisy_file = "data/in/noisy/" + filename[len("podcast"):-len("_16_5.wav")] +"_"+noise_dir+ch[1:]
            sf.write(filename_noisy_file, noisy_data, sr, subtype='PCM_16')
            sf.write(filename_clean_file, clean, sr, subtype='PCM_16')

for noise_dir in ["SPSQUARE"]:
    noise_idx = 0
    for filename in ["podcast4_16_5.wav"]:
        print(filename)
        clean = data[filename]
        parts = np.array_split(clean, parts_n)
        for i in range(parts_n):
            ch = ch_names[noise_idx]
            noise_idx += 1
            noise = data[noise_dir + ch]
            lim = min(parts[i].shape, noise.shape)[0]
            clean = clean[:lim]
            noise = noise[:lim]
            noisy_data = (clean+noise) / 2
            filename_clean_file = "data/in/clean/" + filename[len("podcast"):-len("_16_5.wav")] +"_"+noise_dir+ch[1:]
            filename_noisy_file = "data/in/noisy/" + filename[len("podcast"):-len("_16_5.wav")] +"_"+noise_dir+ch[1:]
            print(filename_noisy_file)
            sf.write(filename_noisy_file, noisy_data, sr, subtype='PCM_16')
            sf.write(filename_clean_file, clean, sr, subtype='PCM_16')

            

podcast1_16_5.wav
podcast2_16_5.wav
podcast3_16_5.wav
podcast1_16_5.wav
podcast2_16_5.wav
podcast3_16_5.wav
podcast1_16_5.wav
podcast2_16_5.wav
podcast3_16_5.wav
podcast4_16_5.wav
data/in/noisy/4_SPSQUAREch01.wav
data/in/noisy/4_SPSQUAREch02.wav
data/in/noisy/4_SPSQUAREch03.wav
data/in/noisy/4_SPSQUAREch04.wav
data/in/noisy/4_SPSQUAREch05.wav


In [5]:
region = at.load("data/in/noisy/2_OOFFICEch06.wav", sr=sr) # returns an AudioRegion object

In [6]:
parts_n = 5
final = "X_0_00_YYYY.wav"
noisy_dir = "data/in/noisy/"
clean_dir = "data/in/clean/"
for filename in os.listdir(noisy_dir):
    print(noisy_dir + filename)
    audio_regions = at.split(
        noisy_dir + filename,
        min_dur=3,     # minimum duration of a valid audio event in seconds
        max_dur=9,       # maximum duration of an event
        max_silence=0.3, # maximum duration of tolerated continuous silence within an event
        energy_threshold=55 # threshold of detection
    )
    for i, sample in enumerate(audio_regions):
        sample.save("data/in/split/noisy/"+str(i)+"_"+filename)


for filename in os.listdir(clean_dir):
    audio_regions = at.split(
        clean_dir + filename,
        min_dur=3,     # minimum duration of a valid audio event in seconds
        max_dur=9,       # maximum duration of an event
        max_silence=0.3, # maximum duration of tolerated continuous silence within an event
        energy_threshold=55 # threshold of detection
    )
    for i, sample in enumerate(audio_regions):
        sample.save("data/in/split/clean/"+str(i)+"_"+filename)


data/in/noisy/2_OOFFICEch06.wav
data/in/noisy/4_SPSQUAREch02.wav
data/in/noisy/1_OMEETINGch01.wav
data/in/noisy/2_OMEETINGch08.wav
data/in/noisy/3_PRESTOch15.wav
data/in/noisy/3_OMEETINGch12.wav
data/in/noisy/1_OOFFICEch05.wav
data/in/noisy/4_SPSQUAREch03.wav
data/in/noisy/1_OMEETINGch03.wav
data/in/noisy/1_PRESTOch05.wav
data/in/noisy/2_OOFFICEch07.wav
data/in/noisy/3_OOFFICEch15.wav
data/in/noisy/2_OOFFICEch09.wav
data/in/noisy/2_PRESTOch08.wav
data/in/noisy/2_OOFFICEch08.wav
data/in/noisy/1_OOFFICEch02.wav
data/in/noisy/1_OOFFICEch03.wav
data/in/noisy/1_PRESTOch03.wav
data/in/noisy/3_OMEETINGch15.wav
data/in/noisy/1_PRESTOch04.wav
data/in/noisy/3_PRESTOch12.wav
data/in/noisy/3_PRESTOch11.wav
data/in/noisy/3_OMEETINGch14.wav
data/in/noisy/2_PRESTOch07.wav
data/in/noisy/2_OOFFICEch10.wav
data/in/noisy/2_PRESTOch10.wav
data/in/noisy/1_PRESTOch02.wav
data/in/noisy/2_PRESTOch09.wav
data/in/noisy/3_OOFFICEch13.wav
data/in/noisy/2_OMEETINGch06.wav
data/in/noisy/2_OMEETINGch10.wav
data/in/n