In [None]:
!pip install pyworld
!pip install pyroomacoustics
!apt-get install sox

Collecting pyworld
  Downloading pyworld-0.3.5.tar.gz (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pyworld
  Building wheel for pyworld (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyworld: filename=pyworld-0.3.5-cp311-cp311-linux_x86_64.whl size=899941 sha256=418b3c821a49241e5ac569395e657a128496aff0239f00fae011efe18aa720d0
  Stored in directory: /root/.cache/pip/wheels/26/f0/db/ebcd5cdfe5ad7d229917d3a8db6f18f0cf40f099bf878e294d
Successfully built pyworld
Installing collected packages: pyworld
Successfully installed pyworld-0.3.5
Collecting pyroomacoustics
  Downloading pyroomacoustics-0.8.3.tar.gz (35.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.1/

In [None]:
import os
import shutil
import zipfile
import subprocess
import numpy as np
import soundfile as sf
import pyworld as pw
import pyroomacoustics as pra
import librosa
from scipy.interpolate import interp1d
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


# PyWorld, SoX, and Noise Injection

In [None]:
# function for adding pseudo-child speakers from original adult speaker (formant & pitch shifting)
def shift_formants_pyworld(audio_path, output_path, formant_shift_ratio, pitch_shift_ratio):

    x, fs = librosa.load(audio_path, sr=None)
    x = x.astype(np.float64)

    _f0, t = pw.harvest(x, fs)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)  # spectral envelope
    ap = pw.d4c(x, f0, t, fs)         # aperiodicity

    f0_shifted = f0 * pitch_shift_ratio   # pitch shift

    # spectral envelope warping to simulate shorter vocal tract (higher formants, i.e., child voices)
    def warp_spectral_envelope(sp, ratio):
        warped_sp = np.zeros_like(sp)
        n_frames, n_bins = sp.shape
        freq_axis = np.linspace(0, fs / 2, n_bins)
        warped_freq_axis = np.clip(freq_axis * ratio, 0, fs / 2)

        for i in range(n_frames):
            interp_func = interp1d(warped_freq_axis, sp[i], kind='linear',
                                   fill_value='extrapolate', bounds_error=False)
            warped_sp[i] = interp_func(freq_axis)
        return warped_sp

    sp_warped = warp_spectral_envelope(sp, formant_shift_ratio)
    y = pw.synthesize(f0_shifted, sp_warped, ap, fs)
    sf.write(output_path, y.astype(np.float32), fs)

In [None]:
# function for time stretching on audio
def sox_time_stretch(input_path, output_path, factor):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    subprocess.run(["sox", input_path, output_path, "tempo", str(factor), "rate", "16000"], check=True)

In [None]:
# function for noise injection
def add_noise(y, snr_db):
    rms = np.sqrt(np.mean(y**2))
    noise_std = rms / (10**(snr_db / 20))
    noise = np.random.normal(0, noise_std, y.shape)
    return y + noise

# Pyroomacoustics

In [None]:
# function for applying room-like reverb on audio
def reverb(input_dir, output_dir, room_dim, absorption, max_order, source_location, mic_location):
    signal, fs = sf.read(input_dir)
    room = pra.ShoeBox(room_dim, fs=fs, max_order=max_order, absorption=absorption)
    room.add_source(source_location, signal=signal)
    room.add_microphone_array(pra.MicrophoneArray(mic_location, fs=fs))
    room.simulate()
    reverberated_signal = room.mic_array.signals[0]
    sf.write(output_dir, reverberated_signal.astype(np.float32), fs)

In [None]:
# function for applying reverb with noise on audio, simulating real-world classroom settings
def reverb_with_noise(input_dir, output_dir, room_dim, absorption, max_order, source_location, mic_location, snr_db):
    signal, fs = sf.read(input_dir)
    room = pra.ShoeBox(room_dim, fs=fs, max_order=max_order, absorption=absorption)
    room.add_source(source_location, signal=signal)
    room.add_microphone_array(pra.MicrophoneArray(mic_location, fs=fs))
    room.simulate()
    reverberated_signal = room.mic_array.signals[0]
    y_noise = add_noise(reverberated_signal, snr_db=snr_db)
    sf.write(output_dir, y_noise.astype(np.float32), fs)

# AUGMENTATION

In [None]:
# SPEAKER/AUGMENT GUIDELINES
'''
ORIGINAL SPEAKER:
BR01 - Female adult voice

PSEUDO-SPEAKERS:
BR02 - Main child voice (male)
BR03 - Slightly older child voice
BR04 - female child voice

AUGMENTS (FEMALE ADULT):
XXX-1 = ORIGINAL
XXX-2 = +1 semitone (female adult)
XXX-3 = 1.1x + 20dB noise
XXX-4 = 0.9X
XXX-5 = noise (15dB)
XXX-6 = reverb 1
XXX-7 = noise (25dB) + reverb 2

AUGMENTS (PSEUDO-CHILD):
XXX-1 = ORIGINAL
XXX-2 = pitch +2.4 semitones
XXX-3 = 1.1x + 20dB noise
XXX-4 = 1.2x
XXX-5 = 0.95x + pitch +2.4 semitones
XXX-6 = noise (15dB)
XXX-7 = reverb 1
XXX-8 = noise (20dB) + reverb 2
XXX-9 = noise (20dB) + reverb 3

TOTAL HOURS (hh:mm:ss):
 - FEMALE ADULT = 06:18:24
 - CHILD VOICE = 07:55:22 each (23:46:06 total)
 - TOTAL = 30:04:30

*Reverb Simulations:
  - Reverb 1: Simulates well-treated room
  - Reverb 2: Simulates room with thin walls, with speaker being near the wall
  - Reverb 3: Simulates old room with strong echo/reverb
'''

In [None]:
# CREATE PSEUDO SPEAKERS FROM ORIGINAL (CHILD 1-3)
wav_path = '/content/drive/path/to/audio/source' # directory of original speaker
out_path = '/content/drive/path/to/audio/output'
num_files = 0

for wav_file in sorted(os.listdir(wav_path)):
  if wav_file.endswith('.wav'):
    wav_file_path = os.path.join(wav_path, wav_file)
    out_file_name = wav_file.replace('BR01', 'BR0X') # change depending on speaker id
    out_file = out_file_name.replace('_orig.wav', '.wav')
    out_file_path = os.path.join(out_path, out_file)
    shift_formants_pyworld(wav_file_path, out_file_path, formant_shift_ratio=1.21, pitch_shift_ratio=1.315) # CHILD 1 (BR02)
    #shift_formants_pyworld(wav_file_path, out_file_path, formant_shift_ratio=1.1, pitch_shift_ratio=1.2) # CHILD 2 (BR03)
    #shift_formants_pyworld(wav_file_path, out_file_path, formant_shift_ratio=1.305, pitch_shift_ratio=1.425) # CHILD 3 (BR04)

    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

## Augment pseudo-child speakers

In [None]:
# AUGMENT PSEUDO-CHILD SPEAKERS
wav_path_augment = out_path # directory containing pseudo speakers
out_path_augment = f"{wav_path_augment}/BR0X_augments" # change depending on speaker id
num_files = 0

for wav_file in sorted(os.listdir(wav_path_augment)):
  if wav_file.endswith('.wav'):
    wav_file_path = os.path.join(wav_path_augment, wav_file)

    # 2 (pitch +2.4 semitones)
    out_file = wav_file.replace('-1.wav', '-2.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    shift_formants_pyworld(wav_file_path, out_file_path, formant_shift_ratio=1.0, pitch_shift_ratio=1.15)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 3 (1.1x + 20dB noise)
    tmp_file = wav_file.replace('-1.wav', '-tmp1.wav')
    tmp_file_path = os.path.join('/content/temp', tmp_file)
    sox_time_stretch(wav_file_path, tmp_file_path, 1.1)
    out_file = wav_file.replace('-1.wav', '-3.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    signal_tmp, fs = sf.read(tmp_file_path)
    y_noise = add_noise(signal_tmp, snr_db=20)
    sf.write(out_file_path, y_noise, fs)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 4 (1.2x)
    out_file = wav_file.replace('-1.wav', '-4.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    sox_time_stretch(wav_file_path, out_file_path, 1.2)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 5 (0.95x + pitch +2.4 semitones)
    tmp_file = wav_file.replace('-1.wav', '-tmp2.wav')
    tmp_file_path = os.path.join('/content/temp', tmp_file)
    shift_formants_pyworld(wav_file_path, tmp_file_path, formant_shift_ratio=1.0, pitch_shift_ratio=1.15)
    out_file = wav_file.replace('-1.wav', '-5.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    sox_time_stretch(tmp_file_path, out_file_path, 0.95)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 6 (noise (15dB))
    out_file = wav_file.replace('-1.wav', '-6.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    signal_orig, fs = sf.read(wav_file_path)
    y_noise = add_noise(signal_orig, snr_db=15)
    sf.write(out_file_path, y_noise, fs)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 7 (reverb 1)
    out_file = wav_file.replace('-1.wav', '-7.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    reverb(wav_file_path, out_file_path, room_dim=[6.5, 5, 3], absorption=0.4, max_order=11, source_location=[1, 1, 1.2], mic_location=np.array([[3], [2], [1.2]]))
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 8 (noise (20dB) + reverb 2)
    out_file = wav_file.replace('-1.wav', '-8.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    reverb_with_noise(wav_file_path, out_file_path, room_dim=[8, 7, 3], absorption=0.3, max_order=9, source_location=[1, 1, 1.2], mic_location=np.array([[5.5], [3], [1.2]]), snr_db=20)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 9 (noise (20dB) + reverb 3)
    out_file = wav_file.replace('-1.wav', '-9.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    reverb_with_noise(wav_file_path, out_file_path, room_dim=[6, 5, 2.8], absorption=0.25, max_order=12, source_location=[2, 3, 1.2], mic_location=np.array([[5], [2], [1.2]]), snr_db=20)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

## Augment female adult speaker

In [None]:
# AUGMENT FEMALE ADULT SPEAKER
wav_path_augment = wav_path # directory of original speaker
out_path_augment = f"{wav_path_augment}/BR01_augments"
num_files = 0

for wav_file in sorted(os.listdir(wav_path_augment)):
  if wav_file.endswith('_orig.wav'):
    wav_file_path = os.path.join(wav_path_augment, wav_file)

    # 2 (+1 semitone)
    out_file = wav_file.replace('-1_orig.wav', '-2.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    shift_formants_pyworld(wav_file_path, out_file_path, formant_shift_ratio=1.0, pitch_shift_ratio=1.059)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 3 (1.1x + 20dB noise)
    tmp_file = wav_file.replace('-1_orig.wav', '-tmp1.wav')
    tmp_file_path = os.path.join('/content/temp', tmp_file)
    sox_time_stretch(wav_file_path, tmp_file_path, 1.1)
    out_file = wav_file.replace('-1_orig.wav', '-3.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    signal_tmp, fs = sf.read(tmp_file_path)
    y_noise = add_noise(signal_tmp, snr_db=20)
    sf.write(out_file_path, y_noise, fs)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 4 (0.9X)
    out_file = wav_file.replace('-1_orig.wav', '-4.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    sox_time_stretch(wav_file_path, out_file_path, 0.9)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 5 (noise (15dB))
    out_file = wav_file.replace('-1_orig.wav', '-5.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    signal_orig, fs = sf.read(wav_file_path)
    y_noise = add_noise(signal_orig, snr_db=15)
    sf.write(out_file_path, y_noise, fs)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 6 (reverb 1)
    out_file = wav_file.replace('-1_orig.wav', '-6.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    reverb(wav_file_path, out_file_path, room_dim=[6.5, 5, 3], absorption=0.4, max_order=11, source_location=[1, 1, 1.2], mic_location=np.array([[3], [2], [1.2]]))
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

    # 7 (noise (25dB) + reverb 2)
    out_file = wav_file.replace('-1_orig.wav', '-7.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    reverb_with_noise(wav_file_path, out_file_path, room_dim=[8, 7, 3], absorption=0.3, max_order=9, source_location=[1, 1, 1.2], mic_location=np.array([[5.5], [3], [1.2]]), snr_db=25)
    num_files += 1
    print(f"{wav_file} - Processed {num_files} files")

In [None]:
wav_path_augment = wav_path # directory of original speaker
out_path_augment = f"{wav_path_augment}/BR01_augments"
for wav_file in sorted(os.listdir(wav_path_augment)):
  if wav_file.endswith('_orig.wav'):
    wav_file_path = os.path.join(wav_path_augment, wav_file)
    out_file = wav_file.replace('-1_orig.wav', '-1.wav')
    out_file_path = os.path.join(out_path_augment, out_file)
    shutil.copy(wav_file_path, out_file_path)