In [None]:
config = {}
config["target_phrase"] = 'hey jar_vis'
config["target_phrase_as_written"] = 'hey jarvis'       # used to generate adverserial phrases
config["model_name"] = 'hey_jarvis'
config["custom_negative_phrases"] = ['hey', 'jar', 'hey jar', 'hey Johnny', 'hey Jacky', 'Beavis']

config["samples_output_dir"] = 'generated_samples'
config["features_output_dir"] = 'training_features/hey_jarvis'
config["rirs_dir"] = 'BUT_ReverbDB_rel_19_06_RIR-Only'  # directory containing Room Imnpulse Response files
config["rirs_glob"] = "**/RIR/*.wav"                    # Glob to choose the appropriate wav files for RIR
config["audioset_clips_dir"] = 'audioset_16k'           # directory containing converted Audioset wav files
config["fma_clips_dir"] = 'fma'                         # directory containing converted FreeMusicArchive wav files
config["fsd_clips_dir"] = 'fsd'                         # directory containing converted FSD50K wav files

config["n_samples"] = 200000                            # number of training samples to generate
config["n_samples_val"] = 20000                         # number of testing and validation samples to generate
config["tts_batch_size"] = 20
config["augment_batch_size"] = 16
config["clip_duration_ms"] = 1430                       # generated clips longer than this are ignored when augmenting
config["spectrogram_duration_ms"] = 1490                # duration of the spectrogram (usually equivalent to clip_duration_ms + end_jitter_ms)
config["sample_rate_hz"] = 16000
config["end_jitter_ms"] = 60                            # augmented clips have up to this amount of blank noise at the end of the clip


import os

config["samples_output_dir"] = os.path.abspath(config["samples_output_dir"])

if not os.path.exists(config["samples_output_dir"]):
    os.mkdir(config["samples_output_dir"])

models_samples_directory = os.path.join(config["samples_output_dir"], config["model_name"])
if not os.path.exists(models_samples_directory):
    os.mkdir(models_samples_directory)

positive_train_output_dir = os.path.join(models_samples_directory, "positive_train")
positive_test_output_dir = os.path.join(models_samples_directory, "positive_test")
positive_validation_output_dir = os.path.join(models_samples_directory, "positive_validation")
negative_train_output_dir = os.path.join(models_samples_directory, "negative_train")
negative_test_output_dir = os.path.join(models_samples_directory, "negative_test")
negative_validation_output_dir = os.path.join(models_samples_directory, "negative_validation")

import tensorflow as tf
from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op 

def generate_features_for_clip(clip):
    micro_frontend = frontend_op.audio_microfrontend(
        tf.convert_to_tensor(clip),
        sample_rate=16000,
        window_size=30,
        window_step=20,
        num_channels=40,
        upper_band_limit=7500,
        lower_band_limit=125,
        enable_pcan=True,
        min_signal_remaining=0.05,
        out_scale=1,
        out_type=tf.float32)
    output = tf.multiply(micro_frontend, 0.0390625)
    return output.numpy()

def features_generator(generator):
    for data in generator:
        for clip in data:
            yield generate_features_for_clip(clip)

In [None]:
# Generate positive and negative samples

import os
import uuid
import torch
import logging
import sys

if "piper-sample-generator/" not in sys.path:
    sys.path.append("piper-sample-generator/")
from generate_samples import generate_samples

if ".openwakeword/openwakeword" not in sys.path:
    sys.path.append('./openwakeword/openwakeword')
from data import generate_adversarial_texts

# Generate positive clips for training
if not os.path.exists(positive_train_output_dir):
    os.mkdir(positive_train_output_dir)

n_current_samples = len(os.listdir(positive_train_output_dir))
if n_current_samples <= 0.95*config["n_samples"]:
    generate_samples(
        text=config["target_phrase"], max_samples=config["n_samples"]-n_current_samples,
        batch_size=config["tts_batch_size"],
        noise_scales=[0.98], noise_scale_ws=[0.98], length_scales=[0.75, 1.0, 1.25],
        output_dir=positive_train_output_dir, auto_reduce_batch_size=True,
        file_names=[uuid.uuid4().hex + ".wav" for i in range(config["n_samples"])]
    )
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of positive clips for training, as ~{config['n_samples']} already exist")

# Generate positive clips for testing
logging.info("#"*50 + "\nGenerating positive clips for testing\n" + "#"*50)
if not os.path.exists(positive_test_output_dir):
    os.mkdir(positive_test_output_dir)
n_current_samples = len(os.listdir(positive_test_output_dir))
        
if n_current_samples <= 0.95*config["n_samples_val"]:
    generate_samples(text=config["target_phrase"], max_samples=config["n_samples_val"]-n_current_samples,
                     batch_size=config["tts_batch_size"],
                     noise_scales=[1.0], noise_scale_ws=[1.0], length_scales=[0.75, 1.0, 1.25],
                     output_dir=positive_test_output_dir, auto_reduce_batch_size=True)
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of positive clips testing, as ~{config['n_samples_val']} already exist")
    
# Generate positive clips for validation
logging.info("#"*50 + "\nGenerating positive clips for validation\n" + "#"*50)
if not os.path.exists(positive_validation_output_dir):
    os.mkdir(positive_validation_output_dir)
n_current_samples = len(os.listdir(positive_validation_output_dir))
        
if n_current_samples <= 0.95*config["n_samples_val"]:
    generate_samples(text=config["target_phrase"], max_samples=config["n_samples_val"]-n_current_samples,
                     batch_size=config["tts_batch_size"],
                     noise_scales=[1.0], noise_scale_ws=[1.0], length_scales=[0.75, 1.0, 1.25],
                     output_dir=positive_validation_output_dir, auto_reduce_batch_size=True)
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of positive clips validation, as ~{config['n_samples_val']} already exist")
    
# Generate adversarial negative clips for training
logging.info("#"*50 + "\nGenerating negative clips for training\n" + "#"*50)
if not os.path.exists(negative_train_output_dir):
    os.mkdir(negative_train_output_dir)
n_current_samples = len(os.listdir(negative_train_output_dir))
if n_current_samples <= 0.95*config["n_samples"]:
    adversarial_texts = config["custom_negative_phrases"]
    for target_phrase in config["target_phrase"]:
        adversarial_texts.extend(generate_adversarial_texts(
            input_text=target_phrase,
            N=config["n_samples"]//len(config["target_phrase"]),
            include_partial_phrase=1.0,
            include_input_words=0.2))
    generate_samples(text=adversarial_texts, max_samples=config["n_samples"]-n_current_samples,
                     batch_size=config["tts_batch_size"]//7,
                     noise_scales=[0.98], noise_scale_ws=[0.98], length_scales=[0.75, 1.0, 1.25],
                     output_dir=negative_train_output_dir, auto_reduce_batch_size=True,
                     file_names=[uuid.uuid4().hex + ".wav" for i in range(config["n_samples"])]
                     )
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of negative clips for training, as ~{config['n_samples']} already exist")

# Generate adversarial negative clips for testing
logging.info("#"*50 + "\nGenerating negative clips for testing\n" + "#"*50)
if not os.path.exists(negative_test_output_dir):
    os.mkdir(negative_test_output_dir)
n_current_samples = len(os.listdir(negative_test_output_dir))
if n_current_samples <= 0.95*config["n_samples_val"]:
    adversarial_texts = config["custom_negative_phrases"]
    for target_phrase in config["target_phrase"]:
        adversarial_texts.extend(generate_adversarial_texts(
            input_text=target_phrase,
            N=config["n_samples_val"]//len(config["target_phrase"]),
            include_partial_phrase=1.0,
            include_input_words=0.2))
    generate_samples(text=adversarial_texts, max_samples=config["n_samples_val"]-n_current_samples,
                     batch_size=config["tts_batch_size"]//7,
                     noise_scales=[1.0], noise_scale_ws=[1.0], length_scales=[0.75, 1.0, 1.25],
                     output_dir=negative_test_output_dir, auto_reduce_batch_size=True)
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of negative clips for testing, as ~{config['n_samples_val']} already exist")

# Generate adversarial negative clips for validation
logging.info("#"*50 + "\nGenerating negative clips for validation\n" + "#"*50)
if not os.path.exists(negative_validation_output_dir):
    os.mkdir(negative_validation_output_dir)
n_current_samples = len(os.listdir(negative_validation_output_dir))
if n_current_samples <= 0.95*config["n_samples_val"]:
    adversarial_texts = config["custom_negative_phrases"]
    for target_phrase in config["target_phrase"]:
        adversarial_texts.extend(generate_adversarial_texts(
            input_text=target_phrase,
            N=config["n_samples_val"]//len(config["target_phrase"]),
            include_partial_phrase=1.0,
            include_input_words=0.2))
    generate_samples(text=adversarial_texts, max_samples=config["n_samples_val"]-n_current_samples,
                     batch_size=config["tts_batch_size"]//7,
                     noise_scales=[1.0], noise_scale_ws=[1.0], length_scales=[0.75, 1.0, 1.25],
                     output_dir=negative_validation_output_dir, auto_reduce_batch_size=True)
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of negative clips for validation, as ~{config['n_samples_val']} already exist")



In [None]:
# Augment generated samples by adding background noise and applying room impulse responses

import sys
import os
from pathlib import Path
from tqdm import tqdm
from mmap_ninja.ragged import RaggedMmap

if ".openwakeword/openwakeword" not in sys.path:
    sys.path.append('./openwakeword/openwakeword')
from data import augment_clips, filter_audio_paths

config["rirs_dir"] = os.path.abspath(config["rirs_dir"])
config["audioset_clips_dir"] = os.path.abspath(config["audioset_clips_dir"])
config["fma_clips_dir"] = os.path.abspath(config["fma_clips_dir"])
config["fsd_clips_dir"] = os.path.abspath(config["fsd_clips_dir"])
config["features_output_dir"] = os.path.abspath(config["features_output_dir"])
config["audio_samples_per_clip"] = int((config["spectrogram_duration_ms"])*config["sample_rate_hz"]/1000) # ms * herz *1/(1000ms) = # of samples

if not os.path.exists(config["features_output_dir"]):
    os.mkdir(config["features_output_dir"])
    
max_duration_sec = config["clip_duration_ms"]/1000.0
spectrogram_duration_sec = config["spectrogram_duration_ms"]/1000.0
jitter_s = config["end_jitter_ms"]/1000.0

positive_clips_train, durations = filter_audio_paths([positive_train_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")
positive_clips_test, durations = filter_audio_paths([positive_test_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")
positive_clips_validation, durations = filter_audio_paths([positive_validation_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")

negative_clips_train, durations = filter_audio_paths([negative_train_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")
negative_clips_test, durations = filter_audio_paths([negative_test_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")
negative_clips_validation, durations = filter_audio_paths([negative_validation_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")

rir_paths = [str(i) for i in Path(config["rirs_dir"]).glob(config["rirs_glob"])]

for i in range(0,1):
    positive_train_generator = augment_clips(positive_clips_train,
                        total_length=config["audio_samples_per_clip"],
                        batch_size=config["augment_batch_size"],
                        background_clip_paths=[config["audioset_clips_dir"], config["fma_clips_dir"], config["fsd_clips_dir"]],
                        RIR_paths=rir_paths, end_jitter=jitter_s)
    positive_test_generator = augment_clips(positive_clips_test,
                        total_length=config["audio_samples_per_clip"],
                        batch_size=config["augment_batch_size"],
                        background_clip_paths=[config["audioset_clips_dir"], config["fma_clips_dir"], config["fsd_clips_dir"]],
                        RIR_paths=rir_paths, end_jitter=jitter_s)
    positive_validation_generator = augment_clips(positive_clips_validation,
                        total_length=config["audio_samples_per_clip"],
                        batch_size=config["augment_batch_size"],
                        background_clip_paths=[config["audioset_clips_dir"], config["fma_clips_dir"], config["fsd_clips_dir"]],
                        RIR_paths=rir_paths, end_jitter=jitter_s)

    negative_train_generator = augment_clips(negative_clips_train,
                        total_length=config["audio_samples_per_clip"],
                        batch_size=config["augment_batch_size"],
                        background_clip_paths=[config["audioset_clips_dir"], config["fma_clips_dir"], config["fsd_clips_dir"]],
                        RIR_paths=rir_paths, end_jitter=jitter_s)
    negative_test_generator = augment_clips(negative_clips_test,
                        total_length=config["audio_samples_per_clip"],
                        batch_size=config["augment_batch_size"],
                        background_clip_paths=[config["audioset_clips_dir"], config["fma_clips_dir"], config["fsd_clips_dir"]],
                        RIR_paths=rir_paths, end_jitter=jitter_s)
    negative_validation_generator = augment_clips(negative_clips_validation,
                        total_length=config["audio_samples_per_clip"],
                        batch_size=config["augment_batch_size"],
                        background_clip_paths=[config["audioset_clips_dir"], config["fma_clips_dir"], config["fsd_clips_dir"]],
                        RIR_paths=rir_paths, end_jitter=jitter_s)

    augmented_training_clips_directory = os.path.join(config["features_output_dir"], "training")
    augmented_testing_clips_directory = os.path.join(config["features_output_dir"], "testing")
    augmented_validation_clips_directory = os.path.join(config["features_output_dir"], "validation")

    if not os.path.exists(augmented_training_clips_directory):
        os.mkdir(augmented_training_clips_directory)
    if not os.path.exists(augmented_testing_clips_directory):
        os.mkdir(augmented_testing_clips_directory)
    if not os.path.exists(augmented_validation_clips_directory):
        os.mkdir(augmented_validation_clips_directory)

    augmented_positive_train_directory = os.path.join(augmented_training_clips_directory, "wakeword")
    augmented_positive_test_directory = os.path.join(augmented_testing_clips_directory, "wakeword")
    augmented_positive_validation_directory = os.path.join(augmented_validation_clips_directory, "wakeword")

    augmented_negative_train_directory = os.path.join(augmented_training_clips_directory, "unknown")
    augmented_negative_test_directory = os.path.join(augmented_testing_clips_directory, "unknown")
    augmented_negative_validation_directory = os.path.join(augmented_validation_clips_directory, "unknown")

    generator_outputs = [
                         [positive_train_generator, augmented_positive_train_directory, len(positive_clips_train)], 
                         [positive_test_generator, augmented_positive_test_directory,len(positive_clips_test)],
                         [positive_validation_generator, augmented_positive_validation_directory, len(positive_clips_validation)],
                         [negative_train_generator, augmented_negative_train_directory, len(negative_clips_train)],
                         [negative_test_generator, augmented_negative_test_directory, len(negative_clips_test)],
                         [negative_validation_generator, augmented_negative_validation_directory, len(negative_clips_validation)]
                        ]


    for [generator, output_directory, n_total] in generator_outputs:
        if not os.path.exists(output_directory):
            os.mkdir(output_directory)

        output_directory = os.path.join(output_directory, 'batch_'+str(i)+'_mmap')

        RaggedMmap.from_generator(out_dir=output_directory,sample_generator=features_generator(generator),batch_size=1024,verbose=True)


In [None]:
# Converts a data set of samples to appropriately formatted wavs (for use as mixing background noises)
# Could also use ffmpeg directly, which may be faster and can handle errors better
# for clip in *.flac; do ffmpeg -i "./$clip" -hide_banner -loglevel error -sample_fmt s16 -ac 1 -ar 16000 "./$clip.wav"; done

import datasets
import scipy
import os
from pathlib import Path
import tqdm
import numpy as np

path_to_audio_dataset = "bal_train"
audio_dataset_glob = "*.flac"
output_dir = 'audioset_16k'

audioset_dataset = datasets.Dataset.from_dict({"audio": [str(i) for i in Path(path_to_audio_dataset).glob(audio_dataset_glob)]})
audioset_dataset = audioset_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))

for row in tqdm(audioset_dataset):
    name = row['audio']['path'].split('/')[-1].replace(".flac", ".wav")
    try:
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))
    except Exception as e:
        print("caught an issue")

In [None]:
# Generates features for background noise datasets
# Input files do not need to be pre-converted to wav
# Test and validations sets are truncated to the clip duration for consistent testing
# Training sets convert the entire clip; the training process randomly truncates it each time used

import datasets
import sys
import numpy as np
from pathlib import Path
from tqdm import tqdm
from mmap_ninja.ragged import RaggedMmap

if ".openwakeword/openwakeword" not in sys.path:
    sys.path.append('./openwakeword/openwakeword')

from data import truncate_clip

path_to_audio_dataset = "BUT_ReverbDB_rel_19_06_RIR-Only"
audio_dataset_glob = "**/silence/*.wav"
dataset_name = "but_reverdb_silence"

audioset_dataset = datasets.Dataset.from_dict({"audio": [str(i) for i in Path(path_to_audio_dataset).glob(audio_dataset_glob)]})
audioset_dataset = audioset_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))

train_testvalid = audioset_dataset.train_test_split(test_size=0.1)

test_validate = train_testvalid['test'].train_test_split(test_size=0.5)

def features_generator(set):
    if set == 'train':
        for row in train_testvalid['train']:
            if len(row['audio']['array']) < config["audio_samples_per_clip"]:  # ensure clip has at least as many samples as needed
                continue
            yield generate_features_for_clip((row['audio']['array']*32767).astype(np.int16))
    elif set == 'test':
        for row in test_validate['test']:
            if len(row['audio']['array']) < config["audio_samples_per_clip"]:  # ensure clip has at least as many samples as needed
                continue
            
            # Truncate for a consistent test set
            truncated = truncate_clip((row['audio']['array']*32767).astype(np.int16), config["audio_samples_per_clip"], "random")
            yield generate_features_for_clip(truncated)
    elif set == 'validate':
        for row in test_validate['train']:
            if len(row['audio']['array']) < config["audio_samples_per_clip"]: # ensure clip has at least as many samples as needed
                continue
            
            # Truncate for a consistent validation set
            truncated = truncate_clip((row['audio']['array']*32767).astype(np.int16), config["audio_samples_per_clip"], "random")
            yield generate_features_for_clip(truncated)

test_dir_fname = dataset_name + '_test_' + str(config["audio_samples_per_clip"]) + 'ms_mmap'
validation_dir_fname = dataset_name + '_validation_' + str(config["audio_samples_per_clip"]) + 'ms_mmap'
train_dir_fname = dataset_name + '_training_' + '_mmap'

test_output_dir = os.join.path(augmented_negative_test_directory, test_dir_fname)
validation_output_dir = os.join.path(augmented_negative_validation_directory, validation_dir_fname)
train_output_dir = os.join.path(augmented_negative_train_directory, train_dir_fname)

RaggedMmap.from_generator(out_dir=test_output_dir,
                          sample_generator=features_generator('test'),
                          batch_size=1024,
                          verbose=True)
RaggedMmap.from_generator(out_dir=validation_output_dir,
                          sample_generator=features_generator('validate'), 
                          batch_size=1024,
                          verbose=True)
RaggedMmap.from_generator(out_dir=train_output_dir,
                          sample_generator=features_generator('train'),
                          batch_size=1024,
                          verbose=True)


In [None]:
# Install all the required packages (borrowed from openWakeWord's automatic training notebook)

!pip install torch
!pip install torchaudio
!pip install datasetsDD
!pip install scipy
!pip install tqdm
!pip install jupyter
!pip install ipywidgets
!pip install mutagen
!pip install torchinfo
!pip install torchmetrics
!pip install speechbrain
!pip install audiomentations
!pip install torch-audiomentations
!pip install acoustics
!pip install pronouncing
!pip install datasets
!pip install deep-phonemizer
!pip install piper-phonemize
!pip install webrtcvad
!pip install datasets

import os

if not os.path.exists("./piper-sample-generator"):
    !git clone https://github.com/rhasspy/piper-sample-generator
    !wget -O piper-sample-generator/models/en_US-libritts_r-medium.pt 'https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/en_US-libritts_r-medium.pt'

# install openwakeword (full installation to support training)
!git clone https://github.com/dscripka/openwakeword
!pip install -e ./openwakeword

In [None]:
# Downloads a small collection of background noise and negative samples (borrowed from openWakeWord's automatic training notebook)

import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# Download required models (workaround for Colab)
import os

# Imports
import sys

if "piper-sample-generator/" not in sys.path:
    sys.path.append("piper-sample-generator/")
from generate_samples import generate_samples

import numpy as np
import torch
import sys
from pathlib import Path
import uuid
import yaml
import datasets
import scipy
from tqdm import tqdm

## Download all data

## Download MIR RIR data (takes about ~2 minutes)
output_dir = "./mit_rirs"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    rir_dataset = datasets.load_dataset("davidscripka/MIT_environmental_impulse_responses", split="train", streaming=True)
    # Save clips to 16-bit PCM wav files
    for row in tqdm(rir_dataset):
        name = row['audio']['path'].split('/')[-1]
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

## Download noise and background audio (takes about ~3 minutes)

# Audioset Dataset (https://research.google.com/audioset/dataset/index.html)
# Download one part of the audioset .tar files, extract, and convert to 16khz
# For full-scale training, it's recommended to download the entire dataset from
# https://huggingface.co/datasets/agkphysics/AudioSet, and
# even potentially combine it with other background noise datasets (e.g., FSD50k, Freesound, etc.)

if not os.path.exists("audioset"):
    os.mkdir("audioset")

    fname = "bal_train09.tar"
    out_dir = f"audioset/{fname}"
    link = "https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/" + fname
    !wget -O {out_dir} {link}
    !cd audioset && tar -xvf bal_train09.tar

    output_dir = "./audioset_16k"
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    # Save clips to 16-bit PCM wav files
    audioset_dataset = datasets.Dataset.from_dict({"audio": [str(i) for i in Path("audioset/audio").glob("**/*.flac")]})
    audioset_dataset = audioset_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))
    for row in tqdm(audioset_dataset):
        name = row['audio']['path'].split('/')[-1].replace(".flac", ".wav")
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

# Free Music Archive dataset
# https://github.com/mdeff/fma

output_dir = "./fma"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    fma_dataset = datasets.load_dataset("rudraml/fma", name="small", split="train", streaming=True)
    fma_dataset = iter(fma_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000)))

    # Save clips to 16-bit PCM wav files
    n_hours = 1  # use only 1 hour of clips for this example notebook, recommend increasing for full-scale training
    for i in tqdm(range(n_hours*3600//30)):  # this works because the FMA dataset is all 30 second clips
        row = next(fma_dataset)
        name = row['audio']['path'].split('/')[-1].replace(".mp3", ".wav")
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))
        i += 1
        if i == n_hours*3600//30:
            break
