**Introduction**:

This notebook builds a dataset inside colab and potentially saves it in google drive for later usage.

Typically, instead of regenrating from scratch a dataset, one would want to augment it and retrain the model. The workflow would therefore be to set the "restore_dataset" variable accordingly.

If restore is set to false, all samples are re-generated, otherwhise, they are reloaded and the user can augement with voice samples or any other samples. It's recommended to save the dataset back into the drive after augmenting for later usage (notebook 02-features-generation for example)

In [2]:
# Install all the required packages (borrowed from openWakeWord's automatic training notebook)
running_on_colab = False 
if 'google.colab' in str(get_ipython()):
    print('Running on CoLab')
    from google.colab import drive
    drive.mount('/content/drive')
    running_on_colab = True
    restore_dataset = True
    restore_features = True
else:
    print('Not running on CoLab - will use gdrive utility to backup/restore datasets and features. Please configure gdrive to access your gdrive..')
    restore_dataset = False
    restore_features = False
 

import locale
locale.getpreferredencoding = lambda: "UTF-8"



Not running on CoLab - will use gdrive utility to backup/restore datasets and features. Please configure gdrive to access your gdrive..


In [None]:
import os
if restore_dataset:
  dataset_filename = "/content/drive/MyDrive/ColabNotebooks/VoiceAssistant/microWakeWord/datasets_20240423_175513.tar"
  !cp {dataset_filename}  .
  dataset_file = os.path.basename(dataset_filename)
  !tar -xvf ./{dataset_file} to 

In [None]:
%pip install -q torch
%pip install -q torchaudio
%pip install -q scipy
%pip install -q tqdm
%pip install -q jupyter
%pip install -q ipywidgets
%pip install -q mutagen
%pip install -q torchinfo
%pip install -q torchmetrics
%pip install -q speechbrain
%pip install -q audiomentations
%pip install -q torch-audiomentations
%pip install -q acoustics
%pip install -q pronouncing
%pip install -q datasets
%pip install -q deep-phonemizer
%pip install -q piper-phonemize
%pip install -q webrtcvad
%pip install -q datasets
%pip install -q mmap_ninja
%pip install -q gradio
%pip install -q tensorflow==2.15.0
%pip install -q keras==2.15.0
%pip install -q sounddevice
%pip install -q tflite_micro
#%pip install cloud-tpu-client

In [None]:
import os
import sys

if not os.path.isdir("./piper-sample-generator"):
    !git clone -b mps-support https://github.com/kahrendt/piper-sample-generator
    !wget -O piper-sample-generator/models/en_US-libritts_r-medium.pt 'https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/en_US-libritts_r-medium.pt'


if not os.path.isdir("./openWakeWord"):
    !git clone https://github.com/dscripka/openWakeWord

cwd = os.getcwd()
piper_path = cwd+"/piper-sample-generator/"
if piper_path not in sys.path:
    sys.path.insert(0, piper_path)

oww_path = cwd + "/openWakeWord/openwakeword/"
if  oww_path not in sys.path:
    sys.path.insert(0, oww_path)

print("Updated sys.path is ")
print("\n".join(sys.path))
print("Please restart the kernel (Kernel -> Restart Kernel) and run the next cell.")

In [None]:
import sys
import os
from pathlib import Path
from tqdm import tqdm
from mmap_ninja.ragged import RaggedMmap
import os
import uuid
import logging
import datasets
import scipy
import numpy as np
import tqdm
import torch
import yaml
import logging
from generate_samples import generate_samples
from data import generate_adversarial_texts

!mkdir -p datasets

In [None]:
## Download noise and background audio (takes about ~3 minutes)

# Audioset Dataset (https://research.google.com/audioset/dataset/index.html)
# Download one part of the audioset .tar files, extract, and convert to 16khz
# For full-scale training, it's recommended to download the entire dataset from
# https://huggingface.co/datasets/agkphysics/AudioSet, and
# even potentially combine it with other background noise datasets (e.g., FSD50k, Freesound, etc.)
from tqdm import tqdm
if not os.path.exists("datasets/audioset"):
    os.mkdir("audioset")

    fname = "bal_train09.tar"
    out_dir = f"datasets/audioset/{fname}"
    link = "https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/" + fname
    !wget -O {out_dir} {link}
    !cd datasets/audioset && tar -xvf bal_train09.tar

# convert them to wav at 16k sampling rate
output_dir = "./datasets/audioset_16k"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

    # Save clips to 16-bit PCM wav files
    audioset_dataset = datasets.Dataset.from_dict({"audio": [str(i) for i in Path("audioset/audio").glob("**/*.flac")]})
    audioset_dataset = audioset_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))
    for row in tqdm(audioset_dataset):
        name = row['audio']['path'].split('/')[-1].replace(".flac", ".wav")
        scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

In [None]:
%%writefile ./alexa_phonemes.txt
ə lɛk sə,
ə lɛk sa, 
a lɛk sa, 
ʌ lɛk sa,
ʌ lɛk sʌ,

In [None]:
more_pronounciation_dataset = 'datasets/positive_more_pronunciations/alexa/'
for d in ['positive_train', 'positive_test', 'positive_validation']:
    if not os.path.exists(more_pronounciation_dataset + d):
        os.makedirs(more_pronounciation_dataset + d)

more_pronounciation_samples = {}
more_pronounciation_samples['positive_train'] = 100000
more_pronounciation_samples['positive_test'] = 1000
more_pronounciation_samples['positive_validation'] = 1000

for d in ['positive_train', 'positive_test', 'positive_validation']:
    n_current_samples = len(os.listdir(more_pronounciation_dataset + d))
    print("Currently there are ", n_current_samples, " files in ", d )
    if n_current_samples <= 0.95*more_pronounciation_samples[d]:
        !python3 piper-sample-generator/generate_samples.py alexa_phonemes.txt \
            --max-samples {more_pronounciation_samples[d]} \
            --batch-size 10 \
            --slerp-weights 1 \
            --phoneme-input \
            --output-dir {more_pronounciation_dataset + d} \
            --max-speakers 600 \
            --min-phoneme-count 200

In [None]:

# Free Music Archive dataset
# https://github.com/mdeff/fma

output_dir = "./datasets/fma"
if not os.path.exists(output_dir):
  os.mkdir(output_dir)
  fma_dataset = datasets.load_dataset("rudraml/fma", name="small", split="train", streaming=False)
  fma_dataset = iter(fma_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000)))
  # Save clips to 16-bit PCM wav files
  n_hours = 1  # use only 1 hour of clips for this example notebook, recommend increasing for full-scale training
  for i in tqdm(range(n_hours*3600//30)):  # this works because the FMA dataset is all 30 second clips
      row = next(fma_dataset)
      name = row['audio']['path'].split('/')[-1].replace(".mp3", ".wav")
      scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))
      i += 1
      if i == n_hours*3600//30:
          break


In [None]:
# generate the wake word with phonetic transcription
config = {}
config["target_phrase"] = 'alexha'
config["target_phrase_as_written"] = 'alexha'       # used to generate adverserial phrases
config["model_name"] = 'alexa'
config["custom_negative_phrases"] = ['ali', 'aliba', 'yooba', 'liba', 'tessa','alex','exa', 'ale']

config["samples_output_dir"] = 'datasets/generated_samples'
config["features_output_dir"] = 'training_features'
config["rirs_dir"] = 'datasets/BUT_ReverbDB_rel_19_06_RIR-Only'  # directory containing Room Imnpulse Response files
config["rirs_glob"] = "**/RIR/*.wav"                    # Glob to choose the appropriate wav files for RIR
config["audioset_clips_dir"] = 'datasets/audioset_16k'           # directory containing converted Audioset wav files
config["fma_clips_dir"] = 'datasets/fma'                         # directory containing converted FreeMusicArchive wav files
config["fsd_clips_dir"] = 'datasets/fsd'                         # directory containing converted FSD50K wav files

config["n_samples"] = 250000 #200000                            # number of training samples to generate
config["n_samples_val"] =  20000 # 20000                         # number of testing and validation samples to generate
config["tts_batch_size"] = 20
config["augment_batch_size"] = 16
config["clip_duration_ms"] = 1430                       # generated clips longer than this are ignored when augmenting
config["spectrogram_duration_ms"] = 1490                # duration of the spectrogram (usually equivalent to clip_duration_ms + end_jitter_ms)
config["sample_rate_hz"] = 16000
config["end_jitter_ms"] = 60                            # augmented clips have up to this amount of blank noise at the end of the clip


config["samples_output_dir"] = os.path.abspath(config["samples_output_dir"])

if not os.path.exists(config["samples_output_dir"]):
    os.mkdir(config["samples_output_dir"])

models_samples_directory = os.path.join(config["samples_output_dir"], config["model_name"])
if not os.path.exists(models_samples_directory):
    os.mkdir(models_samples_directory)

positive_train_output_dir = os.path.join(models_samples_directory, "positive_train")
positive_test_output_dir = os.path.join(models_samples_directory, "positive_test")
positive_validation_output_dir = os.path.join(models_samples_directory, "positive_validation")
negative_train_output_dir = os.path.join(models_samples_directory, "negative_train")
negative_test_output_dir = os.path.join(models_samples_directory, "negative_test")
negative_validation_output_dir = os.path.join(models_samples_directory, "negative_validation")

# save the dataset and features generation configuration
import json
config_str = json.dumps(config)
with open('dataset_config.json', 'w') as f:
    json.dump(config, f)


In [None]:
logging.info("\n" +"#"*50 + "\nGenerating positive clips for training\n" + "#"*50)

# Generate positive clips for training
if not os.path.exists(positive_train_output_dir):
    os.mkdir(positive_train_output_dir)

n_current_samples = len(os.listdir(positive_train_output_dir))
print("Currently there are ", n_current_samples, " files")
if n_current_samples <= 0.95*config["n_samples"]:
    generate_samples(
        text=config["target_phrase"], max_samples=config["n_samples"]-n_current_samples,
        batch_size=config["tts_batch_size"],
        noise_scales=[0.98], noise_scale_ws=[0.98], length_scales=[0.75, 1.0, 1.25],
        output_dir=positive_train_output_dir, auto_reduce_batch_size=True,
        file_names=[uuid.uuid4().hex + ".wav" for i in range(config["n_samples"])]
    )
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of positive clips for training, as ~{config['n_samples']} already exist")


In [None]:

# Generate positive clips for testing
logging.info("\n" +"#"*50 + "\nGenerating positive clips for testing\n" + "#"*50)
if not os.path.exists(positive_test_output_dir):
    os.mkdir(positive_test_output_dir)
n_current_samples = len(os.listdir(positive_test_output_dir))

if n_current_samples <= 0.95*config["n_samples_val"]:
    generate_samples(text=config["target_phrase"], max_samples=config["n_samples_val"]-n_current_samples,
                     batch_size=config["tts_batch_size"],
                     noise_scales=[1.0], noise_scale_ws=[1.0], length_scales=[0.75, 1.0, 1.25],
                     output_dir=positive_test_output_dir, auto_reduce_batch_size=True,
                     file_names=[uuid.uuid4().hex + ".wav" for i in range(config["n_samples"])]
    )
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of positive clips testing, as ~{config['n_samples_val']} already exist")



In [None]:
# Generate positive clips for validation
logging.info("\n" + "#"*50 + "\nGenerating positive clips for validation\n" + "#"*50)
if not os.path.exists(positive_validation_output_dir):
    os.mkdir(positive_validation_output_dir)
n_current_samples = len(os.listdir(positive_validation_output_dir))
print("Currently there are ", n_current_samples, " files")
if n_current_samples <= 0.95*config["n_samples_val"]:
    generate_samples(text=config["target_phrase"], max_samples=config["n_samples_val"]-n_current_samples,
                     batch_size=config["tts_batch_size"],
                     noise_scales=[1.0], noise_scale_ws=[1.0], length_scales=[0.75, 1.0, 1.25],
                     output_dir=positive_validation_output_dir, auto_reduce_batch_size=True,
                     file_names=[uuid.uuid4().hex + ".wav" for i in range(config["n_samples"])]
    )
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of positive clips validation, as ~{config['n_samples_val']} already exist")


In [None]:

# Generate adversarial negative clips for training
logging.info("\n" +"#"*50 + "\nGenerating negative clips for training\n" + "#"*50)
if not os.path.exists(negative_train_output_dir):
    os.mkdir(negative_train_output_dir)
n_current_samples = len(os.listdir(negative_train_output_dir))
while n_current_samples <= 0.95*config["n_samples"]:
    print("Currently there are ", n_current_samples, " files")
    adversarial_texts = config["custom_negative_phrases"]
    for target_phrase in config["target_phrase"]:
        adversarial_texts.extend(generate_adversarial_texts(
            input_text=target_phrase,
            N=config["n_samples"]//len(config["target_phrase"]),
            include_partial_phrase=1.0,
            include_input_words=0.2))
    generate_samples(text=adversarial_texts, max_samples=config["n_samples"]-n_current_samples,
                     batch_size=config["tts_batch_size"]//7,
                     noise_scales=[0.98], noise_scale_ws=[0.98], length_scales=[0.75, 1.0, 1.25],
                     output_dir=negative_train_output_dir, auto_reduce_batch_size=True,
                     file_names=[uuid.uuid4().hex + ".wav" for i in range(config["n_samples"])]
                     )
    n_current_samples = len(os.listdir(negative_train_output_dir))
    print("Currently there are ", n_current_samples, " files")
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of negative clips for training, as ~{config['n_samples']} already exist")


In [None]:

# Generate adversarial negative clips for testing
logging.info("\n" +"#"*50 + "\nGenerating negative clips for testing\n" + "#"*50)
if not os.path.exists(negative_test_output_dir):
    os.mkdir(negative_test_output_dir)
n_current_samples = len(os.listdir(negative_test_output_dir))
print("Currently there are ", n_current_samples, " files - requiring ", config["n_samples_val"], " files.")
if n_current_samples <= 0.95*config["n_samples_val"]:
    adversarial_texts = config["custom_negative_phrases"]
    for target_phrase in config["target_phrase"]:
        adversarial_texts.extend(generate_adversarial_texts(
            input_text=target_phrase,
            N=config["n_samples_val"]//len(config["target_phrase"]),
            include_partial_phrase=1.0,
            include_input_words=0.2))
    generate_samples(text=adversarial_texts, max_samples=config["n_samples_val"]-n_current_samples,
                     batch_size=config["tts_batch_size"]//7,
                     noise_scales=[1.0], noise_scale_ws=[1.0], length_scales=[0.75, 1.0, 1.25],
                     output_dir=negative_test_output_dir, auto_reduce_batch_size=True,
                     file_names=[uuid.uuid4().hex + ".wav" for i in range(config["n_samples"])]
    )
    n_current_samples = len(os.listdir(negative_test_output_dir))
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of negative clips for testing, as ~{config['n_samples_val']} already exist")


In [None]:

# Generate adversarial negative clips for validation
logging.info("#"*50 + "\nGenerating negative clips for validation\n" + "#"*50)
if not os.path.exists(negative_validation_output_dir):
    os.mkdir(negative_validation_output_dir)
n_current_samples = len(os.listdir(negative_validation_output_dir))
print("Currently there are ", n_current_samples, " files")
if n_current_samples <= 0.95*config["n_samples_val"]:
    adversarial_texts = config["custom_negative_phrases"]
    for target_phrase in config["target_phrase"]:
        adversarial_texts.extend(generate_adversarial_texts(
            input_text=target_phrase,
            N=config["n_samples_val"]//len(config["target_phrase"]),
            include_partial_phrase=1.0,
            include_input_words=0.2))
    generate_samples(text=adversarial_texts, max_samples=config["n_samples_val"]-n_current_samples,
                     batch_size=config["tts_batch_size"]//7,
                     noise_scales=[1.0], noise_scale_ws=[1.0], length_scales=[0.75, 1.0, 1.25],
                     output_dir=negative_validation_output_dir, auto_reduce_batch_size=True,
                     file_names=[uuid.uuid4().hex + ".wav" for i in range(config["n_samples"])]
    )
    n_current_samples = len(os.listdir(negative_validation_output_dir))
    torch.cuda.empty_cache()
else:
    logging.warning(f"Skipping generation of negative clips for validation, as ~{config['n_samples_val']} already exist")



In [None]:
# Augment generated samples by adding background noise and applying room impulse responses
from data import augment_clips, filter_audio_paths

config["rirs_dir"] = os.path.abspath(config["rirs_dir"])
config["audioset_clips_dir"] = os.path.abspath(config["audioset_clips_dir"])
config["fma_clips_dir"] = os.path.abspath(config["fma_clips_dir"])
config["fsd_clips_dir"] = os.path.abspath(config["fsd_clips_dir"])
config["features_output_dir"] = os.path.abspath(config["features_output_dir"])
config["audio_samples_per_clip"] = int((config["spectrogram_duration_ms"])*config["sample_rate_hz"]/1000) # ms * herz *1/(1000ms) = # of samples

if not os.path.exists(config["features_output_dir"]):
    os.mkdir(config["features_output_dir"])

max_duration_sec = config["clip_duration_ms"]/1000.0
spectrogram_duration_sec = config["spectrogram_duration_ms"]/1000.0
jitter_s = config["end_jitter_ms"]/1000.0

positive_clips_train, durations = filter_audio_paths([positive_train_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")
positive_clips_test, durations = filter_audio_paths([positive_test_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")
positive_clips_validation, durations = filter_audio_paths([positive_validation_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")

negative_clips_train, durations = filter_audio_paths([negative_train_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")
negative_clips_test, durations = filter_audio_paths([negative_test_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")
negative_clips_validation, durations = filter_audio_paths([negative_validation_output_dir],min_length_secs=0.0, max_length_secs=max_duration_sec, duration_method = "size")

rir_paths = [str(i) for i in Path(config["rirs_dir"]).glob(config["rirs_glob"])]


In [None]:
# download noise and silence files and put in datasets
# all non relevant files in "silence" dirs can be removed with
# find . -type d -name "silence" -exec rm -rf {} +
if not os.path.exists("datasets/BUT_ReverbDB_rel_19_06_RIR-Only"):
  if os.path.isfile("./BUT_ReverbDB_rel_19_06_RIR-Only.tgz") == False:
    !wget http://merlin.fit.vutbr.cz/ReverbDB/BUT_ReverbDB_rel_19_06_RIR-Only.tgz
  !tar -xvf ./BUT_ReverbDB_rel_19_06_RIR-Only.tgz ./datasets/BUT_ReverbDB_rel_19_06_RIR-Only/


In [None]:
%pip install -q TTS

Trials with coquiTTS for adapting general voices to user voice profile

In [None]:
#!/home/lior/anaconda3/envs/google_kws/bin/python
import torch
from TTS.api import TTS
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")

In [None]:
import os
import shutil
import glob
from scipy.io import wavfile
from tqdm import tqdm

if not os.path.exists("./datasets/cloned_samples"):
  os.mkdir("./datasets/cloned_samples")

for target_dir in ["positive_train", "positive_test",
                   "positive_validation", "negative_train", "negative_test", "negative_validation"]:

  print(f'processing {target_dir}')
  if not os.path.exists(f'./datasets/cloned_samples/{target_dir}'):
      os.mkdir(f'./datasets/cloned_samples/{target_dir}')

  dirA = f'./datasets/generated_samples/alexa/{target_dir}/'
  target_voice = "./datasets/captured_samples/positive_train/positive0da5fca5153246a8ae2e631f666622c8.wav"
  dirB = f'./datasets/cloned_samples/{target_dir}/'

  # Get a list of all wav files in dirA
  wav_files = glob.glob(os.path.join(dirA, "*.wav"))

  for wav_file in tqdm(wav_files[1:2000], desc="Processing items", unit="item"):
      filename = os.path.basename(wav_file)
      target_file = os.path.join(dirB, filename)

      # Check if the file exists in dirB
      if os.path.exists(target_file):
          #print(f"File {filename} already exists in {dirB}, skipping.")
          continue

      # If not, apply a function on it and save it in dirB
      tts.voice_conversion_to_file(source_wav=wav_file, target_wav=target_voice, file_path=target_file)
      #print(f"Processed and saved file {filename} in {dirB}.")


In [1]:

# coqui is generating 24Khz sample rate - covert to 16Khz
import os
import librosa
from scipy.io.wavfile import write
import soundfile as sf

def resample_wav_file(filepath, target_sr):
    # Load the audio file with original sample rate
    y, sr = librosa.load(filepath, sr=None)

    # Resample to target sample rate
    y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)

    # Save the resampled audio
    sf.write(filepath, y_resampled, target_sr, subtype='PCM_16')
    #write(filepath, target_sr, y_resampled)

# Define the directory
directory = "./datasets/cloned_samples"

# Define the target sample rate
target_sr = 16000

# Walk through the directory
for dirpath, dirnames, filenames in os.walk(directory):
    for filename in tqdm(filenames):
        if filename.endswith('.wav'):
            filepath = os.path.join(dirpath, filename)
            resample_wav_file(filepath, target_sr)


In [3]:
if running_on_colab == True:
    !tar -cvf ./datasets_$(date +%Y%m%d_%H%M%S).tar ./datasets
    !cp ./datasets*.tar  /content/drive/MyDrive/ColabNotebooks/VoiceAssistant/microWakeWord/
else:
    #!tar -cf ./datasets_$(date +%Y%m%d_%H%M%S).tar ./datasets
    !tar cf - ./datasets | pv -s $(du -sb ./datasets | awk '{print $1}')  > ./datasets_$(date +%Y%m%d_%H%M%S).tar 
    # TODO: Replace with the dir ID of your gdrive dataset
    !gdrive files upload ./datasets*.tar --parent "1xLbJBpoHUCWDELtzMggekID3ZkSMaU7-"

Uploading ./datasets_20240428_133701.tar
File successfully uploaded
Id: 1xSVLo9abJYq7_BEmlyQc9G8rPSvrQdy9
Name: datasets_20240428_133701.tar
Mime: application/x-tar
Size: 22.7 GB
Created: 2024-04-28 13:58:22
Modified: 2024-04-28 13:58:22
MD5: 6049bc30edd1a61688f2d3e78a2154ff
Shared: True
Parents: 1xLbJBpoHUCWDELtzMggekID3ZkSMaU7-
ViewUrl: https://drive.google.com/file/d/1xSVLo9abJYq7_BEmlyQc9G8rPSvrQdy9/view?usp=drivesdk
