In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.
!apt-get install sox libsndfile1 ffmpeg
!pip install wget
!pip install git+https://github.com/NVIDIA/apex.git
!pip install nemo-toolkit
!pip install nemo-asr
!pip install unidecode

!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/master/examples/asr/configs/quartznet_vad_3x1.yaml

In [None]:
# Import some necessary libraries
import os
import argparse
import copy
import math
import os
import glob
from functools import partial
from datetime import datetime
from ruamel.yaml import YAML

# Introduction

This VAD tutorial is based on the MatchboxNet model from the paper "[MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition](https://arxiv.org/abs/2004.08531)" with a modified decoder head to suit classification tasks.

The notebook will follow the steps below:

 - Dataset preparation: Instruction of downloading datasets. And how to convert it to a format suitable for use with nemo_asr
 - Audio preprocessing (feature extraction): signal normalization, windowing, (log) spectrogram (or mel scale spectrogram, or MFCC)

 - Data augmentation using SpecAugment "[SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779)" to increase number of data samples.
 
 - Develop a small Neural classification model which can be trained efficiently.
 
 - Model training on the Google Speech Commands dataset and Freesound dataset in NeMo.
 
 - Evaluation of error cases of the model by audibly hearing the samples

# Data Preparation

## Download the background data
We suggest to use the background categories of [freesound](https://freesound.org/) dataset  as our non-speech/background data. 
We provide scripts for downloading and resampling it.  Please have a look at [NeMo docs VAD Data Preparation]( https://nvidia.github.io/NeMo/voice_activity_detection/tutorial.html#data-preparation). Note that downloading this dataset may takes hours. 

**NOTE:** Here, this tutorial serves as a demonstration on how to train and evaluate models for vad using NeMo. We avoid using freesound dataset, and use `_background_noise_` category in Google Speech Commands Dataset as non-speech/background data.

## Download the speech data
   
We will use the open source Google Speech Commands Dataset (we will use V2 of the dataset for the tutorial, but require very minor changes to support V1 dataset) as our speech data. Google Speech Commands Dataset V2 will take roughly 6GB disk space. These scripts below will download the dataset and convert it to a format suitable for use with nemo_asr.


**NOTE**: You may additionally pass `--test_size` or `--val_size` flag for spliting train val and test data.

**NOTE**: You may additionally pass a `--rebalance_method='fixed|over|under'` at the end of the script to rebalance the class samples in the manifest. 
* 'fixed': Fixed number of sample for each class. Train 5000, val 1000, and test 1000. (Change number in script if you want)
* 'over': Oversampling rebalance method
* 'under': Undersampling rebalance method

**NOTE**: The `_background_noise_` category only has 6 audio files. So we would like to generate more based on the audio files to enlarge our background training data. If you want to use your own background noise data, just change the `background_data_root` and delete `--generate`


In [None]:
tmp = 'src'
data_folder = 'data'
if not os.path.exists(tmp):
    os.makedirs(tmp)
if not os.path.exists(data_folder):
    os.makedirs(data_folders)

In [None]:
script = os.path.join(tmp, 'process_vad_data.py')
if not os.path.exists(script):
    !wget -P $tmp https://raw.githubusercontent.com/NVIDIA/NeMo/master/scripts/process_vad_data.py

In [None]:
speech_data_root = os.path.join(data_folder, 'google_dataset_v2')
background_data_root = os.path.join(data_folder, 'google_dataset_v2/google_speech_recognition_v2/_background_noise_')# your <resampled freesound data directory>
out_dir = os.path.join(data_folder, 'manifest')
if not os.path.exists(speech_data_root):
    os.mkdir(speech_data_root)

In [None]:
!python $script --out_dir={out_dir} --speech_data_root={speech_data_root} --background_data_root={background_data_root} --log --generate --rebalance_method='fixed' 


## Prepare the path to manifest files

In [None]:
# change below if you don't have or don't want to use rebalanced data
train_dataset = 'data/manifest/balanced_background_training_manifest.json,data/manifest/balanced_speech_training_manifest.json' 
val_dataset = 'data/manifest/background_validation_manifest.json,data/manifest/speech_validation_manifest.json' 
test_dataset = 'data/manifest/balanced_background_testing_manifest.json,data/manifest/balanced_speech_testing_manifest.json' 

## Read a few rows of the manifest file 

Manifest files are the data structure used by NeMo to declare a few important details about the data :

1) `audio_filepath`: Refers to the path to the raw audio file <br>
2) `label`: The class label (speech or background) of this sample <br>
3) `duration`: The length of the audio file, in seconds.<br>
4) `offset`: The start of the segment, in seconds.

In [None]:
sample_test_dataset =  test_dataset.split(',')[0]

In [None]:
!head -n 5 {sample_test_dataset}

# Training - Preparation

We will be training a MatchboxNet model from paper "[MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition](https://arxiv.org/abs/2004.08531)" evolved from [QuartzNet](https://arxiv.org/pdf/1910.10261.pdf) model. The benefit of QuartzNet over JASPER models is that they use Separable Convolutions, which greatly reduce the number of parameters required to get good model accuracy.

QuartzNet models generally follow the model definition pattern QuartzNet-[BxR], where B is the number of blocks and R is the number of convolutional sub-blocks. Each sub-block contains a 1-D masked convolution, batch normalization, ReLU, and dropout:


In [None]:
# Lets load the config file for the QuartzNet 3x1 model
# Here we will be using separable convolutions with 3 blocks (k=3 repeated once r=1 from)
yaml = YAML(typ="safe")
with open("../configs/quartznet_vad_3x1.yaml") as f:
    jasper_params = yaml.load(f)

# Pre-define a set of labels that this model must learn to predict
labels = jasper_params['labels']

# Get the sampling rate of the data
sample_rate = jasper_params['sample_rate']

In [None]:
labels

In [None]:
# Import NeMo core functionality
# NeMo's "core" package
import nemo
# NeMo's ASR collection
import nemo.collections.asr as nemo_asr
# NeMo's learning rate policy
from nemo.utils.lr_policies import CosineAnnealing
from nemo.collections.asr.helpers import (
    monitor_classification_training_progress,
    process_classification_evaluation_batch,
    process_classification_evaluation_epoch,
)
from nemo.collections.asr.metrics import classification_accuracy, classification_confusion_matrix
from nemo.utils import logging

## Define some model hyper parameters

In [None]:
# Lets define some hyper parameters
lr = 0.05
num_epochs = 5
batch_size = 128
weight_decay = 0.001

## Define the NeMo components

In [None]:
result_dir = 'results'

In [None]:
# Create a Neural Factory
# It creates log files and tensorboard writers for us among other functions
neural_factory = nemo.core.NeuralModuleFactory(
    log_dir='./{0}/quartznet-3x1'.format(result_dir),
    create_tb_writer=True)
tb_writer = neural_factory.tb_writer

In [None]:
# Check if data augmentation such as white noise and time shift augmentation should be used
audio_augmentor = jasper_params.get('AudioAugmentor', None)

# Build the input data layer and the preprocessing layers for the train set
train_data_layer = nemo_asr.AudioToSpeechLabelDataLayer(
    manifest_filepath=train_dataset,
    labels=labels,
    sample_rate=sample_rate,
    batch_size=batch_size,
    num_workers=os.cpu_count(),
    augmentor=audio_augmentor,
    shuffle=True
)

# Build the input data layer and the preprocessing layers for the test set
eval_data_layer = nemo_asr.AudioToSpeechLabelDataLayer(
    manifest_filepath=test_dataset,
    sample_rate=sample_rate,
    labels=labels,
    batch_size=batch_size,
    num_workers=os.cpu_count(),
    shuffle=False,
)

# We will convert the raw audio data into MFCC Features to feed as input to our model.
data_preprocessor = nemo_asr.AudioToMFCCPreprocessor(
    sample_rate=sample_rate, **jasper_params["AudioToMFCCPreprocessor"],
)


# Compute the total number of samples and the number of training steps per epoch
N = len(train_data_layer)
steps_per_epoch = math.ceil(N / float(batch_size) + 1)

logging.info("Steps per epoch : {0}".format(steps_per_epoch))
logging.info('Have {0} examples to train on.'.format(N))

# Here we begin defining all of the augmentations we want
# We will pad the preprocessed spectrogram image to have a certain number of timesteps
# This centers the generated spectrogram and adds black boundaries to either side
# of the padded image.
crop_pad_augmentation = nemo_asr.CropOrPadSpectrogramAugmentation(audio_length=128)

# We also optionally add `SpecAugment` augmentations based on the config file
# SpecAugment has various possible augmentations to the generated spectrogram
# 1) Frequency band masking
# 2) Time band masking
# 3) Rectangular cutout
spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None)

if spectr_augment_config:
    data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config)

# Build the QuartzNet Encoder model
# The config defines the layers as a list of dictionaries
# The first and last two blocks are not considered when we say QuartzNet-[BxR]
# B is counted as the number of blocks after the first layer and before the penultimate layer.
# R is defined as the number of repetitions of each block in B.
# Note: We can scale the convolution kernels size by the float parameter `kernel_size_factor`
jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"])

# We then define the QuartzNet decoder.
# This decoder head is specialized for the task for classification, such that it
# accepts a set of `N-feat` per timestep of the model, and averages these features
# over all the timesteps, before passing a Linear classification layer on those features.
jasper_decoder = nemo_asr.JasperDecoderForClassification(
    feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"],
    num_classes=len(labels),
    **jasper_params['JasperDecoderForClassification'],
)

# We can easily apply cross entropy loss to train this model
ce_loss = nemo_asr.CrossEntropyLossNM()

In [None]:
# Lets print out the number of parameters of this model
logging.info('================================')
logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}")
logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}")
logging.info(
    f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}"
)
logging.info('================================')

## Compile the Training Graph for NeMo

In [None]:
# Now we have all of the components that are required to build the NeMo execution graph!
## Build the training data loaders and preprocessors first
audio_signal, audio_signal_len, labels, label_len = train_data_layer()
processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len)
processed_signal, processed_signal_len = crop_pad_augmentation(
    input_signal=processed_signal,
    length=audio_signal_len
)

## Augment the dataset for training
if spectr_augment_config:
    processed_signal = data_spectr_augmentation(input_spec=processed_signal)

## Define the model
encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=processed_signal_len)
decoded = jasper_decoder(encoder_output=encoded)

## Obtain the train loss
train_loss = ce_loss(logits=decoded, labels=labels)


## Compile the Test Graph for NeMo

In [None]:
# Now we build the test graph in a similar way, reusing the above components
## Build the test data loader and preprocess same way as train graph
## But note, we do not add the spectrogram augmentation to the test graph !
test_audio_signal, test_audio_signal_len, test_labels, test_label_len = eval_data_layer()
test_processed_signal, test_processed_signal_len = data_preprocessor(
    input_signal=test_audio_signal, length=test_audio_signal_len
)
test_processed_signal, test_processed_signal_len = crop_pad_augmentation(
    input_signal=test_processed_signal, length=test_processed_signal_len
)

# Pass the test data through the model encoder and decoder
test_encoded, test_encoded_len = jasper_encoder(
    audio_signal=test_processed_signal, length=test_processed_signal_len
)
test_decoded = jasper_decoder(encoder_output=test_encoded)

# Compute test loss for visualization
test_loss = ce_loss(logits=test_decoded, labels=test_labels)

## Setting up callbacks for training and test set evaluation, and checkpoint saving

In [None]:
# Now that we have our training and evaluation graphs built,
# we can focus on a few callbacks to help us save the model checkpoints
# during training, as well as display train and test metrics

# Callbacks needed to print train info to console and Tensorboard
train_callback = nemo.core.SimpleLossLoggerCallback(
    # Notice that we pass in loss, predictions, and the labels.
    # Of course we would like to see our training loss, but we need the
    # other arguments to calculate the accuracy.
    tensors=[train_loss, decoded, labels],
    # The print_func defines what gets printed.
    print_func=partial(monitor_classification_training_progress, eval_metric=None),
    get_tb_values=lambda x: [("loss", x[0])],
    tb_writer=neural_factory.tb_writer,
)

# Callbacks needed to print test info to console and Tensorboard
tagname = 'TestSet'
eval_callback = nemo.core.EvaluatorCallback(
    eval_tensors=[test_loss, test_decoded, test_labels],
    user_iter_callback=partial(process_classification_evaluation_batch, top_k=1),
    user_epochs_done_callback=partial(process_classification_evaluation_epoch, eval_metric=1, tag=tagname),
    eval_step=200,  # How often we evaluate the model on the test set #200
    tb_writer=neural_factory.tb_writer,
)

# Callback to save model checkpoints
chpt_callback = nemo.core.CheckpointCallback(
    folder=neural_factory.checkpoint_dir,
    step_freq=1000,
)

# Prepare a list of checkpoints to pass to the engine
callbacks = [train_callback, eval_callback, chpt_callback]

# Training the model

Even with such a small model (73k parameters), and just 5 epochs (should take just a few minutes to train), you should be able to get a test set accuracy score around 98.83% (this result is for the [freesound](https://freesound.org/) dataset) with enough training data. 

**Note** If you follow our tutorial and user the generated background data, you may notice the below results are acceptable, but please remember, this tutorial is only for **demostration** and the dataset is not good enough. Please change background dataset for improvement!

Experiment with increasing the number of epochs or with batch size to see how much you can improve the score! 

**Note** Noise rebustness is quite important for VAD task. If you would like to train with noise augmented, please refer to [4_Online_Data_Augmentation.ipynb](https://github.com/NVIDIA/NeMo/blob/master/examples/asr/notebooks/4_Online_Data_Augmentation.ipynb) to understand how to do that using NeMo.


If you are interested in  **pretrained** model, please have a look at [Evaluation](#evaluate-the-model) or [7_VAD_Offline_Online_Microphone_Demo.ipynb](https://github.com/NVIDIA/NeMo/blob/master/examples/asr/notebooks/7_VAD_Offline_Online_Microphone_Demo.ipynb)

In [None]:
# Now we have all the components required to train the model
# Lets define a learning rate schedule
lr_policy = CosineAnnealing(
    total_steps=num_epochs * steps_per_epoch,
    warmup_ratio=0.05,
    min_lr=0.001,
)

logging.info(f"Using `{lr_policy}` Learning Rate Scheduler")

# Finally, lets train this model !
neural_factory.train(
    tensors_to_optimize=[train_loss],
    callbacks=callbacks,
    lr_policy=lr_policy,
    optimizer="novograd",
    optimization_params={
        "num_epochs": num_epochs,
        "max_steps": None,
        "lr": lr,
        "momentum": 0.95,
        "betas": (0.98, 0.5),
        "weight_decay": weight_decay,
        "grad_norm_clip": None,
    },
    batches_per_step=1,
)


# Evaluate the model

In [None]:
# Lets add a path to the checkpoint dir
# If you prefer to use pretained model. Change model_path to your checkpoint directory
model_path = neural_factory.checkpoint_dir

In [None]:
model_path

## Extract the predictions from the model

We want to possess the actual logits of the model instead of just the final evaluation score, so we use `NeuralFactory.infer(...)` to extract the logits per batch of samples provided.

In [None]:
# --- Inference Only --- #
# We've already built the inference DAG above, so all we need is to call infer().
evaluated_tensors = neural_factory.infer(
    # These are the tensors we want to get from the model.
    tensors=[test_loss, test_decoded, test_labels],
    # checkpoint_dir specifies where the model params are loaded from.
    checkpoint_dir=model_path
)

## Accuracy calculation

In [None]:
correct_count = 0
total_count = 0

for batch_idx, (logits, labels) in enumerate(zip(evaluated_tensors[1], evaluated_tensors[2])):
    acc = classification_accuracy(
        logits=logits,
        targets=labels,
        top_k=[1]
    )

    # Select top 1 accuracy only
    acc = acc[0]

    # Since accuracy here is "per batch", we simply denormalize it by multiplying
    # by batch size to recover the count of correct samples.
    correct_count += int(acc * logits.size(0))
    total_count += logits.size(0)

logging.info(f"Total correct / Total count : {correct_count} / {total_count}")
logging.info(f"Final accuracy : {correct_count / float(total_count)}")

## Precision Recall F1 score calculation

In [None]:
total_true_negative, total_false_negative , total_false_positive, total_true_positive = 0, 0, 0, 0

for batch_idx, (logits, labels) in enumerate(zip(evaluated_tensors[1], evaluated_tensors[2])):
    
    # check if it's a 2 classes confusion matrix.
    confusion_matrix = classification_confusion_matrix(
            logits=logits,
            targets=labels)
        
    if confusion_matrix.shape[0] == 2:
        tn, fp, fn, tp = confusion_matrix.ravel()

    total_true_negative += tn
    total_false_negative += fn
    total_false_positive += fp
    total_true_positive += tp


logging.info(f" True Positive: {total_true_positive}")
logging.info(f" False Positive : {total_false_positive}")
logging.info(f" False Negative : {total_false_negative}")
logging.info(f" True Negative : {total_true_negative}")

accuracy = (total_true_positive + total_true_negative) \
                / (total_true_positive + total_true_negative + total_false_negative + total_false_positive)
precision = total_true_positive / (total_true_positive + total_false_positive)
recall = total_true_positive / (total_true_positive + total_false_negative)
f1_score =  2 * precision * recall / (precision + recall)

logging.info(f"Final Accuracy: {accuracy}")
logging.info(f"Final Precision: {precision}")
logging.info(f"Final Recall : {recall}")
logging.info(f"Final F1 score : {f1_score}")

# Evaluation of incorrectly predicted samples

Given that we have a trained model, which performs reasonably well, lets try to listen to the samples where the model is least confident in its predictions.

For this, we need support of the librosa library.

**NOTE**: The following code depends on librosa. To install it, run the following code block first

In [None]:
!pip install librosa

## Filtering out incorrect samples
Let us now filter out the incorrectly labeled samples from the total set of samples in the test set

In [None]:
import librosa
import json
import IPython.display as ipd
import torch

In [None]:
# First lets create a utility class to remap the integer class labels to actual string label
class ReverseMapLabel:
    def __init__(self, data_layer: nemo_asr.AudioToSpeechLabelDataLayer):
        self.label2id = dict(data_layer._dataset.label2id)
        self.id2label = dict(data_layer._dataset.id2label)

    def __call__(self, pred_idx, label_idx):
        return self.id2label[pred_idx], self.id2label[label_idx]

In [None]:
# Next, lets get the indices of all the incorrectly labeled samples
sample_idx = 0
incorrect_preds = []
rev_map = ReverseMapLabel(eval_data_layer)

# Remember, evaluated_tensor = (loss, logits, labels)
for batch_idx, (logits, labels) in enumerate(zip(evaluated_tensors[1], evaluated_tensors[2])):
    probs = torch.softmax(logits, dim=-1)
    probas, preds = torch.max(probs, dim=-1)

    incorrect_ids = (preds != labels).nonzero()
    for idx in incorrect_ids:
        proba = float(probas[idx][0])
        pred = int(preds[idx][0])
        label = int(labels[idx][0])
        idx = int(idx[0]) + sample_idx

        incorrect_preds.append((idx, *rev_map(pred, label), proba))

    sample_idx += labels.size(0)

logging.info(f"Num test samples : {total_count}")
logging.info(f"Num errors : {len(incorrect_preds)}")

# First lets sort by confidence of prediction
incorrect_preds = sorted(incorrect_preds, key=lambda x: x[-1], reverse=False) 

## Examine a subset of incorrect samples
Lets print out the (test id, predicted label, ground truth label, confidence) tuple of first 20 incorrectly labeled samples

In [None]:
for incorrect_sample in incorrect_preds[:20]:
    logging.info(str(incorrect_sample))

##  Define a threshold below which we designate a model's prediction as "low confidence"

In [None]:
# Filter out how many such samples exist
low_confidence_threshold = 0.60 
count_low_confidence = len(list(filter(lambda x: x[-1] <= low_confidence_threshold, incorrect_preds)))
logging.info(f"Number of low confidence predictions : {count_low_confidence}")

# Lets hear the samples which the model has least confidence in !

In [None]:
# First lets create a helper function to parse the manifest files
def parse_manifest(manifest):
    data = []
    for line in manifest:
        line = json.loads(line)
        data.append(line)

    return data

In [None]:
# Next, lets create a helper function to actually listen to certain samples
def listen_to_file(sample_id, pred=None, label=None, proba=None):
    # Load the audio waveform using librosa
    filepath = test_samples[sample_id]['audio_filepath']
    if 'offset' in test_samples[sample_id]:
        audio, sample_rate = librosa.load(filepath,
                                          offset = test_samples[sample_id]['offset'],
                                          duration = test_samples[sample_id]['duration'])
    else:
         audio, sample_rate = librosa.load(filepath)

    if pred is not None and label is not None and proba is not None:
        logging.info(f"filepath: {filepath}, Sample : {sample_id} Prediction : {pred} Label : {label} Confidence = {proba: 0.4f}")
    else:
        
        logging.info(f"Sample : {sample_id}")

    return ipd.Audio(audio, rate=sample_rate)


In [None]:
import json
# Now lets load the test manifest into memory
all_test_samples = []
for _ in test_dataset.split(','):
    print(_)
    with open(_, 'r') as test_f:
        test_samples = test_f.readlines()
        
        all_test_samples.extend(test_samples)
print(len(all_test_samples))
test_samples = parse_manifest(all_test_samples)

In [None]:
# Finally, lets listen to all the audio samples where the model made a mistake
# Note: This list of incorrect samples may be quite large, so you may choose to subsample `incorrect_preds`
for sample_id, pred, label, proba in incorrect_preds[:count_low_confidence]:
    ipd.display(listen_to_file(sample_id, pred=pred, label=label, proba=proba))

# Inference and more
If you are interested in **pretrained** model and **streaming inference**, please have a look at [7_VAD_Offline_Online_Microphone_Demo](https://github.com/NVIDIA/NeMo/blob/master/examples/asr/notebooks/7_VAD_Offline_Online_Microphone_Demo.ipynb)

