This notebook demonstrates voice activity detection from a microphone's stream (online) and a given wav file (offline)  in NeMo.

The notebook requires PyAudio library to get a signal from an audio device.
For Ubuntu, please run the following commands to install it:
```
sudo apt-get install -y portaudio19-dev
pip install pyaudio
```

In [None]:
import os
import nemo
import nemo.collections.asr as nemo_asr
import numpy as np
import pyaudio as pa
import time

import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
%matplotlib inline

# Model Architecture and Weights

The model architecture is defined in a YAML file available in the config directory. MatchboxNet 3x1x64 has been trained on the [Google Speech Commands v2 dataset](https://arxiv.org/abs/1804.03209) and [freesound](https://freesound.org), and these weights are available on NGC. They will automatically be downloaded if not found.

In [None]:
MODEL_YAML = '../configs/quartznet_vad_3x1.yaml'

In [None]:
# Download the checkpoint files

base_checkpoint_path = './checkpoints/matchboxnet_3x1x1/'
CHECKPOINT_ENCODER = os.path.join(base_checkpoint_path, 'JasperEncoder-STEP-90800.pt')
CHECKPOINT_DECODER = os.path.join(base_checkpoint_path, 'JasperDecoderForClassification-STEP-90800.pt')

if not os.path.exists(base_checkpoint_path):
    os.makedirs(base_checkpoint_path)
    
if not os.path.exists(CHECKPOINT_ENCODER):
    !wget https://api.ngc.nvidia.com/v2/models/nvidia/vad_matchboxnet_3x1x1/versions/1/files/JasperEncoder-STEP-90800.pt -P {base_checkpoint_path};
if not os.path.exists(CHECKPOINT_DECODER):
    !wget https://api.ngc.nvidia.com/v2/models/nvidia/vad_matchboxnet_3x1x1/versions/1/files/JasperDecoderForClassification-STEP-90800.pt -P {base_checkpoint_path};
        

# Construct the Neural Modules and the eval graph

In [None]:
from ruamel.yaml import YAML
yaml = YAML(typ="safe")
with open(MODEL_YAML) as f:
    model_definition = yaml.load(f)

In [None]:
neural_factory = nemo.core.NeuralModuleFactory(
    placement=nemo.core.DeviceType.GPU,
    backend=nemo.core.Backend.PyTorch)

## Define a Neural Module to iterate over audio

Here we define a custom Neural Module which acts as an iterator over a stream of audio that is supplied to it. 

In [None]:
from nemo.backends.pytorch.nm import DataLayerNM
from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType
import torch

# simple data layer to pass audio signal
class AudioDataLayer(DataLayerNM):
    @property
    def output_ports(self):
        return {
            'audio_signal': NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
        }

    def __init__(self, sample_rate):
        super().__init__()
        self._sample_rate = sample_rate
        self.output = True
        
    def __iter__(self):
        return self
    
    def __next__(self):
        if not self.output:
            raise StopIteration
        self.output = False
        return torch.as_tensor(self.signal, dtype=torch.float32), \
               torch.as_tensor(self.signal_shape, dtype=torch.int64)
        
    def set_signal(self, signal):
        self.signal = np.reshape(signal.astype(np.float32)/32768., [1, -1])
        self.signal_shape = np.expand_dims(self.signal.size, 0).astype(np.int64)
        self.output = True

    def __len__(self):
        return 1

    @property
    def dataset(self):
        return None

    @property
    def data_iterator(self):
        return self

## Instantiate the Neural Modules

We now instantiate the neural modules and the encoder and decoder, set the weights of these models with the downloaded pretrained weights and construct the DAG to evaluate MatchboxNet on audio streams

In [None]:
# Instantiate necessary neural modules
data_layer = AudioDataLayer(sample_rate=model_definition['sample_rate'])

data_preprocessor = nemo_asr.AudioToMFCCPreprocessor(
    **model_definition['AudioToMFCCPreprocessor'])

jasper_encoder = nemo_asr.JasperEncoder(
    **model_definition['JasperEncoder'])

jasper_decoder = nemo_asr.JasperDecoderForClassification(
    feat_in=model_definition['JasperEncoder']['jasper'][-1]['filters'],
    num_classes=len(model_definition['labels']))

# load pre-trained model
jasper_encoder.restore_from(CHECKPOINT_ENCODER)
jasper_decoder.restore_from(CHECKPOINT_DECODER)

# Define inference DAG
audio_signal, audio_signal_len = data_layer()
processed_signal, processed_signal_len = data_preprocessor(
    input_signal=audio_signal,
    length=audio_signal_len)
encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,
                                      length=processed_signal_len)
log_probs = jasper_decoder(encoder_output=encoded)

# inference method for audio signal (single instance)
def infer_signal(self, signal):
    data_layer.set_signal(signal)
    tensors = self.infer([log_probs], verbose=False)
    logits = tensors[0][0]
    return logits

neural_factory.infer_signal = infer_signal.__get__(neural_factory)

# FrameASR: Helper class for streaming inference
Here we adopt FrameASR for streaming inference for voice activatity detection

In [None]:
# class for streaming frame-based ASR
# 1) use reset() method to reset FrameASR's state
# 2) call transcribe(frame) to do ASR on
#    contiguous signal's frames
class FrameASR:
    
    def __init__(self, neural_factory, model_definition,
                 frame_len=2, frame_overlap=2.5, 
                 offset=10):
        '''
        Args:
          frame_len: frame's duration, seconds
          frame_overlap: duration of overlaps before and after current frame, seconds
          offset: number of symbols to drop for smooth streaming
        '''
        self.vocab = list(model_definition['labels'])
        self.vocab.append('_')
        
        self.sr = model_definition['sample_rate']
        self.frame_len = frame_len
        self.n_frame_len = int(frame_len * self.sr)
        self.frame_overlap = frame_overlap
        self.n_frame_overlap = int(frame_overlap * self.sr)
        timestep_duration = model_definition['AudioToMFCCPreprocessor']['window_stride']
        for block in model_definition['JasperEncoder']['jasper']:
            timestep_duration *= block['stride'][0] ** block['repeat']
        self.buffer = np.zeros(shape=2*self.n_frame_overlap + self.n_frame_len,
                               dtype=np.float32)
        self.offset = offset
        self.reset()
        
    def _decode(self, frame, offset=0):
        assert len(frame)==self.n_frame_len
        self.buffer[:-self.n_frame_len] = self.buffer[self.n_frame_len:]
        self.buffer[-self.n_frame_len:] = frame
        logits = neural_factory.infer_signal(self.buffer).to('cpu').numpy()[0]
        decoded = self._greedy_decoder(
            logits, 
            self.vocab
        )
        return decoded[:len(decoded)-offset]
    
    def transcribe(self, frame=None):
        if frame is None:
            frame = np.zeros(shape=self.n_frame_len, dtype=np.float32)
        if len(frame) < self.n_frame_len:
            frame = np.pad(frame, [0, self.n_frame_len - len(frame)], 'constant')
        unmerged = self._decode(frame, self.offset)
        return unmerged
    
    def reset(self):
        '''
        Reset frame_history and decoder's state
        '''
        self.buffer=np.zeros(shape=self.buffer.shape, dtype=np.float32)
        self.prev_char = ''

    @staticmethod
    def _greedy_decoder(logits, vocab):
        s = ''
        s = []
        if logits.shape[0]:
            probs = torch.softmax(torch.as_tensor(logits), dim=-1)
            probas, preds = torch.max(probs, dim=-1)
            s = [preds.item(), str(vocab[preds]), probs[0].item(), probs[1].item(), str(logits)]
        return s

## What classes can this model recognize?

Before we begin inference on the actual audio stream, lets look at what are the classes this model was trained to recognize

In [None]:
labels = model_definition['labels']
print(labels)

# Listening to audio stream and perform inference using FrameASR

## Offline Inference

You can experiment with differents **STEP** and **WINDOW_SIZE** for streaming VAD inference.

In [None]:
STEP_LIST =        [0.01, 0.01, 0.01]
WINDOW_SIZE_LIST = [0.25, 0.20, 0.15]

In [None]:
import wave

def offline_inference(wave_file, STEP = 0.025, WINDOW_SIZE = 0.5):
    
    FRAME_LEN = STEP # infer every STEP seconds 
    CHANNELS = 1 # number of audio channels (expect mono signal)
    RATE = 16000 # sample rate, Hz
   
    CHUNK_SIZE = int(FRAME_LEN*RATE)
    asr = FrameASR(neural_factory, model_definition,
                   frame_len=FRAME_LEN, frame_overlap = (WINDOW_SIZE-FRAME_LEN)/2,
                   offset=0)

    wf = wave.open(wave_file, 'rb')
    p = pa.PyAudio()

    empty_counter = 0

    preds = []
    proba_b = []
    proba_s = []
    
    def callback(in_data, frame_count, time_info, status):
        data = wf.readframes(frame_count)
        global empty_counter
        signal = np.frombuffer(data, dtype=np.int16)
        result = asr.transcribe(signal)

        preds.append(result[0])
        proba_b.append(result[2])
        proba_s.append(result[3])
        if len(result):
            print(result,end='\n')
            empty_counter = 3
        elif empty_counter > 0:
            empty_counter -= 1
            if empty_counter == 0:
                print(' ',end='')

        return (data, pa.paContinue)

    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=CHANNELS,
                    rate=RATE,
                    output = True,
                    stream_callback=callback,
                    frames_per_buffer=CHUNK_SIZE) # Specifies the number of frames per buffer.
 
    stream.start_stream()

    while stream.is_active():
        time.sleep(0.1)

    stream.stop_stream()
    stream.close()
    p.terminate()

    asr.reset()
    return preds, proba_b, proba_s

### Here we show an example of offline streaming inference
You can use your file or download the provided toy dataset. 

In [None]:
toy_data = './vad'
if not os.path.exists(toy_data):
    !wget -c "https://github.com/NVIDIA/NeMo/blob/master/tests/data/vad.tar.xz?raw=true" -O vad.tar.xz 
    !tar -xvf vad.tar.xz

In [None]:
wave_file = './vad/welcome_noisy.wav'
CHANNELS = 1
RATE = 16000
audio, sample_rate = librosa.load(wave_file, sr=RATE)

In [None]:
results = []
for STEP, WINDOW_SIZE in zip(STEP_LIST, WINDOW_SIZE_LIST):
    print(f'====== STEP is {STEP}s, WINDOW_SIZE is {WINDOW_SIZE}s ====== ')
    preds, proba_b, proba_s = offline_inference(wave_file, STEP, WINDOW_SIZE)
    results.append([STEP, WINDOW_SIZE, preds, proba_b, proba_s])

In [None]:
import matplotlib.pyplot as plt
from pylab import *
import numpy as np
import librosa.display
plt.figure(figsize=[16,10])
plt.title('Audio, Preictions and Probas')
plt.rcParams.update({'font.size': 10, 'font.family': 'sans-serif'})
subplots_adjust(hspace=2.00)


FRAME_LEN = STEP_LIST[0]
len_pred = len(results[0][2]) 

num = len(results)
for i,v in enumerate(range(num + 1)):
    v = v + 1
    if  v > len(results):

        ax = plt.subplot(num + 2, 1, v)
        S = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=128,
                                  fmax=8000)
        S_dB = librosa.power_to_db(S, ref=np.max)
        librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', 
                                 sr=sample_rate, fmax=8000)
        ax.set_title('Mel-frequency spectrogram')
        ax.grid()

        ax = plt.subplot(num + 2, 1, v + 1)
        ax.plot(np.arange(audio.size) / sample_rate, audio, 'b')
        ax.set_xlim([-0.01,  len_pred * FRAME_LEN])
        ax.set_ylabel('Signal')
        ax.set_xlabel('Time, seconds')
        ax.set_title(f'File: {str(wave_file)}')
        ax.set_ylim([-0.5,  0.5])
        ax.grid()
    else:
        ax = plt.subplot(num + 2, 1, v)
        ax.plot(results[i][2], 'r', label='pred')
        ax.plot(results[i][3], 'g--', label='prob for background')
        ax.plot(results[i][4], 'b--', label='prob for speech')
        ax.set_xlim([0, len_pred])
        ax.set_title(f'step {results[i][0]}s, buffer size {results[i][1]}s')
        ax.set_ylabel('Preds and Probas')
        ax.set_xlabel('Segments')
        ax.grid()
        legend = ax.legend(loc='lower left', shadow=True)
plt.show()


In [None]:
import librosa
ipd.Audio(audio, rate=sample_rate)

## Online inference through microphone

In [None]:
STEP = 0.01 
WINDOW_SIZE = 0.20
CHANNELS = 1 
RATE = 16000

CHUNK_SIZE = int(STEP * RATE)
asr = FrameASR(neural_factory, model_definition,
               frame_len=STEP, frame_overlap=(WINDOW_SIZE - FRAME_LEN) / 2, 
               offset=0)

In [None]:
p = pa.PyAudio()
print('Available audio input devices:')
for i in range(p.get_device_count()):
    dev = p.get_device_info_by_index(i)
    if dev.get('maxInputChannels'):
        print(i, dev.get('name'))
print('Please type input device ID:')
dev_idx = int(input())

empty_counter = 0

def callback(in_data, frame_count, time_info, status):
    global empty_counter
    signal = np.frombuffer(in_data, dtype=np.int16)
    text = asr.transcribe(signal)
    if len(text):
        print(text,end='\n')
        empty_counter = 3
    elif empty_counter > 0:
        empty_counter -= 1
        if empty_counter == 0:
            print(' ',end='')
    return (in_data, pa.paContinue)

stream = p.open(format=pa.paInt16,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                input_device_index=dev_idx,
                stream_callback=callback,
                frames_per_buffer=CHUNK_SIZE)

print('Listening...')

stream.start_stream()

while stream.is_active():
    time.sleep(0.1)

In [None]:
stream.stop_stream()
stream.close()
p.terminate()