#VAD

## Install Dependencies

In [None]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio

SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

In [None]:
USE_ONNX = False # change this to True if you want to test onnx model
if USE_ONNX:
    !pip install -q onnxruntime
  
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=USE_ONNX)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils

## Full Audio

**Speech timestapms from full audio**

In [None]:
wav = read_audio('/Users/marlowe/workspace/epd-system/build/bin/1019041_977102_2020-02-12_080000_2020-02-12_100000_16K.wav', sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
pprint(speech_timestamps)

In [None]:
# merge all speech chunks to one audio
save_audio('only_speech.wav',
           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE) 
Audio('only_speech.wav')

## Stream imitation example

In [None]:
## using VADIterator class
import soundfile as sf
from utils_vad import VADIterator, read_audio

def init_jit_model(model_path: str,
                   device=torch.device('cpu')):
    torch.set_grad_enabled(False)
    model = torch.jit.load(model_path, map_location=device)
    model.eval()
    return model
model = init_jit_model("/Users/marlowe/workspace/epd-system/model/silero_vad.jit")

vad_iterator = VADIterator(model, sampling_rate=16000, threshold=0.5)
wav = read_audio(f'/Users/marlowe/Desktop/epd-test-data/wav/14_3466_20170826171159.wav', sampling_rate=SAMPLING_RATE)
# audio, fs = sf.read('/Users/marlowe/workspace/epd-system/build/bin/1019041_977102_2020-02-12_080000_2020-02-12_100000_16K.wav')
window_size_samples = 512 # number of samples in a single audio chunk
timesteps = []
count = 0
s = []
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    print("chunk: {}".format(count))
    print(chunk)
    if len(chunk) < window_size_samples:
      break
    count += 1
    speech_dict = vad_iterator(chunk, return_seconds=True)
    post = model(chunk, 16000).item()
    s.append(post)
    # if speech_dict:
    #     # print(speech_dict, end=' ')
    #     for ts in speech_dict.values():
    #       timesteps.append(ts)
    #     print(speech_dict, end=' ')
    #     break

vad_iterator.reset_states() # reset model states after each audio
next_ts = 0


In [None]:
## using VADIterator class
import soundfile as sf
from utils_vad import VADIterator

def init_jit_model(model_path: str,
                   device=torch.device('cpu')):
    torch.set_grad_enabled(False)
    model = torch.jit.load(model_path, map_location=device)
    model.eval()
    return model
model = init_jit_model("/Users/marlowe/workspace/epd-system/model/silero_vad.jit")

vad_iterator = VADIterator(model, sampling_rate=16000, threshold=0.5)
wav = read_audio(f'/Users/marlowe/Desktop/epd-test-data/wav/14_3466_20170826171159.wav', sampling_rate=SAMPLING_RATE)
# audio, fs = sf.read('/Users/marlowe/workspace/epd-system/build/bin/1019041_977102_2020-02-12_080000_2020-02-12_100000_16K.wav')
window_size_samples = 512 # number of samples in a single audio chunk
timesteps = []
count = 0
s = []
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    count += 1
    speech_dict = vad_iterator(chunk, return_seconds=True)
    post = model(chunk, 16000).item()
    s.append(post)
    # if speech_dict:
    #     # print(speech_dict, end=' ')
    #     for ts in speech_dict.values():
    #       timesteps.append(ts)
    #     print(speech_dict, end=' ')
    #     break
vad_iterator.reset_states() # reset model states after each audio
next_ts = 0


In [None]:
import textgrid
output_result = []
output_result.append([0, timesteps[0], '0'])
last_state = 0
for i in range(len(timesteps)-1):
    if last_state == 0:
        last_state = 1
        cur_str = '1'
    else:
        last_state = 0
        cur_str = '0'
    output_result.append([timesteps[i], timesteps[i+1], cur_str])


def make_tg(timestamp, duration, save_path):
    duration_second = duration
    tg = textgrid.TextGrid(minTime=0, maxTime=duration_second)

    tier_word = textgrid.IntervalTier(name="vad", minTime=0, maxTime=duration_second)

    for current_word_idx in range(len(timestamp)):
        if timestamp[current_word_idx][0] == timestamp[current_word_idx][1]:
            continue
        interval = textgrid.Interval(minTime=timestamp[current_word_idx][0], maxTime=timestamp[current_word_idx][1],
                                     mark=timestamp[current_word_idx][2])

        tier_word.addInterval(interval)

    tg.tiers.append(tier_word)
    tg.write(save_path)


def vad_result_to_timestamp(vad_list):

    start_pointer = 0
    end_pointer = 0
    last_value = None
    output_result = []
    label_list = []

    for frame_idx in range(len(vad_list)):
        frame_result = vad_list[frame_idx]
        if last_value is None:
            last_value = frame_result
            end_pointer += 1
        else:
            if frame_result == last_value:
                last_value = frame_result
                end_pointer += 1
            else:
                last_value = frame_result
                if sum(vad_list[start_pointer:end_pointer]) == 0:
                    output_result.append([start_pointer, end_pointer, '0'])
                else:
                    output_result.append([start_pointer, end_pointer, "1"])
                end_pointer += 1
                start_pointer = end_pointer
        if frame_idx == len(vad_list) - 1:
            if sum(vad_list[start_pointer:end_pointer]) == 0:
                output_result.append([start_pointer, end_pointer, "0"])
            else:
                output_result.append([start_pointer, end_pointer, "1"])

    return output_result


def process(vad_list, save_path):

    # frame_idx_list = vad_result_to_timestamp(vad_list)
    timestamp = [[i[0] * 0.01, i[1] * 0.01, i[2]] for i in frame_idx_list]
    duration = len(vad_list) * 0.01
    make_tg(timestamp, duration, save_path)
    print(timestamp)

make_tg(output_result, len(audio)*0.01, "./textgrid")



In [None]:
## just probabilities

wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
speech_probs = []
window_size_samples = 1536
for i in range(0, len(wav), window_size_samples):
    chunk = wav[i: i+ window_size_samples]
    if len(chunk) < window_size_samples:
      break
    speech_prob = model(chunk, SAMPLING_RATE).item()
    speech_probs.append(speech_prob)
vad_iterator.reset_states() # reset model states after each audio

print(speech_probs[:10]) # first 10 chunks predicts

# Number detector

## Install Dependencies

In [None]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio

SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en_num.wav', 'en_number_example.wav')

In [None]:
USE_ONNX = False # change this to True if you want to test onnx model
if USE_ONNX:
    !pip install -q onnxruntime
  
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_number_detector',
                              force_reload=True,
                              onnx=USE_ONNX)

(get_number_ts,
 save_audio,
 read_audio,
 collect_chunks,
 drop_chunks) = utils


## Full audio

In [None]:
wav = read_audio('en_number_example.wav', sampling_rate=SAMPLING_RATE)
# get number timestamps from full audio file
number_timestamps = get_number_ts(wav, model)
pprint(number_timestamps)

In [None]:
# convert ms in timestamps to samples
for timestamp in number_timestamps:
    timestamp['start'] = int(timestamp['start'] * SAMPLING_RATE / 1000)
    timestamp['end'] = int(timestamp['end'] * SAMPLING_RATE / 1000)

In [None]:
# merge all number chunks to one audio
save_audio('only_numbers.wav',
           collect_chunks(number_timestamps, wav), SAMPLING_RATE) 
Audio('only_numbers.wav')

In [None]:
# drop all number chunks from audio
save_audio('no_numbers.wav',
           drop_chunks(number_timestamps, wav), SAMPLING_RATE) 
Audio('no_numbers.wav')

# Language detector

## Install Dependencies

In [None]:
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio

SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')

In [None]:
USE_ONNX = False # change this to True if you want to test onnx model
if USE_ONNX:
    !pip install -q onnxruntime
  
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_lang_detector',
                              force_reload=True,
                              onnx=USE_ONNX)

get_language, read_audio = utils

## Full audio

In [None]:
wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)
lang = get_language(wav, model)
print(lang)