In [None]:
import editdistance

import sys

sys.path.append("../")

from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection


from src.logger import root_logger


app_logger = root_logger.getChild("alignment_utils")


def edit_distance(s1, s2):
    return editdistance.eval(s1, s2)


def format_int(i):
    return str(i).zfill(8)


modelPyannote = Model.from_pretrained("pyannote/segmentation", use_auth_token="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")


padding = 0.25

In [None]:
pipeline = VoiceActivityDetection(segmentation=modelPyannote)
HYPER_PARAMETERS = {
    # onset/offset activation thresholds
    "onset": 0.5,
    "offset": 0.5,
    # remove speech regions shorter than that many seconds.
    "min_duration_on": 0.0,
    # fill non-speech regions shorter than that many seconds.
    "min_duration_off": 0.0,
}
pipeline.instantiate(HYPER_PARAMETERS)

In [None]:
# filename = "/home/ubuntu/repos/tts-qa/MariahProctorTTSGerman.wav"
filename = "/home/ubuntu/repos/tts-qa/notebooks/test-eng-57.wav"
vad = pipeline(filename)

In [None]:
from tqdm import tqdm
from pydub import AudioSegment

data = AudioSegment.from_file(filename)

timeline = vad.get_timeline().support()
for segment in tqdm(timeline):
    start, end = list(segment)
    start = max(0, start - padding)
    end = min(end + padding, len(data) / 1000)
    seg = {}
    seg["SegmentStart"] = start
    seg["SegmentEnd"] = end
    outputAudio = AudioSegment.empty()
    outputAudio += data[seg["SegmentStart"] * 1000 : seg["SegmentEnd"] * 1000]
    break

In [None]:
start, end

## SILERIO VAD

In [None]:
# get sampling rate of the audio file
import soundfile as sf

audio, sr = sf.read(filename)

print("sampling rate:", sr)

In [None]:
# convert to 16kHz
from pydub import AudioSegment

sound = AudioSegment.from_file(filename)
sound = sound.set_frame_rate(16000)
sound.export(filename, format="wav")


In [None]:
import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint

In [None]:
SAMPLING_RATE = 16_000
USE_ONNX = False # change this to True if you want to test onnx model
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=USE_ONNX)

In [None]:
(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils

In [None]:
wav = read_audio(filename, sampling_rate=SAMPLING_RATE)
# get speech timestamps from full audio file
speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
pprint(speech_timestamps)

In [None]:
len(speech_timestamps)

## Whisper VAD

In [None]:
import whisper_timestamped as whisperts

audio = whisperts.load_audio(filename)

model = whisperts.load_model("medium", device="cuda")

In [None]:
results = whisperts.transcribe(model, data, vad=True, detect_disfluencies=False, language="en")

In [None]:
results["segments"][0]