In [None]:
import json
import os
import pickle
import re
from glob import glob

import editdistance
import numpy as np
import pandas as pd
from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection
from pyannote.core import notebook, Segment
from pydub import AudioSegment
from tqdm import tqdm
from whisper_model import WhisperASR
import matplotlib.pyplot as plt

import scipy.io.wavfile as wav
import scipy.signal as signal

import tempfile




import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
USE_ONNX = False # change this to True if you want to test onnx model
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=USE_ONNX)

(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils




modelPyannote = Model.from_pretrained("pyannote/segmentation", use_auth_token="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")

pipeline = VoiceActivityDetection(segmentation=modelPyannote)
HYPER_PARAMETERS = {
    # onset/offset activation thresholds
    "onset": 0.5,
    "offset": 0.5,
    # remove speech regions shorter than that many seconds.
    "min_duration_on": 0.0,
    # fill non-speech regions shorter than that many seconds.
    "min_duration_off": 0.05,
}
pipeline.instantiate(HYPER_PARAMETERS)


In [None]:

padding = 0.025
SAMPLING_RATE = 16000
def edit_distance(s1, s2):
    return editdistance.eval(s1, s2)


def format_int(i):
    return str(i).zfill(8)


# trim the audio using start end end time in secs
def trim_audio(path, start, end, out_path):
    sound = AudioSegment.from_file(path, format="wav")
    # make sure that the start and end are in between the audio duration
    start = max(0, start)
    end = min(end, len(sound) / 1000)
    trimmed_sound = sound[start * 1000 : end * 1000]
    trimmed_sound.export(out_path, format="wav")
    return out_path, start, end

def run_pyannote_vad(file):
    vad_segments = pipeline(file)
    pyannote_timeline = vad_segments.get_timeline().support()
    response_timeline = []
    for segment in pyannote_timeline:
        start, end = list(segment)
        response_timeline.append((start, end))
    # get start of first and end of last
    if len(response_timeline) > 0:
        start = response_timeline[0][0]
        end = response_timeline[-1][1]
    else:
        start = 0
        end = 0
    response_timeline = [(start, end)]
    return response_timeline

def run_silerio_vad(file):
    wav = read_audio(file, sampling_rate=SAMPLING_RATE)
    # get speech timestamps from full audio file
    silerio_timeline = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)
    # show the timestamps as s
    silerio_timeline = [(segment["start"] / SAMPLING_RATE, segment["end"] / SAMPLING_RATE) for segment in silerio_timeline]

    # get start of first and end of last
    if len(silerio_timeline) > 0:
        start = silerio_timeline[0][0]
        end = silerio_timeline[-1][1]
    else:
        start = 0
        end = 0
    silerio_timeline = [(start, end)]
    return silerio_timeline


In [None]:
# files_folder = "/data/tts-qa/tts-data/French(Dorsaf)/trimmed"
# files_folder = "/data/tts-qa/tts-data/German(Dorothee)/trimmed"
# files_folder = "/data/tts-qa/tts-data/English(Melynda)/trimmed"
# files_folder = "/data/tts-qa/tts-data/Italian(Martina) Deliverable 3/trimmed"
files_folder = "/data/tts-qa/tts-data/Spanish(Violeta) Deliverable 3/trimmed"


files = glob(os.path.join(files_folder, "*.wav"))


In [None]:
# randomly select  files

np.random.seed(10)
selected_files = np.random.choice(files, 50, replace=False)

# selected_files = [
#     "/data/tts-qa/tts-data/French(Dorsaf)/trimmed/FR00000026.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/trimmed/FR00000032.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/trimmed/FR00000033.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/trimmed/FR00000035.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/trimmed/FR00001001.wav",
# ]


# rare_cases = [
#     "/data/tts-qa/tts-data/French(Dorsaf)/raw/FR00000907.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/raw/FR00000115.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/raw/FR00000169.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/raw/FR00000952.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/raw/FR00000584.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/raw/FR00000911.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/raw/FR00001238.wav",
#     "/data/tts-qa/tts-data/French(Dorsaf)/raw/FR00000979.wav",
# ]
# selected_files= rare_cases

# selected_files = [
#     "/data/tts-qa/tts-data/German(Dorothee)/trimmed/DE00080623.wav",
#     "/data/tts-qa/tts-data/German(Dorothee)/trimmed/DE00069958.wav",
# ]


In [None]:
from copy import deepcopy
def my_custom_vad(pyannote_segment, silerio_segment, waveform, sample_rate=SAMPLING_RATE, energy_threshold=500000, window_size=0.02):
    merged_timeline = []
    # Your logic for merging or comparing pyannote and silerio timelines
    pyannote_start, pyannote_end = list(deepcopy(pyannote_segment))
    silerio_start, silerio_end = list(deepcopy(silerio_segment))

    # If the segments are close enough, merge them
    if abs(pyannote_start - silerio_start) < 0.05:
        merged_start = min(pyannote_start, silerio_start)
    else:
        # Divide the segment into smaller windows and check energy
        start_start = min(pyannote_start, silerio_start)
        start_end = max(pyannote_start, silerio_start)
        merged_timeline = []
        merged_start = start_end
        num_windows = int((start_end - start_start) / window_size)
        for i in range(num_windows):
            window_start = int((start_start + i * window_size) * sample_rate)
            window_end = int(window_start + window_size * sample_rate)
            window_samples = waveform[window_start:window_end]
            window_energy = np.sum(window_samples ** 2) / len(window_samples)

            if window_energy > energy_threshold:
                merged_timeline.append((start_start + i * window_size, start_start + (i + 1) * window_size, window_energy))
        if len(merged_timeline)>0:
            merged_start = merged_timeline[0][0]
        
    if abs(pyannote_end - silerio_end) < 0.05:
        merged_end = max(pyannote_end, silerio_end)
    else:
        end_start = min(pyannote_end, silerio_end)
        end_end = max(pyannote_end, silerio_end)
        merged_timeline = []
        merged_end = end_start
        num_windows = int((end_end - end_start) / window_size)
        for i in range(num_windows):
            window_start = int((end_start + i * window_size) * sample_rate)
            window_end = int(window_start + window_size * sample_rate)
            window_samples = waveform[window_start:window_end]
            window_energy = np.sum(window_samples ** 2) / len(window_samples)

            if window_energy > energy_threshold:
                merged_timeline.append((end_start + i * window_size, end_start + (i + 1) * window_size, window_energy))
        if len(merged_timeline)>0:
            merged_end = merged_timeline[-1][1]

    custom_segment = (merged_start, merged_end)
    return custom_segment, merged_timeline


In [None]:
def pad(waveform, segment):
    start = segment[0]
    end=segment[1]
    start = max(0, start-padding)
    end = min(len(waveform) / SAMPLING_RATE, end+padding)
    return (start, end)

In [None]:

import tempfile

# Create a temporary directory
temp_dir = tempfile.mkdtemp()

for file in selected_files:
    # load the waveform
    audio = AudioSegment.from_file(file, format="wav")
    waveform = np.array(audio.get_array_of_samples())


    original_sampling_rate = audio.frame_rate
    target_sampling_rate = SAMPLING_RATE

    resampled_waveform = signal.resample(waveform, int(len(waveform) * target_sampling_rate / original_sampling_rate))

    temp_wav_file = os.path.join(temp_dir, "temp_audio.wav")
    wav.write(temp_wav_file, target_sampling_rate, resampled_waveform.astype('int16'))

    pyannote_segment = run_pyannote_vad(temp_wav_file)[0]
    silerio_segment = run_silerio_vad(temp_wav_file)[0]
    pyannote_segment = pad(resampled_waveform, pyannote_segment)
    silerio_segment = pad(resampled_waveform, silerio_segment)


    custom_segment, merged = my_custom_vad(pyannote_segment, silerio_segment, resampled_waveform, SAMPLING_RATE)
    custom_segment = pad(resampled_waveform, custom_segment)

    # Calculate time vector
    time_vector = np.linspace(0, len(resampled_waveform) / SAMPLING_RATE, num=len(resampled_waveform))


    fig = plt.figure(figsize=(20, 5), dpi=50)
    plt.yticks([])
    plt.plot(time_vector, resampled_waveform)
    # add the filename as title
    plt.title(os.path.basename(file))

#     plt.axvspan(silerio_segment[0], silerio_segment[1] , color="red", alpha=0.5, label="SILERIO-VAD")


    plt.axvspan(pyannote_segment[0], pyannote_segment[1], color="green", alpha=0.3, label="PYANNOTE")



    plt.axvspan(custom_segment[0], custom_segment[1], color="black", alpha=0.3, label="MY_CUSTOM_VAD")


    plt.legend()
    plt.show() 

# Cleanup: Delete the temporary directory and its contents
for file_name in os.listdir(temp_dir):
    file_path = os.path.join(temp_dir, file_name)
    if os.path.isfile(file_path):
        os.unlink(file_path)
os.rmdir(temp_dir)