Play slices of the waveforms:

In [1]:
import torchaudio
from IPython.display import Audio

waveform, _ = torchaudio.load("data/librispeech_subset/84-121123-0001.wav")
sample_rate=16000

start_time=1.1
end_time=1.46

start_frame = int(start_time * sample_rate)
end_frame = int(end_time * sample_rate)

waveform_slice = waveform[:, start_frame:end_frame]

Audio(waveform_slice.squeeze(), rate=16000)

Get the words & their indices for a .textgrid file from Librispeech

In [1]:
from pathlib import Path
import textgrids

    

input_dir = Path("data/all_textgrid")
files = list(input_dir.rglob("**/*.TextGrid"))
output_path = "data/words_and_indices.txt"

with open(output_path, "w", encoding="utf-8") as f:
    for file in files:
        grid = textgrids.TextGrid(file)

        words_tier = grid["words"]
        file_name = file.stem
        
        f.write(f"{file_name}:\n")
        for idx, interval in enumerate(words_tier, start=1):
            if idx == 1:
                continue
            f.write(f"{idx-2}: {interval.text}\n")
        f.write("\n")


Getting the timestamps for the indices of words:

In [2]:
from pathlib import Path
import textgrids

input_dir = Path("data/all_textgrid")
files = list(input_dir.rglob("**/*.TextGrid"))
output_path = "data/timestamps_and_indices.txt"

with open(output_path, "w", encoding="utf-8") as f:
    for file in files:
        grid = textgrids.TextGrid(file)

        words_tier = grid["words"]
        file_name = file.stem
        
        f.write(f"{file_name}:\n")
        for idx, interval in enumerate(words_tier, start=1):
            if idx == 1:
                continue
            f.write(f"{idx-2}: [{interval.xmin}:{interval.xmax}]\n")
        f.write("\n")


Evaluate the norm_dist_mat.npy's

In [None]:
import numpy as np
from utils import Cluster 
from cluster import cluster

model_name = "wavlm_base"
layer_num = 8
dist_threshold = 0.55
dist_mat_dir = Path(f"output/dtw/{model_name}/{layer_num}/d{dist_threshold}")
dist_mat_dir.mkdir(parents=True, exist_ok=True)

new_norm_dist_mat = np.load(dist_mat_dir / "norm_dist_mat.npy")
print(new_norm_dist_mat)

clusters = cluster(new_norm_dist_mat, file_names, model_name, layer_num, dist_threshold)

appended_clusters = []
for i, clust in enumerate(clusters):
    new_cluster = Cluster(i)
    for j in range(len(clust)):
        filename = file_names[clust[j]]
        wordunit_id = j
        file_parts = filename.split("_")
        file_name = file_parts[0]
        index = int(file_parts[1])  

        new_cluster.add_word_unit(wordunit_id, index, file_name)
    
    for word_unit in new_cluster.word_dict:
        word = indices_dict[word_unit.file][word_unit.index]
        new_cluster.add_true_word(word)
            

    appended_clusters.append(new_cluster.true_word_dict)
    new_cluster.cluster_purity()

    if new_cluster.purity < 0.8:
        print(f"purity : {new_cluster.purity*100}%")
        print(new_cluster.true_word_dict)

How to get audio out of my clusters?

In [3]:
import re

def parse_text_to_dict(file):
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    data_dict = {}
    current_id = None
    timestep_dict = {}

    for line in lines:
        line = line.strip()

        if not line:
            continue

        if line.endswith(":") and not line.split(":")[0].isdigit():
            if current_id is not None:
                data_dict[current_id] = timestep_dict  
            current_id = line[:-1]  
            timestep_dict = {}  
        else:
            match = re.match(r"(\d+): \[(\d*\.*\d+):(\d*\.*\d+)\]", line)
            if match:
                index = int(match.group(1))
                start = float(match.group(2))
                end = float(match.group(3))
                timestep_dict[index] = (start, end)

    if current_id is not None:
        data_dict[current_id] = timestep_dict

    return data_dict


In [None]:
from eval import  parse_cluster_file, parse_text_to_dict
from pathlib import Path
from IPython.display import Audio, display
import torchaudio

def waveform_slice(file, start_time, end_time):

    waveform, _ = torchaudio.load(file)
    sample_rate=16000
    start_frame = int(start_time * sample_rate)
    end_frame = int(end_time * sample_rate)

    waveform_slice = waveform[:, start_frame:end_frame]
    return waveform_slice


cluster_dir = Path("output/dtw/clusters/")
cluster_files = list(cluster_dir.rglob("*.txt"))
wav_dir = Path("data/librispeech-wav/")

indices_dict = parse_text_to_dict("data/timestamps_and_indices.txt")

clusters_arr = []
for cluster_file in cluster_files:
    clusters = parse_cluster_file(cluster_file)

    for clust in clusters:
        print(f"Cluster {clust.id}")
        for word_unit in clust.word_dict:
            times = indices_dict[word_unit.file][word_unit.index]
            times = times.split(":")
            start_time = float(times[0][1:])
            end_time = float(times[1][0:-1])
            word_unit.add_word_boundaries(start_time, end_time)

        if len(clust.word_dict) > 0:
            for word_unit in clust.word_dict:
                filename = wav_dir / str(word_unit.file + ".wav")
                slice = waveform_slice(filename, word_unit.start_time, word_unit.end_time)
                audio_obj = Audio(slice.squeeze(), rate=16000)
                display(audio_obj)
