Play slices of the waveforms:

In [1]:
import torchaudio
from IPython.display import Audio

waveform, _ = torchaudio.load("data/librispeech_subset/84-121123-0001.wav")
sample_rate=16000

start_time=1.1
end_time=1.46

start_frame = int(start_time * sample_rate)
end_frame = int(end_time * sample_rate)

waveform_slice = waveform[:, start_frame:end_frame]

Audio(waveform_slice.squeeze(), rate=16000)

Get the words & their indices for a .textgrid file from Librispeech

In [1]:
from pathlib import Path
import textgrids

    

input_dir = Path("data/all_textgrid")
files = list(input_dir.rglob("**/*.TextGrid"))
output_path = "data/words_and_indices.txt"

with open(output_path, "w", encoding="utf-8") as f:
    for file in files:
        grid = textgrids.TextGrid(file)

        words_tier = grid["words"]
        file_name = file.stem
        
        f.write(f"{file_name}:\n")
        for idx, interval in enumerate(words_tier, start=1):
            if idx == 1:
                continue
            f.write(f"{idx-2}: {interval.text}\n")
        f.write("\n")


Getting the timestamps for the indices of words:

In [2]:
from pathlib import Path
import textgrids

input_dir = Path("data/all_textgrid")
files = list(input_dir.rglob("**/*.TextGrid"))
output_path = "data/timestamps_and_indices.txt"

with open(output_path, "w", encoding="utf-8") as f:
    for file in files:
        grid = textgrids.TextGrid(file)

        words_tier = grid["words"]
        file_name = file.stem
        
        f.write(f"{file_name}:\n")
        for idx, interval in enumerate(words_tier, start=1):
            if idx == 1:
                continue
            f.write(f"{idx-2}: [{interval.xmin}:{interval.xmax}]\n")
        f.write("\n")


Evaluate the norm_dist_mat.npy's

In [None]:
import numpy as np
from utils import Cluster 
from cluster import cluster

model_name = "wavlm_base"
layer_num = 8
dist_threshold = 0.55
dist_mat_dir = Path(f"output/dtw/{model_name}/{layer_num}/d{dist_threshold}")
dist_mat_dir.mkdir(parents=True, exist_ok=True)

new_norm_dist_mat = np.load(dist_mat_dir / "norm_dist_mat.npy")
print(new_norm_dist_mat)

clusters = cluster(new_norm_dist_mat, file_names, model_name, layer_num, dist_threshold)

appended_clusters = []
for i, clust in enumerate(clusters):
    new_cluster = Cluster(i)
    for j in range(len(clust)):
        filename = file_names[clust[j]]
        wordunit_id = j
        file_parts = filename.split("_")
        file_name = file_parts[0]
        index = int(file_parts[1])  

        new_cluster.add_word_unit(wordunit_id, index, file_name)
    
    for word_unit in new_cluster.word_dict:
        word = indices_dict[word_unit.file][word_unit.index]
        new_cluster.add_true_word(word)
            

    appended_clusters.append(new_cluster.true_word_dict)
    new_cluster.cluster_purity()

    if new_cluster.purity < 0.8:
        print(f"purity : {new_cluster.purity*100}%")
        print(new_cluster.true_word_dict)

How to get audio out of my clusters?

In [3]:
import re

def parse_text_to_dict(file):
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    data_dict = {}
    current_id = None
    timestep_dict = {}

    for line in lines:
        line = line.strip()

        if not line:
            continue

        if line.endswith(":") and not line.split(":")[0].isdigit():
            if current_id is not None:
                data_dict[current_id] = timestep_dict  
            current_id = line[:-1]  
            timestep_dict = {}  
        else:
            match = re.match(r"(\d+): \[(\d*\.*\d+):(\d*\.*\d+)\]", line)
            if match:
                index = int(match.group(1))
                start = float(match.group(2))
                end = float(match.group(3))
                timestep_dict[index] = (start, end)

    if current_id is not None:
        data_dict[current_id] = timestep_dict

    return data_dict


In [30]:
from eval import  parse_cluster_file, parse_text_to_dict
from pathlib import Path
from IPython.display import Audio, display
import torchaudio

output = True

def waveform_slice(file, start_time, end_time):

    waveform, _ = torchaudio.load(file)
    sample_rate=16000
    start_frame = int(start_time * sample_rate)
    end_frame = int(end_time * sample_rate)

    waveform_slice = waveform[:, start_frame:end_frame]
    return waveform_slice, sample_rate


cluster_dir = Path("output/dtw/clusters/")
cluster_files = list(cluster_dir.rglob("*.txt"))
wav_dir = Path("data/librispeech-wav/")


indices_dict = parse_text_to_dict("data/timestamps_and_indices.txt")

clusters_arr = []
cluster_file = Path("output/dtw/clusters/wavlm_base_8_d0.55.txt")
clusters = parse_cluster_file(cluster_file)

for clust in clusters:
    for word_unit in clust.word_dict:
        times = indices_dict[word_unit.file][word_unit.index]
        times = times.split(":")
        start_time = float(times[0][1:])
        end_time = float(times[1][0:-1])
        word_unit.add_word_boundaries(start_time, end_time)

    if len(clust.word_dict) > 1:
        print(f"Cluster {clust.id}:")
        for word_unit in clust.word_dict:
            filename = wav_dir / str(word_unit.file + ".wav")
            slice, sample_rate = waveform_slice(filename, word_unit.start_time, word_unit.end_time)
            audio_obj = Audio(slice.squeeze(), rate=16000)
            display(audio_obj)
            if output:
                output_dir = Path(f"output/sliced_audio/cluster_{clust.id}")
                output_dir.mkdir(parents=True,exist_ok=True)
                output_filename = output_dir /f"{word_unit.file}_{word_unit.index}.wav"

                torchaudio.save(str(output_filename), slice, sample_rate)
                print(f"Saved: {output_filename}") 


Cluster 3:


Saved: output/sliced_audio/cluster_3/84-121123-0000_3.wav


Saved: output/sliced_audio/cluster_3/251-118436-0003_1.wav


Saved: output/sliced_audio/cluster_3/174-50561-0005_11.wav
Cluster 4:


Saved: output/sliced_audio/cluster_4/84-121123-0000_4.wav


Saved: output/sliced_audio/cluster_4/174-50561-0005_13.wav
Cluster 5:


Saved: output/sliced_audio/cluster_5/84-121123-0000_5.wav


Saved: output/sliced_audio/cluster_5/251-118436-0004_9.wav


Saved: output/sliced_audio/cluster_5/174-50561-0005_17.wav


Saved: output/sliced_audio/cluster_5/251-118436-0003_36.wav


Saved: output/sliced_audio/cluster_5/251-118436-0001_8.wav


Saved: output/sliced_audio/cluster_5/251-118436-0001_11.wav


Saved: output/sliced_audio/cluster_5/84-121123-0001_13.wav


Saved: output/sliced_audio/cluster_5/251-118436-0002_23.wav


Saved: output/sliced_audio/cluster_5/251-118436-0000_17.wav


Saved: output/sliced_audio/cluster_5/84-121123-0002_53.wav


Saved: output/sliced_audio/cluster_5/174-50561-0002_5.wav


Saved: output/sliced_audio/cluster_5/174-50561-0001_55.wav


Saved: output/sliced_audio/cluster_5/174-50561-0003_8.wav


Saved: output/sliced_audio/cluster_5/174-50561-0005_8.wav


Saved: output/sliced_audio/cluster_5/174-50561-0000_11.wav


Saved: output/sliced_audio/cluster_5/174-50561-0004_3.wav


Saved: output/sliced_audio/cluster_5/174-50561-0001_47.wav


Saved: output/sliced_audio/cluster_5/174-50561-0001_14.wav


Saved: output/sliced_audio/cluster_5/174-50561-0001_34.wav
Cluster 7:


Saved: output/sliced_audio/cluster_7/174-50561-0001_1.wav


Saved: output/sliced_audio/cluster_7/84-121123-0002_29.wav


Saved: output/sliced_audio/cluster_7/251-118436-0003_8.wav


Saved: output/sliced_audio/cluster_7/251-118436-0003_21.wav
Cluster 10:


Saved: output/sliced_audio/cluster_10/174-50561-0001_4.wav


Saved: output/sliced_audio/cluster_10/251-118436-0002_4.wav


Saved: output/sliced_audio/cluster_10/251-118436-0001_9.wav
Cluster 11:


Saved: output/sliced_audio/cluster_11/174-50561-0001_5.wav


Saved: output/sliced_audio/cluster_11/251-118436-0001_5.wav


Saved: output/sliced_audio/cluster_11/174-50561-0002_2.wav


Saved: output/sliced_audio/cluster_11/174-50561-0003_6.wav


Saved: output/sliced_audio/cluster_11/251-118436-0003_19.wav


Saved: output/sliced_audio/cluster_11/251-118436-0002_1.wav


Saved: output/sliced_audio/cluster_11/251-118436-0004_7.wav


Saved: output/sliced_audio/cluster_11/251-118436-0003_15.wav


Saved: output/sliced_audio/cluster_11/251-118436-0003_29.wav


Saved: output/sliced_audio/cluster_11/251-118436-0000_15.wav


Saved: output/sliced_audio/cluster_11/84-121123-0002_14.wav


Saved: output/sliced_audio/cluster_11/84-121123-0002_27.wav
Cluster 17:


Saved: output/sliced_audio/cluster_17/174-50561-0001_11.wav


Saved: output/sliced_audio/cluster_17/174-50561-0001_17.wav


Saved: output/sliced_audio/cluster_17/174-50561-0001_27.wav


Saved: output/sliced_audio/cluster_17/84-121123-0002_4.wav


Saved: output/sliced_audio/cluster_17/84-121123-0001_6.wav


Saved: output/sliced_audio/cluster_17/174-50561-0000_3.wav


Saved: output/sliced_audio/cluster_17/174-50561-0001_45.wav


Saved: output/sliced_audio/cluster_17/84-121123-0002_23.wav


Saved: output/sliced_audio/cluster_17/174-50561-0001_22.wav


Saved: output/sliced_audio/cluster_17/84-121123-0002_20.wav


Saved: output/sliced_audio/cluster_17/174-50561-0000_8.wav


Saved: output/sliced_audio/cluster_17/84-121123-0002_8.wav


Saved: output/sliced_audio/cluster_17/174-50561-0001_19.wav


Saved: output/sliced_audio/cluster_17/174-50561-0003_0.wav


Saved: output/sliced_audio/cluster_17/84-121123-0002_48.wav


Saved: output/sliced_audio/cluster_17/251-118436-0002_17.wav
Cluster 18:


Saved: output/sliced_audio/cluster_18/174-50561-0001_12.wav


Saved: output/sliced_audio/cluster_18/174-50561-0000_10.wav
Cluster 24:


Saved: output/sliced_audio/cluster_24/174-50561-0001_21.wav


Saved: output/sliced_audio/cluster_24/251-118436-0000_14.wav


Saved: output/sliced_audio/cluster_24/174-50561-0001_41.wav


Saved: output/sliced_audio/cluster_24/84-121123-0002_13.wav
Cluster 28:


Saved: output/sliced_audio/cluster_28/174-50561-0001_26.wav


Saved: output/sliced_audio/cluster_28/174-50561-0000_5.wav


Saved: output/sliced_audio/cluster_28/84-121123-0002_7.wav


Saved: output/sliced_audio/cluster_28/251-118436-0002_16.wav


Saved: output/sliced_audio/cluster_28/84-121123-0002_50.wav


Saved: output/sliced_audio/cluster_28/84-121123-0002_22.wav
Cluster 29:


Saved: output/sliced_audio/cluster_29/174-50561-0001_28.wav


Saved: output/sliced_audio/cluster_29/84-121123-0002_37.wav
Cluster 35:


Saved: output/sliced_audio/cluster_35/174-50561-0001_35.wav


Saved: output/sliced_audio/cluster_35/251-118436-0003_24.wav
Cluster 37:


Saved: output/sliced_audio/cluster_37/174-50561-0001_37.wav


Saved: output/sliced_audio/cluster_37/174-50561-0001_50.wav


Saved: output/sliced_audio/cluster_37/174-50561-0003_5.wav


Saved: output/sliced_audio/cluster_37/84-121123-0002_38.wav
Cluster 38:


Saved: output/sliced_audio/cluster_38/174-50561-0001_38.wav


Saved: output/sliced_audio/cluster_38/174-50561-0001_51.wav
Cluster 49:


Saved: output/sliced_audio/cluster_49/174-50561-0001_54.wav


Saved: output/sliced_audio/cluster_49/174-50561-0004_2.wav
Cluster 51:


Saved: output/sliced_audio/cluster_51/84-121123-0002_1.wav


Saved: output/sliced_audio/cluster_51/174-50561-0002_1.wav


Saved: output/sliced_audio/cluster_51/84-121123-0002_46.wav
Cluster 53:


Saved: output/sliced_audio/cluster_53/84-121123-0002_3.wav


Saved: output/sliced_audio/cluster_53/174-50561-0000_2.wav


Saved: output/sliced_audio/cluster_53/251-118436-0001_3.wav
Cluster 62:


Saved: output/sliced_audio/cluster_62/84-121123-0002_17.wav


Saved: output/sliced_audio/cluster_62/84-121123-0002_31.wav
Cluster 64:


Saved: output/sliced_audio/cluster_64/84-121123-0002_19.wav


Saved: output/sliced_audio/cluster_64/84-121123-0002_26.wav


Saved: output/sliced_audio/cluster_64/84-121123-0002_40.wav
Cluster 71:


Saved: output/sliced_audio/cluster_71/84-121123-0002_33.wav


Saved: output/sliced_audio/cluster_71/251-118436-0003_31.wav


Saved: output/sliced_audio/cluster_71/251-118436-0003_0.wav
Cluster 77:


Saved: output/sliced_audio/cluster_77/84-121123-0002_42.wav


Saved: output/sliced_audio/cluster_77/251-118436-0000_1.wav
Cluster 78:


Saved: output/sliced_audio/cluster_78/84-121123-0002_43.wav


Saved: output/sliced_audio/cluster_78/174-50561-0003_1.wav
Cluster 81:


Saved: output/sliced_audio/cluster_81/84-121123-0002_47.wav


Saved: output/sliced_audio/cluster_81/174-50561-0002_0.wav


Saved: output/sliced_audio/cluster_81/84-121123-0001_0.wav


Saved: output/sliced_audio/cluster_81/174-50561-0005_9.wav
Cluster 84:


Saved: output/sliced_audio/cluster_84/84-121123-0002_52.wav


Saved: output/sliced_audio/cluster_84/251-118436-0001_10.wav
Cluster 86:


Saved: output/sliced_audio/cluster_86/251-118436-0002_2.wav


Saved: output/sliced_audio/cluster_86/251-118436-0004_8.wav
Cluster 87:


Saved: output/sliced_audio/cluster_87/251-118436-0002_3.wav


Saved: output/sliced_audio/cluster_87/251-118436-0002_10.wav


Saved: output/sliced_audio/cluster_87/251-118436-0003_11.wav


Saved: output/sliced_audio/cluster_87/251-118436-0003_23.wav
Cluster 90:


Saved: output/sliced_audio/cluster_90/251-118436-0002_7.wav


Saved: output/sliced_audio/cluster_90/251-118436-0003_28.wav


Saved: output/sliced_audio/cluster_90/251-118436-0003_33.wav
Cluster 91:


Saved: output/sliced_audio/cluster_91/251-118436-0002_8.wav


Saved: output/sliced_audio/cluster_91/251-118436-0003_13.wav
Cluster 95:


Saved: output/sliced_audio/cluster_95/251-118436-0002_13.wav


Saved: output/sliced_audio/cluster_95/251-118436-0003_17.wav


Saved: output/sliced_audio/cluster_95/251-118436-0000_11.wav


Saved: output/sliced_audio/cluster_95/251-118436-0004_2.wav
Cluster 97:


Saved: output/sliced_audio/cluster_97/251-118436-0002_15.wav


Saved: output/sliced_audio/cluster_97/251-118436-0000_12.wav


Saved: output/sliced_audio/cluster_97/251-118436-0001_7.wav
Cluster 100:


Saved: output/sliced_audio/cluster_100/251-118436-0002_20.wav


Saved: output/sliced_audio/cluster_100/174-50561-0005_12.wav
Cluster 102:


Saved: output/sliced_audio/cluster_102/251-118436-0002_22.wav


Saved: output/sliced_audio/cluster_102/251-118436-0000_8.wav
Cluster 106:


Saved: output/sliced_audio/cluster_106/251-118436-0000_4.wav


Saved: output/sliced_audio/cluster_106/251-118436-0003_3.wav


Saved: output/sliced_audio/cluster_106/251-118436-0000_10.wav


Saved: output/sliced_audio/cluster_106/251-118436-0004_1.wav
Cluster 112:


Saved: output/sliced_audio/cluster_112/251-118436-0000_16.wav


Saved: output/sliced_audio/cluster_112/251-118436-0003_22.wav
Cluster 116:


Saved: output/sliced_audio/cluster_116/251-118436-0003_6.wav


Saved: output/sliced_audio/cluster_116/251-118436-0003_9.wav
Cluster 153:


Saved: output/sliced_audio/cluster_153/174-50561-0005_0.wav


Saved: output/sliced_audio/cluster_153/174-50561-0005_7.wav


Saved: output/sliced_audio/cluster_153/174-50561-0005_16.wav


Saved: output/sliced_audio/cluster_153/174-50561-0005_2.wav
