In [372]:
# Randomly sample files to read
import random
from pathlib import Path

in_dir = Path("data/librispeech-wav")
sample_size = 20

wav_paths = list(in_dir.rglob("*.wav"))
sampled_paths = random.sample(wav_paths, sample_size)  

print(len(sampled_paths))

20


In [373]:
# Encode the sampled audio features 
import torchaudio
from tqdm import tqdm
import torch
import numpy as np
import librosa

def preemphasis(signal, coeff=0.97):
    return np.append(signal[0], signal[1:] - coeff*signal[:-1])

model_pipelines = {
    "hubert_base": torchaudio.pipelines.HUBERT_BASE,
    "hubert_large": torchaudio.pipelines.HUBERT_LARGE,
    "hubert_xlarge": torchaudio.pipelines.HUBERT_XLARGE,
    "wavlm_base": torchaudio.pipelines.WAVLM_BASE,
    "wavlm_large": torchaudio.pipelines.WAVLM_LARGE,
    "wavlm_base_plus": torchaudio.pipelines.WAVLM_BASE_PLUS,
}

model_name = "wavlm_base"
layer = 6

if model_name != "mfcc":
    bundle = model_pipelines.get(model_name, torchaudio.pipelines.HUBERT_BASE)
    model = bundle.get_model().cuda()
    model.eval()

encodings = {}
for wav_path in tqdm(sampled_paths, desc="Encoding Audio Features"):
    if model_name != "mfcc":
        out_dir = Path("features/") / model_name / str(layer)
        wav, sr = torchaudio.load(wav_path)
        wav = torchaudio.functional.resample(wav, sr, 16000).cuda()

        with torch.inference_mode():
            encoding, _ = model.extract_features(wav, num_layers=layer)

        encoding = encoding[layer-1].squeeze().cpu().numpy()
    else:
        out_dir = Path("features/") / model_name 
        wav, sr = librosa.core.load(wav_path, sr=None)
        wav = preemphasis(wav, coeff=0.97)

        mfcc = librosa.feature.mfcc(
            y=wav, sr=sr, n_mfcc=13, n_mels=24, 
            n_fft=int(np.floor(0.025*sr)),
            hop_length=int(np.floor(0.01*sr)), 
            fmin=64, fmax=8000
        )
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta_delta = librosa.feature.delta(mfcc_delta)
        encoding = np.hstack([mfcc.T, mfcc_delta.T, mfcc_delta_delta.T])

    if out_dir:
        out_dir.mkdir(parents=True, exist_ok=True)
        output_path = Path(out_dir) / f"{wav_path.stem}.npy"
        np.save(output_path, encoding)
    encodings[wav_path.stem] = encoding
print(f"Stored Encodings in {str(out_dir)}")


Encoding Audio Features: 100%|██████████| 20/20 [00:00<00:00, 20.21it/s]

Stored Encodings in features/wavlm_base/6





In [171]:
# Get the kmeans word codes
from sklearn.cluster import KMeans

def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int)->int:
    hop_size = frame_size_ms/1000 * sample_rate
    hop_size = np.max([hop_size, 1])
    return int((timestamp * sample_rate) / hop_size)

def kmeans_model(url):
    model = KMeans(100)
    checkpoint = torch.hub.load_state_dict_from_url(url)

    model.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
    model.__dict__["_n_threads"] = checkpoint["_n_threads"]
    model.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
    return model

def apply_kmeans(kmeans_model, encoding):
    # C = cluster centers matrix
    C_np = kmeans_model.cluster_centers_.transpose()
    Cnorm_np = (C_np ** 2).sum(0, keepdims=True)

    C = torch.from_numpy(C_np)
    Cnorm = torch.from_numpy(Cnorm_np)

    if torch.cuda.is_available():
        C = C.cuda()
        Cnorm = Cnorm.cuda()
    
    if isinstance(encoding, torch.Tensor):
        dist = (
            encoding.pow(2).sum(1, keepdims=True)-2*torch.matmul(encoding, C)+Cnorm
        )
    else:
        dist = (
            (encoding**2).sum(1, keepdims=True)-2*np.matmul(encoding, C_np)+Cnorm_np
        )
    return np.argmin(dist, axis=1)


In [374]:
out_dir = Path("output/codes/kmeans")
align_dir = Path("data/all_alignments")

kmeans_url = "https://github.com/bshall/dusted/releases/download/v0.1/kmeans-english-50f36a.pt"
kmeans = kmeans_model(kmeans_url)

if out_dir and model_name != "mfcc":
    out_dir = out_dir / model_name / str(layer)
    out_dir.mkdir(parents=True, exist_ok=True)
else:
    out_dir = out_dir / model_name 
    out_dir.mkdir(parents=True, exist_ok=True)

align_paths = list(align_dir.rglob("*.list"))

cut_encodings = {}
filenames = {}
features = []
index = 0
for path in tqdm(encodings, desc="Cutting Encodings"):
    alignment_file = [a for a in align_paths if a.stem == path]
    if not alignment_file:
        continue
    else:
        alignment_file = alignment_file[0]

    with open(str(alignment_file), "r") as f:
        bounds = [get_frame_num(float(line.strip()), 16000, 20) for line in f]
    
    cut_encoding = encodings[path][0: bounds[0]]
    words = [cut_encoding]
    features.append(cut_encoding)
    for i in range(len(bounds)-1): 
        cut_encoding = encodings[path][bounds[i]: bounds[i+1]]
        features.append(cut_encoding)
        filenames[index] = f"{path}_{i}"
        words.append(cut_encoding)
        index += 1
    cut_encodings[path] = words


Cutting Encodings: 100%|██████████| 20/20 [00:00<00:00, 852.49it/s]


In [375]:
for path in cut_encodings:
    print(len(cut_encodings[path]))

29
22
29
20
10
12
17
32
42
15
24
23
49
37
9
4
13
26
25
17


In [376]:
# Extracting the kmeans codes for each word encoding 
kmeans_codes = {}
for path in tqdm(cut_encodings, desc="Extracting Kmeans codes"):
    words = []
    for word in cut_encodings[path]:
        codes = apply_kmeans(kmeans, word).tolist()
        words.append(codes)
    kmeans_codes[path] = words


Extracting Kmeans codes: 100%|██████████| 20/20 [00:00<00:00, 71.13it/s]


In [377]:
# Extracting the dusted codes for each word encoding 
from segment import segment

gamma = 0.05
dusted_codes = {}
for path in tqdm(cut_encodings, desc="Extracting DUSTED codes"):
    words = []
    for word in cut_encodings[path]:
        codes, _ = segment(word, kmeans.cluster_centers_, gamma)   
        words.append(codes)
    dusted_codes[path] = words


Extracting DUSTED codes: 100%|██████████| 20/20 [00:00<00:00, 121.41it/s]


In [378]:
for path in dusted_codes:
    print(path)
    
    for i, word in enumerate(dusted_codes[path]):
        print(dusted_codes[path][i])
        print(kmeans_codes[path][i])
        print()
        break
    

174-84280-0007
[95]
[95, 95, 95, 95, 95, 95, 95, 95, 53, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 46, 95, 95, 95]

6313-66125-0010
[18]
[95, 95, 18, 18, 18, 18, 18, 18, 18, 95, 18, 18]

2277-149897-0016
[95]
[95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95]

6345-64257-0007
[95 18 86]
[95, 95, 95, 95, 95, 95, 95, 95, 18, 18, 18, 18, 95, 86]

1988-24833-0028
[95 97]
[95, 95, 97, 95, 95, 95, 95, 95, 95, 95, 95, 95, 97, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 97, 95, 95, 97]

3752-4944-0025
[95 97]
[95, 95, 95, 95, 97, 97, 95, 95, 95, 95, 18, 97, 97]

652-130726-0007
[95 97 95]
[95, 18, 97, 97, 95, 95, 97, 95, 95, 95, 97]

3536-23268-0018
[95 97 18]
[95, 95, 97, 95, 95, 18, 18, 18, 18, 70, 18, 18, 70]

1988-148538-0013
[95]
[95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 18, 95]

6345-93306-0006
[95 18]
[95, 95, 95, 95, 18, 18, 18]

2803-161169-0001
[95 18 24 97]
[97, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 97, 95, 95, 95, 95, 95, 18, 18, 24, 97, 97]

6241-61943-0017
[95 18

In [379]:
# Converting the dictionaries to arrays and getting the index for each of the words in the dataset
dict_ind = {}

just_words_dusted = []
just_words_kmeans = []

index = 0
for path in dusted_codes:
    dict_ind[path] = []
    for i in range(len(dusted_codes[path])):
        just_words_dusted.append(dusted_codes[path][i])
        dict_ind[path].append(index)
        index += 1

    for j in range(len(kmeans_codes[path])):
        just_words_kmeans.append(kmeans_codes[path][j])

for word in range(len(just_words_dusted)):
    print(just_words_dusted[word])
    print(just_words_kmeans[word])
    print()
    break

[95]
[95, 95, 95, 95, 95, 95, 95, 95, 53, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 46, 95, 95, 95]



In [380]:
# Collapsing the kmeans words
from itertools import groupby

collapsed_kmeans = []
for path in kmeans_codes:
    for j in range(len(kmeans_codes[path])):
        collapsed_word = [key for key, _ in groupby(kmeans_codes[path][j])]
        collapsed_kmeans.append(collapsed_word)

for word in range(len(collapsed_kmeans)):
    print(collapsed_kmeans[word])
    break

[95, 53, 95, 46, 95]


In [23]:
# DTW
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed

def dtw_sweep_min(query_seq, search_seq, n_step=3):
    """
    Return the minimum DTW cost as `query_seq` is swept across `search_seq`.

    Step size can be specified with `n_step`.
    """

    from cython_dtw import _dtw
    dtw_cost_func = _dtw.multivariate_dtw_cost_cosine

    i_start = 0
    n_query = query_seq.shape[0]
    n_search = search_seq.shape[0]
    min_cost = np.inf

    while i_start <= n_search - n_query or i_start == 0:
        cost = dtw_cost_func(
            query_seq, search_seq[i_start:i_start + n_query], True
        )
        i_start += n_step
        if cost < min_cost:
            min_cost = cost

    return min_cost

def dtw(features):
    tensor_features = [torch.from_numpy(f) for f in features]
    stacked_features = torch.cat(tensor_features, dim=0)
    normalized_features = []

    scaler = StandardScaler()
    scaler.fit(stacked_features) 
    normalized_features = []
    for feature in tqdm(features, desc="Normalizing Features"):
        normalized_features.append(torch.from_numpy(scaler.transform(feature))) 
    
    num_features = len(normalized_features)
    norm_distance_mat = np.zeros((num_features, num_features))
    normalized_features = [f.cpu().numpy().astype(np.float64) for f in normalized_features]

    for i in tqdm(range(num_features), desc="Calculating Distances"):
        dists_i = Parallel(n_jobs=8)(
            delayed(dtw_sweep_min)(normalized_features[i], normalized_features[j])
            for j in range(i + 1, num_features)
        )

        for j, dist in zip(range(i + 1, num_features), dists_i):
            norm_distance_mat[i, j] = dist
            norm_distance_mat[j, i] = dist  
            
    return norm_distance_mat

In [306]:
# Edit Distance
from joblib import Parallel, delayed

def edit_distance(seq1, seq2):
    """
    Compute the edit distance between two sequences using dynamic programming.
    """
    N, M = len(seq1), len(seq2)
    dp = np.zeros((N + 1, M + 1))
    for i in range(N + 1):
        dp[i, 0] = i
    for j in range(M + 1):
        dp[0, j] = j
    for i in range(1, N + 1):
        for j in range(1, M + 1):
            cost = 0 if seq1[i - 1] == seq2[j - 1] else 1
            dp[i, j] = min(dp[i - 1, j] + 1, dp[i, j - 1] + 1, dp[i - 1, j - 1] + cost)
    return dp[N, M] 

def calculate_distance(just_words, num_words):
    dist_mat = np.zeros((num_words, num_words))

    for i in tqdm(range(num_words), desc="Calculating Distances"):
        js = [j for j in range(i + 1, num_words)]
        dists_i = Parallel(n_jobs=8)(
            delayed(edit_distance)(just_words[i], just_words[j]) for j in js
        )

        for j, dist in zip(js, dists_i):
            dist_mat[i, j] = dist
            dist_mat[j, i] = dist  
    
    return dist_mat

In [381]:
num_words = index
dist_mat_kmeans = calculate_distance(just_words_kmeans, num_words)
dist_mat_kmeans_collapsed = calculate_distance(collapsed_kmeans, num_words)
dist_mat_dusted = calculate_distance(just_words_dusted, num_words)

Calculating Distances: 100%|██████████| 455/455 [00:27<00:00, 16.28it/s]
Calculating Distances: 100%|██████████| 455/455 [00:28<00:00, 16.21it/s]
Calculating Distances: 100%|██████████| 455/455 [00:30<00:00, 14.74it/s]


In [308]:
print(num_words)
print(len(features))

69
69


In [309]:
dist_mat_dtw = dtw(features)

Normalizing Features:   0%|          | 0/69 [00:00<?, ?it/s]

Normalizing Features: 100%|██████████| 69/69 [00:00<00:00, 6373.62it/s]
Calculating Distances: 100%|██████████| 69/69 [00:02<00:00, 33.07it/s]


In [None]:
out_dir = Path(f"output/mat/{model_name}/{layer}/{sample_size}")
out_dir.mkdir(parents=True, exist_ok=True)

np.save(out_dir/"dist_mat_kmeans.npy", dist_mat_kmeans)
np.save(out_dir/"dist_mat_kmeans_collapsed.npy", dist_mat_kmeans_collapsed)
np.save(out_dir/"dist_mat_dusted.npy", dist_mat_dusted)
np.save(out_dir/"dist_mat_dtw.npy", dist_mat_dtw)

NameError: name 'sample_size' is not defined

In [310]:
print(dist_mat_dusted[0:5, 0:15])
print(dist_mat_dusted.shape)
print(dist_mat_kmeans[0:5, 0:15])
print(dist_mat_kmeans.shape)
print(dist_mat_kmeans_collapsed[0:5, 0:15])
print(dist_mat_kmeans_collapsed.shape)
print(dist_mat_dtw[0:5, 0:15])
print(dist_mat_dtw.shape)

[[0. 1. 1. 2. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1.]
 [1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 2.]
 [1. 1. 0. 1. 1. 1. 0. 0. 2. 1. 0. 1. 1. 0. 1.]
 [2. 1. 1. 0. 2. 1. 1. 1. 1. 2. 1. 2. 1. 1. 1.]
 [1. 1. 1. 2. 0. 1. 1. 1. 2. 1. 1. 1. 1. 1. 2.]]
(69, 69)
[[ 0. 20. 25. 23. 25. 19. 19. 17. 16.  7. 22. 20. 20. 22. 16.]
 [20.  0. 26. 14. 27. 20. 20. 21. 20. 23. 20. 21. 22. 22. 24.]
 [25. 26.  0. 27.  3.  8. 17. 10. 22. 23.  8.  7.  6.  6. 14.]
 [23. 14. 27.  0. 28. 21. 22. 22. 18. 25. 22. 22. 25. 23. 20.]
 [25. 27.  3. 28.  0.  8. 19. 12. 23. 23. 10.  8.  7.  8. 16.]]
(69, 69)
[[ 0.  8.  6.  6.  6.  5.  5.  3.  6.  2.  4.  3.  3.  3.  4.]
 [ 8.  0. 11.  7. 11. 10.  6.  9.  9.  9.  8.  7.  8.  8.  9.]
 [ 6. 11.  0.  9.  2.  2.  8.  3. 10.  4.  3.  5.  4.  4.  5.]
 [ 6.  7.  9.  0.  9.  8.  6.  7.  7.  7.  7.  6.  7.  7.  8.]
 [ 6. 11.  2.  9.  0.  2.  9.  4. 10.  4.  4.  5.  5.  5.  5.]]
(69, 69)
[[0.         0.51592271 0.89439656 0.51845808 0.99239204 0.76974889
  0.57417087 0.68496925 0.536189

In [69]:
# Clustering algorithm 

def cluster(dist_mat, distance_threshold):
    num_nodes = dist_mat.shape[0]
    graph = {i: set() for i in range(num_nodes)}

    for i in range(num_nodes - 1): 
        for j in range(i + 1, num_nodes):  
            if dist_mat[i, j] < distance_threshold:
                graph[i].add(j)
                graph[j].add(i)  


    clusters = []
    visited = set()

    def bfs(start_node):
        """ Traverse a cluster using BFS """
        queue = [start_node]
        cluster = []
        
        while queue:
            node = queue.pop(0)
            if node in visited:
                continue 
            visited.add(node)
            cluster.append(node)
            queue.extend(graph[node])  

        return cluster

    for node in range(num_nodes):
        if node not in visited:
            new_cluster = bfs(node)
            clusters.append(new_cluster)

    return clusters

In [312]:
# Evaluating the clusters
dusted_clusters = cluster(dist_mat_dusted,0.1)
kmeans_clusters = cluster(dist_mat_kmeans, 5)
kmeans_clusters_collapsed = cluster(dist_mat_kmeans_collapsed, 3)
dtw_clusters = cluster(dist_mat_dtw, np.mean(dist_mat_dtw)- np.std(dist_mat_dtw)/2)

print(len(dusted_clusters))
print(len(kmeans_clusters))
print(len(kmeans_clusters_collapsed))
print(len(dtw_clusters))
print()

9
47
29
2



In [325]:
dusted_clusters = kluster(dist_mat_dusted,1)
kmeans_clusters = kluster(dist_mat_kmeans, 6)
kmeans_clusters_collapsed = kluster(dist_mat_kmeans_collapsed, 3)
dtw_clusters = kluster(dist_mat_dtw, 0.2)

print(len(dusted_clusters))
print(len(kmeans_clusters))
print(len(kmeans_clusters_collapsed))
print(len(dtw_clusters))
print()

{(51,), (57,), (63,), (4, 58), (60,), (66,), (11,), (8, 55), (17,), (23,), (20,), (26,), (29,), (35,), (32,), (38,), (44,), (3, 40), (41,), (50,), (56,), (53,), (59,), (62,), (2, 6, 7, 10, 13, 15, 16, 22, 25, 26, 32, 34, 35, 36, 41, 43, 44, 46, 50, 52, 53, 60, 63, 64, 65), (7,), (68,), (65,), (10,), (16,), (13,), (19,), (25,), (22,), (31,), (37,), (1, 5), (34,), (43,), (14, 39, 59), (14, 39), (49,), (46,), (0, 9, 11, 18, 19, 20, 21, 24, 27, 31, 37, 38, 45, 49, 62, 66), (52,), (28, 47), (61,), (64,), (67,), (6,), (12,), (18,), (15,), (21,), (3, 40, 42, 48, 54, 57), (24,), (0, 9), (30,), (27,), (33,), (1, 5, 12, 17, 23, 29, 30, 51, 56, 61, 67, 68), (36,), (42,), (45,), (48,), (54,)}
{(51,), (57,), (66,), (8,), (14,), (18, 29), (17,), (23,), (20,), (26,), (35,), (32,), (38,), (5, 12, 30, 64), (44,), (41,), (47,), (37, 45), (50,), (56,), (2, 4, 15, 20, 22, 25, 31, 49), (53,), (59,), (11, 38), (62,), (1,), (7,), (68,), (16,), (21, 24), (19,), (25,), (5, 12), (10, 13, 33, 34, 41, 52), (22,),

In [79]:
rounded_dtw_mat = np.round(dist_mat_dtw)
dtw_clusters = cluster(rounded_dtw_mat, 0.7)
print(len(dtw_clusters))

1


In [141]:
for i, c in enumerate(dusted_clusters):
    if(len(c)>1):
        print(f"Cluster {i}: {c}")

print()
for i, c in enumerate(kmeans_clusters):
    if(len(c)>1):
        print(f"Cluster {i}: {c}")

print()
for i, c in enumerate(kmeans_clusters_collapsed):
    if(len(c)>1):
        print(f"Cluster {i}: {c}")

print()
for i, c in enumerate(dtw_clusters):
    if(len(c)>1):
        print(f"Cluster {i}: {c}")

Cluster 1: [1, 2, 4, 5, 7, 12, 13, 16, 22, 28, 29, 30, 32, 33, 35, 36, 37, 39, 47, 49, 54, 55, 59, 60, 61, 62, 64, 66, 67, 68, 70, 73, 74, 76, 79, 80, 85]
Cluster 2: [3, 34, 56, 6, 38, 8, 11, 75, 15, 19, 20, 52, 24, 25, 26, 27]
Cluster 3: [9, 69, 72, 41, 42, 44, 78, 17, 18, 51, 83, 21, 84, 57, 31]
Cluster 4: [10, 65]
Cluster 6: [23, 82]
Cluster 7: [40, 81, 46]
Cluster 8: [43, 63]
Cluster 11: [50, 53, 86]
Cluster 13: [71, 77]

Cluster 1: [1, 32, 33, 68, 5, 39, 13, 16, 80, 54, 55, 61, 7, 12, 25, 26, 29, 38, 41, 62, 64, 66, 73, 83, 2, 17, 35, 44, 49, 76, 82, 23, 74, 42, 58, 9, 3, 15, 75, 27, 24, 31, 78, 72, 21, 18, 10]
Cluster 37: [81, 84]

Cluster 1: [1, 2, 5, 7, 12, 13, 16, 17, 22, 23, 25, 27, 29, 31, 32, 38, 39, 41, 42, 43, 44, 47, 49, 54, 55, 61, 62, 64, 66, 73, 76, 78, 80, 81, 82, 83, 84, 74, 26, 58, 6, 35, 4, 8, 24, 3, 9, 18, 75, 15, 20, 21, 10, 72, 50, 68, 67, 33, 36, 46, 79, 71, 34, 70, 40]

Cluster 7: [7, 44, 83, 61, 73]
Cluster 9: [9, 15]
Cluster 16: [17, 24, 62, 82, 38, 72, 55]

In [43]:
# Evaluating the cluster words

def parse_text_to_dict(file):
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    data_dict = {}
    current_id = None
    word_dict = {}

    for line in lines: 
        line = line.strip()

        if not line: 
            continue
        
        if line.endswith(":") and not line.split(":")[0].isdigit():
            if current_id is not None:
                data_dict[current_id] = word_dict
            
            current_id = line[:-1]
            word_dict = {}
        else:
            parts = line.split(": ")
            if len(parts) == 2:
                index, word = parts
                word_dict[int(index)] = word.strip()
            else:
                parts = parts[0].split(":")
                index = parts[0]
                word_dict[int(index)] = " "
            
            if current_id is not None:
                data_dict[current_id] = word_dict
        
    return data_dict

true_words_dict = parse_text_to_dict("data/words_and_indices.txt")

path_dict = {}
for path in dict_ind:
    path_dict[dict_ind[path][0]] = ''
    for i in range(1, len(dict_ind[path])):
        path_dict[dict_ind[path][i]] = true_words_dict[path][i-1]

In [44]:
# Cluster and WordUnit classes
from collections import defaultdict

class Cluster:
    def __init__(self,id, word_dict=None, true_words=None):
        self.id = id
        self.length = len(word_dict) if word_dict else 0
        self.word_dict = word_dict if word_dict is not None else []
        self.true_word_dict = true_words if true_words is not None else []
    
    def add_word_unit(self, id, index, file):
        word_unit = WordUnit(file, index, id)
        self.length += 1
        self.word_dict.append(word_unit)

    def add_true_word(self, word):
        self.true_word_dict.append(word)

    @classmethod
    def print_cluster(self, cluster):
        print(f"Cluster {cluster.id}")
        for word in cluster.word_dict:
            print(f"Word {word.id}: Index {word.index} in File {word.file}")
    
    def cluster_purity(self):

        word_counts = {}
        for word in self.true_word_dict:
            word_counts[word] = word_counts.get(word, 0) + 1

        max_count = max(word_counts.values()) if word_counts else 0
        cluster_purity = max_count / self.length if self.length > 0 else 0

        self.purity = cluster_purity

    @classmethod
    def duplicate_clusters(self, clusters):
        cluster_dict = defaultdict(int)

        for cluster in clusters:
            cluster_set = frozenset(cluster)  
            cluster_dict[cluster_set] += 1  

        duplicate_count = sum(1 for count in cluster_dict.values() if count > 1)

        return duplicate_count

class WordUnit:
    def __init__(self, file, index, id):
        self.index = int(index)
        self.file = file
        self.id = int(id)
        self.start_time = None
        self.end_time = None

    def add_word_boundaries(self, start_time, end_time):
        self.start_time = start_time
        self.end_time = end_time
    

In [45]:
def waveform_slice(file, start_time, end_time):

    waveform, _ = torchaudio.load(file)
    sample_rate=16000
    start_frame = int(start_time * sample_rate)
    end_frame = int(end_time * sample_rate)

    waveform_slice = waveform[:, start_frame:end_frame]
    return waveform_slice, sample_rate

In [339]:
from IPython.display import Audio, display

dtw_clusters = cluster(dist_mat_dtw, 0.19)
clusters = []
wav_dir = Path("data/librispeech-wav/")
indices_dict = parse_text_to_dict("data/timestamps_and_indices.txt")

# dtw_clusters = kluster(dist_mat_dtw, 0.5)
# print(dtw_clusters)
for i, clust in enumerate(dtw_clusters):
    new_cluster = Cluster(id=i)
    for w in range(len(clust)):
        if clust[w] in filenames:
            filename_parts = filenames[clust[w]].split("_")
            filename = filename_parts[0]
            word_index = int(filename_parts[1])                     
            new_cluster.add_word_unit(w, word_index, filename)
    clusters.append(new_cluster)

for c in clusters:
    # print(c.id)
    for word_unit in c.word_dict:
        if word_unit.index == 0:
            word = ''
        else:
            word = true_words_dict[word_unit.file][word_unit.index-1]
        times = indices_dict[word_unit.file][word_unit.index]
        times = times.split(":")
        start_time = float(times[0][1:])
        end_time = float(times[1][0:-1])
        word_unit.add_word_boundaries(start_time, end_time)
        c.add_true_word(word)
        
        # filename = wav_dir / str(word_unit.file + ".wav")
        # slice, sample_rate = waveform_slice(filename, word_unit.start_time, word_unit.end_time)
        # audio_obj = Audio(slice.squeeze(), rate=16000)
        # display(audio_obj)
        
    
    if len(c.word_dict) > 1:  
        print(f"Cluster {c.id}")                
        for word in c.true_word_dict:
            print(word)

        print()

Cluster 1
lady
lady
 
 
lady
 
lady
lady
 

Cluster 2
 
my
my

Cluster 3
rose
white

Cluster 12
roundel
lady

Cluster 29
 
 



In [345]:
kmeans_clusters = cluster(dist_mat_kmeans,3)
clusters = []

for i, clust in enumerate(kmeans_clusters):
    new_cluster = Cluster(id=i)
    for w in range(len(clust)):
        if clust[w] in filenames:
            filename_parts = filenames[clust[w]].split("_")
            filename = filename_parts[0]
            word_index = int(filename_parts[1])                     
            new_cluster.add_word_unit(w, word_index, filename)
    clusters.append(new_cluster)

for c in clusters:

    for word_unit in c.word_dict:
        if word_unit.index == 0:
            word = ''
        else:
            word = true_words_dict[word_unit.file][word_unit.index-1]
        c.add_true_word(word)
    
    if len(c.word_dict) > 1:  
        print(f"Cluster {c.id}")                
        for word in c.true_word_dict:
            print(word)
        print()

Cluster 2
 
roundel
play
thousand
a
gift

Cluster 5
my
a



In [350]:
kmeans_clusters_collapsed = cluster(dist_mat_kmeans_collapsed,0.2)
clusters = []

for i, clust in enumerate(kmeans_clusters_collapsed):
    new_cluster = Cluster(id=i)
    for w in range(len(clust)):
        if clust[w] in filenames:
            filename_parts = filenames[clust[w]].split("_")
            filename = filename_parts[0]
            word_index = int(filename_parts[1])                     
            new_cluster.add_word_unit(w, word_index, filename)
    clusters.append(new_cluster)

for c in clusters:

    for word_unit in c.word_dict:
        if word_unit.index == 0:
            word = ''
        else:
            word = true_words_dict[word_unit.file][word_unit.index-1]
        c.add_true_word(word)
    
    if len(c.word_dict) > 1:  
        print(f"Cluster {c.id}")                
        for word in c.true_word_dict:
            print(word)
        print()

Cluster 5
my
play
a

Cluster 15
a
roundel

Cluster 42
though
water



In [371]:
dusted_clusters = cluster(dist_mat_dusted,1)
clusters = []

for i, clust in enumerate(dusted_clusters):
    new_cluster = Cluster(id=i)
    for w in range(len(clust)):
        if clust[w] in filenames:
            filename_parts = filenames[clust[w]].split("_")
            filename = filename_parts[0]
            word_index = int(filename_parts[1])                     
            new_cluster.add_word_unit(w, word_index, filename)
    clusters.append(new_cluster)

for c in clusters:

    for c in clusters:
        print(c.id)
        for word_unit in c.word_dict:
            if word_unit.index == 0:
                word = ''
            else:
                word = true_words_dict[word_unit.file][word_unit.index-1]
            times = indices_dict[word_unit.file][word_unit.index]
            times = times.split(":")
            start_time = float(times[0][1:])
            end_time = float(times[1][0:-1])
            word_unit.add_word_boundaries(start_time, end_time)
            c.add_true_word(word)
            
            filename = wav_dir / str(word_unit.file + ".wav")
            slice, sample_rate = waveform_slice(filename, word_unit.start_time, word_unit.end_time)
            audio_obj = Audio(slice.squeeze(), rate=16000)
            display(audio_obj)
            
        
        if len(c.word_dict) > 1:  
            print(f"Cluster {c.id}")                
            for word in c.true_word_dict:
                print(word)

0


Cluster 0

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift
1


Cluster 1
lady
my
you
lady
over
 
you
lady
me
a
2


Cluster 2
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
3


Cluster 3
lady
for
finger
a
 
 
4


Cluster 4
 
my
5


Cluster 5
lady
lady
6


Cluster 6
hear
fair
ring
7


Cluster 7
get
pledged
8


0


Cluster 0

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift
1


Cluster 1
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
2


Cluster 2
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
3


Cluster 3
lady
for
finger
a
 
 
lady
for
finger
a
 
 
4


Cluster 4
 
my
 
my
5


Cluster 5
lady
lady
lady
lady
6


Cluster 6
hear
fair
ring
hear
fair
ring
7


Cluster 7
get
pledged
get
pledged
8


0


Cluster 0

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift
1


Cluster 1
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
2


Cluster 2
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
3


Cluster 3
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
4


Cluster 4
 
my
 
my
 
my
5


Cluster 5
lady
lady
lady
lady
lady
lady
6


Cluster 6
hear
fair
ring
hear
fair
ring
hear
fair
ring
7


Cluster 7
get
pledged
get
pledged
get
pledged
8


0


Cluster 0

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift
1


Cluster 1
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
2


Cluster 2
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
3


Cluster 3
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
4


Cluster 4
 
my
 
my
 
my
 
my
5


Cluster 5
lady
lady
lady
lady
lady
lady
lady
lady
6


Cluster 6
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
7


Cluster 7
get
pledged
get
pledged
get
pledged
get
pledged
8


0


Cluster 0

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift
1


Cluster 1
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
2


Cluster 2
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
3


Cluster 3
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
4


Cluster 4
 
my
 
my
 
my
 
my
 
my
5


Cluster 5
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
6


Cluster 6
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
7


Cluster 7
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
8


0


Cluster 0

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift
1


Cluster 1
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
2


Cluster 2
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
3


Cluster 3
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
4


Cluster 4
 
my
 
my
 
my
 
my
 
my
 
my
5


Cluster 5
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
6


Cluster 6
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
7


Cluster 7
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
8


0


Cluster 0

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift
1


Cluster 1
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
2


Cluster 2
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
3


Cluster 3
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
4


Cluster 4
 
my
 
my
 
my
 
my
 
my
 
my
 
my
5


Cluster 5
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
6


Cluster 6
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
7


Cluster 7
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
8


0


Cluster 0

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift
1


Cluster 1
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
2


Cluster 2
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
s

Cluster 3
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
4


Cluster 4
 
my
 
my
 
my
 
my
 
my
 
my
 
my
 
my
5


Cluster 5
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
6


Cluster 6
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
7


Cluster 7
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
8


0


Cluster 0

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift

lady
her
finger
 
will
though
thousand

but
if
i
a
 
o
gift
1


Cluster 1
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
lady
my
you
lady
over
 
you
lady
me
a
2


Cluster 2
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
she's
leagues
the
water
lady
my
rose
white
 
rose
white
but
not
a
roundel
play
roundel
lady
from
emperor's
daughter
 
my
bring
 
s

Cluster 3
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
lady
for
finger
a
 
 
4


Cluster 4
 
my
 
my
 
my
 
my
 
my
 
my
 
my
 
my
 
my
5


Cluster 5
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
lady
6


Cluster 6
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
hear
fair
ring
7


Cluster 7
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
get
pledged
8


In [168]:
print("Dusted clusters")
for i, c in enumerate(dusted_clusters):
    if(len(c)>1):
        words = [path_dict[c[j]] for j in range(len(c))]
        print(f"Cluster {i}: {', '.join(words)}")

print("\nKmeans clusters")
for i, c in enumerate(kmeans_clusters):
    if(len(c)>1):
        words = [path_dict[c[j]] for j in range(len(c))]
        print(f"Cluster {i}: {', '.join(words)}")

print("\nCollapsed Kmeans clusters")
for i, c in enumerate(kmeans_clusters_collapsed):
    if(len(c)>1):
        words = [path_dict[c[j]] for j in range(len(c))]
        print(f"Cluster {i}: {', '.join(words)}")

Dusted clusters
Cluster 0: , upon, to, effect, , body, ear, , her, he, it
Cluster 1: the, the
Cluster 4: which, well, stood, understand, self, and, so
Cluster 5: had, a, that, points, sioux, you, for, me, you, you, around, my,  , we, you, gazed
Cluster 6: formed,  
Cluster 7: a, that, like, a, the, some, and,  , the, the, of, the, they, the, the, the, the, a, i, the, are, her,  , the, you, a, the, in, the, the,  , the, had, the, if, the, the, the, the, was
Cluster 13: and, and, is
Cluster 16: now, his, not, his, here, should, my, of, know
Cluster 18: out, little, that, had, that, song, night
Cluster 19: to,  , to,  , to, at, been,  , as, in, of, to, to, to, to, find, at, to, old, pleasure, to
Cluster 21: leaving, me, the, place, so, so, been
Cluster 23: things, rice
Cluster 24:  , the, the, i, us, the, the, the, the, was,  , the,  , the, the, need, us, at, not,  ,  , the,  , i, you, the, you, when, are, me,  , the, the,  , the, the, that, i
Cluster 30: of, up, yet, and, all, in, of, in