In [39]:

import numpy as np
from sklearn.cluster import KMeans
from pathlib import Path

In [40]:

import torch 
import segment
def Kmeans():
    model = KMeans(100)
    checkpoint = torch.hub.load_state_dict_from_url(
    "https://github.com/bshall/dusted/releases/download/v0.1/kmeans-english-50f36a.pt"
    )
    model.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
    model.__dict__["_n_threads"] = checkpoint["_n_threads"]
    model.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
    return model, segment

In [41]:
from tqdm import tqdm

def process_file(paths, codebook, segment, gamma):
    in_path, out_path = paths
    sequence = np.load(in_path)
    codes, boundaries = segment(sequence, codebook, gamma)
    np.savez(out_path.with_suffix(".npz"), codes=codes, boundaries=boundaries)
    return sequence.shape[0], np.mean(np.diff(boundaries))

def segment_dataset(args):
    kmeans, segment = Kmeans()
    in_paths = list(args.in_dir.rglob("*.npy"))
    out_paths = [args.out_dir / path.relative_to(args.in_dir) for path in in_paths]

    for path in out_paths:
        path.parent.mkdir(exist_ok=True, parents=True)

    results = []
    for path in tqdm(zip(in_paths, out_paths), desc="Processing segments"):
        result = process_file(paths=path, codebook=kmeans.cluster_centers_, segment=segment.segment, gamma=1)
        
        results.append(result)

    frames, boundary_length = zip(*results)
    print(f"Segmented {sum(frames) * 0.02 / 60 / 60:.2f} hours of audio")
    print(f"Average segment length: {np.mean(boundary_length) * 0.02:.2f} seconds")

In [42]:

class Args:
    def __init__(self, in_dir, out_dir):
        self.in_dir =in_dir
        self.out_dir = out_dir

args = Args(Path("features/"),Path("codes/"))

segment_dataset(args)

Processing segments: 468it [00:35, 13.15it/s]


SystemError: CPUDispatcher(<function _segment at 0x71e1952659e0>) returned a result with an exception set

In [None]:
true_bounds = [0, 14, 25, 30, 35, 36]
boundaries = [0, 13, 25, 27, 29, 30, 31, 32, 36]
encodings = [1, 2, 3, 4, 5, 6, 7, 8, 9]


features = []
prev_j = 1
for i in range(1, len(true_bounds)):
    new_feature = []
    for j in range(prev_j, len(boundaries)):
        if true_bounds[i] < boundaries[j]:
            continue

        if true_bounds[i] == boundaries[j]:
            new_feature.append(encodings[j-1])
            
        elif true_bounds[i] > boundaries[j]:
            new_feature.append(encodings[j-1])
        
        prev_j = j + 1
        
    features.append(new_feature)
    

In [None]:
import numpy as np
from pathlib import Path
in_dir = Path("output/librispeech_subset/")

for path in list(in_dir.rglob("*.npy")):
    mat = np.load(path)

    DISTANCE_THRESHOLD = round(np.mean(mat)/3,3)
    num_nodes = mat.shape[0]
    graph = {i: set() for i in range(num_nodes)}

    for i in range(num_nodes - 1): 
        for j in range(i + 1, num_nodes):  
            if mat[i, j] < DISTANCE_THRESHOLD:
                graph[i].add(j)
                graph[j].add(i)  


    clusters = []
    visited = set()

    def bfs(start_node):
        """ Traverse a cluster using BFS """
        queue = [start_node]
        cluster = []
        
        while queue:
            node = queue.pop(0)
            if node in visited:
                continue
            visited.add(node)
            cluster.append(node)
            queue.extend(graph[node])  

        return cluster

    for node in range(num_nodes):
        if node not in visited:
            new_cluster = bfs(node)
            clusters.append(new_cluster)

    sizes = []
    for i, clust in enumerate(clusters):
        sizes.append(len(clust))

    print(f"Avg size cluster for file {str(path)}: {np.mean(sizes)}")
    

Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.05/norm_distance_matrix.npy: 1.9774436090225564
Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.1/norm_distance_matrix.npy: 2.8452380952380953
Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.15/norm_distance_matrix.npy: 4.326923076923077
Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.2/norm_distance_matrix.npy: 5.666666666666667


# The whole dusted pipeline

In [1]:
class Args:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

In [2]:
# Encoding into HuBERT or WAVLM features
import  torchaudio
from pathlib import Path
from tqdm import tqdm
import torch

model_pipelines = {
    "hubert_base": torchaudio.pipelines.HUBERT_BASE,
    "hubert_large": torchaudio.pipelines.HUBERT_LARGE,
    "hubert_xlarge": torchaudio.pipelines.HUBERT_XLARGE,
    "wavlm_base": torchaudio.pipelines.WAVLM_BASE,
    "wavlm_large": torchaudio.pipelines.WAVLM_LARGE,
    "wavlm_base_plus": torchaudio.pipelines.WAVLM_BASE_PLUS,
}

def encode_features(args):
    wavs = list(args.in_dir.rglob(f"*{args.audio_ext}"))

    bundle = model_pipelines.get(args.model, torchaudio.pipelines.HUBERT_BASE)
    model = bundle.get_model().cuda()
    model.eval()

    out_dir = None
    if args.out_dir:
        out_dir = args.out_dir / args.model / str(args.layer)
        print(f"Storing Encodings in {str(out_dir)}")

    encodings = {}
    for wav_path in tqdm(wavs, desc="Encoding Audio Features"):
        wav, sr = torchaudio.load(wav_path)
        wav = torchaudio.functional.resample(wav, sr, 16000).cuda()
        
        with torch.inference_mode():
            encoding, _ = model.extract_features(wav, num_layers=args.layer)

        encoding = encoding[args.layer-1].squeeze().cpu().numpy()
        
        if out_dir:
            out_dir.mkdir(parents=True, exist_ok=True)
            output_path = Path(out_dir) / f"{wav_path.stem}.npy"
            np.save(output_path, encoding)
            
        else:
            encodings[wav_path.stem] = encoding
    return encodings         

In [4]:
args = Args(
    in_dir=Path("data/librispeech_subset/"),
    out_dir=None,
    model="wavlm_base",
    layer=6,
    audio_ext=".wav"
)

encodings = encode_features(args)
print(len(encodings))

Encoding Audio Features: 100%|██████████| 14/14 [00:00<00:00, 28.08it/s]

14





In [None]:
import random
in_dir = Path("data/librispeech-wav")
sample_size = 15

wav_paths = list(in_dir.rglob("*.wav"))
sampled_paths = random.sample(wav_paths, sample_size)  

print(sampled_paths)

In [None]:
# Segment the dataset into discrete acoustic units
from sklearn.cluster import KMeans
import segment
import numpy as np

def kmeans_model(url):
    model = KMeans(100)
    checkpoint = torch.hub.load_state_dict_from_url(url)

    model.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
    model.__dict__["_n_threads"] = checkpoint["_n_threads"]
    model.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
    return model, segment

def apply_kmeans(kmeans_model, encoding):
    # C = cluster centers matrix
    C_np = kmeans_model.cluster_centers_.transpose()
    Cnorm_np = (C_np ** 2).sum(0, keepdims=True)

    C = torch.from_numpy(C_np)
    Cnorm = torch.from_numpy(Cnorm_np)

    if torch.cuda.is_available():
        C = C.cuda()
        Cnorm = Cnorm.cuda()
    
    if isinstance(encoding, torch.Tensor):
        dist = (
            encoding.pow(2).sum(1, keepdims=True)-2*torch.matmul(encoding, C)+Cnorm
        )
    else:
        dist = (
            (encoding**2).sum(1, keepdims=True)-2*np.matmul(encoding, C_np)+Cnorm_np
        )
    return np.argmin(dist, axis=1)

def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int)->int:
    """
    Convert timestamp (in seconds) to frame index based on sampling rate and frame size.
    """
    hop_size = frame_size_ms/1000 * sample_rate
    hop_size = np.max([hop_size, 1])
    return int((timestamp * sample_rate) / hop_size)


def segment_dataset_kmeans(args):
    kmeans, _ = kmeans_model(args.kmeans_url)
    encodings = []

    if args.in_dir:
        in_dir = args.in_dir / args.model 
        in_paths = list(in_dir.rglob("*.npy"))
        encodings = {}
        for path in tqdm(in_paths, desc="Loading Features"):
            encoding = np.load(path)
            encodings[path.stem] = encoding

    elif args.encodings:
        encodings = args.encodings

    if args.out_dir:
        out_dir = args.out_dir / args.model / str(args.layer)
        out_dir.mkdir(parents=True, exist_ok=True)

    align_paths = list(args.align_dir.rglob("*.list"))
    
    codes_dict = {}

    for path in tqdm(encodings, desc="Extracting Kmeans codes"):
        alignment_file = [a for a in align_paths if a.stem == path]
        if not alignment_file:
            continue
        else:
            alignment_file = alignment_file[0]

        with open(str(alignment_file), "r") as f:
            bounds = [get_frame_num(float(line.strip()), 16000, 20) for line in f]
        
        words = []
        for i in range(len(bounds)-1):
            cut_encoding = encodings[path][bounds[i]: bounds[i+1]]
            codes = apply_kmeans(kmeans, cut_encoding).tolist()
            words.append(codes)
        codes_dict[path] = words
        
    return codes_dict


In [24]:
# Segment the dataset into discrete acoustic units using DUSTED
from sklearn.cluster import KMeans
from segment import segment
import numpy as np

def kmeans_model(url):
    model = KMeans(100)
    checkpoint = torch.hub.load_state_dict_from_url(url)

    model.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
    model.__dict__["_n_threads"] = checkpoint["_n_threads"]
    model.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
    return model, segment

def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int)->int:
    """
    Convert timestamp (in seconds) to frame index based on sampling rate and frame size.
    """
    hop_size = frame_size_ms/1000 * sample_rate
    hop_size = np.max([hop_size, 1])
    return int((timestamp * sample_rate) / hop_size)

def segment_dataset_dusted(args):
    kmeans, segment = kmeans_model(args.kmeans_url)
    encodings = []
    align_paths = list(args.align_dir.rglob("*.list"))

    if args.in_dir:
        in_dir = args.in_dir / args.model 
        in_paths = list(in_dir.rglob("*.npy"))
        encodings = {}
        for path in tqdm(in_paths, desc="Loading Features"):
            encoding = np.load(path)
            encodings[path.stem] = encoding

    elif args.encodings:
        encodings = args.encodings

    if args.out_dir:
        out_dir = args.out_dir / args.model / str(args.layer)
        out_dir.mkdir(parents=True, exist_ok=True)

    codes_dict = {}
    for path in tqdm(encodings, desc="Extracting DUSTED codes"):
        alignment_file = [a for a in align_paths if a.stem == path]
        if not alignment_file:
            continue
        else:
            alignment_file = alignment_file[0]

        with open(str(alignment_file), "r") as f:
            bounds = [get_frame_num(float(line.strip()), 16000, 20) for line in f]
        
        words = []
        for i in range(len(bounds)-1):
            cut_encoding = encodings[path][bounds[i]: bounds[i+1]]
            codes, _ = segment(cut_encoding, kmeans.cluster_centers_, args.gamma)   
            words.append(codes)
        codes_dict[path] = words
        
    return codes_dict


In [None]:
args = Args(
    in_dir=None,
    out_dir=None,
    align_dir=Path("data/all_alignments/"),
    model="wavlm_base",
    layer=6,
    audio_ext=".wav",
    kmeans_url="https://github.com/bshall/dusted/releases/download/v0.1/kmeans-english-50f36a.pt",
    gamma=0.02,
    encodings=encodings
)

dusted_codes_dict = segment_dataset_dusted(args)
codes_dict = segment_dataset_kmeans(args)


Extracting DUSTED codes: 100%|██████████| 14/14 [00:00<00:00, 124.29it/s]
Extracting DUSTED codes: 100%|██████████| 14/14 [00:00<00:00, 164.22it/s]


In [126]:
for path in dusted_codes_dict:
    print(path)
    
    for word in range(len(dusted_codes_dict[path])):
        print(dusted_codes_dict[path][word])
        print(codes_dict[path][word])
        print()
    break

251-118436-0003
[97 24 18 70]
[18, 97, 24, 24, 24, 24, 24, 18, 18, 18, 70, 70, 70]

[70 95 97]
[70, 95, 95, 97, 97, 97, 97]

[24 70 95 97 95 18]
[24, 24, 97, 70, 95, 95, 95, 97, 95, 95, 18, 18, 18]

[70 18 28 40 18]
[70, 70, 70, 18, 28, 40, 40, 18, 18, 18, 18, 18, 18, 18, 18, 18]

[97 70 18 97]
[97, 24, 97, 70, 18, 18, 18, 18, 18, 97]

[18]
[18, 18, 18, 18, 18]

[18 70 18 97 95]
[18, 18, 18, 97, 18, 70, 70, 18, 18, 97, 97, 97, 97, 95, 18, 95, 95, 95, 97, 95]

[97 24 97 18 95 70 97 18]
[97, 95, 97, 97, 24, 24, 24, 97, 40, 97, 18, 95, 95, 95, 95, 95, 70, 70, 70, 97, 18]

[18 24 18 95]
[18, 24, 24, 18, 18, 18, 18, 95, 95]

[95 70 18 97 18]
[95, 97, 70, 70, 70, 18, 18, 97, 97, 18, 18, 18, 18, 18, 18]

[18 95 97 95 18 95 18]
[18, 97, 95, 95, 40, 0, 18, 97, 95, 95, 95, 95, 18, 95, 95, 18, 18, 18, 40, 18, 18]

[18 95 18 24 97]
[18, 18, 18, 95, 95, 95, 18, 18, 2, 18, 24, 97]

[24 70 97 18]
[24, 70, 70, 70, 97, 97, 97, 18, 18, 18, 18, 18]

[97 18 95]
[97, 97, 97, 97, 97, 97, 97, 40, 18, 18, 18,

In [11]:
args = Args(
    in_dir=None,
    out_dir=None,
    model="wavlm_base",
    align_dir=Path("data/all_alignments"),
    layer=6,
    audio_ext=".wav",
    kmeans_url="https://github.com/bshall/dusted/releases/download/v0.1/kmeans-english-50f36a.pt",
    gamma=0.2,
    encodings=encodings
)

codes_dict = segment_dataset(args)

Extracting DUSTED codes: 100%|██████████| 14/14 [00:00<00:00, 125.04it/s]


In [12]:
# Getting the word codes

def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int)->int:
    """
    Convert timestamp (in seconds) to frame index based on sampling rate and frame size.
    """
    hop_size = frame_size_ms/1000 * sample_rate
    hop_size = np.max([hop_size, 1])
    return int((timestamp * sample_rate) / hop_size)

def word_codes(align_paths, codes_dict):
    words_dict = {}
    word_count = 0
    for path in tqdm(codes_dict, desc="Cutting into words"):
        alignment_file = [a for a in align_paths if a.stem == path]
        
        if alignment_file:
            alignment_file = alignment_file[0]
        else:
            continue
        
        with open(str(alignment_file), "r") as f:
            bounds = [get_frame_num(float(line.strip()), 16000, 20) for line in f]
        boundaries = []
        boundaries.extend(bounds)

        feature_codes = []
        codes = codes_dict[path]
        for i in range(len(boundaries)-1):
            new_codes = codes[boundaries[i]:boundaries[i+1]]
            feature_codes.append(new_codes)
            word_count += 1
        words_dict[path] = feature_codes
    return words_dict, word_count

align_dir = Path("data/all_alignments")
align_paths = list(align_dir.rglob("*.list"))
words_dict, num_words = word_codes(align_paths, codes_dict)

Cutting into words: 100%|██████████| 14/14 [00:00<00:00, 1114.11it/s]


In [127]:
from itertools import groupby

dict_ind = {}

just_words_dusted = []
just_word_kmeans = []

index = 0
for path in dusted_codes_dict:
    dict_ind[path] = []
    for i in range(len(dusted_codes_dict[path])):
        just_words_dusted.append(dusted_codes_dict[path][i])
        dict_ind[path].append(index)
        index += 1

    for j in range(len(codes_dict[path])):
        just_word_kmeans.append(codes_dict[path][j])
    
    
collapsed_kmeans = []
for path in codes_dict:
    for j in range(len(codes_dict[path])):
        collapsed_word = [key for key, _ in groupby(codes_dict[path][j])]
        collapsed_kmeans.append(collapsed_word)
        # print(f"{codes_dict[path][j]} --> {collapsed_word}")

In [128]:
from joblib import Parallel, delayed

def edit_distance(seq1, seq2):
    """
    Compute the edit distance between two sequences using dynamic programming.
    """
    N, M = len(seq1), len(seq2)
    dp = np.zeros((N + 1, M + 1))
    for i in range(N + 1):
        dp[i, 0] = i
    for j in range(M + 1):
        dp[0, j] = j
    for i in range(1, N + 1):
        for j in range(1, M + 1):
            cost = 0 if seq1[i - 1] == seq2[j - 1] else 1
            dp[i, j] = min(dp[i - 1, j] + 1, dp[i, j - 1] + 1, dp[i - 1, j - 1] + cost)
    return dp[N, M] 

def calculate_distance(just_words, num_words):
    num_words = int(num_words/2)
    dist_mat = np.zeros((num_words, num_words))

    for i in tqdm(range(num_words), desc="Calculating Distances"):
        js = [j for j in range(i + 1, num_words)]
        dists_i = Parallel(n_jobs=8)(
            delayed(edit_distance)(just_words[i], just_words[j]) for j in js
        )

        for j, dist in zip(js, dists_i):
            dist_mat[i, j] = dist
            dist_mat[j, i] = dist  
    
    return dist_mat

dist_mat_kmeans = calculate_distance(just_word_kmeans, num_words)
dist_mat_dusted = calculate_distance(just_words_dusted, num_words)

Calculating Distances: 100%|██████████| 140/140 [00:04<00:00, 29.05it/s]
Calculating Distances: 100%|██████████| 140/140 [00:05<00:00, 26.08it/s]


In [131]:
print(dist_mat_dusted[0:5, 0:20])
print()
print(dist_mat_kmeans[0:5, 0:20])

[[0. 4. 5. 5. 2. 3. 4. 4. 2. 4. 5. 4. 4. 2. 7. 3. 3. 3. 6. 3.]
 [4. 0. 3. 4. 2. 3. 3. 6. 4. 3. 5. 3. 3. 3. 7. 3. 3. 3. 6. 2.]
 [5. 3. 0. 4. 4. 5. 3. 5. 5. 3. 4. 5. 2. 5. 5. 5. 5. 5. 4. 4.]
 [5. 4. 4. 0. 4. 4. 4. 6. 4. 3. 6. 5. 4. 4. 7. 3. 4. 4. 5. 4.]
 [2. 2. 4. 4. 0. 3. 2. 5. 3. 2. 5. 3. 3. 2. 6. 3. 3. 3. 5. 3.]]

[[ 0. 13. 12. 12.  8.  9. 16. 13.  7. 11. 16. 10. 10. 12. 21. 11.  9. 11.
  14. 11.]
 [13.  0.  9. 15.  8.  7. 15. 16.  9. 12. 18.  9.  8. 12. 20.  6.  8.  7.
  15.  7.]
 [12.  9.  0. 13.  8. 10. 15. 13. 10. 10. 12.  9.  7. 12. 17. 10.  9. 11.
  12.  6.]
 [12. 15. 13.  0. 11. 11. 16. 18. 11.  7. 14. 12.  9. 11. 20. 12.  8. 14.
   9. 13.]
 [ 8.  8.  8. 11.  0.  5. 14. 16.  5.  8. 14.  8.  7.  9. 19.  7.  5.  8.
  11.  7.]]


In [37]:
# np.save("output/wavlm_base_6.npy", dist_mat)
def cluster(dist_mat, distance_threshold):
    num_nodes = dist_mat.shape[0]
    graph = {i: set() for i in range(num_nodes)}

    for i in range(num_nodes - 1): 
        for j in range(i + 1, num_nodes):  
            if dist_mat[i, j] < distance_threshold:
                graph[i].add(j)
                graph[j].add(i)  


    clusters = []
    visited = set()

    def bfs(start_node):
        """ Traverse a cluster using BFS """
        queue = [start_node]
        cluster = []
        
        while queue:
            node = queue.pop(0)
            if node in visited:
                continue
            visited.add(node)
            cluster.append(node)
            queue.extend(graph[node])  

        return cluster

    for node in range(num_nodes):
        if node not in visited:
            new_cluster = bfs(node)
            clusters.append(new_cluster)

    return clusters

In [142]:
dusted_clusters = cluster(dist_mat_dusted, 1)
kmeans_clusters = cluster(dist_mat_kmeans, 2.5)

print(len(dusted_clusters))
print(len(kmeans_clusters))

for i, c in enumerate(dusted_clusters):
    if(len(c)>1):
        print(f"Cluster {i}: {c}")

print()
for i, c in enumerate(kmeans_clusters):
    if(len(c)>1):
        print(f"Cluster {i}: {c}")

117
116
Cluster 5: [5, 99, 16, 17, 50, 86]
Cluster 13: [13, 60, 55]
Cluster 17: [19, 81, 90, 29]
Cluster 19: [21, 112]
Cluster 25: [27, 36]
Cluster 31: [34, 64, 71]
Cluster 34: [38, 72, 42]
Cluster 47: [53, 57, 124]
Cluster 57: [67, 108]
Cluster 68: [80, 119]
Cluster 75: [89, 104, 110, 135]

Cluster 5: [5, 138, 110, 15, 115, 86, 34, 104, 118, 67, 99, 17, 50, 21, 135, 89, 71, 94]
Cluster 8: [8, 54]
Cluster 17: [19, 29]
Cluster 33: [38, 90]
Cluster 37: [42, 72]
Cluster 47: [53, 103, 120]
Cluster 96: [114, 134]


In [118]:
def parse_text_to_dict(file):
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    data_dict = {}
    current_id = None
    word_dict = {}

    for line in lines: 
        line = line.strip()

        if not line: 
            continue
        
        if line.endswith(":") and not line.split(":")[0].isdigit():
            if current_id is not None:
                data_dict[current_id] = word_dict
            
            current_id = line[:-1]
            word_dict = {}
        else:
            parts = line.split(": ")
            if len(parts) == 2:
                index, word = parts
                word_dict[int(index)] = word.strip()
            else:
                parts = parts[0].split(":")
                index = parts[0]
                word_dict[int(index)] = " "
            
            if current_id is not None:
                data_dict[current_id] = word_dict
        
    return data_dict

true_words_dict = parse_text_to_dict("data/words_and_indices.txt")

    

In [143]:
path_dict = {}
for path in dict_ind:
    for i in range(len(dict_ind[path])):
        path_dict[dict_ind[path][i]] = true_words_dict[path][i]
    
print(path_dict)

{0: 'as', 1: 'you', 2: 'well', 3: 'know', 4: 'there', 5: 'are', 6: 'ten', 7: 'men', 8: 'and', 9: 'ten', 10: 'women', 11: ' ', 12: 'whose', 13: 'sole', 14: 'duty', 15: 'is', 16: 'to', 17: ' ', 18: 'taste', 19: 'his', 20: 'food', 21: 'and', 22: 'wine', 23: ' ', 24: 'and', 25: 'fifty', 26: 'armed', 27: 'warriors', 28: 'guard', 29: 'his', 30: 'chamber', 31: 'as', 32: 'they', 33: 'guard', 34: 'it', 35: 'now', 36: ' ', 37: 'since', 38: 'his', 39: 'birth', 40: ' ', 41: 'he', 42: 'has', 43: 'been', 44: 'guarded', 45: 'so', 46: 'closely', 47: ' ', 48: 'that', 49: 'the', 50: ' ', 51: 'cleverest', 52: 'poisoners', 53: 'of', 54: 'the', 55: 'east', 56: 'could', 57: 'not', 58: 'reach', 59: 'him', 60: ' ', 61: 'the', 62: "emperor's", 63: 'daughter', 64: ' ', 65: 'a', 66: 'low', 67: ' ', 68: 'confused', 69: 'moan', 70: 'waned', 71: 'from', 72: 'his', 73: 'mouth', 74: ' ', 75: 'he', 76: 'was', 77: 'young', 78: ' ', 79: 'no', 80: 'spear', 81: 'had', 82: 'touched', 83: 'him', 84: ' ', 85: 'no', 86: ' ', 

In [145]:

print("Dusted clusters")
for i, c in enumerate(dusted_clusters):
    if(len(c)>1):
        words = [path_dict[c[j]] for j in range(len(c))]
        print(f"Cluster {i}: {', '.join(words)}")

print("\nKmeans clusters")
for i, c in enumerate(kmeans_clusters):
    if(len(c)>1):
        words = [path_dict[c[j]] for j in range(len(c))]
        print(f"Cluster {i}: {', '.join(words)}")

Dusted clusters
Cluster 5: are,  , to,  ,  ,  
Cluster 13: sole,  , east
Cluster 17: his, had, his, his
Cluster 19: and, the
Cluster 25: warriors,  
Cluster 31: it,  , from
Cluster 34: his, his, has
Cluster 47: of, not, his
Cluster 57:  , who
Cluster 68: spear, of
Cluster 75: in, the, the, his

Kmeans clusters
Cluster 5: are, the, the, is, the,  , it, the, out,  ,  ,  ,  , and, his, in, from, and
Cluster 8: and, the
Cluster 17: his, his
Cluster 33: his, his
Cluster 37: has, his
Cluster 47: of, be, the
Cluster 96: and, in
