In [216]:
# Randomly sample files to read
import random
from pathlib import Path

in_dir = Path("data/librispeech-wav")
sample_size = 20

wav_paths = list(in_dir.rglob("*.wav"))
sampled_paths = random.sample(wav_paths, sample_size)  

print(len(sampled_paths))

20


In [217]:
# Encode the sampled audio features 
import torchaudio
from tqdm import tqdm
import torch
import numpy as np
import librosa

def preemphasis(signal, coeff=0.97):
    return np.append(signal[0], signal[1:] - coeff*signal[:-1])

model_pipelines = {
    "hubert_base": torchaudio.pipelines.HUBERT_BASE,
    "hubert_large": torchaudio.pipelines.HUBERT_LARGE,
    "hubert_xlarge": torchaudio.pipelines.HUBERT_XLARGE,
    "wavlm_base": torchaudio.pipelines.WAVLM_BASE,
    "wavlm_large": torchaudio.pipelines.WAVLM_LARGE,
    "wavlm_base_plus": torchaudio.pipelines.WAVLM_BASE_PLUS,
}

model_name = "wavlm_base"
layer = 6

if model_name != "mfcc":
    bundle = model_pipelines.get(model_name, torchaudio.pipelines.HUBERT_BASE)
    model = bundle.get_model().cuda()
    model.eval()

encodings = {}
for wav_path in tqdm(sampled_paths, desc="Encoding Audio Features"):
    if model_name != "mfcc":
        out_dir = Path("features/") / model_name / str(layer)
        wav, sr = torchaudio.load(wav_path)
        wav = torchaudio.functional.resample(wav, sr, 16000).cuda()

        with torch.inference_mode():
            encoding, _ = model.extract_features(wav, num_layers=layer)

        encoding = encoding[layer-1].squeeze().cpu().numpy()
    else:
        out_dir = Path("features/") / model_name 
        wav, sr = librosa.core.load(wav_path, sr=None)
        wav = preemphasis(wav, coeff=0.97)

        mfcc = librosa.feature.mfcc(
            y=wav, sr=sr, n_mfcc=13, n_mels=24, 
            n_fft=int(np.floor(0.025*sr)),
            hop_length=int(np.floor(0.01*sr)), 
            fmin=64, fmax=8000
        )
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta_delta = librosa.feature.delta(mfcc_delta)
        encoding = np.hstack([mfcc.T, mfcc_delta.T, mfcc_delta_delta.T])

    if out_dir:
        out_dir.mkdir(parents=True, exist_ok=True)
        output_path = Path(out_dir) / f"{wav_path.stem}.npy"
        np.save(output_path, encoding)
    encodings[wav_path.stem] = encoding
print(f"Stored Encodings in {str(out_dir)}")


Encoding Audio Features: 100%|██████████| 20/20 [00:00<00:00, 26.15it/s]

Stored Encodings in features/wavlm_base/6





In [57]:
# Get the kmeans word codes
from sklearn.cluster import KMeans

def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int)->int:
    hop_size = frame_size_ms/1000 * sample_rate
    hop_size = np.max([hop_size, 1])
    return int((timestamp * sample_rate) / hop_size)

def kmeans_model(url):
    model = KMeans(100)
    checkpoint = torch.hub.load_state_dict_from_url(url)

    model.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
    model.__dict__["_n_threads"] = checkpoint["_n_threads"]
    model.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
    return model

def apply_kmeans(kmeans_model, encoding):
    # C = cluster centers matrix
    C_np = kmeans_model.cluster_centers_.transpose()
    Cnorm_np = (C_np ** 2).sum(0, keepdims=True)

    C = torch.from_numpy(C_np)
    Cnorm = torch.from_numpy(Cnorm_np)

    if torch.cuda.is_available():
        C = C.cuda()
        Cnorm = Cnorm.cuda()
    
    if isinstance(encoding, torch.Tensor):
        dist = (
            encoding.pow(2).sum(1, keepdims=True)-2*torch.matmul(encoding, C)+Cnorm
        )
    else:
        dist = (
            (encoding**2).sum(1, keepdims=True)-2*np.matmul(encoding, C_np)+Cnorm_np
        )
    return np.argmin(dist, axis=1)


In [235]:
out_dir = Path("output/codes/kmeans")
align_dir = Path("data/all_alignments")

kmeans_url = "https://github.com/bshall/dusted/releases/download/v0.1/kmeans-english-50f36a.pt"
kmeans = kmeans_model(kmeans_url)

if out_dir and model_name != "mfcc":
    out_dir = out_dir / model_name / str(layer)
    out_dir.mkdir(parents=True, exist_ok=True)
else:
    out_dir = out_dir / model_name 
    out_dir.mkdir(parents=True, exist_ok=True)

align_paths = list(align_dir.rglob("*.list"))

cut_encodings = {}
filenames = {}
features = []
index = 0
for path in tqdm(encodings, desc="Cutting Encodings"):
    alignment_file = [a for a in align_paths if a.stem == path]
    if not alignment_file:
        continue
    else:
        alignment_file = alignment_file[0]

    with open(str(alignment_file), "r") as f:
        bounds = [get_frame_num(float(line.strip()), 16000, 20) for line in f]
    
    cut_encoding = encodings[path][0: bounds[0]]
    words = [cut_encoding]
    for i in range(len(bounds)-1): 
        cut_encoding = encodings[path][bounds[i]: bounds[i+1]]
        features.append(cut_encoding)
        filenames[index] = f"{path}_{i}"
        words.append(cut_encoding)
        index += 1
    cut_encodings[path] = words


Cutting Encodings: 100%|██████████| 20/20 [00:00<00:00, 714.52it/s]


In [219]:
# Extracting the kmeans codes for each word encoding 
kmeans_codes = {}
for path in tqdm(cut_encodings, desc="Extracting Kmeans codes"):
    words = []
    for word in cut_encodings[path]:
        codes = apply_kmeans(kmeans, word).tolist()
        words.append(codes)
    kmeans_codes[path] = words


Extracting Kmeans codes: 100%|██████████| 20/20 [00:00<00:00, 129.88it/s]


In [260]:
# Extracting the dusted codes for each word encoding 
from segment import segment

gamma = 0.2
dusted_codes = {}
for path in tqdm(cut_encodings, desc="Extracting DUSTED codes"):
    words = []
    for word in cut_encodings[path]:
        codes, _ = segment(word, kmeans.cluster_centers_, gamma)   
        words.append(codes)
    dusted_codes[path] = words


Extracting DUSTED codes: 100%|██████████| 20/20 [00:00<00:00, 111.16it/s]


In [261]:
for path in dusted_codes:
    print(path)
    
    for word in range(len(dusted_codes[path])):
        print(dusted_codes[path][word])
        print(kmeans_codes[path][word])
        print()
        break

5338-24640-0005
[97]
[95, 95, 97, 97, 95, 97]

652-129742-0006
[97 95 18]
[97, 97, 97, 97, 97, 97, 97, 97, 95, 18, 18, 18, 95, 97, 95, 95, 95, 95, 18, 18, 18, 18, 18, 18, 53]

251-136532-0019
[95 18]
[95, 95, 95, 97, 18, 97, 18, 95, 18, 97, 18, 95, 97]

2035-147960-0000
[95]
[95, 95, 95, 95, 95, 97, 97, 95, 18, 95, 97]

8297-275155-0025
[18]
[95, 95, 18, 18, 18, 18, 18, 70, 18, 95, 18, 70, 18, 70, 95, 95, 40, 95, 18, 18, 18, 18, 18, 24, 18]

5895-34622-0017
[95]
[95, 95, 95, 53, 95, 95, 95, 53, 95, 95, 95, 95, 97, 95, 95, 95, 95, 95, 95, 95, 62, 95, 97]

1919-142785-0003
[18]
[97, 97, 95, 97, 18, 18, 97, 18, 95, 18, 95, 18, 18, 18, 18, 95, 18, 18]

84-121550-0011
[95 18]
[95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 95, 18]

2277-149897-0028
[97]
[97, 95, 24, 18, 97, 95, 95, 95, 97, 97, 97, 97, 18, 97]

1988-147956-0028
[97 95]
[97, 97, 97, 97, 97, 18, 18, 70, 95, 97, 18, 95, 95, 97, 95, 95, 18, 97, 18, 95, 18, 95, 95, 95]

174-50561-0014
[95]
[95, 95, 95, 97, 18, 97, 97, 95, 95

In [262]:
# Converting the dictionaries to arrays and getting the index for each of the words in the dataset
dict_ind = {}

just_words_dusted = []
just_words_kmeans = []

index = 0
for path in dusted_codes:
    dict_ind[path] = []
    for i in range(len(dusted_codes[path])):
        just_words_dusted.append(dusted_codes[path][i])
        dict_ind[path].append(index)
        index += 1

    for j in range(len(kmeans_codes[path])):
        just_words_kmeans.append(kmeans_codes[path][j])

for word in range(len(just_words_dusted)):
    print(just_words_dusted[word])
    print(just_words_kmeans[word])
    print()
    break

[97]
[95, 95, 97, 97, 95, 97]



In [223]:
# Collapsing the kmeans words
from itertools import groupby

collapsed_kmeans = []
for path in kmeans_codes:
    for j in range(len(kmeans_codes[path])):
        collapsed_word = [key for key, _ in groupby(kmeans_codes[path][j])]
        collapsed_kmeans.append(collapsed_word)

for word in range(len(collapsed_kmeans)):
    print(collapsed_kmeans[word])
    break

[95, 97, 95, 97]


In [None]:
# DTW
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed

def dtw_sweep_min(query_seq, search_seq, n_step=3):
    """
    Return the minimum DTW cost as `query_seq` is swept across `search_seq`.

    Step size can be specified with `n_step`.
    """

    from cython_dtw import _dtw
    dtw_cost_func = _dtw.multivariate_dtw_cost_cosine

    i_start = 0
    n_query = query_seq.shape[0]
    n_search = search_seq.shape[0]
    min_cost = np.inf

    while i_start <= n_search - n_query or i_start == 0:
        cost = dtw_cost_func(
            query_seq, search_seq[i_start:i_start + n_query], True
        )
        i_start += n_step
        if cost < min_cost:
            min_cost = cost

    return min_cost

def dtw(features):
    tensor_features = [torch.from_numpy(f) for f in features]
    stacked_features = torch.cat(tensor_features, dim=0)
    normalized_features = []

    scaler = StandardScaler()
    scaler.fit(stacked_features) 
    normalized_features = []
    for feature in tqdm(features, desc="Normalizing Features"):
        normalized_features.append(torch.from_numpy(scaler.transform(feature))) 
    
    num_features = len(normalized_features)
    norm_distance_mat = np.zeros((num_features, num_features))
    normalized_features = [f.cpu().numpy().astype(np.float64) for f in normalized_features]

    for i in tqdm(range(num_features), desc="Calculating Distances"):
        dists_i = Parallel(n_jobs=8)(
            delayed(dtw_sweep_min)(normalized_features[i], normalized_features[j])
            for j in range(i + 1, num_features)
        )

        for j, dist in zip(range(i + 1, num_features), dists_i):
            norm_distance_mat[i, j] = dist
            norm_distance_mat[j, i] = dist  
            
    return norm_distance_mat

In [194]:
# Edit Distance
from joblib import Parallel, delayed

def edit_distance(seq1, seq2):
    """
    Compute the edit distance between two sequences using dynamic programming.
    """
    N, M = len(seq1), len(seq2)
    dp = np.zeros((N + 1, M + 1))
    for i in range(N + 1):
        dp[i, 0] = i
    for j in range(M + 1):
        dp[0, j] = j
    for i in range(1, N + 1):
        for j in range(1, M + 1):
            cost = 0 if seq1[i - 1] == seq2[j - 1] else 1
            dp[i, j] = min(dp[i - 1, j] + 1, dp[i, j - 1] + 1, dp[i - 1, j - 1] + cost)
    return dp[N, M] 

def calculate_distance(just_words, num_words):
    dist_mat = np.zeros((num_words, num_words))

    for i in tqdm(range(num_words), desc="Calculating Distances"):
        js = [j for j in range(i + 1, num_words)]
        dists_i = Parallel(n_jobs=8)(
            delayed(edit_distance)(just_words[i], just_words[j]) for j in js
        )

        for j, dist in zip(js, dists_i):
            dist_mat[i, j] = dist
            dist_mat[j, i] = dist  
    
    return dist_mat

In [263]:
num_words = index
# dist_mat_kmeans = calculate_distance(just_words_kmeans, num_words)
# dist_mat_kmeans_collapsed = calculate_distance(collapsed_kmeans, num_words)
dist_mat_dusted = calculate_distance(just_words_dusted, num_words)

Calculating Distances: 100%|██████████| 471/471 [00:36<00:00, 12.95it/s]


In [196]:
dist_mat_dtw = dtw(features)

Normalizing Features: 100%|██████████| 2415/2415 [00:00<00:00, 8011.46it/s]
Calculating Distances: 100%|██████████| 2415/2415 [08:56<00:00,  4.50it/s]


In [None]:
out_dir = Path(f"output/mat/{model_name}/{layer}/{sample_size}")
out_dir.mkdir(parents=True, exist_ok=True)

np.save(out_dir/"dist_mat_kmeans.npy", dist_mat_kmeans)
np.save(out_dir/"dist_mat_kmeans_collapsed.npy", dist_mat_kmeans_collapsed)
np.save(out_dir/"dist_mat_dusted.npy", dist_mat_dusted)
np.save(out_dir/"dist_mat_dtw.npy", dist_mat_dtw)

In [228]:
print(dist_mat_dusted[0:5, 0:15])
print(dist_mat_dusted.shape)
print(dist_mat_kmeans[0:5, 0:15])
print(dist_mat_kmeans.shape)
print(dist_mat_kmeans_collapsed[0:5, 0:15])
print(dist_mat_kmeans_collapsed.shape)
# print(dist_mat_dtw[0:5, 0:15])
# print(dist_mat_dtw.shape)

[[ 0.  2.  8. 12.  8.  5.  2.  3.  5.  2.  5.  1.  2.  5.  2.]
 [ 2.  0.  6. 10.  7.  4.  3.  2.  4.  3.  5.  2.  2.  4.  2.]
 [ 8.  6.  0.  6.  6.  3.  8.  5.  5.  8.  5.  7.  7.  6.  7.]
 [12. 10.  6.  0.  6.  7. 13. 10.  9. 13. 11. 12. 12.  9. 12.]
 [ 8.  7.  6.  6.  0.  6.  9.  7.  6.  9.  7.  8.  8.  4.  9.]]
(471, 471)
[[ 0. 17. 20. 22. 21. 12.  6. 10. 14.  6.  8.  5.  4. 22. 12.]
 [17.  0. 19. 18. 14. 13. 17. 16. 15. 16. 20. 19. 17. 18. 14.]
 [20. 19.  0. 14. 18. 14. 21. 13. 12. 20. 18. 23. 21. 21. 14.]
 [22. 18. 14.  0. 15. 18. 22. 14. 13. 21. 20. 24. 22. 16. 18.]
 [21. 14. 18. 15.  0. 17. 21. 17. 17. 20. 21. 23. 22. 14. 21.]]
(471, 471)
[[ 0.  3.  9. 12.  7.  5.  4.  4.  4.  4.  5.  3.  3.  8.  2.]
 [ 3.  0.  8. 10.  5.  5.  5.  3.  4.  5.  6.  4.  4.  7.  4.]
 [ 9.  8.  0.  8.  7.  5. 12.  7.  7. 12.  9. 11. 11.  8. 11.]
 [12. 10.  8.  0.  7.  9. 15. 10. 11. 15. 13. 14. 14.  9. 14.]
 [ 7.  5.  7.  7.  0.  7.  9.  6.  6.  9.  7.  8.  8.  4.  9.]]
(471, 471)


In [226]:
# Clustering algorithm 

def cluster(dist_mat, distance_threshold):
    num_nodes = dist_mat.shape[0]
    graph = {i: set() for i in range(num_nodes)}

    for i in range(num_nodes - 1): 
        for j in range(i + 1, num_nodes):  
            if dist_mat[i, j] < distance_threshold:
                graph[i].add(j)
                graph[j].add(i)  


    clusters = []
    visited = set()

    def bfs(start_node):
        """ Traverse a cluster using BFS """
        queue = [start_node]
        cluster = []
        
        while queue:
            node = queue.pop(0)
            if node in visited:
                continue 
            visited.add(node)
            cluster.append(node)
            queue.extend(graph[node])  

        return cluster

    for node in range(num_nodes):
        if node not in visited:
            new_cluster = bfs(node)
            clusters.append(new_cluster)

    return clusters

In [233]:
# Evaluating the clusters
dusted_clusters = cluster(dist_mat_dusted, 2)
kmeans_clusters = cluster(dist_mat_kmeans, 3)
kmeans_clusters_collapsed = cluster(dist_mat_kmeans_collapsed, 2)
# dtw_clusters = cluster(dist_mat_dtw, 0.7)

print(len(dusted_clusters))
print(len(kmeans_clusters))
print(len(kmeans_clusters_collapsed))
# print(len(dtw_clusters))
print()

166
345
233



In [203]:
rounded_dtw_mat = np.round(dist_mat_dtw)
dtw_clusters = cluster(rounded_dtw_mat, 0.7)
print(len(dtw_clusters))

KeyboardInterrupt: 

In [234]:
for i, c in enumerate(dusted_clusters):
    if(len(c)>1):
        print(f"Cluster {i}: {c}")

print()
for i, c in enumerate(kmeans_clusters):
    if(len(c)>1):
        print(f"Cluster {i}: {c}")

print()
for i, c in enumerate(kmeans_clusters_collapsed):
    if(len(c)>1):
        print(f"Cluster {i}: {c}")

Cluster 0: [0, 391, 11, 270, 402, 19, 409, 154, 26, 288, 416, 34, 40, 177, 434, 310, 188, 316, 446, 319, 450, 324, 197, 453, 455, 76, 460, 206, 212, 468, 215, 230, 103, 358, 107, 110, 1, 388, 140, 14, 272, 408, 411, 417, 41, 435, 443, 77, 205, 333, 470, 217, 350, 353, 231, 109, 243, 251, 128, 6, 136, 9, 395, 143, 145, 274, 148, 404, 22, 151, 407, 413, 291, 168, 424, 303, 304, 307, 53, 182, 437, 441, 449, 322, 71, 74, 330, 79, 207, 81, 209, 336, 343, 345, 228, 359, 106, 364, 111, 367, 370, 115, 374, 122, 380, 253, 394, 385, 12, 152, 280, 38, 181, 84, 344, 94, 96, 100, 112, 372, 254, 126, 398, 382, 265, 396, 401, 24, 32, 39, 200, 461, 338, 222, 226, 238, 117, 320, 16, 428, 464, 346, 366, 120, 171, 95, 229, 326, 28, 399, 275, 311, 185, 193, 101, 241, 465, 430, 65, 91, 451, 439, 363, 356, 7, 232, 298, 276, 134, 263, 155, 220, 379, 144, 153, 176, 180, 69, 246, 125, 147, 156, 195, 87, 225, 119, 102, 264, 25, 329, 378, 317, 219, 293, 431, 55, 454, 223, 355, 250, 199, 278, 124, 166, 261, 172, 

In [161]:
# Evaluating the cluster words

def parse_text_to_dict(file):
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    data_dict = {}
    current_id = None
    word_dict = {}

    for line in lines: 
        line = line.strip()

        if not line: 
            continue
        
        if line.endswith(":") and not line.split(":")[0].isdigit():
            if current_id is not None:
                data_dict[current_id] = word_dict
            
            current_id = line[:-1]
            word_dict = {}
        else:
            parts = line.split(": ")
            if len(parts) == 2:
                index, word = parts
                word_dict[int(index)] = word.strip()
            else:
                parts = parts[0].split(":")
                index = parts[0]
                word_dict[int(index)] = " "
            
            if current_id is not None:
                data_dict[current_id] = word_dict
        
    return data_dict

true_words_dict = parse_text_to_dict("data/words_and_indices.txt")

path_dict = {}
for path in dict_ind:
    path_dict[dict_ind[path][0]] = ''
    for i in range(1, len(dict_ind[path])):
        path_dict[dict_ind[path][i]] = true_words_dict[path][i-1]

In [204]:
# Cluster and WordUnit classes
from collections import defaultdict

class Cluster:
    def __init__(self,id, word_dict=None, true_words=None):
        self.id = id
        self.length = len(word_dict) if word_dict else 0
        self.word_dict = word_dict if word_dict is not None else []
        self.true_word_dict = true_words if true_words is not None else []
    
    def add_word_unit(self, id, index, file):
        word_unit = WordUnit(file, index, id)
        self.length += 1
        self.word_dict.append(word_unit)

    def add_true_word(self, word):
        self.true_word_dict.append(word)

    @classmethod
    def print_cluster(self, cluster):
        print(f"Cluster {cluster.id}")
        for word in cluster.word_dict:
            print(f"Word {word.id}: Index {word.index} in File {word.file}")
    
    def cluster_purity(self):

        word_counts = {}
        for word in self.true_word_dict:
            word_counts[word] = word_counts.get(word, 0) + 1

        max_count = max(word_counts.values()) if word_counts else 0
        cluster_purity = max_count / self.length if self.length > 0 else 0

        self.purity = cluster_purity

    @classmethod
    def duplicate_clusters(self, clusters):
        cluster_dict = defaultdict(int)

        for cluster in clusters:
            cluster_set = frozenset(cluster)  
            cluster_dict[cluster_set] += 1  

        duplicate_count = sum(1 for count in cluster_dict.values() if count > 1)

        return duplicate_count

class WordUnit:
    def __init__(self, file, index, id):
        self.index = int(index)
        self.file = file
        self.id = int(id)
        self.start_time = None
        self.end_time = None

    def add_word_boundaries(self, start_time, end_time):
        self.start_time = start_time
        self.end_time = end_time
    

In [271]:
def waveform_slice(file, start_time, end_time):

    waveform, _ = torchaudio.load(file)
    sample_rate=16000
    start_frame = int(start_time * sample_rate)
    end_frame = int(end_time * sample_rate)

    waveform_slice = waveform[:, start_frame:end_frame]
    return waveform_slice, sample_rate

In [275]:
from IPython.display import Audio, display

dusted_clusters = cluster(dist_mat_dusted, 0.8)
clusters = []
wav_dir = Path("data/librispeech-wav/")
indices_dict = parse_text_to_dict("data/timestamps_and_indices.txt")

for i, clust in enumerate(dusted_clusters):
    new_cluster = Cluster(id=i)
    for w in range(len(clust)):
        if clust[w] in filenames:
            filename_parts = filenames[clust[w]].split("_")
            filename = filename_parts[0]
            word_index = int(filename_parts[1])                     
            new_cluster.add_word_unit(w, word_index, filename)
    clusters.append(new_cluster)

for c in clusters:
    print(c.id)
    for word_unit in c.word_dict:
        word = true_words_dict[word_unit.file][word_unit.index]
        times = indices_dict[word_unit.file][word_unit.index]
        times = times.split(":")
        start_time = float(times[0][1:])
        end_time = float(times[1][0:-1])
        word_unit.add_word_boundaries(start_time, end_time)
        c.add_true_word(word)
        
        filename = wav_dir / str(word_unit.file + ".wav")
        slice, sample_rate = waveform_slice(filename, word_unit.start_time, word_unit.end_time)
        audio_obj = Audio(slice.squeeze(), rate=16000)
        display(audio_obj)
        
    
    if len(c.word_dict) > 1:  
        print(f"Cluster {c.id}")                
        for word in c.word_dict:
            print(word.file)

        print()

0


Cluster 0
5338-24640-0005
3170-137482-0007
3170-137482-0007
5895-34622-0017
3170-137482-0007
3170-137482-0007
5338-24640-0005
5338-24640-0005
3170-137482-0007
3170-137482-0007
3170-137482-0007
8842-302203-0003
3576-138058-0027
5895-34622-0017
3576-138058-0027
3576-138058-0027
5338-24640-0005
5895-34622-0017
84-121550-0016
3576-138058-0027
5338-24640-0005
5338-24640-0005
2428-83699-0012
2277-149897-0023
84-121550-0011
84-121550-0011
84-121550-0011
251-136532-0019
251-136532-0019
84-121550-0011
84-121550-0011
2428-83699-0012
2277-149897-0028
2277-149897-0028
3170-137482-0007
3170-137482-0007
2035-147960-0000
1988-147956-0028
3170-137482-0007
2035-147960-0000
1988-147956-0028
174-50561-0014
3170-137482-0007
2035-147960-0000

1


Cluster 1
5338-24640-0005
251-136532-0019
84-121550-0011
652-129742-0006
5338-24640-0005

2


Cluster 2
5338-24640-0005
2428-83705-0009
3170-137482-0007
2428-83699-0012

3


Cluster 3
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
5338-24640-0005
652-129742-0006
652-129742-0006
652-129742-0006
652-129742-0006
652-129742-0006
652-129742-0006
652-129742-0006
251-136532-0019
251-136532-0019
251-136532-0019
251-136532-0019
251-136532-0019
251-136532-0019
251-136532-0019
251-136532-0019
251-136532-0019
251-136532-0019
251-136532-0019
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
2035-147960-0000
8297-275155-0025
8297-275155-0025
8297-275155-0025
8297-275155-0025
5895-34622-0017
5895-34622-0017
5895-34622-0017
5895-34622-0017
5895-34622-0017
5895-34622-0017
5895-346

Cluster 4
5338-24640-0005
3170-137482-0007
1988-147956-0028
3170-137482-0007
84-121550-0016
1919-142785-0003
5338-24640-0005

5


Cluster 5
5338-24640-0005
7850-281318-0002
7850-281318-0002
7850-281318-0002
5338-24640-0005
5338-24640-0005
8842-302203-0003
8842-302203-0003
5895-34622-0017
84-121550-0016
3576-138058-0027
3576-138058-0027
652-129742-0006
84-121550-0016
1919-142785-0003
2428-83705-0009
84-121550-0011
2428-83699-0012
251-136532-0019
2428-83699-0012
251-136532-0019
2277-149897-0028
3170-137482-0007
1988-147956-0028
1988-147956-0028
1988-147956-0028
3170-137482-0007
3170-137482-0007

6


Cluster 6
5338-24640-0005
8297-275155-0025
7850-281318-0002
3170-137482-0007
5895-34622-0017
8842-302203-0003
3170-137482-0007
5338-24640-0005
5338-24640-0005
84-121550-0016
5895-34622-0017
84-121550-0016
2428-83699-0012
2277-149897-0023
2277-149897-0023
2428-83699-0012
3170-137482-0007
2277-149897-0028
3170-137482-0007
3170-137482-0007
2035-147960-0000
2035-147960-0000
1988-147956-0028
3170-137482-0007
2035-147960-0000
2035-147960-0000
3170-137482-0007
3170-137482-0007
3170-137482-0007

7


Cluster 7
5338-24640-0005
3170-137482-0007
8297-275155-0025
3170-137482-0007
5895-34622-0017
3170-137482-0007
5895-34622-0017
5895-34622-0017
84-121550-0016
3576-138058-0027
5338-24640-0005
5338-24640-0005
3576-138058-0027
5338-24640-0005
84-121550-0016
84-121550-0016
652-129742-0006
84-121550-0011
2428-83699-0012
2428-83699-0012
3170-137482-0007
251-136532-0019
3170-137482-0007
251-136532-0019
3170-137482-0007
1988-147956-0028
3170-137482-0007
7850-281318-0002

8


9


Cluster 9
5338-24640-0005
5895-34622-0017
8842-302203-0003
8842-302203-0003
84-121550-0016
5338-24640-0005
1919-142785-0003
5338-24640-0005
3576-138058-0027
2428-83705-0009
2428-83699-0012
652-129742-0006
2428-83699-0012
251-136532-0019
2428-83699-0012
84-121550-0011
2277-149897-0028
251-136532-0019
3170-137482-0007
3170-137482-0007
1988-147956-0028
7850-281318-0002

10


11


12


13


Cluster 13
5338-24640-0005
2035-147960-0000

14


Cluster 14
5338-24640-0005
3170-137482-0007
3170-137482-0007
2428-83705-0009

15


Cluster 15
652-129742-0006
84-121550-0011
8297-275155-0025
84-121550-0016
2428-83699-0012
84-121550-0011
1919-142785-0003
5895-34622-0017
652-129742-0006

16


Cluster 16
652-129742-0006
84-121550-0016
3576-138058-0027
3576-138058-0027
3576-138058-0027
2035-147960-0000
3170-137482-0007
2428-83699-0012
2277-149897-0023
2035-147960-0000

17


18


19


20


Cluster 20
652-129742-0006
2277-149897-0023

21


22


Cluster 22
652-129742-0006
5895-34622-0017

23


Cluster 23
251-136532-0019
2428-83699-0012
2035-147960-0000
84-121550-0016
84-121550-0011
1988-147956-0028
3170-137482-0007
8842-302203-0003
84-121550-0011
2277-149897-0028
84-121550-0011
2277-149897-0028

24


Cluster 24
251-136532-0019
7850-281318-0002
3170-137482-0007
5895-34622-0017
8842-302203-0003
3576-138058-0027
5895-34622-0017
84-121550-0016
84-121550-0016
1919-142785-0003
1919-142785-0003
2428-83705-0009
2277-149897-0023
2428-83699-0012
2277-149897-0023
2428-83699-0012
2428-83699-0012
3170-137482-0007
2035-147960-0000
3170-137482-0007
2277-149897-0028
2035-147960-0000
2035-147960-0000
1988-147956-0028
3170-137482-0007
2035-147960-0000

25


26


27


Cluster 27
251-136532-0019
3576-138058-0027
2035-147960-0000

28


Cluster 28
251-136532-0019
3170-137482-0007

29


30


Cluster 30
2035-147960-0000
3170-137482-0007
84-121550-0011

31


32


Cluster 32
2035-147960-0000
8842-302203-0003
84-121550-0016

33


34


35


36


37


38


39


Cluster 39
84-121550-0011
3576-138058-0027

40


41


42


43


44


45


46


47


Cluster 47
7850-281318-0002
84-121550-0016

48


49


50


51


52


53


54


Cluster 54
3170-137482-0007
3576-138058-0027

55


56


57


58


59


60


61


62


63


64


65


66


In [256]:
kmeans_clusters = cluster(dist_mat_kmeans,0.8)
clusters = []

for i, clust in enumerate(kmeans_clusters):
    new_cluster = Cluster(id=i)
    for w in range(len(clust)):
        if clust[w] in filenames:
            filename_parts = filenames[clust[w]].split("_")
            filename = filename_parts[0]
            word_index = int(filename_parts[1])                     
            new_cluster.add_word_unit(w, word_index, filename)
    clusters.append(new_cluster)

for c in clusters:

    for word_unit in c.word_dict:
        word = true_words_dict[word_unit.file][word_unit.index]
        c.add_true_word(word)
    
    if len(c.word_dict) > 1:  
        print(f"Cluster {c.id}")                
        print(c.true_word_dict)
        print()

Cluster 6
['rude', ' ']

Cluster 9
['which', 'randal', 'the', 'into', 'for']

Cluster 11
['he', 'christmas']

Cluster 19
['ambition', 'these']

Cluster 76
['listening', 'to', 'i', 'but', 'back']

Cluster 78
['the', 'gladdened', 'i', 'the', 'and', 'helicon']

Cluster 97
['sure', 'was']

Cluster 103
['of', ' ', 'be']

Cluster 121
['manner', 'to']

Cluster 140
['of', 'all']

Cluster 198
['to', 'away']



In [259]:
kmeans_clusters_collapsed = cluster(dist_mat_kmeans_collapsed,0.3)
clusters = []

for i, clust in enumerate(kmeans_clusters_collapsed):
    new_cluster = Cluster(id=i)
    for w in range(len(clust)):
        if clust[w] in filenames:
            filename_parts = filenames[clust[w]].split("_")
            filename = filename_parts[0]
            word_index = int(filename_parts[1])                     
            new_cluster.add_word_unit(w, word_index, filename)
    clusters.append(new_cluster)

for c in clusters:

    for word_unit in c.word_dict:
        word = true_words_dict[word_unit.file][word_unit.index]
        c.add_true_word(word)
    
    if len(c.word_dict) > 1:  
        print(f"Cluster {c.id}")                
        print(c.true_word_dict)
        print()

Cluster 6
['rude', 'randal', 'which', 'back', 'to', 'gladdened', 'the', 'be', 'into', 'helicon', 'the', 'for', 'listening', 'the', ' ', ' ', 'i', 'of', 'but', 'and', 'i']

Cluster 10
['he', 'these', 'ambition', 'bedchamber', 'mingled', 'christmas']

Cluster 11
['moves', 'in', 'sure', 'period', 'to', 'was', 'old', ' ', 'manner']

Cluster 13
['and', 'unhappy', 'preposterous', 'and', 'he', ' ']

Cluster 22
['scruples', ' ', 'put', 'sun', 'contrive', 'much']

Cluster 24
['he', 'seemed', 'no']

Cluster 32
['distinguish', 'black']

Cluster 49
['adding', 'parish']

Cluster 81
[' ', 'opened']

Cluster 93
['had', 'me', 'felt']

Cluster 96
['the', 'but', 'up']

Cluster 104
['was', 'being']

Cluster 120
['hut', 'alarm', 'to', ' ', 'of', 'all', 'away']

Cluster 133
['served', 'and']

Cluster 134
['as', ' ']

Cluster 135
['dressing', 'must']

Cluster 155
['is', ' ', 'the', 'me', 'an']

Cluster 160
['needs', 'some', 'most']

Cluster 161
['pour', 'for', 'quite']

Cluster 178
['difficult', 'car']

Clu

In [168]:
print("Dusted clusters")
for i, c in enumerate(dusted_clusters):
    if(len(c)>1):
        words = [path_dict[c[j]] for j in range(len(c))]
        print(f"Cluster {i}: {', '.join(words)}")

print("\nKmeans clusters")
for i, c in enumerate(kmeans_clusters):
    if(len(c)>1):
        words = [path_dict[c[j]] for j in range(len(c))]
        print(f"Cluster {i}: {', '.join(words)}")

print("\nCollapsed Kmeans clusters")
for i, c in enumerate(kmeans_clusters_collapsed):
    if(len(c)>1):
        words = [path_dict[c[j]] for j in range(len(c))]
        print(f"Cluster {i}: {', '.join(words)}")

Dusted clusters
Cluster 0: , upon, to, effect, , body, ear, , her, he, it
Cluster 1: the, the
Cluster 4: which, well, stood, understand, self, and, so
Cluster 5: had, a, that, points, sioux, you, for, me, you, you, around, my,  , we, you, gazed
Cluster 6: formed,  
Cluster 7: a, that, like, a, the, some, and,  , the, the, of, the, they, the, the, the, the, a, i, the, are, her,  , the, you, a, the, in, the, the,  , the, had, the, if, the, the, the, the, was
Cluster 13: and, and, is
Cluster 16: now, his, not, his, here, should, my, of, know
Cluster 18: out, little, that, had, that, song, night
Cluster 19: to,  , to,  , to, at, been,  , as, in, of, to, to, to, to, find, at, to, old, pleasure, to
Cluster 21: leaving, me, the, place, so, so, been
Cluster 23: things, rice
Cluster 24:  , the, the, i, us, the, the, the, the, was,  , the,  , the, the, need, us, at, not,  ,  , the,  , i, you, the, you, when, are, me,  , the, the,  , the, the, that, i
Cluster 30: of, up, yet, and, all, in, of, in