In [63]:
from pathlib import Path

model_name = "wavlm_base"
layer = 8
in_dir = Path("data/librispeech-wav")
sample_size = 200

In [64]:
# Randomly sample files to read
import random

wav_paths = list(in_dir.rglob("*.wav"))
sampled_paths = random.sample(wav_paths, sample_size)  

print(len(sampled_paths))

200


In [65]:
# Encode the sampled audio features 
import torchaudio
from tqdm import tqdm
import torch
import numpy as np
import librosa

def preemphasis(signal, coeff=0.97):
    return np.append(signal[0], signal[1:] - coeff*signal[:-1])

model_pipelines = {
    "hubert_base": torchaudio.pipelines.HUBERT_BASE,
    "hubert_large": torchaudio.pipelines.HUBERT_LARGE,
    "hubert_xlarge": torchaudio.pipelines.HUBERT_XLARGE,
    "wavlm_base": torchaudio.pipelines.WAVLM_BASE,
    "wavlm_large": torchaudio.pipelines.WAVLM_LARGE,
    "wavlm_base_plus": torchaudio.pipelines.WAVLM_BASE_PLUS,
}

if model_name != "mfcc":
    bundle = model_pipelines.get(model_name, torchaudio.pipelines.HUBERT_BASE)
    model = bundle.get_model().cuda()
    model.eval()

encodings = {}
for wav_path in tqdm(sampled_paths, desc="Encoding Audio Features"):
    if model_name != "mfcc":
        out_dir = Path("features/") / model_name / str(layer)
        wav, sr = torchaudio.load(wav_path)
        wav = torchaudio.functional.resample(wav, sr, 16000).cuda()

        with torch.inference_mode():
            encoding, _ = model.extract_features(wav, num_layers=layer)

        encoding = encoding[layer-1].squeeze().cpu().numpy()
    else:
        out_dir = Path("features/") / model_name 
        wav, sr = librosa.core.load(wav_path, sr=None)
        wav = preemphasis(wav, coeff=0.97)

        mfcc = librosa.feature.mfcc(
            y=wav, sr=sr, n_mfcc=13, n_mels=24, 
            n_fft=int(np.floor(0.025*sr)),
            hop_length=int(np.floor(0.01*sr)), 
            fmin=64, fmax=8000
        )
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta_delta = librosa.feature.delta(mfcc_delta)
        encoding = np.hstack([mfcc.T, mfcc_delta.T, mfcc_delta_delta.T])

    if out_dir:
        out_dir.mkdir(parents=True, exist_ok=True)
        output_path = Path(out_dir) / f"{wav_path.stem}.npy"
        np.save(output_path, encoding)
    encodings[wav_path.stem] = encoding
print(f"Stored Encodings in {str(out_dir)}")


Encoding Audio Features: 100%|██████████| 200/200 [00:07<00:00, 26.74it/s]

Stored Encodings in features/wavlm_base/8





In [66]:
# Cut encodings
def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int)->int:
    hop_size = frame_size_ms/1000 * sample_rate
    hop_size = np.max([hop_size, 1])
    return int((timestamp * sample_rate) / hop_size)

out_dir = Path("output/codes/kmeans")
align_dir = Path("data/all_alignments")

if out_dir and model_name != "mfcc":
    out_dir = out_dir / model_name / str(layer)
    out_dir.mkdir(parents=True, exist_ok=True)
else:
    out_dir = out_dir / model_name 
    out_dir.mkdir(parents=True, exist_ok=True)

align_paths = list(align_dir.rglob("*.list"))

cut_encodings = {}
filenames = {}
features = []
index = 0
for path in tqdm(encodings, desc="Cutting Encodings"):
    alignment_file = [a for a in align_paths if a.stem == path]
    if not alignment_file:
        continue
    else:
        alignment_file = alignment_file[0]

    with open(str(alignment_file), "r") as f:
        bounds = [get_frame_num(float(line.strip()), 16000, 20) for line in f]
    
    cut_encoding = encodings[path][0: bounds[0]]
    words = [cut_encoding]
    for i in range(len(bounds)-1): 
        cut_encoding = encodings[path][bounds[i]: bounds[i+1]]
        features.append(cut_encoding)
        words.append(cut_encoding)
        filenames[index] = f"{path}_{i}"
        index += 1
    cut_encodings[path] = words


Cutting Encodings:   0%|          | 0/200 [00:00<?, ?it/s]

Cutting Encodings: 100%|██████████| 200/200 [00:00<00:00, 921.92it/s]


In [67]:
# Converting the dictionaries to arrays and getting the index for each of the words in the dataset
dict_ind = {}

index = 0
for path in cut_encodings:
    dict_ind[path] = []
    for i in range(len(cut_encodings[path])):
        
        dict_ind[path].append(index)
        index += 1


In [8]:
# DTW
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed

def dtw_sweep_min(query_seq, search_seq, n_step=3):
    """
    Return the minimum DTW cost as `query_seq` is swept across `search_seq`.

    Step size can be specified with `n_step`.
    """

    from cython_dtw import _dtw
    dtw_cost_func = _dtw.multivariate_dtw_cost_cosine

    i_start = 0
    n_query = query_seq.shape[0]
    n_search = search_seq.shape[0]
    min_cost = np.inf

    while i_start <= n_search - n_query or i_start == 0:
        cost = dtw_cost_func(
            query_seq, search_seq[i_start:i_start + n_query], True
        )
        i_start += n_step
        if cost < min_cost:
            min_cost = cost

    return min_cost

def dtw(features):
    tensor_features = [torch.from_numpy(f) for f in features]
    stacked_features = torch.cat(tensor_features, dim=0)
    normalized_features = []

    scaler = StandardScaler()
    scaler.fit(stacked_features) 
    normalized_features = []
    for feature in tqdm(features, desc="Normalizing Features"):
        normalized_features.append(torch.from_numpy(scaler.transform(feature))) 
    
    num_features = len(normalized_features)
    norm_distance_mat = np.zeros((num_features, num_features))
    normalized_features = [f.cpu().numpy().astype(np.float64) for f in normalized_features]

    for i in tqdm(range(num_features), desc="Calculating Distances"):
        dists_i = Parallel(n_jobs=8)(
            delayed(dtw_sweep_min)(normalized_features[i], normalized_features[j])
            for j in range(i + 1, num_features)
        )

        for j, dist in zip(range(i + 1, num_features), dists_i):
            norm_distance_mat[i, j] = dist
            norm_distance_mat[j, i] = dist  
            
    return norm_distance_mat

In [68]:
features = [np.ascontiguousarray(f) for f in features]
dist_mat_dtw = dtw(features)

Normalizing Features: 100%|██████████| 4519/4519 [00:00<00:00, 7574.07it/s]
Calculating Distances:   2%|▏         | 88/4519 [00:47<39:43,  1.86it/s] 


KeyboardInterrupt: 

In [59]:
out_dir = Path(f"output/mat/{model_name}/{layer}/{sample_size}")
out_dir.mkdir(parents=True, exist_ok=True)

np.save(out_dir/"dist_mat_dtw.npy", dist_mat_dtw)

In [60]:
print(dist_mat_dtw[0:5, 0:15])
print(dist_mat_dtw.shape)

[[0.         0.77427764 0.62808591 0.82940389 0.60451699 0.51741716
  0.65911661 0.57893458 0.63977625 0.52445605 0.86401942 0.53740024
  0.53804338 0.51582226 0.79152627]
 [0.77427764 0.         0.47698115 0.4889294  0.45792736 0.48271909
  0.48642485 0.47579694 0.46931661 0.46506085 0.53923493 0.49407092
  0.48735931 0.48461273 0.47290599]
 [0.62808591 0.47698115 0.         0.71116497 0.4877622  0.46296147
  0.52707392 0.49795275 0.51340117 0.485275   0.79112062 0.45228296
  0.49596127 0.42601075 0.67789985]
 [0.82940389 0.4889294  0.71116497 0.         0.46298309 0.49831581
  0.50299457 0.51038167 0.46485221 0.47627475 0.55977701 0.48228696
  0.48960585 0.45335487 0.47485141]
 [0.60451699 0.45792736 0.4877622  0.46298309 0.         0.40703904
  0.58386042 0.44892431 0.50620716 0.49410235 0.7856648  0.49195152
  0.46742061 0.46823171 0.69137271]]
(2150, 2150)


In [27]:
# Clustering algorithm 

def cluster(dist_mat, distance_threshold):
    num_nodes = dist_mat.shape[0]
    graph = {i: set() for i in range(num_nodes)}

    for i in range(num_nodes - 1): 
        for j in range(i + 1, num_nodes):  
            if dist_mat[i, j] < distance_threshold:
                graph[i].add(j)
                graph[j].add(i)  


    clusters = []
    visited = set()

    def bfs(start_node):
        """ Traverse a cluster using BFS """
        queue = [start_node]
        cluster = []
        
        while queue:
            node = queue.pop(0)
            if node in visited:
                continue 
            visited.add(node)
            cluster.append(node)
            queue.extend(graph[node])  

        return cluster

    for node in range(num_nodes):
        if node not in visited:
            new_cluster = bfs(node)
            clusters.append(new_cluster)

    return clusters

In [35]:
# Evaluating clustering on dist_mat_dtw
dist_mat_dtw_round = np.round(dist_mat_dtw[0:10, 0:10]*10)
print(dist_mat_dtw_round)

distance_thresh = 5

num_nodes = dist_mat_dtw_round.shape[0]
graph = {i: set() for i in range(num_nodes)}

for i in range(num_nodes - 1): 
    for j in range(i + 1, num_nodes):  
        if dist_mat_dtw_round[i, j] < distance_thresh:
            graph[i].add(j)
            graph[j].add(i)  

for i in range(num_nodes-1):
    print(graph[i])


clusters = []
visited = set()


[[ 0.  3.  4.  4.  4.  6.  4.  4.  6.  6.]
 [ 3.  0.  7.  3.  5.  8.  4.  3.  8.  7.]
 [ 4.  7.  0.  4.  4.  7.  4.  5.  6.  5.]
 [ 4.  3.  4.  0.  3.  9.  4.  4.  9.  6.]
 [ 4.  5.  4.  3.  0.  9.  5.  5.  9.  9.]
 [ 6.  8.  7.  9.  9.  0.  0.  3.  6.  4.]
 [ 4.  4.  4.  4.  5.  0.  0.  4. 10.  7.]
 [ 4.  3.  5.  4.  5.  3.  4.  0.  8.  7.]
 [ 6.  8.  6.  9.  9.  6. 10.  8.  0.  2.]
 [ 6.  7.  5.  6.  9.  4.  7.  7.  2.  0.]]
{1, 2, 3, 4, 6, 7}
{0, 3, 6, 7}
{0, 3, 4, 6}
{0, 1, 2, 4, 6, 7}
{0, 2, 3}
{9, 6, 7}
{0, 1, 2, 3, 5, 7}
{0, 1, 3, 5, 6}
{9}


In [None]:
# True words Dictionary

def parse_text_to_dict(file):
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    data_dict = {}
    current_id = None
    word_dict = {}

    for line in lines: 
        line = line.strip()

        if not line: 
            continue
        
        if line.endswith(":") and not line.split(":")[0].isdigit():
            if current_id is not None:
                data_dict[current_id] = word_dict
            
            current_id = line[:-1]
            word_dict = {}
        else:
            parts = line.split(": ")
            if len(parts) == 2:
                index, word = parts
                word_dict[int(index)] = word.strip()
            else:
                parts = parts[0].split(":")
                index = parts[0]
                word_dict[int(index)] = " "
            
            if current_id is not None:
                data_dict[current_id] = word_dict
        
    return data_dict

true_words_dict = parse_text_to_dict("data/words_and_indices.txt")

path_dict = {}
for path in dict_ind:
    path_dict[dict_ind[path][0]] = ''
    for i in range(1, len(dict_ind[path])):
        path_dict[dict_ind[path][i]] = true_words_dict[path][i-1]

In [None]:
# Cluster and WordUnit classes
from collections import defaultdict

class Cluster:
    def __init__(self,id, word_dict=None, true_words=None):
        self.id = id
        self.length = len(word_dict) if word_dict else 0
        self.word_dict = word_dict if word_dict is not None else []
        self.true_word_dict = true_words if true_words is not None else []
    
    def add_word_unit(self, id, index, file):
        word_unit = WordUnit(file, index, id)
        self.length += 1
        self.word_dict.append(word_unit)

    def add_true_word(self, word):
        self.true_word_dict.append(word)

    @classmethod
    def print_cluster(self, cluster):
        print(f"Cluster {cluster.id}")
        for word in cluster.word_dict:
            print(f"Word {word.id}: Index {word.index} in File {word.file}")
    
    def cluster_purity(self):

        word_counts = {}
        for word in self.true_word_dict:
            word_counts[word] = word_counts.get(word, 0) + 1

        max_count = max(word_counts.values()) if word_counts else 0
        cluster_purity = max_count / self.length if self.length > 0 else 0

        self.purity = cluster_purity

    @classmethod
    def duplicate_clusters(self, clusters):
        cluster_dict = defaultdict(int)

        for cluster in clusters:
            cluster_set = frozenset(cluster)  
            cluster_dict[cluster_set] += 1  

        duplicate_count = sum(1 for count in cluster_dict.values() if count > 1)

        return duplicate_count

class WordUnit:
    def __init__(self, file, index, id):
        self.index = int(index)
        self.file = file
        self.id = int(id)
        self.start_time = None
        self.end_time = None

    def add_word_boundaries(self, start_time, end_time):
        self.start_time = start_time
        self.end_time = end_time
    

In [62]:
clusters_dtw = cluster(dist_mat_dtw, 0.25)
clusters = []
for i, clust in enumerate(clusters_dtw):
    new_cluster = Cluster(id=i)
    for w in range(len(clust)):
        filename_parts = filenames[clust[w]].split("_")
        filename = filename_parts[0]
        word_index = int(filename_parts[1])                     
        new_cluster.add_word_unit(w, word_index, filename)
    clusters.append(new_cluster)

for c in clusters:
    for word_unit in c.word_dict:
        word = true_words_dict[word_unit.file][word_unit.index]
        c.add_true_word(word)
    
    if len(c.word_dict) > 0:  
        print(f"Cluster {c.id}")                
        print(c.true_word_dict)
        print()


Cluster 0
['delighted']

Cluster 1
['to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', ' ', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'to', 'honest', 'no', 'does', 'portugal', 'pleasing', 'is', 'two', 'into', ' ', 'on', 'no', 'no', 'no', 'as', 'is', 'as', 'two', 'too', 'two', 'too', 'two', 'into', 'into', 'into', 'into', 'into', ' ', ' ', 'was', 'an', 'girls', 'kent', ' ', ' ', 'as', ' ', 'is', 'night', ' ', 'as', ' ', 'is', 'liquid', ' ', 'and', ' ', 'dinners', ' ', 'as', 'at', 'and', 'at', 'night', 'guard', 'wines', 'in', ' ', 'of', ' ', 'agreeable', 'on', 'on', 'know', 'is', 'as', 'is', 'is', 'is', 'is', 'is', 'is', 'as', 'two', 'into', 'into', 'into', 'into', ' ', ' ', ' ', 'was', 'was', 'was', ' ', 'was', 'was', 'were', 'was', ' ', 'an', 'as', 'as', 'as', 'as', 'introduced', 'events', 'error', 'without', 'any', 'until', 'aft