In [1]:
# Randomly sample files to read
import random
from pathlib import Path

in_dir = Path("data/dev-clean")
sample_size = 20

wav_paths = list(in_dir.rglob("*.flac"))
wav_paths = [
    Path("data/dev-clean/174/50561/174-50561-0005.flac"),
    Path("data/dev-clean/174/50561/174-50561-0013.flac"),
    Path("data/dev-clean/3081/166546/3081-166546-0003.flac"),
             ]
wav_paths = [Path("data/dev-clean/3000/15664/3000-15664-0017.flac")]

if sample_size < len(wav_paths):
    sampled_paths = random.sample(wav_paths, sample_size)
else:
    sampled_paths = wav_paths  

print(len(sampled_paths))

1


In [2]:
from torchaudio.functional import resample
import torch
import torchaudio
from tqdm import tqdm

hubert = torch.hub.load(
    "bshall/hubert:main",
    "hubert_discrete",
    trust_repo=True,
)

acoustic_units = {}
for wav_path in tqdm(sampled_paths, desc="Encoding Audio Features"):
    wav, sr = torchaudio.load(wav_path)
    wav = resample(wav, sr, 16000)
    wav = wav.unsqueeze(0)

    with torch.inference_mode():
        units = hubert.units(wav)
    acoustic_units[wav_path.stem] = units.numpy()

Using cache found in /Users/daneladendorff/.cache/torch/hub/bshall_hubert_main
Encoding Audio Features: 100%|██████████| 1/1 [00:00<00:00,  2.01it/s]


In [None]:
import numpy as np

layer = 7
hubert, encode = torch.hub.load(
    "bshall/dusted:main", "hubert", language="english", trust_repo=True
)

encodings = {}
for wav_path in tqdm(sampled_paths, desc="Encoding Audio Features"):
    wav, sr = torchaudio.load(wav_path)
    wav = resample(wav, sr, 16000)
    wav = wav.unsqueeze(0)
    x = encode(hubert, wav, layer)
    encodings[wav_path.stem] = x


Using cache found in /Users/daneladendorff/.cache/torch/hub/bshall_dusted_main
Using cache found in /Users/daneladendorff/.cache/torch/hub/bshall_hubert_main
Encoding Audio Features: 100%|██████████| 1/1 [00:00<00:00,  2.38it/s]


In [39]:
align_dir = Path("data/all_alignments")
align_paths = list(align_dir.rglob("*.list"))

def get_frame_num(timestamp, sample_rate, frame_size_ms):
    hop = frame_size_ms/1000 * sample_rate
    hop_size = np.max([hop, 1])
    return int((timestamp * sample_rate) / hop_size)

filenames = {}
cut_encodings = {}
index = 1
for path in tqdm(encodings, desc="Cutting Encodings"):
    alignment_file = [a for a in align_paths if a.stem == path]
    if not alignment_file:
        continue
    else:
        alignment_file = alignment_file[0]

    with open(str(alignment_file), "r") as f:
        bounds = [get_frame_num(float(line.strip()), 16000, 20) for line in f]
    
    cut_x = encodings[path][0:bounds[0]]
    cuttings = [cut_x]
    filenames[0] = f"{path}_{0}"
    
    for i in range(len(bounds)-1): 
        cut_x = encodings[path][bounds[i]: bounds[i+1]]
        print(cut_x.shape)
        cuttings.append(cut_x)
        filenames[index] = f"{path}_{i+1}"
        index += 1
    cut_encodings[path] = cuttings

Cutting Encodings: 100%|██████████| 1/1 [00:00<00:00, 483.55it/s]

torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])
torch.Size([0, 704, 768])





In [38]:
kmeans, segment = torch.hub.load(
    "bshall/dusted:main", "kmeans", language="english", trust_repo=True
)

gamma = 0.001

dusted_units = []
for path in cut_encodings:
    words = []
    for word in cut_encodings[path]:
        sequence = word.squeeze(0)
        print(sequence)
        codes, boundaries = segment(sequence, kmeans.cluster_centers_, gamma)
        words.append(codes)
    dusted_units.append(words)



Using cache found in /Users/daneladendorff/.cache/torch/hub/bshall_dusted_main


tensor([[-0.0857,  0.0068, -0.0018,  ..., -0.1621,  0.0876,  0.0083],
        [-0.0787,  0.0457, -0.0113,  ..., -0.1585,  0.0800, -0.0450],
        [-0.0756,  0.0604, -0.0455,  ..., -0.1574,  0.0271, -0.1214],
        ...,
        [ 0.0888,  0.1911, -0.0035,  ..., -0.0237,  0.0802,  0.0904],
        [-0.1122,  0.0095,  0.0040,  ..., -0.0382,  0.1739,  0.2278],
        [-0.1573, -0.0210,  0.0655,  ..., -0.0874,  0.1755,  0.1598]])
tensor([], size=(0, 704, 768))


ValueError: XA must be a 2-dimensional array.

In [None]:
import numpy as np 

align_dir = Path("data/all_alignments")
align_paths = list(align_dir.rglob("*.list"))

def get_frame_num(timestamp, sample_rate, frame_size_ms):
    hop = frame_size_ms/1000 * sample_rate
    hop_size = np.max([hop, 1])
    return int((timestamp * sample_rate) / hop_size)

filenames = {}
cut_units = {}
index = 1
for path in tqdm(acoustic_units, desc="Cutting Encodings"):
    alignment_file = [a for a in align_paths if a.stem == path]
    if not alignment_file:
        continue
    else:
        alignment_file = alignment_file[0]

    with open(str(alignment_file), "r") as f:
        bounds = [get_frame_num(float(line.strip()), 16000, 20) for line in f]
    
    cut_unit = acoustic_units[path][0:bounds[0]]
    cuttings = [cut_unit]
    filenames[0] = f"{path}_{0}"
    
    for i in range(len(bounds)-1): 
        cut_unit = acoustic_units[path][bounds[i]: bounds[i+1]]
        cuttings.append(cut_unit)
        filenames[index] = f"{path}_{i+1}"
        index += 1
    cut_units[path] = cuttings


Cutting Encodings: 100%|██████████| 1/1 [00:00<00:00, 500.99it/s]


In [7]:

for w in filenames:
    parts = filenames[w].split("_")
    name = parts[0]
    i = int(parts[1])
    print(f"{filenames[w]}: {cut_units[name][i]}")
    break

3000-15664-0017_0: [ 6  6  6  6  6 44 44 44 96 96 96 96 96 96 96 96 22 22 22]


In [132]:
for path in cut_units:
    print(path)
    for word in cut_units[path]:
        print(word)
        print()
        break
    

3000-15664-0017
[ 6  6  6  6  6 44 44 44 96 96 96 96 96 96 96 96 22 22 22]



In [9]:
just_words = []

index = 0
for path in cut_units:
    for i in range(len(cut_units[path])):
        just_words.append(cut_units[path][i])

num_words = len(just_words)
print(num_words)

39


In [37]:
from joblib import Parallel, delayed

def edit_distance(seq1, seq2):
    """
    Compute the edit distance between two sequences using dynamic programming.
    """
    N, M = len(seq1), len(seq2)
    dp = np.zeros((N + 1, M + 1))
    for i in range(N + 1):
        dp[i, 0] = i
    for j in range(M + 1):
        dp[0, j] = j
    for i in range(1, N + 1):
        for j in range(1, M + 1):
            cost = 0 if seq1[i - 1] == seq2[j - 1] else 1
            dp[i, j] = min(dp[i - 1, j] + 1, dp[i, j - 1] + 1, dp[i - 1, j - 1] + cost)
    return dp[N, M] 

def calculate_distance(just_words, num_words):
    dist_mat = np.zeros((num_words, num_words))

    for i in tqdm(range(num_words), desc="Calculating Distances"):
        js = [j for j in range(i + 1, num_words)]
        dists_i = Parallel(n_jobs=8)(
            delayed(edit_distance)(just_words[i], just_words[j]) for j in js
        )

        for j, dist in zip(js, dists_i):
            dist_mat[i, j] = dist
            dist_mat[j, i] = dist  
    
    return dist_mat

In [134]:
dist_mat = calculate_distance(just_words, num_words)
print(dist_mat[0:5, 0:5])

Calculating Distances: 100%|██████████| 39/39 [00:02<00:00, 18.89it/s]

[[ 0. 19. 19. 19. 25.]
 [19.  0. 17.  6. 23.]
 [19. 17.  0. 16. 25.]
 [19.  6. 16.  0. 25.]
 [25. 23. 25. 25.  0.]]





In [135]:

import editdistance
import numpy as np

dist_mat = np.zeros((num_words, num_words))

for u in range(num_words):
    for v in range(u + 1, num_words):
        dist_mat[u, v] = editdistance.eval(just_words[u], just_words[v])

print(dist_mat[0:5, 0:5])

[[ 0. 19. 19. 19. 25.]
 [ 0.  0. 17.  6. 23.]
 [ 0.  0.  0. 16. 25.]
 [ 0.  0.  0.  0. 25.]
 [ 0.  0.  0.  0.  0.]]


In [39]:
# Clustering algorithm 
def cluster(dist_mat, distance_threshold):
    num_nodes = dist_mat.shape[0]
    graph = {i: set() for i in range(num_nodes)}

    for i in range(num_nodes - 1): 
        for j in range(i + 1, num_nodes):  
            if dist_mat[i, j] < distance_threshold:
                graph[i].add(j)
                graph[j].add(i)  


    clusters = []
    visited = set()

    def bfs(start_node):
        """ Traverse a cluster using BFS """
        queue = [start_node]
        cluster = []
        
        while queue:
            node = queue.pop(0)
            if node in visited:
                continue 
            visited.add(node)
            cluster.append(node)
            queue.extend(graph[node])  

        return cluster

    for node in range(num_nodes):
        if node not in visited:
            new_cluster = bfs(node)
            clusters.append(new_cluster)

    return clusters

In [137]:
au_clusters = cluster(dist_mat, 20)
print(len(au_clusters))

11


In [44]:
# Get true word dict
def parse_text_to_dict(file):
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    data_dict = {}
    current_id = None
    word_dict = {}

    for line in lines: 
        line = line.strip()

        if not line: 
            continue
        
        if line.endswith(":") and not line.split(":")[0].isdigit():
            if current_id is not None:
                data_dict[current_id] = word_dict
            
            current_id = line[:-1]
            word_dict = {}
        else:
            parts = line.split(": ")
            if len(parts) == 2:
                index, word = parts
                word_dict[int(index)] = word.strip()
            else:
                parts = parts[0].split(":")
                index = parts[0]
                word_dict[int(index)] = " "
            
            if current_id is not None:
                data_dict[current_id] = word_dict
        
    return data_dict

true_words_dict = parse_text_to_dict("data/words_and_indices.txt")

In [43]:
# Cluster and WordUnit classes
from collections import defaultdict

class Cluster:
    def __init__(self,id, word_dict=None, true_words=None):
        self.id = id
        self.length = len(word_dict) if word_dict else 0
        self.word_dict = word_dict if word_dict is not None else []
        self.true_word_dict = true_words if true_words is not None else []
    
    def add_word_unit(self, id, index, file):
        word_unit = WordUnit(file, index, id)
        self.length += 1
        self.word_dict.append(word_unit)

    def add_true_word(self, word):
        self.true_word_dict.append(word)

    @classmethod
    def print_cluster(self, cluster):
        print(f"Cluster {cluster.id}")
        for word in cluster.word_dict:
            print(f"Word {word.id}: Index {word.index} in File {word.file}")
    
    def cluster_purity(self):

        word_counts = {}
        for word in self.true_word_dict:
            word_counts[word] = word_counts.get(word, 0) + 1

        max_count = max(word_counts.values()) if word_counts else 0
        cluster_purity = max_count / self.length if self.length > 0 else 0

        self.purity = cluster_purity

    @classmethod
    def duplicate_clusters(self, clusters):
        cluster_dict = defaultdict(int)

        for cluster in clusters:
            cluster_set = frozenset(cluster)  
            cluster_dict[cluster_set] += 1  

        duplicate_count = sum(1 for count in cluster_dict.values() if count > 1)

        return duplicate_count

class WordUnit:
    def __init__(self, file, index, id):
        self.index = int(index)
        self.file = file
        self.id = int(id)
        self.start_time = None
        self.end_time = None

    def add_word_boundaries(self, start_time, end_time):
        self.start_time = start_time
        self.end_time = end_time
    

In [143]:
au_clusters = cluster(dist_mat, 15)
clusters = []
for i, c in enumerate(au_clusters):
    new_c = Cluster(i)
    for w in range(len(c)):
        if c[w] in filenames:
            filename_parts = filenames[c[w]].split("_")
            filename = filename_parts[0]
            word_index = int(filename_parts[1])        
            new_c.add_word_unit(w, word_index, filename)
    clusters.append(new_c)

for c in clusters:
    # print(c.id)
    for word_unit in c.word_dict:
        # if len(c.word_dict) > 1:
        #     print(word_unit.file, word_unit.index)
        if word_unit.index == 0:
            word = ''
        else:
            word = true_words_dict[word_unit.file][word_unit.index-1]
        
        c.add_true_word(word)
    print()
    
    if len(c.word_dict) > 1:  
        print(f"Cluster {c.id}: ", end="")                
        print(", ".join(c.true_word_dict))
        print()




Cluster 1: it,  ,  , feet,  ,  ,  ,  , at, an,  , sea, in, and, five, feet, ice, above, nine, four



















