In [1]:
from pathlib import Path
import pandas as pd
from encode import sample_files
from distance import get_batch_of_paths
from tqdm import tqdm
import scipy.sparse as sp
import editdistance
import numpy as np
import igraph as ig
import leidenalg as la
import math
from multiprocessing import Pool

In [13]:
audio_dir = Path("data/dev-clean")
audio_ext = ".flac"
align_path = Path("data/alignments/dev-clean/alignments.csv")
save_dir = Path("features/")
wav_dir = Path("data/dev-clean")
feat_dir = Path("features/0.1")

align_df = pd.read_csv(align_path)

In [28]:
paths, sample_size = sample_files(
    audio_dir=audio_dir, audio_ext=audio_ext, sample_size=-1
)
# paths = [Path("data/dev-clean/174/50561/174-50561-0005.flac")]
sample_size = len(paths)
print(sample_size)

2703


In [10]:
gamma = 0.1
paths = (p for p in Path(f"features/{gamma}").rglob("*.npy"))

sorted_paths = sorted(paths, key=lambda x: int(x.stem.split("_")[-1]))
sample_size = len(sorted_paths)
print(sample_size)

63221


In [16]:
import torch
import torchaudio
from webrtcvad import Vad
from encode import mark_sil

kmeans, segment = torch.hub.load(
    "bshall/dusted:main", "kmeans", language="english", trust_repo=True
)
hubert, encode = torch.hub.load(
    "bshall/dusted:main", "hubert", language="english", trust_repo=True
)
vad = Vad()

Using cache found in /home/danel/.cache/torch/hub/bshall_dusted_main
Using cache found in /home/danel/.cache/torch/hub/bshall_dusted_main
Using cache found in /home/danel/.cache/torch/hub/bshall_hubert_main


In [17]:
def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int) -> int:
    hop = frame_size_ms / 1000 * sample_rate
    hop_size = np.max([hop, 1])
    return int((timestamp * sample_rate) / hop_size)


for path in tqdm(paths, desc="Getting Units"):
    wav_df = align_df[align_df["filename"] == path.stem]

    wav, sr = torchaudio.load(str(path))
    wav = torchaudio.functional.resample(wav, sr, 16000)
    flags = mark_sil(vad, wav)
    wav = wav.unsqueeze(0)

    encoding = encode(hubert, wav, 7)
    encoding = encoding.squeeze(0)
    gamma = 0.1

    for w in range(max(wav_df["word_id"])):
        word_df = wav_df[wav_df["word_id"] == w]
        word_boundaries = [word_df["word_start"].iloc[0], word_df["word_end"].iloc[0]]

        start_frame = get_frame_num(word_boundaries[0], 16000, 20)
        end_frame = get_frame_num(word_boundaries[1], 16000, 20)

        cut_encoding = encoding[start_frame:end_frame]
        cut_flags = flags[start_frame:end_frame]
        clean_encoding = []
        for i in range(min(cut_encoding.shape[0], len(flags))):
            if cut_flags[i]:
                clean_encoding.append(cut_encoding[i, :].unsqueeze(0))

        if clean_encoding != []:
            clean_encoding = torch.cat(clean_encoding, dim=0)

        codes = []
        if clean_encoding != []:
            codes, _ = segment(
                clean_encoding.numpy(), kmeans.cluster_centers_, gamma=gamma
            )

            save_path = (
                Path("features")
                / str(gamma)
                / path.relative_to(wav_dir)
                / f"{path.stem}_{w}.npy"
            )
            save_path.parent.mkdir(parents=True, exist_ok=True)

            np.save(save_path, codes)


Getting Units:   0%|          | 6/2703 [00:04<32:28,  1.38it/s]  


KeyboardInterrupt: 

In [None]:
def cal_dist_per_pair(pair):
    """
    Calculates the normalized edit distance for a given pair of feature sequences.

    Args:
        chunk_pair (dict): Dictionary with a single key-value pair where:
            - Key: Tuple (i, j) representing the indices of the feature pair.
            - Value: Tuple (feature_i, feature_j) containing the feature sequences.

    Returns:
        tuple: (index_i, index_j, normalized edit distance).
    """

    id_1, id_2 = tuple(pair.keys())[0]
    feature_1, feature_2 = tuple(pair.values())[0]

    max_length = np.max([len(feature_1), len(feature_2)])
    min_length = np.min([len(feature_1), len(feature_2)])
    # print(f"max len {max_length}")

    dist = 0
    if min_length == 0:
        # print(f"{id_1, id_2}\n{feature_1}\n{feature_2}\nDistance: {1.0}\n")
        return (id_1, id_2, 1.0)

    if max_length > 0:
        dist = editdistance.eval(feature_1, feature_2) / max_length

    # if dist < 0.5:
    # print(f"{id_1, id_2}\n{feature_1}\n{feature_2}\nDistance: {dist}\n")
    return (id_1, id_2, dist)

In [None]:
# Process chunks
gamma = 0.1
paths = list(Path(f"features/{gamma}").rglob("*.npy"))
sample_size = len(paths)
sorted_paths = sorted(paths, key=lambda x: int(x.stem.split("_")[-1]))

filenames = []
features = []
# for feature in tqdm(paths, desc="Process Filenames"):
for path in tqdm(sorted_paths, desc="Appending Features"):
    word_id = path.stem.split("_")[1]
    filenames.append(path.stem)
    feature = np.load(path)
    features.append(feature)


chunk_limit = 5000000
num_pairs = sample_size * (sample_size - 1) // 2
num_chunks = (num_pairs + chunk_limit - 1) // chunk_limit

row_indices = []
col_indices = []
values = []

print(f"num_pairs: {num_pairs}")
print(f"num_chunks: {num_chunks}")
print(f"num_samples: {sample_size}")

for chunk in tqdm(
    get_batch_of_paths(sample_size, chunk_limit=chunk_limit),
    total=num_chunks,
    desc="Processing Chunks",
    unit="chunk",
):
    chunk_units = [{(i, j): (features[i], features[j])} for i, j in chunk]
    chunk_results = []
    with Pool(6) as pool:
        chunk_results = pool.map(cal_dist_per_pair, chunk_units)

    for i, j, dist in chunk_results:
        row_indices.append(i)
        col_indices.append(j)
        values.append(dist)

    dist_sparse = sp.coo_matrix(
        (values, (row_indices, col_indices)), shape=(sample_size, sample_size)
    )

Appending Features: 100%|██████████| 63137/63137 [00:16<00:00, 3887.30it/s]


num_pairs: 1993108816
num_chunks: 399
num_samples: 63137


Processing Chunks:   0%|          | 0/399 [00:00<?, ?chunk/s]Process SpawnPoolWorker-7:
Traceback (most recent call last):
  File "/Users/daneladendorff/.pyenv/versions/3.12.8/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/daneladendorff/.pyenv/versions/3.12.8/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/daneladendorff/.pyenv/versions/3.12.8/lib/python3.12/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/Users/daneladendorff/.pyenv/versions/3.12.8/lib/python3.12/multiprocessing/queues.py", line 389, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'cal_dist_per_pair' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
Process SpawnPoolWorker-12:
Traceback (most recent call last):
  File "/Users/daneladendorff/.pyenv/versions/3.12.8/lib

KeyboardInterrupt: 

In [2]:
chunk_idx = 400
gamma = 0.1
elements = ["rows", "cols", "vals"]
first_chunk = np.load(f"output/{gamma}/temp/temp_rows_0.npy")
dtype = first_chunk.dtype
total_size = sum(
    np.load(f"output/{gamma}/temp/temp_rows_{i}.npy").shape[0]
    for i in tqdm(range(chunk_idx), desc="Calculating total")
)
print(total_size)

Calculating total:   0%|          | 0/400 [00:00<?, ?it/s]

Calculating total: 100%|██████████| 400/400 [00:27<00:00, 14.36it/s]

1998415810





In [None]:
for element in ["rows"]:
    file_path = f"output/{gamma}/{element}.npy"

    # Preallocate memory-mapped file
    merged_data = np.memmap(file_path, dtype=dtype, mode="w+", shape=(total_size,))

    index = 0
    for i in tqdm(range(chunk_idx), desc=f"Writing {element} to disk"):
        temp_data = np.load(f"output/{gamma}/temp/temp_{element}_{i}.npy")
        merged_data[index : index + temp_data.shape[0]] = temp_data
        if i >= 399:
            print(f"{index}:{index + temp_data.shape[0]}")
        index += temp_data.shape[0]

    merged_data.flush()
    del merged_data


In [3]:
gamma = 0.1
paths = (p for p in Path(f"features/{gamma}").rglob("*.npy"))
align_df = pd.read_csv("data/alignments/dev-clean/alignments.csv")
sorted_paths = sorted(paths, key=lambda x: int(x.stem.split("_")[-1]))

text = []
for path in tqdm(sorted_paths, desc="Appending Text"):
    filename_parts = path.stem.split("_")
    wav_df = align_df[align_df["filename"] == filename_parts[0]]
    word_df = wav_df[wav_df["word_id"] == int(filename_parts[1])]
    text.append(str(word_df["text"].iloc[0]))

Appending Text: 100%|██████████| 63221/63221 [03:39<00:00, 288.59it/s]


In [5]:
gamma = 0.1
temp_dir = Path(f"output/{gamma}/temp")


def get_total_size(temp_dir, total_chunks):
    total_size = sum(
        np.load(temp_dir / f"temp_rows_{i}.npy").shape[0]
        for i in tqdm(range(total_chunks), desc="Calculating total")
    )
    return total_size


def get_n(length_list):
    return int(1 + np.sqrt(1 + 8 * length_list)) // 2


total_size = get_total_size(temp_dir, 400)
sample_size = get_n(total_size)
print(f"total_size: {total_size}, sample_size: {sample_size}")


Calculating total: 100%|██████████| 400/400 [00:02<00:00, 145.70it/s]

total_size: 1998415810, sample_size: 63221





In [12]:
g = ig.Graph()
g.add_vertices(sample_size)

chunks = 400
for i in tqdm(range(chunks), desc="Getting Temp Info"):
    temp_rows = np.load(temp_dir / f"temp_rows_{i}.npy")
    temp_cols = np.load(temp_dir / f"temp_cols_{i}.npy")
    temp_vals = np.load(temp_dir / f"temp_vals_{i}.npy")

    mask = temp_vals < 0.4
    filtered_rows = temp_rows[mask]
    filtered_cols = temp_cols[mask]
    filtered_vals = temp_vals[mask]

    # Convert edges and weights to lists
    edges = list(zip(map(int, filtered_rows), map(int, filtered_cols)))
    weights = list(map(float, filtered_vals))

    # Add edges if they exist
    if edges:
        g.add_edges(edges)

        # Assign weights only to newly added edges
        if weights:
            g.es[-len(weights) :].set_attribute_values("weight", weights)


Getting Temp Info: 100%|██████████| 400/400 [02:29<00:00,  2.67it/s]


In [None]:
num_clusters = 13967
for e in tqdm(g.es, desc="Filtering Weights"):
    if e["weight"] <= 0:
        e["weight"] = 1e-10


Filtering Weights: 100%|██████████| 5752547/5752547 [00:02<00:00, 2052098.24it/s]


In [22]:
g.summary


<bound method Graph.summary of <igraph.Graph object at 0x7c2cbbb85050>>

In [23]:
g.write_pickle(f"output/{gamma}/graph.pkl")

In [32]:
num_clusters = 13967  # Desired number of clusters

for res in tqdm([0.025, 0.026, 0.027, 0.028, 0.029], desc="Optimising Resolution"):
    partition = la.find_partition(
        g, la.CPMVertexPartition, weights="weight", resolution_parameter=res
    )
    actual_clusters = len(set(partition.membership))
    print(f"Resolution {res}: {actual_clusters} clusters")


Optimising Resolution:  20%|██        | 1/5 [00:17<01:11, 17.75s/it]

Resolution 0.025: 13800 clusters


Optimising Resolution:  40%|████      | 2/5 [00:35<00:52, 17.61s/it]

Resolution 0.026: 13908 clusters


Optimising Resolution:  60%|██████    | 3/5 [00:53<00:35, 17.91s/it]

Resolution 0.027: 13902 clusters


Optimising Resolution:  80%|████████  | 4/5 [01:10<00:17, 17.72s/it]

Resolution 0.028: 14075 clusters


Optimising Resolution: 100%|██████████| 5/5 [01:29<00:00, 17.80s/it]

Resolution 0.029: 14135 clusters





In [None]:
num_clusters = 13967
partition = la.find_partition(
    g, la.CPMVertexPartition, weights="weight", resolution_parameter=0.0277
)
actual_clusters = len(set(partition.membership))
print(f"Cluster difference: {abs(actual_clusters - num_clusters)}")

Cluster difference: -3


In [83]:
# Dictionary to store cluster assignments
clusters = {i: [] for i in set(partition.membership)}

# Assign nodes to clusters
for node, cluster_id in enumerate(partition.membership):
    clusters[cluster_id].append(node)  # Append node index to the respective cluster

cluster_transcriptions = []
for cluster_id, words in clusters.items():
    for w in words:
        cluster_transcriptions.append((cluster_id, text[w]))

In [None]:
from collections import defaultdict

# Dictionary to store all text per cluster
cluster_texts = defaultdict(list)

# Group all text by cluster_id
for cluster_id, txt in cluster_transcriptions:
    cluster_texts[cluster_id].append(txt)

# Print all texts in each cluster
for cluster_id, texts in cluster_texts.items():
    if len(texts) > 1:
        print(f"Cluster {cluster_id}: {' | '.join(texts)}\n")


Cluster 0: the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | this | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | a | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | the | 

In [None]:
import itertools
import statistics


def distance(p, q):
    """Compute normalized edit distance between two strings."""
    length = max(len(p), len(q))
    return (
        editdistance.eval(p, q) / length if length > 0 else 1
    )  # Avoid division by zero


def ned(discovered):
    """Compute the normalized edit distance (NED) within each cluster."""
    if not discovered:
        return 0

    discovered = sorted(discovered, key=lambda x: x[0])

    distances = []
    for _, group in itertools.groupby(discovered, key=lambda x: x[0]):
        group_list = list(group)

        if len(group_list) < 2:
            continue

        for p, q in itertools.combinations(group_list, 2):
            d = distance(p[1], q[1])
            distances.append(d)

    return (
        statistics.mean(distances) if distances else 0
    )  # Ensure no division by empty list


print(f"NED: {ned(cluster_transcriptions)}")

NED: 0.07928736711516866


In [None]:
gamma = 0.1
rows = np.memmap(f"output/{gamma}/rows.npy", mode="r", shape=(total_size,))
cols = np.memmap(f"output/{gamma}/cols.npy", mode="r", shape=(total_size,))
vals = np.memmap(f"output/{gamma}/vals.npy", mode="r", shape=(total_size,))


In [5]:
def get_n(length_list):
    return int(1 + np.sqrt(1 + 8 * length_list)) // 2


print(f"sample_size: {get_n(total_size)}")
sample_size = get_n(total_size)

sample_size: 63221


In [6]:
g = ig.Graph()
g.add_vertices(sample_size)

In [None]:
# Create a boolean mask for values < 0.4
mask = vals < 0.4

# Directly create edges as tuples (node1, node2, weight)
edges_with_weights = np.column_stack((rows[mask], cols[mask], vals[mask]))

# Convert to list of tuples for NetworkX
edges_with_weights = [
    tuple(edge) for edge in tqdm(edges_with_weights, desc="Creating Edges with Weights")
]

# Add all edges to the graph in one batch operation
g.add_edges_from(edges_with_weights, weight="weight")

In [12]:
def num_vertices_from_upper_triangular(num_points):
    return int((1 + math.sqrt(1 + 8 * num_points)) / 2)


# Get the number of vertices
num_data_points = len(dist_sparse.data)
sample_size = num_vertices_from_upper_triangular(num_data_points)
print(f"sample_size: {sample_size}")
# Initialize graph
g = ig.Graph()
g.add_vertices(sample_size)

# Extract rows, cols, and values from COO format (efficiently)
rows, cols, vals = dist_sparse.row, dist_sparse.col, dist_sparse.data

# Filter edges and weights efficiently
filtered_edges = [
    (r, c)
    for r, c, v in tqdm(
        zip(rows, cols, vals), total=len(dist_sparse.data), desc="Filtering edges"
    )
    if v > 0 and v < 0.4
]
filtered_weights = [v for v in vals if v > 0 and v < 0.4]

print(len(filtered_edges))

# Add edges with progress tracking
for idx, edge in tqdm(
    enumerate(filtered_edges),
    desc="Adding edges",
    total=len(filtered_edges),
    mininterval=1.0,
):
    g.add_edge(*edge, weight=filtered_weights[idx])

# Add weights with progress tracking
for idx, weight in tqdm(
    enumerate(filtered_weights), desc="Assigning weights", total=len(filtered_weights)
):
    g.es[idx]["weight"] = weight


sample_size: 5609


Filtering edges: 100%|██████████| 15727680/15727680 [00:03<00:00, 3987707.49it/s]


351


Adding edges: 100%|██████████| 351/351 [00:00<00:00, 39821.50it/s]
Assigning weights: 100%|██████████| 351/351 [00:00<00:00, 1111094.87it/s]


In [None]:
print(len(g.vs))
communities = g.community_edge_betweenness()
communities = communities.as_clustering()


18


In [39]:
partition = la.find_partition(g, la.ModularityVertexPartition)
print(partition)

Clustering with 18 elements and 14 clusters
[ 0] 1, 3, 8, 17
[ 1] 2, 4
[ 2] 0
[ 3] 5
[ 4] 6
[ 5] 7
[ 6] 9
[ 7] 10
[ 8] 11
[ 9] 12
[10] 13
[11] 14
[12] 15
[13] 16
