In [4]:
from pathlib import Path
import pandas as pd
from encode import sample_files
from distance import get_batch_of_paths
from tqdm import tqdm
import scipy.sparse as sp
import editdistance
import numpy as np
import igraph as ig
import leidenalg as la
import math
from multiprocessing import Pool

In [14]:
audio_dir = Path("data/dev-clean")
audio_ext = ".flac"
align_path = Path("data/alignments/dev-clean/alignments.csv")
save_dir = Path("features/")
wav_dir = Path("data/dev-clean")
feat_dir = Path("features/0.2")

align_df = pd.read_csv(align_path)

In [15]:
paths, sample_size = sample_files(
    audio_dir=audio_dir, audio_ext=audio_ext, sample_size=-1
)
# paths = [Path("data/dev-clean/174/50561/174-50561-0005.flac")]
sample_size = len(paths)
print(sample_size)

2703


In [16]:
import torch
import torchaudio
from webrtcvad import Vad
from encode import mark_sil

kmeans, segment = torch.hub.load(
    "bshall/dusted:main", "kmeans", language="english", trust_repo=True
)
hubert, encode = torch.hub.load(
    "bshall/dusted:main", "hubert", language="english", trust_repo=True
)
vad = Vad()

Using cache found in /home/danel/.cache/torch/hub/bshall_dusted_main
Using cache found in /home/danel/.cache/torch/hub/bshall_dusted_main
Using cache found in /home/danel/.cache/torch/hub/bshall_hubert_main


In [17]:
def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int) -> int:
    hop = frame_size_ms / 1000 * sample_rate
    hop_size = np.max([hop, 1])
    return int((timestamp * sample_rate) / hop_size)


for path in tqdm(paths, desc="Getting Units"):
    wav_df = align_df[align_df["filename"] == path.stem]

    wav, sr = torchaudio.load(str(path))
    wav = torchaudio.functional.resample(wav, sr, 16000)
    flags = mark_sil(vad, wav)
    wav = wav.unsqueeze(0)

    encoding = encode(hubert, wav, 7)
    encoding = encoding.squeeze(0)
    gamma = 0.1

    for w in range(max(wav_df["word_id"])):
        word_df = wav_df[wav_df["word_id"] == w]
        word_boundaries = [word_df["word_start"].iloc[0], word_df["word_end"].iloc[0]]

        start_frame = get_frame_num(word_boundaries[0], 16000, 20)
        end_frame = get_frame_num(word_boundaries[1], 16000, 20)

        cut_encoding = encoding[start_frame:end_frame]
        cut_flags = flags[start_frame:end_frame]
        clean_encoding = []
        for i in range(min(cut_encoding.shape[0], len(flags))):
            if cut_flags[i]:
                clean_encoding.append(cut_encoding[i, :].unsqueeze(0))

        if clean_encoding != []:
            clean_encoding = torch.cat(clean_encoding, dim=0)

        codes = []
        if clean_encoding != []:
            codes, _ = segment(
                clean_encoding.numpy(), kmeans.cluster_centers_, gamma=gamma
            )

            save_path = (
                Path("features")
                / str(gamma)
                / path.relative_to(wav_dir)
                / f"{path.stem}_{w}.npy"
            )
            save_path.parent.mkdir(parents=True, exist_ok=True)

            np.save(save_path, codes)


Getting Units:   0%|          | 6/2703 [00:04<32:28,  1.38it/s]  


KeyboardInterrupt: 

In [None]:
def cal_dist_per_pair(pair):
    """
    Calculates the normalized edit distance for a given pair of feature sequences.

    Args:
        chunk_pair (dict): Dictionary with a single key-value pair where:
            - Key: Tuple (i, j) representing the indices of the feature pair.
            - Value: Tuple (feature_i, feature_j) containing the feature sequences.

    Returns:
        tuple: (index_i, index_j, normalized edit distance).
    """

    id_1, id_2 = tuple(pair.keys())[0]
    feature_1, feature_2 = tuple(pair.values())[0]

    max_length = np.max([len(feature_1), len(feature_2)])
    min_length = np.min([len(feature_1), len(feature_2)])
    # print(f"max len {max_length}")

    dist = 0
    if min_length == 0:
        # print(f"{id_1, id_2}\n{feature_1}\n{feature_2}\nDistance: {1.0}\n")
        return (id_1, id_2, 1.0)

    if max_length > 0:
        dist = editdistance.eval(feature_1, feature_2) / max_length

    # if dist < 0.5:
    # print(f"{id_1, id_2}\n{feature_1}\n{feature_2}\nDistance: {dist}\n")
    return (id_1, id_2, dist)

In [None]:
# Process chunks
gamma = 0.1
paths = list(Path(f"features/{gamma}").rglob("*.npy"))
sample_size = len(paths)
sorted_paths = sorted(paths, key=lambda x: int(x.stem.split("_")[-1]))

filenames = []
features = []
# for feature in tqdm(paths, desc="Process Filenames"):
for path in tqdm(sorted_paths, desc="Appending Features"):
    word_id = path.stem.split("_")[1]
    filenames.append(path.stem)
    feature = np.load(path)
    features.append(feature)


chunk_limit = 5000000
num_pairs = sample_size * (sample_size - 1) // 2
num_chunks = (num_pairs + chunk_limit - 1) // chunk_limit

row_indices = []
col_indices = []
values = []

print(f"num_pairs: {num_pairs}")
print(f"num_chunks: {num_chunks}")
print(f"num_samples: {sample_size}")

for chunk in tqdm(
    get_batch_of_paths(sample_size, chunk_limit=chunk_limit),
    total=num_chunks,
    desc="Processing Chunks",
    unit="chunk",
):
    chunk_units = [{(i, j): (features[i], features[j])} for i, j in chunk]
    chunk_results = []
    with Pool(6) as pool:
        chunk_results = pool.map(cal_dist_per_pair, chunk_units)

    for i, j, dist in chunk_results:
        row_indices.append(i)
        col_indices.append(j)
        values.append(dist)

    dist_sparse = sp.coo_matrix(
        (values, (row_indices, col_indices)), shape=(sample_size, sample_size)
    )

Appending Features: 100%|██████████| 63137/63137 [00:16<00:00, 3887.30it/s]


num_pairs: 1993108816
num_chunks: 399
num_samples: 63137


Processing Chunks:   0%|          | 0/399 [00:00<?, ?chunk/s]Process SpawnPoolWorker-7:
Traceback (most recent call last):
  File "/Users/daneladendorff/.pyenv/versions/3.12.8/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/daneladendorff/.pyenv/versions/3.12.8/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/daneladendorff/.pyenv/versions/3.12.8/lib/python3.12/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/Users/daneladendorff/.pyenv/versions/3.12.8/lib/python3.12/multiprocessing/queues.py", line 389, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'cal_dist_per_pair' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
Process SpawnPoolWorker-12:
Traceback (most recent call last):
  File "/Users/daneladendorff/.pyenv/versions/3.12.8/lib

KeyboardInterrupt: 

In [None]:
import numpy as np
from tqdm import tqdm
import os

chunk_idx = 400
gamma = 0.1

for element in ["rows", "cols", "vals"]:
    all = []
    for i in tqdm(range(chunk_idx), desc="Concatenating info"):
        all.append(np.load(f"output/{gamma}/temp_{element}_{i}.npy"))

    # Merge into single NumPy arrays
    np_all = np.concatenate(all)
    np.save(f"output/{gamma}/{element}.npy")
    del all, np_all

# cols = np.concatenate(all_cols)
# vals = np.concatenate(all_vals)

# # Convert to a compressed sparse format for efficient storage
# print("Saving sparse matrix...")
# if rows and cols and vals:
#     dist_sparse = sp.coo_matrix((vals, (rows, cols)), shape=(sample_size, sample_size))
#     sp.save_npz(f"output/{gamma}/sparse_dist_mat.npz", dist_sparse)


In [2]:
import numpy as np
from tqdm import tqdm
import os

chunk_idx = 400 * 2
gamma = 0.1
idx = 0
all_rows, all_cols, all_vals = [], [], []
for i in tqdm(range(1, chunk_idx, 2), desc="Concatenating info"):
    os.remove(f"output/{gamma}/temp_rows_{i}.npy")

    os.remove(f"output/{gamma}/temp_cols_{i}.npy")

    os.remove(f"output/{gamma}/temp_vals_{i}.npy")

    idx += 1

# # Merge into single NumPy arrays
# rows = np.concatenate(all_rows)
# cols = np.concatenate(all_cols)
# vals = np.concatenate(all_vals)

# # Convert to a compressed sparse format for efficient storage
# print("Saving sparse matrix...")
# if rows and cols and vals:
#     dist_sparse = sp.coo_matrix((vals, (rows, cols)), shape=(sample_size, sample_size))
#     sp.save_npz(f"output/{gamma}/sparse_dist_mat.npz", dist_sparse)


Concatenating info: 100%|██████████| 400/400 [03:56<00:00,  1.69it/s]


In [5]:
dist_sparse = sp.load_npz("output/0.2/sparse_dist_mat.npz")

In [12]:
def num_vertices_from_upper_triangular(num_points):
    return int((1 + math.sqrt(1 + 8 * num_points)) / 2)


# Get the number of vertices
num_data_points = len(dist_sparse.data)
sample_size = num_vertices_from_upper_triangular(num_data_points)
print(f"sample_size: {sample_size}")
# Initialize graph
g = ig.Graph()
g.add_vertices(sample_size)

# Extract rows, cols, and values from COO format (efficiently)
rows, cols, vals = dist_sparse.row, dist_sparse.col, dist_sparse.data

# Filter edges and weights efficiently
filtered_edges = [
    (r, c)
    for r, c, v in tqdm(
        zip(rows, cols, vals), total=len(dist_sparse.data), desc="Filtering edges"
    )
    if v > 0 and v < 0.4
]
filtered_weights = [v for v in vals if v > 0 and v < 0.4]

print(len(filtered_edges))

# Add edges with progress tracking
for idx, edge in tqdm(
    enumerate(filtered_edges),
    desc="Adding edges",
    total=len(filtered_edges),
    mininterval=1.0,
):
    g.add_edge(*edge, weight=filtered_weights[idx])

# Add weights with progress tracking
for idx, weight in tqdm(
    enumerate(filtered_weights), desc="Assigning weights", total=len(filtered_weights)
):
    g.es[idx]["weight"] = weight


sample_size: 5609


Filtering edges: 100%|██████████| 15727680/15727680 [00:03<00:00, 3987707.49it/s]


351


Adding edges: 100%|██████████| 351/351 [00:00<00:00, 39821.50it/s]
Assigning weights: 100%|██████████| 351/351 [00:00<00:00, 1111094.87it/s]


In [38]:
print(len(g.vs))
communities = g.community_edge_betweenness()
communities = communities.as_clustering()


18


In [39]:
partition = la.find_partition(g, la.ModularityVertexPartition)
print(partition)

Clustering with 18 elements and 14 clusters
[ 0] 1, 3, 8, 17
[ 1] 2, 4
[ 2] 0
[ 3] 5
[ 4] 6
[ 5] 7
[ 6] 9
[ 7] 10
[ 8] 11
[ 9] 12
[10] 13
[11] 14
[12] 15
[13] 16
