In [1]:
import pandas as pd
from encode import get_units
from pathlib import Path

get = False

align_dir = Path("data/alignments/dev-clean/")
align_path = align_dir / "alignments.csv"
audio_dir = Path("data/dev-clean")
audio_ext = ".flac"

align_df = pd.read_csv(align_path)
gamma = 0.2
layer = 7
save_dir = Path("ladies/features")
paths = [Path("data/dev-clean/174/50561/174-50561-0005.flac")]

if get:
    get_units(paths, align_df, audio_dir, gamma, layer, save_dir)

In [13]:
from dist import get_features, get_batch_of_paths, cal_dist_per_pair
from tqdm import tqdm
import numpy as np

out_dir = Path(f"ladies/output/{gamma}/temp/")
out_dir.mkdir(parents=True, exist_ok=True)
preloaded = False
chunk_limit = 500000


def process_batch(batch, features):
    """Parallelized function to calculate distance for each (i, j) pair."""
    return [cal_dist_per_pair(((i, j), (features[i], features[j]))) for i, j in batch]


if not preloaded:
    paths = (p for p in Path(f"ladies/features/{gamma}").rglob("**/*.npy"))
    sorted_paths = sorted(paths, key=lambda x: int(x.stem.split("_")[-1]))
    sample_size = len(sorted_paths)

    features = get_features(sorted_paths)

    rows, cols, vals = [], [], []

    num_pairs = sample_size * (sample_size - 1) // 2
    num_batches = (num_pairs + chunk_limit - 1) // chunk_limit

    print(f"num_samples: {sample_size}")
    print(f"num_pairs: {num_pairs}")

    chunk_idx = 0
    # Parallel execution
    for batch in tqdm(
        get_batch_of_paths(sample_size, chunk_limit),
        total=num_batches,
        unit="batch",
        mininterval=10.0,
        desc="Processing Batches",
    ):
        for i, j in batch:
            i, j, dist = cal_dist_per_pair(((i, j), (features[i], features[j])))
            rows.append(i)
            cols.append(j)
            vals.append(dist)

        np.save(out_dir / f"temp_rows_{chunk_idx}.npy", rows)
        np.save(out_dir / f"temp_cols_{chunk_idx}.npy", cols)
        np.save(out_dir / f"temp_vals_{chunk_idx}.npy", vals)

        rows, cols, vals = [], [], []
        chunk_idx += 1

Appending Features: 100%|██████████| 18/18 [00:00<00:00, 10357.73it/s]


num_samples: 18
num_pairs: 153


Processing Batches: 100%|██████████| 1/1 [00:00<00:00, 936.02batch/s]


In [42]:
from cluster import build_graph_from_temp
import pickle

use_preloaded_graph = False
temp_dir = Path(f"ladies/output/{gamma}/temp")
temp_dir.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists
num_chunks = 1

graph_path = Path(f"ladies/output/{gamma}/graph.pkl")

if use_preloaded_graph and graph_path.exists():
    with open(graph_path, "rb") as f:
        g = pickle.load(f)
    print(f"Loaded precomputed graph from {graph_path}")
else:
    g = build_graph_from_temp(temp_dir, num_chunks, threshold=0.7)
    g.write_pickle(str(graph_path))
    print(f"Graph built and saved to {graph_path}")

Calculating total: 100%|██████████| 1/1 [00:00<00:00, 4084.04it/s]


total_size: 153, sample_size: 18


Getting Temp Info: 100%|██████████| 1/1 [00:00<00:00, 1312.77it/s]

Graph built and saved to ladies/output/0.2/graph.pkl





In [None]:
from cluster import adaptive_res_search

num_clusters = 18
best_res, best_partition = adaptive_res_search(g, num_clusters)

# Convert best_partition to a DataFrame
best_partition_df = pd.DataFrame(
    {
        "node": range(len(best_partition.membership)),  # Node IDs
        "cluster": best_partition.membership,  # Cluster assignments
    }
)

# Save to CSV
best_partition_df.to_csv(
    f"output/{gamma}/partition_r{round(best_res, 3)}.csv", index=False
)

Iteration 1: res=0.210000, Cluster difference=3
Iteration 2: res=0.209000, Cluster difference=3
Iteration 3: res=0.208145, Cluster difference=3
Iteration 4: res=0.207414, Cluster difference=3
Iteration 5: res=0.206789, Cluster difference=3
Iteration 6: res=0.206255, Cluster difference=3
Iteration 7: res=0.205798, Cluster difference=3
Iteration 8: res=0.205407, Cluster difference=3
Iteration 9: res=0.205073, Cluster difference=3
Iteration 10: res=0.204787, Cluster difference=3
Iteration 11: res=0.204543, Cluster difference=3
Iteration 12: res=0.204334, Cluster difference=3
Iteration 13: res=0.204156, Cluster difference=3
Iteration 14: res=0.204003, Cluster difference=3
Iteration 15: res=0.203873, Cluster difference=3
Iteration 16: res=0.203761, Cluster difference=3
Iteration 17: res=0.203666, Cluster difference=3
Iteration 18: res=0.203584, Cluster difference=3
Iteration 19: res=0.203515, Cluster difference=3
Iteration 20: res=0.203455, Cluster difference=3
Iteration 21: res=0.203404, C

In [67]:
actual_clusters = len(set(best_partition_df["cluster"]))
diff = abs(actual_clusters - num_clusters)

print(f"Best resolution found: {best_res:.3f} with cluster difference: {diff}")

Best resolution found: 0.210 with cluster difference: 3


In [68]:
def get_phones_and_texts(gamma, align_dir):
    cache_path = Path(f"ladies/features/{gamma}/texts_and_phones.csv")

    if cache_path.exists():
        df = pd.read_csv(cache_path)
        texts = df["text"].tolist()
        phones = df["phones"].apply(lambda x: tuple(x.split(",")))
        print(f"Loaded texts from {cache_path}")
        return phones, texts

    paths = sorted(
        Path(f"ladies/features/{gamma}").rglob("**/*.npy"),
        key=lambda x: int(x.stem.split("_")[-1]),
    )
    align_df = pd.read_csv(align_dir / "alignments.csv")

    texts = []
    phones = []

    for path in tqdm(paths, desc="Appending Text and Phones"):
        filename_parts = path.stem.split("_")
        wav_df = align_df[align_df["filename"] == filename_parts[0]]
        word_df = wav_df[wav_df["word_id"] == int(filename_parts[1])]
        texts.append(str(word_df["text"].iloc[0]))
        word_phones = word_df["phones"].iloc[0].split(",")
        word_phones = " ".join(word_phones)
        phones.append(word_phones)

    df = pd.DataFrame({"text": texts, "phones": phones})
    df.to_csv(cache_path, index=False)
    print(f"Saved texts to {cache_path}")

    return phones, texts


phones, texts = get_phones_and_texts(gamma, align_dir)


Loaded texts from ladies/features/0.2/texts_and_phones.csv


In [69]:
print(phones)

0                 (sil,)
1         (L EY1 D IY0,)
2                  (sp,)
3         (L EY1 D IY0,)
4                  (sp,)
5               (M AY1,)
6             (R OW1 Z,)
7             (W AY1 T,)
8         (L EY1 D IY0,)
9                  (sp,)
10            (B AH1 T,)
11            (W IH1 L,)
12              (Y UW1,)
13            (N AA1 T,)
14           (HH IY1 R,)
15                (AH0,)
16    (R AW1 N D AH0 L,)
17        (L EY1 D IY0,)
Name: phones, dtype: object


In [70]:
tuple_phones = []
for id, word_phones in enumerate(phones):
    word_phones_tuple = tuple(word_phones[0].split(" "))
    text = texts[id]
    tuple_phones.append((id, word_phones_tuple, text))

del phones

In [71]:
print(tuple_phones)

[(0, ('sil',), nan), (1, ('L', 'EY1', 'D', 'IY0'), 'lady'), (2, ('sp',), nan), (3, ('L', 'EY1', 'D', 'IY0'), 'lady'), (4, ('sp',), nan), (5, ('M', 'AY1'), 'my'), (6, ('R', 'OW1', 'Z'), 'rose'), (7, ('W', 'AY1', 'T'), 'white'), (8, ('L', 'EY1', 'D', 'IY0'), 'lady'), (9, ('sp',), nan), (10, ('B', 'AH1', 'T'), 'but'), (11, ('W', 'IH1', 'L'), 'will'), (12, ('Y', 'UW1'), 'you'), (13, ('N', 'AA1', 'T'), 'not'), (14, ('HH', 'IY1', 'R'), 'hear'), (15, ('AH0',), 'a'), (16, ('R', 'AW1', 'N', 'D', 'AH0', 'L'), 'roundel'), (17, ('L', 'EY1', 'D', 'IY0'), 'lady')]


In [72]:
cluster_tuples = []
seen_nodes = set()  # To track nodes we've already added

for node_id, cluster in zip(best_partition_df["node"], best_partition_df["cluster"]):
    for node, phone, word in tuple_phones:
        if node_id == node and node_id not in seen_nodes:
            cluster_tuples.append((cluster, phone, word))
            seen_nodes.add(node_id)  # Mark this node as added
            break  # Exit loop early once node is matched


In [75]:
import itertools
import editdistance
import re

sorted_clusters = sorted(cluster_tuples, key=lambda x: x[0])
distances = []
for idx, group in itertools.groupby(sorted_clusters, key=lambda x: x[0]):
    group_list = list(group)

    if len(group_list) < 2:
        continue

    print(f"idx: {idx}")
    for p, q in itertools.combinations(group_list, 2):
        p_1 = tuple(re.sub(r"[012]", "", phn) for phn in p[1] if phn != "sil")
        q_1 = tuple(re.sub(r"[012]", "", phn) for phn in q[1] if phn != "sil")

        d = 1.0
        if max(len(p_1), len(q_1)) > 0:
            d = float(editdistance.eval(p_1, q_1)) / max(len(p_1), len(q_1))

        print(f"{'-'.join(p_1)}, {'-'.join(q_1)}: dist {d}")
        distances.append(d)
    print()

idx: 0
L-EY-D-IY, L-EY-D-IY: dist 0.0

idx: 1
W-IH-L, R-AW-N-D-AH-L: dist 0.8333333333333334

idx: 2
L-EY-D-IY, L-EY-D-IY: dist 0.0



In [74]:
import statistics

print(f"NED: {statistics.mean(distances)}")

NED: 0.2777777777777778
