In [2]:
import pandas as pd
from encode import get_units
from pathlib import Path

align_dir = Path("data/alignments/dev-clean/")
align_path = align_dir / "alignments.csv"
audio_dir = Path("data/dev-clean")
audio_ext = ".flac"

align_df = pd.read_csv(align_path)
gamma = 0.2
layer = 7
save_dir = Path("ladies/features")
paths = [Path("data/dev-clean/174/50561/174-50561-0005.flac")]
get_units(paths, align_df, audio_dir, gamma, layer, save_dir)

Using cache found in /home/danel/.cache/torch/hub/bshall_dusted_main
Using cache found in /home/danel/.cache/torch/hub/bshall_dusted_main
Using cache found in /home/danel/.cache/torch/hub/bshall_hubert_main
Getting units: 100%|██████████| 1/1 [00:00<00:00,  3.83it/s]


In [8]:
from dist import get_features, get_batch_of_paths, cal_dist_per_pair
from tqdm import tqdm
import numpy as np

out_dir = Path(f"ladies/output/{gamma}/temp/")
out_dir.mkdir(parents=True, exist_ok=True)
preloaded = False
chunk_limit = 500000


def process_batch(batch, features):
    """Parallelized function to calculate distance for each (i, j) pair."""
    return [cal_dist_per_pair(((i, j), (features[i], features[j]))) for i, j in batch]


if not preloaded:
    paths = (p for p in Path(f"ladies/features/{gamma}").rglob("**/*.npy"))
    sorted_paths = sorted(paths, key=lambda x: int(x.stem.split("_")[-1]))
    sample_size = len(sorted_paths)

    features = get_features(sorted_paths)

    rows, cols, vals = [], [], []

    num_pairs = sample_size * (sample_size - 1) // 2
    num_batches = (num_pairs + chunk_limit - 1) // chunk_limit

    print(f"num_samples: {sample_size}")
    print(f"num_pairs: {num_pairs}")

    chunk_idx = 0
    # Parallel execution
    for batch in tqdm(
        get_batch_of_paths(sample_size, chunk_limit),
        total=num_batches,
        unit="batch",
        mininterval=10.0,
        desc="Processing Batches",
    ):
        for i, j in batch:
            i, j, dist = cal_dist_per_pair(((i, j), (features[i], features[j])))
            rows.append(i)
            cols.append(j)
            vals.append(dist)

        np.save(out_dir / f"temp_rows_{chunk_idx}.npy", rows)
        np.save(out_dir / f"temp_cols_{chunk_idx}.npy", cols)
        np.save(out_dir / f"temp_vals_{chunk_idx}.npy", vals)

        rows, cols, vals = [], [], []
        chunk_idx += 1


Appending Features: 100%|██████████| 18/18 [00:00<00:00, 7851.23it/s]


num_samples: 18
num_pairs: 153



Processing Batches: 100%|██████████| 1/1 [00:00<00:00, 729.19batch/s]


In [21]:
from cluster import build_graph_from_temp, adaptive_res_search
import pickle

use_preloaded_graph = False
num_clusters = 15
temp_dir = Path(f"ladies/output/{gamma}/temp")
temp_dir.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists
num_chunks = 1

graph_path = Path(f"ladies/output/{gamma}/graph.pkl")

if use_preloaded_graph and graph_path.exists():
    with open(graph_path, "rb") as f:
        g = pickle.load(f)
    print(f"Loaded precomputed graph from {graph_path}")
else:
    g = build_graph_from_temp(temp_dir, num_chunks)
    g.write_pickle(str(graph_path))
    print(f"Graph built and saved to {graph_path}")

partition_pattern = Path(f"ladies/output/{gamma}").glob("partition_r*.csv")
partition_files = list(partition_pattern)

if not partition_files:
    # No existing partitions found, run the search
    best_res, best_partition = adaptive_res_search(g, num_clusters)

    # Convert best_partition to a DataFrame
    best_partition_df = pd.DataFrame(
        {
            "node": range(len(best_partition.membership)),  # Node IDs
            "cluster": best_partition.membership,  # Cluster assignments
        }
    )

    # Save to CSV
    best_partition_df.to_csv(
        f"output/{gamma}/partition_r{round(best_res, 3)}.csv", index=False
    )
else:
    # Load existing partitions
    res_partitions = [
        (float(p.stem.split("_r")[1]), pd.read_csv(p)) for p in partition_files
    ]

    # Find the partition with the minimum resolution
    best_res, best_partition_df = min(res_partitions, key=lambda x: x[0])

# Ensure best_partition_df is used for further processing
actual_clusters = len(set(best_partition_df["cluster"]))
diff = abs(actual_clusters - num_clusters)

print(f"Best resolution found: {best_res:.3f} with cluster difference: {diff}")



Calculating total: 100%|██████████| 1/1 [00:00<00:00, 2427.26it/s]


total_size: 153, sample_size: 18




Getting Temp Info: 100%|██████████| 1/1 [00:00<00:00, 995.33it/s]

Graph built and saved to ladies/output/0.2/graph.pkl
Iteration 1: res=0.020000, Cluster difference=0
Best resolution found: 0.020 with cluster difference: 0





In [12]:
def get_phones_and_texts(gamma, align_dir):
    cache_path = Path(f"ladies/features/{gamma}/texts_and_phones.csv")

    if cache_path.exists():
        df = pd.read_csv(cache_path)
        texts = df["text"].tolist()
        phones = df["phones"].apply(lambda x: tuple(x.split(",")))
        print(f"Loaded texts from {cache_path}")
        return phones, texts

    paths = sorted(
        Path(f"ladies/features/{gamma}").rglob("**/*.npy"),
        key=lambda x: int(x.stem.split("_")[-1]),
    )
    align_df = pd.read_csv(align_dir / "alignments.csv")

    texts = []
    phones = []

    for path in tqdm(paths, desc="Appending Text and Phones"):
        filename_parts = path.stem.split("_")
        wav_df = align_df[align_df["filename"] == filename_parts[0]]
        word_df = wav_df[wav_df["word_id"] == int(filename_parts[1])]
        texts.append(str(word_df["text"].iloc[0]))
        word_phones = [str(word_df["phones"].iloc[0])]
        phones.append(tuple(word_phones))

    df = pd.DataFrame({"text": texts, "phones": phones})
    df.to_csv(cache_path, index=False)
    print(f"Saved texts to {cache_path}")

    return phones, texts


In [51]:
import editdistance
import statistics
import itertools


def distance(p, q):
    """Compute normalized edit distance between two strings."""
    length = max(len(p), len(q))
    return (
        editdistance.eval(p, q) / length if length > 0 else 1
    )  # Avoid division by zero


def ned(clusters):
    """Compute the normalized edit distance (NED) within each cluster."""
    if not clusters:
        return 0

    clusters = sorted(clusters, key=lambda x: x[0])

    distances = []
    for _, group in itertools.groupby(clusters, key=lambda x: x[0]):
        group_list = list(group)

        if len(group_list) < 2:
            continue
        print(group_list)

        for p, q in itertools.combinations(group_list, 2):
            print(p, q)
            d = distance(p[1], q[1])
            distances.append(d)

    return statistics.mean(distances)

In [60]:
phones, texts = get_phones_and_texts(gamma, align_dir)


def ned_per_cluster(cluster):
    distances = []

    if len(cluster) < 2:
        return 0

    for p, q in itertools.combinations(cluster, 2):
        if p not in {"sil", "sp"} and q not in {"sil", "sp"}:
            d = distance(p, q)
            distances.append(d)

    return statistics.mean(distances) if distances else 0


def clean_phones(phones):
    phone_clusters = []

    for node_id, phone in phones.items():
        clean_phone = []
        for el in phone:
            if el.strip() in {"(", ")"}:
                continue
            clean_el = el.strip("'()'")
            clean_phone.append(clean_el)
        phone_clusters.append((node_id, tuple(clean_phone)))
    return phone_clusters


phones = clean_phones(phones)
phone_clusters = []
for node, cluster in zip(best_partition_df["node"], best_partition_df["cluster"]):
    for node_id, phone in phones:
        if node_id == node:
            phone_clusters.append((cluster, phone))

# text_clusters = transcribe_clusters(best_partition_df, texts)  # if you want to print it
for cluster_id, phones in phone_clusters:
    print(" | ".join(phones))
    print(f"NED: {ned_per_cluster(phones)}\n")

Loaded texts from ladies/features/0.2/texts_and_phones.csv
sil
NED: 0

L | EY1 | D | IY0
NED: 0.9444444444444444

sp
NED: 0

L | EY1 | D | IY0
NED: 0.9444444444444444

sp
NED: 0

M | AY1
NED: 1.0

R | OW1 | Z
NED: 1.0

W | AY1 | T
NED: 1.0

L | EY1 | D | IY0
NED: 0.9444444444444444

sp
NED: 0

B | AH1 | T
NED: 1.0

W | IH1 | L
NED: 1.0

Y | UW1
NED: 1.0

N | AA1 | T
NED: 1.0

HH | IY1 | R
NED: 1.0

AH0
NED: 0

R | AW1 | N | D | AH0 | L
NED: 0.9777777777777777

L | EY1 | D | IY0
NED: 0.9444444444444444



In [None]:
from collections import defaultdict


def print_clusters(cluster_words, cluster_phones):
    # Dictionary to store all text per cluster
    cluster_texts = defaultdict(list)

    # Group all text by cluster_id
    for cluster_id, txt in cluster_words:
        cluster_texts[cluster_id].append(txt)

    cluster_phones = sorted(cluster_phones, key=lambda x: x[0])
    grouped_cluster_phones = []
    for _, group in itertools.groupby(cluster_phones, key=lambda x: x[0]):
        grouped_cluster_phones.append(list(group))

    # Print all texts in each cluster
    neds = []
    for cluster_id, texts in cluster_texts.items():
        if len(texts) > 1:
            phones = grouped_cluster_phones[cluster_id]
            ned_val = ned_per_cluster(phones)
            neds.append(ned_val)
            words = []
            for text in texts:
                text = str(text)
                if text != "nan":
                    words.append(text)
            if words:
                print(f"Cluster {cluster_id}: ned: {ned_val}\n {' | '.join(words)}\n")
    print(f"NED: {statistics.mean(neds)}")


print_clusters(text_clusters, phone_clusters)
