# Getting units
Set `get=True` if units need to be extracted for specified `gamma` and `layer`.

In [13]:
import pandas as pd
from encode import sample_files, get_units
from pathlib import Path

get = False
align_dir = Path("data/alignments/dev-clean/")
align_path = align_dir / "alignments.csv"
audio_dir = Path("data/dev-clean")
audio_ext = ".flac"

gamma = 0.5
layer = 7
save_dir = Path("features/")

align_df = pd.read_csv(align_path)

paths, sample_size = sample_files(
    audio_dir=audio_dir, audio_ext=audio_ext, sample_size=-1
)

print(f"Sample size: {sample_size}")
if not Path(save_dir / str(gamma)).exists():
    get_units(paths, align_df, audio_dir, gamma, layer, save_dir)

Sample size: 2703


# Calculate Distances 
Calculates pairwise distances for all the pairs in the dataset chunkwise. Set `chunk_limit` and `out_dir`.

In [14]:
from dist import get_features, get_batch_of_paths, cal_dist_per_pair
from tqdm import tqdm
import numpy as np

chunk_limit = 5000000
out_dir = Path(f"output/{gamma}/temp/")
out_dir.mkdir(parents=True, exist_ok=True)
preloaded = True


def process_batch(batch, features):
    """Parallelized function to calculate distance for each (i, j) pair."""
    return [cal_dist_per_pair(((i, j), (features[i], features[j]))) for i, j in batch]


if not preloaded:
    paths = (p for p in Path(f"features/{gamma}").rglob("**/*.npy"))
    sorted_paths = sorted(paths, key=lambda x: int(x.stem.split("_")[-1]))
    sample_size = len(sorted_paths)

    features = get_features(sorted_paths)

    rows, cols, vals = [], [], []

    num_pairs = sample_size * (sample_size - 1) // 2
    num_batches = (num_pairs + chunk_limit - 1) // chunk_limit

    print(f"num_samples: {sample_size}")
    print(f"num_pairs: {num_pairs}")

    chunk_idx = 0
    # Parallel execution
    for batch in tqdm(
        get_batch_of_paths(sample_size, chunk_limit),
        total=num_batches,
        unit="batch",
        mininterval=10.0,
        desc="Processing Batches",
    ):
        for i, j in batch:
            i, j, dist = cal_dist_per_pair(((i, j), (features[i], features[j])))
            rows.append(i)
            cols.append(j)
            vals.append(dist)

        np.save(out_dir / f"temp_rows_{chunk_idx}.npy", rows)
        np.save(out_dir / f"temp_cols_{chunk_idx}.npy", cols)
        np.save(out_dir / f"temp_vals_{chunk_idx}.npy", vals)

        rows, cols, vals = [], [], []
        chunk_idx += 1

# Build graph from temp files
These temp files are then used to build the graph chunkwise. If the graph has been computed before, it can only be read in. 

A search is performed to get the resolution `res` that gets the correct amount of clusters. The partition is stored in a .csv file.

In [15]:
from cluster import build_graph_from_temp
import pickle

use_preloaded_graph = True
num_clusters = 13967
temp_dir = Path(f"output/{gamma}/temp")
temp_dir.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists

graph_path = Path(f"output/{gamma}/graph.pkl")

if use_preloaded_graph and graph_path.exists():
    with open(graph_path, "rb") as f:
        g = pickle.load(f)
    print(f"Loaded precomputed graph from {graph_path}")
else:
    g = build_graph_from_temp(temp_dir, 399)
    g.write_pickle(str(graph_path))
    print(f"Graph built and saved to {graph_path}")


Loaded precomputed graph from output/0.5/graph.pkl


And get the partition dataframe (which node belongs in which cluster):


Set `use_predefined_partition=False` if the partition must be calculated.

In [None]:
from cluster import adaptive_res_search

use_predefined_partition = False

partition_pattern = Path(f"output/{gamma}").glob("partition_r*.csv")
partition_files = list(partition_pattern)

if not partition_files or not use_predefined_partition:
    # No existing partitions found, run the search
    best_res, best_partition = adaptive_res_search(
        g, num_clusters, 0.0275, max_iters=10
    )

    # Convert best_partition to a DataFrame
    best_partition_df = pd.DataFrame(
        {
            "node": range(len(best_partition.membership)),  # Node IDs
            "cluster": best_partition.membership,  # Cluster assignments
        }
    )

    # Save to CSV
    best_partition_df.to_csv(
        f"output/{gamma}/partition_r{round(best_res, 6)}.csv", index=False
    )
else:
    # Load existing partitions
    res_partitions = [
        (float(p.stem.split("_r")[1]), pd.read_csv(p)) for p in partition_files
    ]

    # Find the partition with the minimum resolution
    best_res, best_partition_df = min(res_partitions, key=lambda x: x[0])

# Ensure best_partition_df is used for further processing
actual_clusters = len(set(best_partition_df["cluster"]))
diff = abs(actual_clusters - num_clusters)

print(f"Best resolution found: {best_res:.6f} with cluster difference: {diff}")


Iteration 1: res=0.026500, Cluster difference=56
Iteration 2: res=0.027500, Cluster difference=46
Iteration 3: res=0.026550, Cluster difference=60
Iteration 4: res=0.027453, Cluster difference=44
Iteration 5: res=0.026595, Cluster difference=68
Iteration 6: res=0.027410, Cluster difference=27
Iteration 7: res=0.026636, Cluster difference=65
Iteration 8: res=0.027371, Cluster difference=42
Iteration 9: res=0.026673, Cluster difference=46
Iteration 10: res=0.027336, Cluster difference=40
Best resolution found: 0.027410 with cluster difference: 27


# Evaluate
The graph partition is evaluated by computing NED for the text in each cluster.

In [None]:
from eval import transcribe_clusters, ned, print_clusters, get_phones_and_texts

phones, texts = get_phones_and_texts(gamma, align_dir)
phone_clusters = transcribe_clusters(best_partition_df, phones, texts)
ned_val, dist_p_cluster = ned(phone_clusters, num_clusters - diff)
print(f"NED: {ned_val}")
print_clusters(dist_p_cluster)


Appending Text and Phones:   2%|▏         | 1513/63137 [00:05<03:41, 278.71it/s]

And to update the readme:

In [None]:
from eval import update_readme

update_readme(gamma, best_res, ned_val, diff)

Updated README.md with gamma=0.5, res=0.025909631250000002, NED=0.1363889572950492
