# Getting units
Set `get=True` if units need to be extracted for specified `gamma` and `layer`.

In [9]:
import pandas as pd
from encode import sample_files, get_units
from pathlib import Path

get = False
align_dir = Path("data/alignments/dev-clean/")
align_path = align_dir / "alignments.csv"
audio_dir = Path("data/dev-clean")
audio_ext = ".flac"

gamma = 0.3
layer = 7
save_dir = Path("features/")

align_df = pd.read_csv(align_path)

paths, sample_size = sample_files(
    audio_dir=audio_dir, audio_ext=audio_ext, sample_size=-1
)

print(f"Sample size: {sample_size}")
if get:
    get_units(paths, align_df, audio_dir, gamma, layer, save_dir)

Sample size: 2703


# Calculate Distances 
Calculates pairwise distances for all the pairs in the dataset chunkwise. Set `chunk_limit` and `out_dir`.

In [10]:
from dist import get_features, get_batch_of_paths, cal_dist_per_pair
from tqdm import tqdm
import numpy as np

chunk_limit = 5000000
out_dir = Path(f"output/{gamma}/temp/")
out_dir.mkdir(parents=True, exist_ok=True)
preloaded = True

if not preloaded:
    paths = (p for p in Path(f"features/{gamma}").rglob("**/*.npy"))
    sorted_paths = sorted(paths, key=lambda x: int(x.stem.split("_")[-1]))
    sample_size = len(sorted_paths)

    features = get_features(sorted_paths)

    rows, cols, vals = [], [], []

    num_pairs = sample_size * (sample_size - 1) // 2
    num_batches = (num_pairs + chunk_limit - 1) // chunk_limit

    print(f"num_samples: {sample_size}")
    print(f"num_pairs: {num_pairs}")

    chunk_idx = 0
    for batch in tqdm(
        get_batch_of_paths(sample_size, chunk_limit),
        total=num_batches,
        unit="batch",
        mininterval=10.0,
        desc="Processing Batches",
    ):
        for i, j in batch:
            i, j, dist = cal_dist_per_pair(((i, j), (features[i], features[j])))
            rows.append(i)
            cols.append(j)
            vals.append(dist)

        np.save(out_dir / f"temp_rows_{chunk_idx}.npy", rows)
        np.save(out_dir / f"temp_cols_{chunk_idx}.npy", cols)
        np.save(out_dir / f"temp_vals_{chunk_idx}.npy", vals)

        rows, cols, vals = [], [], []
        chunk_idx += 1


# Build graph from temp files
These temp files are then used to build the graph chunkwise. If the graph has been computed before, it can only be read in. 

A search is performed to get the resolution `res` that gets the correct amount of clusters. The partition is stored in a .csv file.

In [None]:
from cluster import build_graph_from_temp, adaptive_res_search
import pickle

use_preloaded_graph = True
num_clusters = 13967
temp_dir = Path(f"output/{gamma}/temp")
temp_dir.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists

graph_path = Path(f"output/{gamma}/graph.pkl")

if use_preloaded_graph and graph_path.exists():
    with open(graph_path, "rb") as f:
        g = pickle.load(f)
    print(f"Loaded precomputed graph from {graph_path}")
else:
    g = build_graph_from_temp(temp_dir, 399)
    g.write_pickle(str(graph_path))
    print(f"Graph built and saved to {graph_path}")

best_res, best_partition = adaptive_res_search(g, num_clusters)

actual_clusters = len(set(best_partition.membership))
diff = abs(actual_clusters - num_clusters)

print(f"Best resolution found: {best_res:.3f} with cluster difference: {diff}")

df = pd.DataFrame(
    {
        "node": range(len(best_partition.membership)),  # Node IDs
        "cluster": best_partition.membership,  # Cluster assignments
    }
)

df.to_csv(f"output/{gamma}/best_partition_r{round(best_res, 3)}.csv", index=False)


Calculating total: 100%|██████████| 399/399 [00:30<00:00, 13.11it/s]


total_size: 1993108816, sample_size: 63137


Getting Temp Info: 100%|██████████| 399/399 [03:01<00:00,  2.20it/s]


TypeError: _write_graph_to_pickle_file() got an unexpected keyword argument 'format'

# Evaluate
The graph partition is evaluated by computing NED for the text in each cluster.

In [None]:
from eval import get_texts, transcribe_clusters, ned

partition = pd.read_csv(f"output/{gamma}/best_partition_r{round(best_res, 3)}.csv")
texts = get_texts(gamma, align_dir)

cluster_transcriptions = transcribe_clusters(partition, texts)
ned_val = ned(cluster_transcriptions)
print(f"NED: {ned_val}")

And to update the readme:

In [None]:
from eval import update_readme

update_readme(gamma, best_res, ned_val)