In [2]:
from pathlib import Path
from encode import sample_files, get_units
import pandas as pd

audio_dir = Path("data/dev-clean")
audio_ext = ".flac"
align_path = Path("data/alignments/dev-clean/alignments.csv")
save_dir = Path("features/")

align_df = pd.read_csv(align_path)

In [None]:
paths, sample_size = sample_files(
    audio_dir=audio_dir, audio_ext=audio_ext, sample_size=-1
)

gamma = 0.1

get_units(paths=paths, align_df=align_df, gamma=gamma, layer=7, save_dir=save_dir)

In [None]:
from distance import process_chunks

gamma = 0.2
feat_dir = Path(f"features/{gamma}")
csv_path = Path(f"output/{gamma}/info.csv")
dist_mat_out_path = Path(f"output/{gamma}/dist_mat.npz")
chunk_limit = 1000000

process_chunks(
    feat_dir=feat_dir,
    info_csv_path=csv_path,
    dist_mat_out_path=dist_mat_out_path,
    chunk_limit=chunk_limit,
)

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path


csv_path = "output/0.2/info.csv"
dist_mat_out_path = Path("output/0.2/dist_mat.npz")

info_df = pd.read_csv(csv_path)

In [3]:
dist_mat = np.load(dist_mat_out_path)
dist_mat = dist_mat[dist_mat.files[0]]

KeyboardInterrupt: 

In [4]:
from cluster import to_sparse_upper_chunked

sparse_dist_mat = to_sparse_upper_chunked(
    dist_mat, chunk_size=500, save_path="sparse_dist_mat.npz"
)


Converting to Sparse: 100%|██████████| 127/127 [00:01<00:00, 70.75chunk/s]


In [4]:
import scipy.sparse as sp

sparse_dist_mat = sp.load_npz("output/0.2/sparse_dist_mat.npz")

In [5]:
def get_true_words(info_df: pd.DataFrame, align_df: pd.DataFrame):
    """
    Efficiently extracts corresponding words from `align_df` based on `info_df` filenames.

    Args:
        info_df (pd.DataFrame): DataFrame containing `filename` column with "file_wordID".
        align_df (pd.DataFrame): DataFrame containing `filename`, `word_id`, and `text`.

    Returns:
        List[str]: List of corresponding words or "_" if no match is found.
    """
    if "word_id" not in info_df:
        split_cols = info_df["filename"].str.split("_", expand=True)
        info_df[["filename", "word_id"]] = split_cols
        info_df["word_id"] = info_df["word_id"].astype(int)
    merged_df = info_df.merge(align_df, on=["filename", "word_id"], how="left")

    merged_df["text"] = merged_df["text"].fillna("_")
    return merged_df["text"].tolist()

In [6]:
import itertools
import editdistance
import statistics


def print_clusters(word_clusters, print_pure=False, print_inpure=True):
    for i, clust in enumerate(word_clusters):
        if len(clust) > 1:
            clust_dist = []

            for p, q in itertools.combinations(clust, 2):
                dist = editdistance.eval(p, q)
                clust_dist.append(dist)

            if any(dist > 0 for dist in clust_dist) and print_inpure or print_pure:
                print(f"Cluster {i}: {statistics.mean(clust_dist)}")
                words = [j for j in clust]
                print(", ".join(words))
                print()


In [7]:
def get_sim_mat(dist_mat):
    dist_mat = dist_mat.tocsr()

    # Convert distances to similarities for nonzero elements
    max_dist = dist_mat.data.max()
    similarity_matrix = dist_mat.copy()
    similarity_matrix.data = max_dist - similarity_matrix.data

    return similarity_matrix


sim_mat = get_sim_mat(sparse_dist_mat)

In [None]:
k = 13967

In [14]:
print(sparse_dist_mat)

<COOrdinate sparse matrix of dtype 'float32'
	with 15727680 stored elements and shape (63136, 63136)>
  Coords	Values
  (0, 1)	1.0
  (0, 2)	1.0
  (0, 3)	1.0
  (0, 4)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 7)	1.0
  (0, 8)	1.0
  (0, 9)	0.875
  (0, 10)	1.0
  (0, 11)	1.0
  (0, 12)	0.875
  (0, 13)	1.0
  (0, 14)	1.0
  (0, 15)	1.0
  (0, 16)	1.0
  (0, 17)	1.0
  (0, 18)	1.0
  (0, 19)	0.8571428656578064
  (0, 20)	0.875
  (0, 21)	1.0
  (0, 22)	0.875
  (0, 23)	1.0
  (0, 24)	1.0
  (0, 25)	1.0
  :	:
  (63128, 132)	0.0
  (63128, 133)	0.0
  (63128, 134)	0.0
  (63128, 135)	0.0
  (63129, 130)	0.0
  (63129, 131)	0.0
  (63129, 132)	0.0
  (63129, 133)	0.0
  (63129, 134)	0.0
  (63129, 135)	0.0
  (63130, 131)	0.0
  (63130, 132)	0.0
  (63130, 133)	0.0
  (63130, 134)	0.0
  (63130, 135)	0.0
  (63131, 132)	0.0
  (63131, 133)	0.0
  (63131, 134)	0.0
  (63131, 135)	0.0
  (63132, 133)	0.0
  (63132, 134)	0.0
  (63132, 135)	0.0
  (63133, 134)	0.0
  (63133, 135)	0.0
  (63134, 135)	0.0
