In [None]:

import numpy as np
from sklearn.cluster import KMeans
from pathlib import Path

In [16]:

import torch 
import segment
def Kmeans():
    model = KMeans(100)
    checkpoint = torch.hub.load_state_dict_from_url(
    "https://github.com/bshall/dusted/releases/download/v0.1/kmeans-english-50f36a.pt"
    )
    model.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
    model.__dict__["_n_threads"] = checkpoint["_n_threads"]
    model.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
    return model, segment

In [None]:
from tqdm import tqdm

def process_file(paths, codebook, segment, gamma):
    in_path, out_path = paths
    sequence = np.load(in_path)
    codes, boundaries = segment(sequence, codebook, gamma)
    np.savez(out_path.with_suffix(".npz"), codes=codes, boundaries=boundaries)
    return sequence.shape[0], np.mean(np.diff(boundaries))

def segment_dataset(args):
    kmeans, segment = Kmeans()
    in_paths = list(args.in_dir.rglob("*.npy"))
    out_paths = [args.out_dir / path.relative_to(args.in_dir) for path in in_paths]

    for path in out_paths:
        path.parent.mkdir(exist_ok=True, parents=True)

    results = []
    for path in tqdm(zip(in_paths, out_paths), desc="Processing segments"):
        result = process_file(paths=path, codebook=kmeans.cluster_centers_, segment=segment.segment, gamma=1)
        
        results.append(result)

    frames, boundary_length = zip(*results)
    print(f"Segmented {sum(frames) * 0.02 / 60 / 60:.2f} hours of audio")
    print(f"Average segment length: {np.mean(boundary_length) * 0.02:.2f} seconds")

In [22]:

class Args:
    def __init__(self, in_dir, out_dir):
        self.in_dir =in_dir
        self.out_dir = out_dir

args = Args(Path("features/"),Path("codes/"))

segment_dataset(args)

100%|██████████| 2703/2703 [00:00<00:00, 87306.64it/s]


Segmented 5.38 hours of audio
Average segment length: 1.83 seconds


In [24]:
true_bounds = [0, 14, 25, 30, 35, 36]
boundaries = [0, 13, 25, 27, 29, 30, 31, 32, 36]
encodings = [1, 2, 3, 4, 5, 6, 7, 8, 9]


features = []
prev_j = 1
for i in range(1, len(true_bounds)):
    new_feature = []
    for j in range(prev_j, len(boundaries)):
        if true_bounds[i] < boundaries[j]:
            continue

        if true_bounds[i] == boundaries[j]:
            new_feature.append(encodings[j-1])
            
        elif true_bounds[i] > boundaries[j]:
            new_feature.append(encodings[j-1])
        
        prev_j = j + 1
        
    features.append(new_feature)
    

In [34]:
import numpy as np
from pathlib import Path
in_dir = Path("output/librispeech_subset/")

for path in list(in_dir.rglob("*.npy")):
    mat = np.load(path)

    DISTANCE_THRESHOLD = round(np.mean(mat)/3,3)
    num_nodes = mat.shape[0]
    graph = {i: set() for i in range(num_nodes)}

    for i in range(num_nodes - 1): 
        for j in range(i + 1, num_nodes):  
            if mat[i, j] < DISTANCE_THRESHOLD:
                graph[i].add(j)
                graph[j].add(i)  


    clusters = []
    visited = set()

    def bfs(start_node):
        """ Traverse a cluster using BFS """
        queue = [start_node]
        cluster = []
        
        while queue:
            node = queue.pop(0)
            if node in visited:
                continue
            visited.add(node)
            cluster.append(node)
            queue.extend(graph[node])  

        return cluster

    for node in range(num_nodes):
        if node not in visited:
            new_cluster = bfs(node)
            clusters.append(new_cluster)

    sizes = []
    for i, clust in enumerate(clusters):
        sizes.append(len(clust))

    print(f"Avg size cluster for file {str(path)}: {np.mean(sizes)}")
    

Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.05/norm_distance_matrix.npy: 1.9774436090225564
Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.1/norm_distance_matrix.npy: 2.8452380952380953
Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.15/norm_distance_matrix.npy: 4.326923076923077
Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.2/norm_distance_matrix.npy: 5.666666666666667
