In [39]:

import numpy as np
from sklearn.cluster import KMeans
from pathlib import Path

In [40]:

import torch 
import segment
def Kmeans():
    model = KMeans(100)
    checkpoint = torch.hub.load_state_dict_from_url(
    "https://github.com/bshall/dusted/releases/download/v0.1/kmeans-english-50f36a.pt"
    )
    model.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
    model.__dict__["_n_threads"] = checkpoint["_n_threads"]
    model.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
    return model, segment

In [41]:
from tqdm import tqdm

def process_file(paths, codebook, segment, gamma):
    in_path, out_path = paths
    sequence = np.load(in_path)
    codes, boundaries = segment(sequence, codebook, gamma)
    np.savez(out_path.with_suffix(".npz"), codes=codes, boundaries=boundaries)
    return sequence.shape[0], np.mean(np.diff(boundaries))

def segment_dataset(args):
    kmeans, segment = Kmeans()
    in_paths = list(args.in_dir.rglob("*.npy"))
    out_paths = [args.out_dir / path.relative_to(args.in_dir) for path in in_paths]

    for path in out_paths:
        path.parent.mkdir(exist_ok=True, parents=True)

    results = []
    for path in tqdm(zip(in_paths, out_paths), desc="Processing segments"):
        result = process_file(paths=path, codebook=kmeans.cluster_centers_, segment=segment.segment, gamma=1)
        
        results.append(result)

    frames, boundary_length = zip(*results)
    print(f"Segmented {sum(frames) * 0.02 / 60 / 60:.2f} hours of audio")
    print(f"Average segment length: {np.mean(boundary_length) * 0.02:.2f} seconds")

In [42]:

class Args:
    def __init__(self, in_dir, out_dir):
        self.in_dir =in_dir
        self.out_dir = out_dir

args = Args(Path("features/"),Path("codes/"))

segment_dataset(args)

Processing segments: 468it [00:35, 13.15it/s]


SystemError: CPUDispatcher(<function _segment at 0x71e1952659e0>) returned a result with an exception set

In [None]:
true_bounds = [0, 14, 25, 30, 35, 36]
boundaries = [0, 13, 25, 27, 29, 30, 31, 32, 36]
encodings = [1, 2, 3, 4, 5, 6, 7, 8, 9]


features = []
prev_j = 1
for i in range(1, len(true_bounds)):
    new_feature = []
    for j in range(prev_j, len(boundaries)):
        if true_bounds[i] < boundaries[j]:
            continue

        if true_bounds[i] == boundaries[j]:
            new_feature.append(encodings[j-1])
            
        elif true_bounds[i] > boundaries[j]:
            new_feature.append(encodings[j-1])
        
        prev_j = j + 1
        
    features.append(new_feature)
    

In [None]:
import numpy as np
from pathlib import Path
in_dir = Path("output/librispeech_subset/")

for path in list(in_dir.rglob("*.npy")):
    mat = np.load(path)

    DISTANCE_THRESHOLD = round(np.mean(mat)/3,3)
    num_nodes = mat.shape[0]
    graph = {i: set() for i in range(num_nodes)}

    for i in range(num_nodes - 1): 
        for j in range(i + 1, num_nodes):  
            if mat[i, j] < DISTANCE_THRESHOLD:
                graph[i].add(j)
                graph[j].add(i)  


    clusters = []
    visited = set()

    def bfs(start_node):
        """ Traverse a cluster using BFS """
        queue = [start_node]
        cluster = []
        
        while queue:
            node = queue.pop(0)
            if node in visited:
                continue
            visited.add(node)
            cluster.append(node)
            queue.extend(graph[node])  

        return cluster

    for node in range(num_nodes):
        if node not in visited:
            new_cluster = bfs(node)
            clusters.append(new_cluster)

    sizes = []
    for i, clust in enumerate(clusters):
        sizes.append(len(clust))

    print(f"Avg size cluster for file {str(path)}: {np.mean(sizes)}")
    

Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.05/norm_distance_matrix.npy: 1.9774436090225564
Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.1/norm_distance_matrix.npy: 2.8452380952380953
Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.15/norm_distance_matrix.npy: 4.326923076923077
Avg size cluster for file output/librispeech_subset/wavlm_base/8/0.2/norm_distance_matrix.npy: 5.666666666666667


# The whole dusted pipeline

In [49]:
class Args:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

In [None]:
# Encoding into HuBERT or WAVLM features
import  torchaudio
from pathlib import Path
from tqdm import tqdm
import torch

model_pipelines = {
    "hubert_base": torchaudio.pipelines.HUBERT_BASE,
    "hubert_large": torchaudio.pipelines.HUBERT_LARGE,
    "hubert_xlarge": torchaudio.pipelines.HUBERT_XLARGE,
    "wavlm_base": torchaudio.pipelines.WAVLM_BASE,
    "wavlm_large": torchaudio.pipelines.WAVLM_LARGE,
    "wavlm_base_plus": torchaudio.pipelines.WAVLM_BASE_PLUS,
}

def encode_features(args):
    wavs = list(args.in_dir.rglob(f"*{args.audio_ext}"))

    bundle = model_pipelines.get(args.model, torchaudio.pipelines.HUBERT_BASE)
    model = bundle.get_model().cuda()
    model.eval()

    out_dir = None
    if args.out_dir:
        out_dir = args.out_dir / args.model / str(args.layer)
        print(f"Storing Encodings in {str(out_dir)}")

    encodings = {}
    for wav_path in tqdm(wavs, desc="Encoding Audio Features"):
        wav, sr = torchaudio.load(wav_path)
        wav = torchaudio.functional.resample(wav, sr, 16000).cuda()
        
        with torch.inference_mode():
            encoding, _ = model.extract_features(wav, num_layers=args.layer)

        encoding = encoding[args.layer-1].squeeze().cpu().numpy()
        
        if out_dir:
            out_dir.mkdir(parents=True, exist_ok=True)
            output_path = Path(out_dir) / f"{wav_path.stem}.npy"
            np.save(output_path, encoding)
            
        else:
            encodings[wav_path.stem] = encoding
    return encodings         

In [None]:
args = Args(
    in_dir=Path("data/librispeech-wav/"),
    out_dir=None,
    model="wavlm_base",
    layer=6,
    audio_ext=".wav"
)

encodings = encode_features(args)
print(len(encodings))

Encoding Audio Features: 100%|██████████| 2703/2703 [01:33<00:00, 28.98it/s]

2703





In [None]:
# Segment the dataset into discrete acoustic units
from sklearn.cluster import KMeans
import segment
import numpy as np

def kmeans_model(url):
    model = KMeans(100)
    checkpoint = torch.hub.load_state_dict_from_url(url)

    model.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
    model.__dict__["_n_threads"] = checkpoint["_n_threads"]
    model.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
    return model, segment

def apply_kmeans(kmeans_model, encoding):
    # C = cluster centers matrix
    C_np = kmeans_model.cluster_centers_.transpose()
    Cnorm_np = (C_np ** 2).sum(0, keepdims=True)

    C = torch.from_numpy(C_np)
    Cnorm = torch.from_numpy(Cnorm_np)

    if torch.cuda.is_available():
        C = C.cuda()
        Cnorm = Cnorm.cuda()
    
    if isinstance(encoding, torch.Tensor):
        dist = (
            encoding.pow(2).sum(1, keepdims=True)-2*torch.matmul(encoding, C)+Cnorm
        )
    else:
        dist = (
            (encoding**2).sum(1, keepdims=True)-2*np.matmul(encoding, C_np)+Cnorm_np
        )
    return np.argmin(dist, axis=1)


def segment_dataset(args):
    kmeans, _ = kmeans_model(args.kmeans_url)
    encodings = []

    if args.in_dir:
        in_dir = args.in_dir / args.model 
        in_paths = list(in_dir.rglob("*.npy"))
        encodings = {}
        for path in tqdm(in_paths, desc="Loading Features"):
            encoding = np.load(path)
            encodings[path.stem] = encoding

    elif args.encodings:
        encodings = args.encodings

    if args.out_dir:
        out_dir = args.out_dir / args.model / str(args.layer)
        out_dir.mkdir(parents=True, exist_ok=True)
    
    codes_dict = {}
    for encoding in tqdm(encodings, desc="Extracting Kmeans codes"):
        codes = apply_kmeans(kmeans, encodings[encoding]).tolist()
        codes_dict[encoding] = codes
        
    return codes_dict


In [None]:
# Segment the dataset into discrete acoustic units using DUSTED
from sklearn.cluster import KMeans
from segment import segment
import numpy as np

def kmeans_model(url):
    model = KMeans(100)
    checkpoint = torch.hub.load_state_dict_from_url(url)

    model.__dict__["n_features_in_"] = checkpoint["n_features_in_"]
    model.__dict__["_n_threads"] = checkpoint["_n_threads"]
    model.__dict__["cluster_centers_"] = checkpoint["cluster_centers_"].numpy()
    return model, segment

def apply_kmeans(kmeans_model, encoding):
    # C = cluster centers matrix
    C_np = kmeans_model.cluster_centers_.transpose()
    Cnorm_np = (C_np ** 2).sum(0, keepdims=True)

    C = torch.from_numpy(C_np)
    Cnorm = torch.from_numpy(Cnorm_np)

    if torch.cuda.is_available():
        C = C.cuda()
        Cnorm = Cnorm.cuda()
    
    if isinstance(encoding, torch.Tensor):
        dist = (
            encoding.pow(2).sum(1, keepdims=True)-2*torch.matmul(encoding, C)+Cnorm
        )
    else:
        dist = (
            (encoding**2).sum(1, keepdims=True)-2*np.matmul(encoding, C_np)+Cnorm_np
        )
    return np.argmin(dist, axis=1)

def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int)->int:
    """
    Convert timestamp (in seconds) to frame index based on sampling rate and frame size.
    """
    hop_size = frame_size_ms/1000 * sample_rate
    hop_size = np.max([hop_size, 1])
    return int((timestamp * sample_rate) / hop_size)

def segment_dataset(args):
    kmeans, _ = kmeans_model(args.kmeans_url)
    encodings = []
    align_paths = list(args.align_dir.rglob("*.list"))

    if args.in_dir:
        in_dir = args.in_dir / args.model 
        in_paths = list(in_dir.rglob("*.npy"))
        encodings = {}
        for path in tqdm(in_paths, desc="Loading Features"):
            encoding = np.load(path)
            encodings[path.stem] = encoding

    elif args.encodings:
        encodings = args.encodings

    if args.out_dir:
        out_dir = args.out_dir / args.model / str(args.layer)
        out_dir.mkdir(parents=True, exist_ok=True)

    codes_dict = {}
    for path in tqdm(encodings, desc="Extracting DUSTED codes"):
        alignment_file = [a for a in align_paths if a.stem == path]
        if not alignment_file:
            continue
        else:
            alignment_file = alignment_file[0]

        with open(str(alignment_file), "r") as f:
            bounds = [get_frame_num(float(line.strip()), 16000, 20) for line in f]
        
        words = []
        for i in range(len(bounds)-1):
            cut_encoding = encodings[path][bounds[i]: bounds[i+1]]
            codes, _ = segment(cut_encoding, kmeans.cluster_centers_, args.gamma)   
            words.append(codes)
        codes_dict[path] = words
        
    return codes_dict


In [76]:
args = Args(
    in_dir=None,
    out_dir=None,
    align_dir=Path("data/all_alignments/"),
    model="wavlm_base",
    layer=6,
    audio_ext=".wav",
    kmeans_url="https://github.com/bshall/dusted/releases/download/v0.1/kmeans-english-50f36a.pt",
    gamma=0.2,
    encodings=encodings
)

dusted_codes_dict = segment_dataset(args)

Extracting DUSTED codes: 100%|██████████| 2703/2703 [00:24<00:00, 109.59it/s]


In [78]:
for path in dusted_codes_dict:
    print(path)
    for word in dusted_codes_dict[path]:
        print(word)
    break

1988-24833-0000
[97]
[97]
[97]
[18]
[18]
[18]
[97]
[97]
[18]
[95]


In [None]:
args = Args(
    in_dir=None,
    out_dir=None,
    model="wavlm_base",
    layer=6,
    audio_ext=".wav",
    kmeans_url="https://github.com/bshall/dusted/releases/download/v0.1/kmeans-english-50f36a.pt",
    gamma=0.2,
    encodings=encodings
)

codes_dict = segment_dataset(args)

Extracting Kmeans codes: 100%|██████████| 2703/2703 [00:02<00:00, 1180.97it/s]


In [None]:
# Getting the word codes

def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int)->int:
    """
    Convert timestamp (in seconds) to frame index based on sampling rate and frame size.
    """
    hop_size = frame_size_ms/1000 * sample_rate
    hop_size = np.max([hop_size, 1])
    return int((timestamp * sample_rate) / hop_size)

def word_codes(align_paths, codes_dict):
    words_dict = {}
    word_count = 0
    for path in tqdm(codes_dict, desc="Cutting into words"):
        alignment_file = [a for a in align_paths if a.stem == path]
        
        if alignment_file:
            alignment_file = alignment_file[0]
        else:
            continue
        
        with open(str(alignment_file), "r") as f:
            bounds = [get_frame_num(float(line.strip()), 16000, 20) for line in f]
        boundaries = []
        boundaries.extend(bounds)

        feature_codes = []
        codes = codes_dict[path]
        for i in range(len(boundaries)-1):
            new_codes = codes[boundaries[i]:boundaries[i+1]]
            feature_codes.append(new_codes)
            word_count += 1
        words_dict[path] = feature_codes
    return words_dict, word_count

align_dir = Path("data/all_alignments")
align_paths = list(align_dir.rglob("*.list"))
words_dict, num_words = word_codes(align_paths, codes_dict)

Cutting into words:   0%|          | 0/2703 [00:00<?, ?it/s]

Cutting into words: 100%|██████████| 2703/2703 [00:02<00:00, 1165.02it/s]


In [None]:
just_words = []
for path in words_dict:
    for word in words_dict[path]:
        just_words.append(word)

In [80]:
from itertools import groupby

just_words_collapsed = []
for path in words_dict:
    for word in words_dict[path]:
        collapsed_word = [key for key, _ in groupby(word)]
        just_words_collapsed.append(collapsed_word)
        # print(f"{word} --> {collapsed_word}")

In [85]:
from joblib import Parallel, delayed

def edit_distance(seq1, seq2):
    """
    Compute the edit distance between two sequences using dynamic programming.
    """
    N, M = len(seq1), len(seq2)
    dp = np.zeros((N + 1, M + 1))
    for i in range(N + 1):
        dp[i, 0] = i
    for j in range(M + 1):
        dp[0, j] = j
    for i in range(1, N + 1):
        for j in range(1, M + 1):
            cost = 0 if seq1[i - 1] == seq2[j - 1] else 1
            dp[i, j] = min(dp[i - 1, j] + 1, dp[i, j - 1] + 1, dp[i - 1, j - 1] + cost)
    return dp[N, M] 

def calculate_distance(just_words, num_words):
    num_words = int(num_words/2)
    dist_mat = np.zeros((num_words, num_words))

    for i in tqdm(range(num_words), desc="Calculating Distances"):
        dist_mat[i, i] = np.inf
        js = [j for j in range(i + 1, num_words)]
        dists_i = Parallel(n_jobs=8)(
            delayed(edit_distance)(just_words[i], just_words[j]) for j in js
        )

        for j, dist in zip(js, dists_i):
            dist_mat[i, j] = dist
            dist_mat[j, i] = dist  

    # for i in tqdm(range(num_words), desc="Calculating Distances"):
    #     for j in range(i+1, num_words):
            
    #         dist = edit_distance(just_words[i], just_words[j])
    #         dist_mat[i, j] = dist
    #         dist_mat[j, i] = dist
        
    
    return dist_mat

dist_mat = calculate_distance(just_words_collapsed, num_words)

Calculating Distances: 100%|██████████| 31568/31568 [2:05:24<00:00,  4.20it/s]  


In [89]:
# np.save("output/wavlm_base_6.npy", dist_mat)
closest = []
for i in range(dist_mat.shape[0]):
    closest.append(np.argmin(dist_mat[i]))

print(closest)
            

[np.int64(35), np.int64(7666), np.int64(8544), np.int64(1091), np.int64(1955), np.int64(26382), np.int64(344), np.int64(1442), np.int64(1686), np.int64(766), np.int64(332), np.int64(50), np.int64(515), np.int64(11476), np.int64(4465), np.int64(120), np.int64(786), np.int64(747), np.int64(602), np.int64(26512), np.int64(25027), np.int64(4283), np.int64(21882), np.int64(3630), np.int64(19949), np.int64(16939), np.int64(30826), np.int64(5159), np.int64(30), np.int64(2075), np.int64(28), np.int64(244), np.int64(1660), np.int64(25441), np.int64(4792), np.int64(0), np.int64(9300), np.int64(40), np.int64(1790), np.int64(892), np.int64(37), np.int64(25133), np.int64(19265), np.int64(17239), np.int64(10413), np.int64(549), np.int64(9091), np.int64(319), np.int64(28395), np.int64(252), np.int64(11), np.int64(29464), np.int64(15587), np.int64(2565), np.int64(528), np.int64(4760), np.int64(95), np.int64(8651), np.int64(3773), np.int64(74), np.int64(793), np.int64(3007), np.int64(5219), np.int64(87