In [None]:
from pathlib import Path
import pandas as pd
from encode import sample_files, get_units
from distance import get_batch_of_paths, calculate_distance_per_chunk_pair
from tqdm import tqdm
import scipy.sparse as sp
import editdistance
import numpy as np

In [2]:
audio_dir = Path("data/dev-clean")
audio_ext = ".flac"
align_path = Path("data/alignments/dev-clean/alignments.csv")
save_dir = Path("features/")
wav_dir = Path("data/dev-clean")
feat_dir = Path("features/0.2")

align_df = pd.read_csv(align_path)

In [51]:
# feature_paths, sample_size = sample_files(feature_dir=feat_dir, sample_size=100)
paths = [Path("data/dev-clean/174/50561/174-50561-0005.flac")]
sample_size = len(paths)
print(sample_size)

1


In [47]:
import torch
import torchaudio
from webrtcvad import Vad
from encode import mark_sil

kmeans, segment = torch.hub.load(
    "bshall/dusted:main", "kmeans", language="english", trust_repo=True
)
hubert, encode = torch.hub.load(
    "bshall/dusted:main", "hubert", language="english", trust_repo=True
)
vad = Vad()

Using cache found in /home/danel/.cache/torch/hub/bshall_dusted_main
Using cache found in /home/danel/.cache/torch/hub/bshall_dusted_main
Using cache found in /home/danel/.cache/torch/hub/bshall_hubert_main


In [67]:
def get_frame_num(timestamp: float, sample_rate: int, frame_size_ms: int) -> int:
    hop = frame_size_ms / 1000 * sample_rate
    hop_size = np.max([hop, 1])
    return int((timestamp * sample_rate) / hop_size)


for path in paths:
    print(f"path: {path}")
    wav_df = align_df[align_df["filename"] == path.stem]

    wav, sr = torchaudio.load(str(path))
    wav = torchaudio.functional.resample(wav, sr, 16000)
    flags = mark_sil(vad, wav)
    wav = wav.unsqueeze(0)

    encoding = encode(hubert, wav, 7)
    encoding = encoding.squeeze(0)
    print(f"encoding_shape: {encoding.shape}")
    print(f"flags_shape: {len(flags)}")
    for w in range(max(wav_df["word_id"])):
        word_df = wav_df[wav_df["word_id"] == w]
        print(f"word: {word_df['text'].iloc[0]}")
        word_boundaries = [word_df["word_start"].iloc[0], word_df["word_end"].iloc[0]]

        print(f"word_boundaries: {word_boundaries}")
        start_frame = get_frame_num(word_boundaries[0], 16000, 20)
        end_frame = get_frame_num(word_boundaries[1], 16000, 20)

        print(f"frames: {start_frame, end_frame}")
        cut_encoding = encoding[start_frame:end_frame]
        cut_flags = flags[start_frame:end_frame]

        print(f"cut_encoding_shape: {cut_encoding.shape}")
        print(f"cut_flags_shape: {len(cut_flags)}")
        print(f"cut_flags: {cut_flags}")
        clean_encoding = []
        for i in range(min(cut_encoding.shape[0], len(flags))):
            if cut_flags[i]:
                clean_encoding.append(cut_encoding[i, :].unsqueeze(0))

        if clean_encoding != []:
            clean_encoding = torch.cat(clean_encoding, dim=0)

        print(f"clean_encoding_shape: {len(clean_encoding)}")
        codes = []
        if clean_encoding != []:
            codes, _ = segment(clean_encoding.numpy(), kmeans.cluster_centers_, 0.2)
        print(f"codes: {codes}\n")


path: data/dev-clean/174/50561/174-50561-0005.flac
encoding_shape: torch.Size([285, 768])
flags_shape: 285
word: nan
word_boundaries: [np.float64(0.0), np.float64(0.51)]
frames: (0, 25)
cut_encoding_shape: torch.Size([25, 768])
cut_flags_shape: 25
cut_flags: [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]
clean_encoding_shape: 0
codes: []

word: lady
word_boundaries: [np.float64(0.51), np.float64(1.06)]
frames: (25, 53)
cut_encoding_shape: torch.Size([28, 768])
cut_flags_shape: 28
cut_flags: [False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
clean_encoding_shape: 25
codes: [17 95 48 20 82 40 57 91 16]

word: nan
word_boundaries: [np.float64(1.06), np.float64(1.12)]
frames: (53, 56)
cut_encoding_shape: torch.Size([3, 768])
cut_flags_shape: 3


In [37]:
def cal_dist_per_pair(pair):
    """
    Calculates the normalized edit distance for a given pair of feature sequences.

    Args:
        chunk_pair (dict): Dictionary with a single key-value pair where:
            - Key: Tuple (i, j) representing the indices of the feature pair.
            - Value: Tuple (feature_i, feature_j) containing the feature sequences.

    Returns:
        tuple: (index_i, index_j, normalized edit distance).
    """

    id_1, id_2 = tuple(pair.keys())[0]
    feature_1, feature_2 = tuple(pair.values())[0]

    max_length = np.max([len(feature_1), len(feature_2)])
    min_length = np.min([len(feature_1), len(feature_2)])
    print(f"max len {max_length}")

    dist = 0
    if min_length == 0:
        print(f"{id_1, id_2}\n{feature_1}\n{feature_2}\nDistance: {1.0}")
        return (id_1, id_2, 1.0)

    if max_length > 0:
        dist = editdistance.eval(feature_1, feature_2) / max_length

    print(f"{id_1, id_2}\n{feature_1}\n{feature_2}\nDistance: {dist}")
    return (id_1, id_2, dist)

In [40]:
# Process chunks
filenames = []
features = []
# for feature in tqdm(paths, desc="Process Filenames"):
for i, path in enumerate(paths):
    filenames.append(path.stem)
    feature = np.load(path)
    features.append(feature)
    print(i, feature)


# chunk_limit = 10
# num_pairs = sample_size * (sample_size - 1) // 2
# num_chunks = (num_pairs + chunk_limit - 1) // chunk_limit

# row_indices = []
# col_indices = []
# values = []

# for chunk in tqdm(
#     get_batch_of_paths(sample_size, chunk_limit=chunk_limit),
#     total=num_chunks,
#     desc="Processing Chunks",
#     unit="chunk",
# ):
#     chunk_units = [{(i, j): (features[i], features[j])} for i, j in chunk]
#     chunk_results = []
#     for pair in chunk_units:
#         # print(pair)
#         chunk_results.append(cal_dist_per_pair(pair))

#     for i, j, dist in chunk_results:
#         row_indices.append(i)
#         col_indices.append(j)
#         values.append(dist)

#     dist_sparse = sp.coo_matrix(
#         (values, (row_indices, col_indices)), shape=(sample_size, sample_size)
#     )

0 [40 13 28 62]
1 [13 84 77 41]
2 [16]
3 [ 1 79 43 70 27 82]
4 [16 90 25 43 70 51]
5 [31 66 18 24]
6 [13 43 51 20 68]
7 [16]
8 [32 74 45 74]
9 [16 17 95 48 20 82 40 57 91  5 16]
10 [65 28 86 10]
11 [94 85 68 13]
12 [17 95 48 20 82 40 57 91 16]
13 [17 95 48 20 82 40 91  5 32]
14 [13  8 43 71 52 26 55 77 41]
15 [20 13  8 85 75 73 92 30 59]
16 [17 95 48 20 82 40 91  5 32]
17 []
