In [1]:
from pathlib import Path
from utils.features import DataSet

name = "librispeech-dev-clean"
in_dir = Path("data/dev-clean")
align_dir = Path("data/alignments/dev-clean")
feat_dir = Path("features")
audio_ext = ".flac" 

dataset = DataSet(
    name, in_dir, align_dir, feat_dir, audio_ext 
)

In [2]:
import random

audio_paths = list(dataset.in_dir.rglob(f"**/*{dataset.audio_ext}"))
sample_feature_paths = list(Path(dataset.feat_dir / "dusted_units/0.2/").rglob("**/*.npy"))

sample = False
sample_size = None

if sample: 
    # sample_audio_paths = random.sample(audio_paths, sample_size)
    sample_feature_paths = random.sample(sample_feature_paths, sample_size)
    # sample_feature_paths = feature_paths[0:sample_size]

sample_size = len(sample_feature_paths)
file_map = {}
for i, feature in enumerate(sample_feature_paths):
    file_map[i] = feature

print(sample_size)

63137


In [3]:
def pair_generator(num_paths):
    for i in range(num_paths):
        for j in range(i + 1, num_paths):
            yield i, j


def get_batch_of_paths(num_paths, chunk_limit=100):
    """Generate sequential batches of (i, j) path pairs."""
    pairs = pair_generator(num_paths) 
    chunk = [] 

    for idx, (i, j) in enumerate(pairs, 1):
        chunk.append((i, j))

        if idx % chunk_limit == 0:
            yield chunk 
            chunk = [] 

    if chunk:  
        yield chunk

In [4]:
import numpy as np
from utils.features import WordUnit
import pandas as pd
import time

def load_word(word_path, word_id, align_df):

    """Loads a word unit with metadata and encoding information."""
    # Load encoding units
    units = np.load(word_path)
    
    # Extract filename and word index
    parts = word_path.stem.split("_")
    filename, index = parts[0], int(parts[1])

    # Filter align_df once using .query()
    word_df = align_df.query("filename == @filename and word_id == @index")
    
    if word_df.empty:
        return None  # Early exit if word not found

    # Extract the actual word text efficiently
    true_word = word_df["text"].iat[0] if isinstance(word_df["text"].iat[0], str) else "_"

    # Create WordUnit object
    word = WordUnit(
        id=word_id,
        filename=filename,
        index=index,
        true_word=true_word,
        boundaries=[word_df["word_start"].iat[0], word_df["word_end"].iat[0]],
    )

    # Update encoding with loaded units
    word.update_encoding(units)
    return word


In [5]:
def process_key(key, file_map, words_cache, keys, align_df):
    """Helper function to process a single key."""
    if key in words_cache:
        return words_cache[key]  # Retrieve from cache
    
    path = file_map.get(key)
    if path is None:
        print(f"Warning: No file found for key '{key}' in file_map")
        return None  # Skip processing for missing files

    word = load_word(path, key, align_df)  # Load word
    words_cache[key] = word  # Cache it
    keys.add(key)
    return word

def load_units_for_chunk(dataset, chunk, align_df=None, file_map=None):
    """Optimized function for loading units for a chunk with parallel loading using joblib."""

    # Use Parquet if available for faster reading
    if align_df is None:
        csv_path = dataset.align_dir / "alignments.csv"
        align_df = pd.read_csv(csv_path)
    
    words_cache = {}  # Cache for fast word retrieval
    keys = set()
    chunk_words = []

    # Process words in parallel using joblib
    for pair in chunk:
        pair_keys = tuple(pair.keys())

        words= []
        for key in pair_keys:
            words.append(process_key(key, file_map, words_cache, keys, align_df))
            
        chunk_words.append(tuple(words))

    return chunk_words

In [6]:
import torch 
import editdistance

def calculate_distance_per_chunk(chunk_words):
    """Process sub-chunk and return computed distances with indices"""
    results = []

    for pair in chunk_words:
        encoding_i = torch.from_numpy(pair[0].clean_encoding)
        encoding_j = torch.from_numpy(pair[1].clean_encoding)

        shapes = torch.tensor([encoding_i.size()[0], encoding_j.size()[0]])
        length = torch.max(shapes)

        dist = 0
        if length > 0:
            dist =  editdistance.eval(encoding_i, encoding_j) / length

        results.append((pair[0].id, pair[1].id, dist))

    return results

In [None]:
from tqdm import tqdm
from joblib import Parallel, delayed

num_features = len(sample_feature_paths)
dist_mat = torch.zeros((sample_size, sample_size), dtype=torch.float32)
align_df = pd.read_csv(dataset.align_dir / "alignments.csv")

chunk_limit = 1000
num_pairs = num_features * (num_features - 1) // 2
num_chunks = (num_pairs + chunk_limit - 1) // chunk_limit 

start_time = time.perf_counter()
for chunk in tqdm(get_batch_of_paths(num_features, chunk_limit=100), total=num_chunks, desc="Processing chunks"):
    chunk_paths = [{i: sample_feature_paths[i], j: sample_feature_paths[j]} for i, j in chunk]

    chunk_words = load_units_for_chunk(
        dataset, chunk_paths, align_df=align_df, file_map=file_map
    )
    futures = Parallel(n_jobs=2)(delayed(calculate_distance_per_chunk)(chunk_words))
    
    for future in futures:
        i,j, distance = future.result()
        dist_mat[i, j] = distance 

end_time = time.perf_counter()

print(f"Total time: {end_time - start_time}s")

Processing chunks:   0%|          | 0/1993109 [00:01<?, ?it/s]


TypeError: cannot unpack non-iterable function object