In [9]:
from pathlib import Path
from utils.features import DataSet

name = "librispeech-dev-clean"
in_dir = Path("data/dev-clean")
align_dir = Path("data/alignments/dev-clean")
feat_dir = Path("features")
audio_ext = ".flac" 

dataset = DataSet(
    name, in_dir, align_dir, feat_dir, audio_ext 
)

wav_paths = list(dataset.in_dir.rglob(f"**/*{dataset.audio_ext}"))

In [None]:
import torch
import torchaudio
from tqdm import tqdm
from webrtcvad import Vad
import struct
import numpy as np
import torch.nn.functional as F
import pandas as pd


INT16_MAX = (2**15) - 1
hop_length = 320
sample_rate = 16000


In [3]:
def mark_sil(vad, wav):
    wav = F.pad(wav, (40, 40))
    wav = wav[:, : wav.size(-1) - (wav.size(-1) % hop_length)]

    pcm = struct.pack(
        "%dh" % wav.size(-1),
        *(np.round(wav.squeeze().numpy() * INT16_MAX)).astype(np.int16),
    )

    flags = []
    for window_start in range(0, wav.size(-1), hop_length):
        window_end = window_start + hop_length
        flag = vad.is_speech(pcm[window_start * 2 : window_end * 2], sample_rate)
        flags.append(flag)
    return flags

In [4]:
kmeans, segment = torch.hub.load(
    "bshall/dusted:main", "kmeans", language="english", trust_repo=True
)

hubert, encode = torch.hub.load(
    "bshall/dusted:main", "hubert", language="english", trust_repo=True
)

Using cache found in /home/danel/.cache/torch/hub/bshall_dusted_main
Using cache found in /home/danel/.cache/torch/hub/bshall_dusted_main
Using cache found in /home/danel/.cache/torch/hub/bshall_hubert_main


In [6]:
def get_dusted_units(
    dataset,
    wav,
    hubert,
    encode,
    segment,
    kmeans,
    wav_path,
    gamma=0.2,
    layer=7,
    save=False,
):
    encoding = encode(hubert, wav, layer)
    codes, _ = segment(encoding, kmeans.cluster_centers_, gamma)
    
    if save:
        out_path = (
            dataset.feat_dir
            / "dusted_units"
            / str(gamma)
            / wav_path.relative_to(dataset.in_dir).with_suffix(".npy")
        )

        out_path.parent.mkdir(parents=True, exist_ok=True)

        np.save(out_path, codes)


In [None]:
vad = Vad()

align_df = pd.read_csv(dataset.align_dir / "alignments.csv")

for wav_path in tqdm(wav_paths, desc="Getting units"):
    wav, sr = torchaudio.load(str(wav_path))
    wav = torchaudio.functional.resample(wav, sr, 16000)
    wav = wav.unsqueeze(0)
    get_dusted_units(dataset, wav, hubert, encode, segment, kmeans, wav_path, save=True)