In [1]:
import os
os.environ["CUDA_HOME"] = "/usr/local/cuda-12.1.0"
os.environ["PATH"] = f"{os.environ['CUDA_HOME']}/bin:{os.environ['PATH']}"
os.environ["LD_LIBRARY_PATH"] = f"{os.environ['CUDA_HOME']}/lib64:{os.environ['LD_LIBRARY_PATH']}"
os.environ["LD_LIBRARY_PATH"] = f"{os.environ['CUDA_HOME']}/lib:{os.environ['LD_LIBRARY_PATH']}"
os.environ["LD_LIBRARY_PATH"] = f"{os.environ['CUDA_HOME']}/extras/CUPTI/lib64:{os.environ['LD_LIBRARY_PATH']}'"
os.environ["CUDAToolkit_ROOT_DIR"] = f"{os.environ['CUDA_HOME']}"
os.environ["CUDAToolkit_ROOT"] = f"{os.environ['CUDA_HOME']}"

os.environ["CUDA_TOOLKIT_ROOT_DIR"] = f"{os.environ['CUDA_HOME']}"
os.environ["CUDA_TOOLKIT_ROOT"] = f"{os.environ['CUDA_HOME']}"
os.environ["CUDA_BIN_PATH"] = f"{os.environ['CUDA_HOME']}"
os.environ["CUDA_PATH"] = f"{os.environ['CUDA_HOME']}"
os.environ["CUDA_INC_PATH"] = f"{os.environ['CUDA_HOME']}/targets/x86_64-linux"
os.environ["CFLAGS"] = f"-I{os.environ['CUDA_HOME']}/targets/x86_64-linux/include:{os.environ['CFLAGS']}"
os.environ["CUDAToolkit_TARGET_DIR"] = f"{os.environ['CUDA_HOME']}/targets/x86_64-linux"

In [2]:
import torch
import torchaudio
import torchaudio.functional as F
from pathlib import Path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

audio_dir = Path("../outputs/openbible_swahili/TIT/")
audios = sorted(audio_dir.rglob("*/*.wav"))

In [3]:
import re
import string
import unicodedata
from unidecode import unidecode
from num2words import num2words

def preprocess_verse(text: str) -> str:
    text = unidecode(text)
    text = unicodedata.normalize('NFKC', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r"\d+", lambda x: num2words(int(x.group(0)), lang="sw"), text)
    text = re.sub("\s+", " ", text)
    return text

In [4]:
transcripts = [open(audio_path.with_suffix(".txt")).read() for audio_path in audios]
verses = [preprocess_verse(v) for v in transcripts]
words = [verse.split() for verse in verses]

In [5]:
bundle = torchaudio.pipelines.MMS_FA
model = bundle.get_model(with_star=False).to(device)
LABELS = bundle.get_labels(star=None)
DICTIONARY = bundle.get_dict(star=None)

In [6]:
waveforms = [torchaudio.load(audio_path) for audio_path in audios]
resampled_waveforms = [torchaudio.functional.resample(waveform, sr, bundle.sample_rate).squeeze() for (waveform, sr) in waveforms]

In [7]:
from torch.nn.utils.rnn import pad_sequence

batch_size = 16
waveform_lengths = [waveform.shape[0] for waveform in resampled_waveforms]
waveforms_batches = [
    pad_sequence(resampled_waveforms[i : i + batch_size], batch_first=True, padding_value=0) # (batch_size, max_batch_frame_length)
    for i in range(0, len(resampled_waveforms), batch_size)
]
waveform_lengths_batches = [torch.tensor(waveform_lengths[i : i + batch_size], dtype=torch.int64) for i in range(0, len(waveform_lengths), batch_size)]
words_batches = [words[i: i + batch_size] for i in range(0, len(words), batch_size)]

In [8]:
def align(emission, tokens):
    targets = torch.tensor([tokens], dtype=torch.int32, device=device)
    alignments, scores = F.forced_align(emission, targets, blank=0)

    alignments, scores = alignments[0], scores[0]  # remove batch dimension for simplicity
    scores = scores.exp()  # convert back to probability
    return alignments, scores

def compute_alignments(emission, transcript, dictionary):
    tokens = [dictionary[char] for word in transcript for char in word]
    _, scores = align(emission, tokens)
    return scores

In [9]:
import numpy as np
from tqdm.auto import tqdm
probability_diffs = []

for waveform_batch, waveform_lengths_batch, words_batch in tqdm(zip(waveforms_batches, waveform_lengths_batches, words_batches)):
    with torch.inference_mode():
        emission, lengths = model(waveform_batch.to(device), waveform_lengths_batch.to(device))  # (batch_size, max_batch_frame_length, num_labels)

    assert len(DICTIONARY) == emission.shape[2]
    
    greedy_log_probs, aligned_log_probs = [], []
    
    for i, length in zip(range(len(waveform_batch)), lengths):
        prob = torch.softmax(emission[i, :length, :].unsqueeze(dim=0), dim=-1) # (1, frame_length, num_labels)
        greedy_prob = torch.max(prob, dim=-1).values  # (1, frame_length)
        greedy_log_prob = torch.sum(torch.log(greedy_prob), dim=-1).cpu().numpy().item()  # (1,)
        greedy_log_probs.append(greedy_log_prob)

    for i, length, words in zip(range(len(waveform_batch)), lengths, words_batch):
        aligned_prob = compute_alignments(emission[i, :length, :].unsqueeze(dim=0), words, DICTIONARY).squeeze() # (1, max_batch_frame_length)
        aligned_log_prob = torch.sum(torch.log(aligned_prob), dim=-1).cpu().numpy().item()  # (1,)
        aligned_log_probs.append(aligned_log_prob)
    
    probability_diff = (np.array(aligned_log_probs) - np.array(greedy_log_probs)) / lengths.cpu().numpy()
    probability_diffs.append(probability_diff)

  from .autonotebook import tqdm as notebook_tqdm
3it [00:05,  1.73s/it]


In [10]:
import numpy as np

probability_diff = np.concatenate(probability_diffs)

In [11]:
probability_diff

array([-1.40528992e-02, -1.89833464e-03, -4.03621128e-02, -1.36629125e-02,
       -5.36187735e-02, -6.03735537e-03, -1.01689745e-04, -8.08676382e-03,
       -6.20300734e-03, -6.25828451e-03, -5.41766485e-03, -1.59862852e-02,
       -5.63826928e-03, -1.07708556e-02, -7.76849784e-04, -1.30411468e-02,
       -6.76120129e-01, -1.92474574e-02, -6.72453406e-04, -6.07647047e-03,
       -2.00812953e-02, -1.85933865e-01, -1.22369718e-01, -2.13108201e-01,
        1.05963813e-08, -1.14707111e-01, -3.69093183e-01, -8.67848460e-02,
       -1.82920774e-02, -6.08356884e-02, -3.99380892e-02, -1.32169939e-02,
       -1.43288424e-02, -1.52013000e-03, -4.69207404e-02, -1.00355958e-02,
       -1.16333925e-02, -9.08226194e-03, -6.26507503e-03, -1.00488384e-02,
       -3.30925228e-03, -1.43456774e-02, -1.82007898e-03, -3.48143525e-02,
       -2.52966684e-01, -9.34417964e-03])

In [13]:
np.where(probability_diff <= -.2)

(array([16, 23, 26, 44]),)