In [1]:
#Essential module import

import praatio
from src.Charsiu import charsiu_chain_attention_aligner, charsiu_forced_aligner, charsiu_attention_aligner
import os
import sys
import librosa
import soundfile as sf
import torch
import torchaudio
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set up path to the data
# We are expecting the following format of the files:

# .WAV: [Passage Name] [Passage Corpus Number]_[Speaker] [Speaker Number].wav
# .txt: [Passage Name] [Passage Corpus Number].txt
path = "../sample_data/original/"
audios = []
transcripts = []
for f in os.listdir(path):
    if f.split('.')[1] == 'wav':
        audios.append(f)
    elif f.split('.')[1] == 'txt':
        transcripts.append(f)

print(audios)
print(transcripts)

['Rainbow Passage 1_Speaker 1.wav', 'Rainbow Passage 2_Speaker 1.wav', 'Rainbow Passage 3_Speaker 1.wav', 'Rainbow Passage 4_Speaker 1.wav', 'Rainbow Passage 5_Speaker 1.wav']
['Rainbow Passage 1.txt', 'Rainbow Passage 2.txt', 'Rainbow Passage 3.txt', 'Rainbow Passage 4.txt', 'Rainbow Passage 5.txt']


In [3]:
charsiu = charsiu_attention_aligner('charsiu/en_w2v2_fs_10ms')
fricatives = set(['F', 'Z', 'V', 'S'])
alignments = []
for voice, transcript in zip(audios, transcripts):
    script = open(path + transcript).read()
    alignment = charsiu.align(path + voice, script)
    alignments.append(alignment)


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [4]:
fricative_timestamps = []
for alignment in alignments:
    filtered = [f for f in alignment if f[-1] in fricatives]
    fricative_timestamps.append(filtered)

In [5]:
print(f"Is CUDA supported by this system? \
      {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
 
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device: \
      {torch.cuda.current_device()}")
       
print(f"Name of current CUDA device: \
      {torch.cuda.get_device_name(cuda_id)}")
torch.set_default_device('cuda')

Is CUDA supported by this system?       True
CUDA version: 12.4
ID of current CUDA device:       0
Name of current CUDA device:       NVIDIA GeForce RTX 4090 Laptop GPU


In [8]:
for alignment, audio in zip(fricative_timestamps, audios):

    audio_name = audio.split('.')[0]

    waveform, sample_rate = torchaudio.load(path + audio)
    waveform = waveform.cpu().numpy()
    num_channels, num_frames = waveform.shape
    target_dir = f"../sample_data/segments/{audio_name}/"
    if not os.path.isdir(target_dir):
        os.mkdir(target_dir)
    
    for ind, fricative in enumerate(alignment):

        start, end = fricative[0], fricative[1]
        phoneme = fricative[-1]

        time_axis = torch.arange(start * sample_rate, end * sample_rate) / sample_rate
        subwave = waveform[:, int(start * sample_rate):int(end * sample_rate)] 
        figure, axes = plt.subplots(num_channels, 1)

        axes = [axes]
        axes[0].plot(time_axis.cpu(), subwave[0], linewidth=1)
        axes[0].grid(True)

        plt.ioff()        
        figure.suptitle(f"{audio_name}_{phoneme}_{ind}_Waveform")
        figure.savefig(target_dir + f"{audio_name}_{phoneme}_{ind}_Waveform.png")
        plt.close(figure)