## SpeechBrain

In [12]:
import speechbrain as sb 
from speechbrain.pretrained import EncoderDecoderASR 
from speechbrain.dataio.dataio import read_audio 
from IPython.display import Audio 
from pathlib import Path
import torch

In [8]:
user, utt = "84", "121123"
audio_dir = "../data/LibriSpeech/dev-clean"
audio_path = str(Path(audio_dir, user, utt, f"{user}-{utt}-0000.flac"))
print(audio_path)
signal = read_audio(audio_path).squeeze()
Audio(signal, rate=16000)

../data/LibriSpeech/dev-clean/84/121123/84-121123-0000.flac


In [9]:
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech", savedir="pretrained_models/asr-crdnn-rnnlm-librispeech")

Downloading: 100%|██████████| 4.49k/4.49k [00:00<00:00, 914kB/s]
Downloading: 100%|██████████| 1.41k/1.41k [00:00<00:00, 874kB/s]
Downloading: 100%|██████████| 480M/480M [00:46<00:00, 10.3MB/s] 
Downloading: 100%|██████████| 212M/212M [00:20<00:00, 10.1MB/s] 
Downloading: 100%|██████████| 253k/253k [00:00<00:00, 989kB/s] 


In [26]:
audio_path = str(Path(audio_dir, user, utt, f"{user}-{utt}-0002.flac"))
asr_model.transcribe_file(audio_path)

'AT THIS MOMENT THE WHOLE SOUL OF THE OLD MAN SEEMED CENTERED IN HIS EYES WHICH BECAME BLOODSHOT THE VEINS OF THE THROAT SWELLED HIS CHEEKS AND TEMPLES BECAME PURPLE AS THOUGH HE WAS STRUCK WITH EPILEPSY NOTHING WAS WANTING TO COMPLETE THIS BUT THE UTTERANCE OF A CRY'

## Explore possible input formats
- Input `(bsz, time, 1)` torch.tensor? -> yes  
- Input MFCC? -> no.

In [13]:
import torchaudio 
from speechbrain.dataio.preprocess import AudioNormalizer

def load_audio(path):
    signal, sr = torchaudio.load(path, channels_first=False)
    return AudioNormalizer()(signal, sr)

In [27]:
# Based on: https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/pretrained/interfaces.py
# I'm refactoring out the transcribe_file() here

batch = load_audio(audio_path).unsqueeze(0)
print(batch.shape)
rel_length = torch.tensor([1.0])

predicted_words, predicted_tokens = asr_model.transcribe_batch(batch, rel_length)

torch.Size([1, 219040])


In [28]:
print(predicted_words)
print(predicted_tokens)

['AT THIS MOMENT THE WHOLE SOUL OF THE OLD MAN SEEMED CENTERED IN HIS EYES WHICH BECAME BLOODSHOT THE VEINS OF THE THROAT SWELLED HIS CHEEKS AND TEMPLES BECAME PURPLE AS THOUGH HE WAS STRUCK WITH EPILEPSY NOTHING WAS WANTING TO COMPLETE THIS BUT THE UTTERANCE OF A CRY']
[[58, 77, 431, 2, 512, 56, 149, 8, 2, 251, 137, 301, 4, 64, 99, 25, 4, 13, 29, 328, 80, 22, 672, 854, 1, 60, 27, 7, 2, 3, 120, 40, 1, 8, 2, 3, 63, 96, 115, 378, 143, 278, 29, 3, 293, 11, 68, 1, 6, 100, 376, 26, 44, 1, 22, 672, 582, 26, 44, 38, 361, 16, 19, 3, 863, 33, 125, 26, 94, 11, 26, 1, 17, 356, 19, 394, 12, 9, 924, 77, 48, 2, 918, 231, 8, 5, 64, 131]]
