In [None]:
BRANCH = 'v1.0.0'
import nemo.collections.asr as nemo_asr

try:
    from plotly import graph_objects as go
except ModuleNotFoundError:
    !pip install plotly
    from plotly import graph_objects as go

In [None]:
import numpy as np
# Import audio processing library
import librosa
# We'll use this to listen to audio
from IPython.display import Audio, display

In [None]:
# Import NeMo and it's ASR, NLP and TTS collections
import nemo
# Import Speech Recognition collection
import nemo.collections.asr as nemo_asr
# Import Natural Language Processing colleciton
import nemo.collections.nlp as nemo_nlp
# Import Speech Synthesis collection
import nemo.collections.tts as nemo_tts
# We'll use this to listen to audio
import IPython

In [None]:
# Here is an example of all CTC-based models:
nemo_asr.models.EncDecCTCModel.list_available_models()
# More ASR Models are available - see: nemo_asr.models.ASRModel.list_available_models()


In [None]:
# Speech Recognition model - QuartzNet trained on Russian part of MCV 6.0
quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="stt_es_quartznet15x5").cuda()

In [None]:
# Download audio sample which we'll try
# This is a sample from MCV 6.0 Dev dataset - the model hasn't seen it before
# IMPORTANT: The audio must be mono with 16Khz sampling rate
Audio_sample = 'SPANISH_PODCAST1126.wav'
# load audio signal with librosa
signal, sample_rate = librosa.load(Audio_sample, sr=None)
# To listen it, click on the play button below
IPython.display.Audio(Audio_sample)

In [None]:
es_text = quartznet.transcribe([Audio_sample])
print(es_text)

In [None]:
logits = quartznet.transcribe([Audio_sample], logprobs=True)[0]
probs = softmax(logits)

In [None]:
# softmax implementation in NumPy
def softmax(logits):
    e = np.exp(logits - np.max(logits))
    return e / e.sum(axis=-1).reshape([logits.shape[0], 1])

In [None]:
# 20ms is duration of a timestep at output of the model
time_stride = 0.02

# get model's alphabet
labels = list(quartznet.decoder.vocabulary) + ['blank']
labels[0] = 'space'

# plot probability distribution over characters for each timestep
fig_probs = go.Figure(
    go.Heatmap(z=probs.transpose(),
               colorscale=[
                   [0, 'rgb(30,62,62)'],
                   [1, 'rgb(30,255,30)'],
               ],
               y=labels,
               dx=time_stride,
               name='Probs',
               hovertemplate='Time: %{x:.2f} s<br>Character: %{y}<br>Probability: %{z:.2f}<extra></extra>'),
    layout={
        'height': 300,
        'xaxis': {'title': 'Time, s'},
        'yaxis': {'title': 'Characters'},
        'title': 'Character Probabilities',
        'margin': dict(l=0, r=0, t=40, b=0, pad=0),
    }
)
fig_probs.show()

In [None]:
# get timestamps for space symbols
spaces = []

state = ''
idx_state = 0

if np.argmax(probs[0]) == 0:
    state = 'space'

for idx in range(1, probs.shape[0]):
    current_char_idx = np.argmax(probs[idx])
    if state == 'space' and current_char_idx != 0 and current_char_idx != 28:
        spaces.append([idx_state, idx-1])
        state = ''
    if state == '':
        if current_char_idx == 0:
            state = 'space'
            idx_state = idx

if state == 'space':
    spaces.append([idx_state, len(pred)-1])

In [None]:
spaces

In [None]:
# calibration offset for timestamps: 180 ms
offset = -0.18

# split the transcript into words
words = es_text[0].split()

# cut words
pos_prev = 0
for j, spot in enumerate(spaces):
    display(words[j])
    pos_end = offset + (spot[0]+spot[1])/2*time_stride
    display(Audio(signal[int(pos_prev*sample_rate):int(pos_end*sample_rate)],
                 rate=sample_rate))
    pos_prev = pos_end

display(words[j+1])
display(Audio(signal[int(pos_prev*sample_rate):],
        rate=sample_rate))