### Installing and importing dependencies

In [3]:
!pip install wget
# !apt-get install sox libsndfile1 ffmpeg
!pip install matplotlib>=3.3.2

BRANCH = 'r1.21.0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

zsh:1: 3.3.2 not found
zsh:1: no matches found: git+https://github.com/NVIDIA/NeMo.git@r1.21.0#egg=nemo_toolkit[all]


In [2]:
import torch
from omegaconf import OmegaConf
import torchaudio

from nemo.collections.asr.models import EncDecCTCModel

from nemo.collections.asr.parts.preprocessing.features import FilterbankFeaturesTA as NeMoFilterbankFeaturesTA
from nemo.collections.asr.modules.audio_preprocessing import AudioToMelSpectrogramPreprocessor as NeMoAudioToMelSpectrogramPreprocessor

ModuleNotFoundError: No module named 'omegaconf'

### Downloading config, weights and audio example

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

# Loading weights, config and example wav for CTC-model
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_weights.ckpt
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_config.yaml
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.mp3


--2024-03-22 07:03:47--  https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_weights.ckpt
Resolving n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)... 37.230.193.192
Connecting to n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)|37.230.193.192|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 968535213 (924M) [application/octet-stream]
Saving to: ‘ctc_model_weights.ckpt’


2024-03-22 07:05:13 (10.9 MB/s) - ‘ctc_model_weights.ckpt’ saved [968535213/968535213]

--2024-03-22 07:05:14--  https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_config.yaml
Resolving n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)... 37.230.193.192
Connecting to n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)|37.230.193.192|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3129 (3.1K) [application/octet-stream]
Saving to: ‘ctc_model_config.yaml’


2024-03-22 07:05:14

### Adding modules for features extraction

In [None]:
from typing import Optional, Tuple, Union
import random

class FilterbankFeaturesTA(NeMoFilterbankFeaturesTA):
    def __init__(self, mel_scale: str = 'htk', wkwargs=None, **kwargs):
        if 'window_size' in kwargs:
            del kwargs['window_size']
        if 'window_stride' in kwargs:
            del kwargs['window_stride']

        super().__init__(**kwargs)

        self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = torchaudio.transforms.MelSpectrogram(
            sample_rate=self._sample_rate,
            win_length=self.win_length,
            hop_length=self.hop_length,
            n_mels=kwargs['nfilt'],
            window_fn=self.torch_windows[kwargs['window']],
            mel_scale=mel_scale,
            norm=kwargs['mel_norm'],
            n_fft=kwargs['n_fft'],
            f_max=kwargs.get('highfreq', None),
            f_min=kwargs.get('lowfreq', 0),
            wkwargs=wkwargs,
        )

class AudioToMelSpectrogramPreprocessor(NeMoAudioToMelSpectrogramPreprocessor):
    def __init__(self, mel_scale: str = 'htk', **kwargs):
        super().__init__(**kwargs)
        kwargs['nfilt'] = kwargs['features']
        del kwargs['features']
        self.featurizer = FilterbankFeaturesTA(  # Deprecated arguments; kept for config compatibility
            mel_scale=mel_scale,
            **kwargs,
        )

### Transcribation example witch CTC-model

In [None]:
model = EncDecCTCModel.from_config_file('./ctc_model_config.yaml')
ckpt = torch.load('./ctc_model_weights.ckpt', map_location='cpu')
model.load_state_dict(ckpt, strict=False)
model.eval()
model = model.to('cuda')

[NeMo W 2024-03-20 15:10:42 audio_to_text_dataset:688] Could not load dataset as `manifest_filepath` was None. Provided config : {'batch_size': 10, 'trim_silence': False, 'max_duration': 25.0, 'min_duration': 0.1, 'shuffle': True, 'is_tarred': False, 'num_workers': 8, 'pin_memory': True, 'manifest_filepath': None, 'labels': [' ', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'], 'sample_rate': 16000}
[NeMo W 2024-03-20 15:10:42 audio_to_text_dataset:688] Could not load dataset as `manifest_filepath` was None. Provided config : {'batch_size': 20, 'shuffle': False, 'num_workers': 4, 'min_duration': 0.1, 'pin_memory': True, 'manifest_filepath': None, 'labels': [' ', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'], 'sample_rate': 16000}
[NeMo W 2024-03-20 15:10:42 audio_

[NeMo I 2024-03-20 15:10:42 features:289] PADDING: 0


In [None]:
model.transcribe(['example.mp3'])

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]

['а и правда никакой']