In [4]:
!gdown --id 1KeqBq6koNgMR5oKGKJUkNg9429d0hkXh
!unzip numbers2_train.zip
!wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/quartznet/quartznet_15x5.yaml

Downloading...
From: https://drive.google.com/uc?id=1KeqBq6koNgMR5oKGKJUkNg9429d0hkXh
To: /workspace/test/numbers2_train.zip
100%|██████████████████████████████████████| 1.01G/1.01G [00:16<00:00, 61.2MB/s]


In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings("ignore")
from pandarallel import pandarallel
from tqdm import tqdm
from num2words import num2words
import json
pandarallel.initialize(progress_bar=True)

from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift,Gain
import soundfile as sf

from data_generator import filters,augment

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
def write_manifest(df,path):
    with open(path,'w',encoding='utf-8') as f:
        for _,row in df.iterrows():
            d = row.to_dict()
            d['text'] = d['transcription']
            d['audio_filepath'] = d['path']
            d['duration'] = 10
            datum = json.dumps(d)
            f.write(f"{datum}\n")

### Preprocess and augment data

In [2]:
data = shuffle(pd.read_csv('numbers2/train.csv').dropna())
data['path'] = 'numbers2/' + data['path']
train = data[:-300]
test = data[-300:]

In [9]:
new_paths = train['path'].parallel_apply(lambda x : augment(x,5,filters))
new_paths = np.array(new_paths.values.tolist()).reshape(-1)
new_paths = pd.DataFrame(new_paths,columns=['path'])

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=169), Label(value='0 / 169'))), HB…

In [33]:
new_paths['core_name'] = new_paths.path.apply(lambda x : x.split('/')[2].split('_')[0])
train['core_name'] = train.path.apply(lambda x : x.split('/')[2].split('.')[0])

In [37]:
new_paths = pd.merge(new_paths,train,on='core_name')
new_paths = new_paths[['path_x','gender','number']]
new_paths.columns = ['path','gender','number']

train = train.drop('core_name',1)

In [41]:
train = shuffle(pd.concat([new_paths,train],))

In [65]:
train['number'] = train['number'].astype(int)
test['number'] = test['number'].astype(int)

In [67]:
train['transcription'] = train['number'].apply(lambda x : num2words(x,lang='ru'))
test['transcription'] = test['number'].apply(lambda x : num2words(x,lang='ru'))


In [69]:
train.to_csv('numbers2/augmented_train.csv',index=False)
test.to_csv('numbers2/test.csv',index=False)

In [41]:
test = pd.read_csv('numbers2/test.csv')
train = pd.read_csv('numbers2/augmented_train.csv')

In [47]:
write_manifest(test,'numbers2/test_manifest.json')
write_manifest(train,'numbers2/train_manifest.json')

### Add more synth data

In [34]:
train = pd.read_csv('numbers2/augmented_train.csv')
train2 = pd.read_csv('numbers2/gen.csv')

train = pd.concat([train[['path','transcription']],train2])

In [36]:
train = shuffle(train)

In [40]:
write_manifest(train,'numbers2/train_manifest_big.json')

### Generate alphabet 

In [63]:
#first try with word-level alphabet
vocab = []
for i in list(range(1000)) + [1000,2000,5000,1000000,2000000,5000000,10000000,20000000,50000000]:
    vocab.extend(num2words(i,lang='ru').split())

vocab = list(set(vocab)) + [' ']
print(len(vocab))

46


In [4]:
#finish try with char-level alphabet
chars = 'йцукенгшщзхъфывапролджэёячсмитьбю'
vocab = [c for c in chars] + [' ']
len(chars)

33

### Generate config

In [5]:
from yaml import load, dump
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper

In [39]:
model_config = load(open('quartznet_15x5.yaml','r'), Loader=Loader)

In [40]:
sr = 24000



model_config['model']['sample_rate'] = sr
model_config['model']['labels'] = vocab

model_config['model']['train_ds']['manifest_filepath'] = 'numbers2/train_manifest_big.json'
model_config['model']['train_ds']['labels'] = vocab
model_config['model']['train_ds']['normalize_transcripts'] = False
model_config['model']['train_ds']['sample_rate'] = sr
model_config['model']['train_ds']['batch_size'] = 1024

model_config['model']['validation_ds']['manifest_filepath'] = 'numbers2/test_manifest.json'
model_config['model']['validation_ds']['labels'] = vocab
model_config['model']['validation_ds']['sample_rate'] = sr

model_config['model']['validation_ds']['normalize_transcripts'] = False

model_config['model']['test_ds']['manifest_filepath'] = 'numbers2/test_manifest.json'
model_config['model']['test_ds']['labels'] = vocab
model_config['model']['test_ds']['sample_rate'] = sr

model_config['model']['test_ds']['normalize_transcripts'] = False

model_config['model']['decoder']['vocabulary'] = vocab
model_config['model']['decoder']['num_classes'] = len(vocab)

model_config['trainer']['strategy'] = None #disable ddp while training with one gpu
model_config['trainer']['max_epochs'] = 100

model_config['name'] = 'QuartzNet5x2'

In [43]:
dump(model_config,open('config.yaml','w'), Dumper=Dumper)

In [9]:
import pytorch_lightning as pl
from omegaconf import OmegaConf
import torch
from nemo.collections.asr.models import EncDecCTCModel
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager
from IPython.display import Audio

### Train model

In [None]:
model_config = OmegaConf.structured(model_config)
trainer = pl.Trainer(**model_config['trainer'])
exp_manager(trainer, model_config.get("exp_manager", None))
asr_model = EncDecCTCModel(cfg=model_config['model'], trainer=trainer)
trainer.fit(asr_model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


[NeMo I 2022-10-18 17:50:22 exp_manager:315] Experiments will be logged at /workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22
[NeMo I 2022-10-18 17:50:23 exp_manager:704] TensorboardLogger has been set up


[NeMo W 2022-10-18 17:50:23 exp_manager:971] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to -1. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.


[NeMo I 2022-10-18 17:50:34 collections:194] Dataset loaded with 124926 files totalling 347.02 hours
[NeMo I 2022-10-18 17:50:34 collections:195] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-10-18 17:50:34 collections:194] Dataset loaded with 300 files totalling 0.83 hours
[NeMo I 2022-10-18 17:50:34 collections:195] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-10-18 17:50:34 collections:194] Dataset loaded with 300 files totalling 0.83 hours
[NeMo I 2022-10-18 17:50:34 collections:195] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-10-18 17:50:34 features:225] PADDING: 16


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


[NeMo I 2022-10-18 17:50:36 modelPT:602] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.8, 0.5]
        eps: 1e-08
        grad_averaging: False
        lr: 0.01
        weight_decay: 0.001
    )
[NeMo I 2022-10-18 17:50:36 lr_scheduler:910] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7fc945746310>" 
    will be used during training (effective maximum steps = 12200) - 
    Parameters : 
    (warmup_steps: null
    warmup_ratio: null
    min_lr: 0.0
    last_epoch: -1
    max_steps: 12200
    )



  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConvASREncoder                    | 199 K 
2 | decoder           | ConvASRDecoder                    | 4.5 K 
3 | loss              | CTCLoss                           | 0     
4 | spec_augmentation | SpectrogramAugmentation           | 0     
5 | _wer              | WER                               | 0     
------------------------------------------------------------------------
204 K     Trainable params
0         Non-trainable params
204 K     Total params
0.816     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 122: 'val_wer' reached 0.93857 (best 0.93857), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.9386-epoch=0.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 244: 'val_wer' reached 0.86806 (best 0.86806), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.8681-epoch=1.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 366: 'val_wer' reached 0.73130 (best 0.73130), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.7313-epoch=2.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 488: 'val_wer' reached 0.55395 (best 0.55395), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.5540-epoch=3.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 610: 'val_wer' reached 0.46581 (best 0.46581), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.4658-epoch=4.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 732: 'val_wer' reached 0.38996 (best 0.38996), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.3900-epoch=5.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 854: 'val_wer' reached 0.34402 (best 0.34402), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.3440-epoch=6.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 976: 'val_wer' reached 0.32639 (best 0.32639), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.3264-epoch=7.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 1098: 'val_wer' reached 0.29915 (best 0.29915), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.2991-epoch=8.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 1220: 'val_wer' reached 0.22222 (best 0.22222), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.2222-epoch=9.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 10, global step 1342: 'val_wer' reached 0.22970 (best 0.22222), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.2297-epoch=10.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 11, global step 1464: 'val_wer' reached 0.22329 (best 0.22222), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.2233-epoch=11.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 12, global step 1586: 'val_wer' reached 0.17842 (best 0.17842), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.1784-epoch=12.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 13, global step 1708: 'val_wer' reached 0.16880 (best 0.16880), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.1688-epoch=13.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 14, global step 1830: 'val_wer' reached 0.14316 (best 0.14316), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.1432-epoch=14.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 15, global step 1952: 'val_wer' reached 0.14957 (best 0.14316), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.1496-epoch=15.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 16, global step 2074: 'val_wer' reached 0.09455 (best 0.09455), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.0946-epoch=16.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 17, global step 2196: 'val_wer' reached 0.10524 (best 0.09455), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.1052-epoch=17.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 18, global step 2318: 'val_wer' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 19, global step 2440: 'val_wer' reached 0.10417 (best 0.09455), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.1042-epoch=19.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 20, global step 2562: 'val_wer' reached 0.09348 (best 0.09348), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.0935-epoch=20.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 21, global step 2684: 'val_wer' reached 0.09241 (best 0.09241), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.0924-epoch=21.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 22, global step 2806: 'val_wer' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 23, global step 2928: 'val_wer' reached 0.08120 (best 0.08120), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.0812-epoch=23.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 24, global step 3050: 'val_wer' reached 0.08654 (best 0.08120), saving model to '/workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.0865-epoch=24.ckpt' as top 3


In [20]:
trainer = pl.Trainer(**model_config['trainer'])
exp_manager(trainer, model_config.get("exp_manager", None))
asr_model = EncDecCTCModel(cfg=model_config['model'], trainer=trainer)

#word_level
# cpt_path = 'nemo_experiments/QuartzNet15x5/2022-10-18_15-37-19/checkpoints/QuartzNet15x5--val_wer=0.0021-epoch=33.ckpt'

#char_level
cpt_path = 'nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22/checkpoints/QuartzNet15x5--val_wer=0.0085-epoch=77.ckpt'

state_dict = torch.load(cpt_path)['state_dict']
asr_model.load_state_dict(state_dict)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


[NeMo I 2022-10-20 09:54:01 exp_manager:315] Experiments will be logged at /workspace/test/nemo_experiments/QuartzNet15x5/2022-10-18_17-50-22
[NeMo I 2022-10-20 09:54:01 exp_manager:704] TensorboardLogger has been set up


[NeMo W 2022-10-20 09:54:01 exp_manager:971] The checkpoint callback was told to monitor a validation value and trainer's max_steps was set to -1. Please ensure that max_steps will run for at least 1 epochs to ensure that checkpointing will not error out.


[NeMo I 2022-10-20 09:54:13 collections:194] Dataset loaded with 124926 files totalling 347.02 hours
[NeMo I 2022-10-20 09:54:13 collections:195] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-10-20 09:54:13 collections:194] Dataset loaded with 300 files totalling 0.83 hours
[NeMo I 2022-10-20 09:54:13 collections:195] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-10-20 09:54:13 collections:194] Dataset loaded with 300 files totalling 0.83 hours
[NeMo I 2022-10-20 09:54:13 collections:195] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-10-20 09:54:13 features:225] PADDING: 16


<All keys matched successfully>

### Check result

In [21]:
test_folder = 'numbers2/test-example/'
files = [test_folder + f for f in os.listdir(test_folder)]
asr_model.transcribe(files)

[NeMo W 2022-10-20 09:54:17 audio_to_text_dataset:56] `sample_rate` is explicitly provided to the data loader, and is different from the `sample_rate` provided at the model level config.
    If this is incorrect, please set the dataloader's `sample_rate` to None.


Transcribing:   0%|          | 0/2 [00:00<?, ?it/s]

['трсть одна тысяча семьдесят етрь',
 'двести стьдесят три тысячи шестьсот шсемьдесят пть',
 'семьдесят две тысячи цать осемь',
 'отадевять тысяч семьсот тридцать восемь',
 'двадеать шость тысяч двести честь',
 'мьсот девяносто семь тысяч восемьсот шестьдесят темь']

### Some conclusion.
Results doesnt looks perfect, but like 90 persent of this mistakes can be fixed with char-probability rescoring, n-gram model. Or vanila euristic which can find closest neigbor from word-level alphabet with levinshtain distance, and replace.