In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%matplotlib notebook
import matplotlib.pylab as plt
import numpy as np
import pickle, os, warnings, sys
warnings.filterwarnings("ignore")
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [4]:
import soundfile as sf
import random
from tqdm.auto import tqdm
import librosa
import json

In [5]:
from Cfg import Cfg
C = Cfg('NIST', 8000, 'amharic') 

In [6]:
from RecordingCorpus import RecordingCorpus
from multiprocessing import Pool
from contextlib import closing
if __name__ == '__main__':
    with closing(Pool(16)) as pool:
        recordings = RecordingCorpus(C, pool)

from SplitCorpus import SplitCorpus
splits=SplitCorpus.transcript_split(C, recordings)

100%|██████████| 122/122 [00:03<00:00, 40.05it/s]


In [7]:
random.shuffle(splits.artifacts)
n_samples=len(splits.artifacts)

n_train = int(0.8*n_samples)
samples=splits.artifacts
train_samples=samples[0:n_train]
test_samples=samples[n_train:]

In [8]:
from ruamel.yaml import YAML
config_path = './NeMo/examples/asr/conf/config.yaml'
yaml = YAML(typ='safe')
with open(config_path) as f:
    params = yaml.load(f)

In [9]:
audio_split_dir=f'{C.build_dir}/audio_split'

!mkdir -p {audio_split_dir}

upsample_rate=params['sample_rate']
for (case, S) in [('train', train_samples), ('test', test_samples)]:
    manifest_fn=f'{C.build_dir}/{case}_manifest.json'
    with open(manifest_fn, 'w', encoding='utf-8') as f_manifest:
        for sample in tqdm(S):
            (_,root,(start,end))=sample.key
            audio = sample.source.value
            duration = sample.source.n_seconds
            transcript = sample.target.value
            audio_path=f'{audio_split_dir}/{root}_{start}_{end}.wav'
            sf.write(audio_path, audio, C.sample_rate)
            y3, sr3 = librosa.load(audio_path,sr=upsample_rate)
            sf.write(audio_path, y3, upsample_rate)
            metadata = {
                    "audio_filepath": audio_path,
                    "duration": duration,
                    "text": transcript
                }
            json.dump(metadata, f_manifest)
            f_manifest.write('\n')

In [10]:
train_manifest=f'{C.build_dir}/train_manifest.json'
test_manifest=f'{C.build_dir}/test_manifest.json'

In [11]:
from omegaconf import DictConfig
params['model']['train_ds']['manifest_filepath'] = train_manifest
params['model']['validation_ds']['manifest_filepath'] = test_manifest

In [12]:
import copy
new_opt = copy.deepcopy(params['model']['optim'])
new_opt['lr'] = 0.001

In [13]:
import nemo
import nemo.collections.asr as nemo_asr

[NeMo W 2020-10-13 10:38:43 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToCharDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-10-13 10:38:43 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToBPEDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-10-13 10:38:43 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioLabelDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-10-13 10:38:43 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text._TarredAudioToTextDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-10-13 10:38:43 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset'> is experimental,

In [14]:
quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En")

[NeMo I 2020-10-13 10:38:43 cloud:55] Found existing object /home/catskills/.cache/torch/NeMo/NeMo_1.0.0b1/QuartzNet15x5Base-En/00869f9c89b8393ca3de640e0c536bd2/QuartzNet15x5Base-En.nemo.
[NeMo I 2020-10-13 10:38:43 cloud:61] Re-using file from: /home/catskills/.cache/torch/NeMo/NeMo_1.0.0b1/QuartzNet15x5Base-En/00869f9c89b8393ca3de640e0c536bd2/QuartzNet15x5Base-En.nemo
[NeMo I 2020-10-13 10:38:43 common:394] Instantiating model from pre-trained checkpoint
[NeMo I 2020-10-13 10:38:44 features:241] PADDING: 16
[NeMo I 2020-10-13 10:38:44 features:258] STFT using torch
[NeMo I 2020-10-13 10:38:46 modelPT:237] Model EncDecCTCModel was successfully restored from /home/catskills/.cache/torch/NeMo/NeMo_1.0.0b1/QuartzNet15x5Base-En/00869f9c89b8393ca3de640e0c536bd2/QuartzNet15x5Base-En.nemo.


In [15]:
# Use the smaller learning rate we set before
quartznet.setup_optimization(optim_config=DictConfig(new_opt))

[NeMo I 2020-10-13 10:38:52 modelPT:572] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.8, 0.5]
        eps: 1e-08
        grad_averaging: False
        lr: 0.001
        weight_decay: 0.001
    )


[NeMo W 2020-10-13 10:38:52 lr_scheduler:526] Neither `max_steps` nor `iters_per_batch` were provided to `optim.sched`, cannot compute effective `max_steps` !
    Scheduler will not be instantiated !


(Novograd (
 Parameter Group 0
     amsgrad: False
     betas: [0.8, 0.5]
     eps: 1e-08
     grad_averaging: False
     lr: 0.001
     weight_decay: 0.001
 ),
 None)

In [16]:
import pytorch_lightning as pl

vocabulary=list(sorted(set(''.join([x.target.value for x in samples]))))

quartznet.change_vocabulary(
    new_vocabulary=vocabulary
)

[NeMo I 2020-10-13 10:39:05 ctc_models:189] Changed decoder to output to [' ', 'ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ', 'ለ', 'ሉ', 'ሊ', 'ላ', 'ሌ', 'ል', 'ሎ', 'ሏ', 'ሐ', 'ሑ', 'ሒ', 'ሔ', 'ሕ', 'መ', 'ሙ', 'ሚ', 'ማ', 'ሜ', 'ም', 'ሞ', 'ሟ', 'ሠ', 'ሡ', 'ሣ', 'ሥ', 'ሦ', 'ረ', 'ሩ', 'ሪ', 'ራ', 'ሬ', 'ር', 'ሮ', 'ሯ', 'ሰ', 'ሱ', 'ሲ', 'ሳ', 'ሴ', 'ስ', 'ሶ', 'ሷ', 'ሸ', 'ሹ', 'ሺ', 'ሻ', 'ሼ', 'ሽ', 'ሾ', 'ሿ', 'ቀ', 'ቁ', 'ቂ', 'ቃ', 'ቄ', 'ቅ', 'ቆ', 'ቋ', 'በ', 'ቡ', 'ቢ', 'ባ', 'ቤ', 'ብ', 'ቦ', 'ቧ', 'ቨ', 'ቪ', 'ቫ', 'ቬ', 'ቭ', 'ተ', 'ቱ', 'ቲ', 'ታ', 'ቴ', 'ት', 'ቶ', 'ቷ', 'ቸ', 'ቹ', 'ቺ', 'ቻ', 'ቼ', 'ች', 'ቾ', 'ቿ', 'ኋ', 'ነ', 'ኑ', 'ኒ', 'ና', 'ኔ', 'ን', 'ኖ', 'ኗ', 'ኘ', 'ኙ', 'ኚ', 'ኛ', 'ኜ', 'ኝ', 'ኞ', 'አ', 'ኡ', 'ኢ', 'ኤ', 'እ', 'ኦ', 'ኧ', 'ከ', 'ኩ', 'ኪ', 'ካ', 'ኬ', 'ክ', 'ኮ', 'ኳ', 'ኸ', 'ኻ', 'ኼ', 'ኽ', 'ወ', 'ዉ', 'ዊ', 'ዋ', 'ዌ', 'ው', 'ዎ', 'ዐ', 'ዑ', 'ዓ', 'ዕ', 'ዘ', 'ዙ', 'ዚ', 'ዛ', 'ዜ', 'ዝ', 'ዞ', 'ዟ', 'ዠ', 'ዢ', 'ዣ', 'ዤ', 'ዥ', 'ዦ', 'የ', 'ዩ', 'ዪ', 'ያ', 'ዬ', 'ይ', 'ዮ', 'ደ', 'ዱ', 'ዲ', 'ዳ', 'ዴ', 'ድ', 'ዶ', 'ዷ', 'ጀ', 'ጁ', 'ጂ', 'ጃ', 'ጄ', 'ጅ', 'ጆ', 'ጇ', 'ገ', 'ጉ', 'ጊ', 'ጋ', 'ጌ', 'ግ', 'ጎ', '

In [17]:
# Point to the data we'll use for fine-tuning as the training set
quartznet.setup_training_data(train_data_config=params['model']['train_ds'])

# Point to the new validation data for fine-tuning
quartznet.setup_validation_data(val_data_config=params['model']['validation_ds'])

# And now we can create a PyTorch Lightning trainer and call `fit` again.
trainer = pl.Trainer(gpus=[0], max_epochs=2, amp_level='O1', precision=16)

[NeMo I 2020-10-13 10:39:07 collections:173] Dataset loaded with 8257 files totalling 8.37 hours
[NeMo I 2020-10-13 10:39:07 collections:174] 30 files were filtered totalling 0.17 hours
[NeMo I 2020-10-13 10:39:07 collections:173] Dataset loaded with 2072 files totalling 2.10 hours
[NeMo I 2020-10-13 10:39:07 collections:174] 0 files were filtered totalling 0.00 hours


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


In [26]:
import torch
torch.__version__

'1.6.0'

In [18]:
trainer.fit(quartznet)

[NeMo I 2020-10-13 10:39:11 modelPT:572] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.8, 0.5]
        eps: 1e-08
        grad_averaging: False
        lr: 0.001
        weight_decay: 0.001
    )


[NeMo W 2020-10-13 10:39:11 lr_scheduler:526] Neither `max_steps` nor `iters_per_batch` were provided to `optim.sched`, cannot compute effective `max_steps` !
    Scheduler will not be instantiated !

  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConvASREncoder                    | 18 M  
2 | spec_augmentation | SpectrogramAugmentation           | 0     
3 | _wer              | WER                               | 0     
4 | decoder           | ConvASRDecoder                    | 241 K 
5 | loss              | CTCLoss                           | 0     


HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

[NeMo I 2020-10-13 10:39:12 wer:148] 
    
[NeMo I 2020-10-13 10:39:12 wer:149] reference:   
[NeMo I 2020-10-13 10:39:12 wer:150] decoded  :ቸፀጃፐቸሲጄንፁቸዮሦፉዤንቸኢቃጊኗዌንሡማቿሣቁጹፐቋኦዕዷፁፐቨቸዳቸዉኋዞፁቃጿሡሉዳኸሸቱሺፖሡዞ


HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

[NeMo I 2020-10-13 10:39:12 wer:148] 
    
[NeMo I 2020-10-13 10:39:12 wer:149] reference:   
[NeMo I 2020-10-13 10:39:12 wer:150] decoded  :ቸኢቸኸቸፈጹቸፖፐዮክቸፐዞቸፉጤፐጊቸኗቱገቱጤቃዤአዊቋሡንዞጌፐኒዘቸሡቸአፐዞቸዉሜዮቋዮዞኤህዮዘቸ
[NeMo I 2020-10-13 10:39:13 wer:148] 
    
[NeMo I 2020-10-13 10:39:13 wer:149] reference:              
[NeMo I 2020-10-13 10:39:13 wer:150] decoded  :ቸሡጤኳቸሡሦሸሡንቸፃኢዌዞፁፉንዞኗሑዞፁኢቃቋኗኢሡጹመፐኼፃፐጄሸጤጾሸሦፖፐወዷንዞዉፐሡሸሡጓኻንሦዉችፐሡቸሦሡፐንፐሡጓጮንዞዉፐንፁጀሼሸጼሲቤኛናማኸጼዷኸፖቻሀሡቸዋፁኳጊፉፐዳቭሬዳጼፖቬፀዘቸዷቃንጼኼንኼዴቸሬፁፐዞወፉወሡፃጃጤቸኤአህቃቸጤሸሁፈፉህንገቸኸዞጤኩቱዉፐንቸዉሎዞዉፐዎዞሡቤፉሐቺቸሎሦጤቸኸፉዮቸፖሦዜፑላጎቃቸዉጄቋሷጿቸሦፉቦሣዝጃችኸኤኸጣሊኸኤካጤቸፁሺወ
[NeMo I 2020-10-13 10:39:14 wer:148] 
    
[NeMo I 2020-10-13 10:39:14 wer:149] reference:   
[NeMo I 2020-10-13 10:39:14 wer:150] decoded  :ቸሡፐሸዞሡፖገሡቸፖሦጡሦዮገሡቸጴዞቸቦችሡፉዞፁሾሦሡቸሡቸሡሁዞቸቼሡቸሡቋአቸቿሾኢፐሡ
[NeMo I 2020-10-13 10:39:14 wer:148] 
    
[NeMo I 2020-10-13 10:39:14 wer:149] reference:     
[NeMo I 2020-10-13 10:39:14 wer:150] decoded  :ቸሡቸፖጤንቸሲጴሡቸክቃሣፉሡኢሀዞሸዞፖጴዞጎጂሦዉቻዷዞቸጓንዷቸኗናፐሁዞንቸዉፐጴዞሡቃሦሡጄቋፁማንዉቫሸፁኼጊዉቺዎዞፁፉንቸትዊዞፁቃቸሦዉፐጤሡጮሸዞኗፐጤሁቸኗሡሸሡጿሦሀፖፐሟቸሡቸንፑፖቪሀ

Saving latest checkpoint..





1

In [None]:
train_manifest

In [23]:
import json
data=[]
with open(train_manifest, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

In [24]:
data[0:3]

[{'audio_filepath': 'NIST/openasr20_amharic/build/audio_split/BABEL_OP3_307_15902_20140422_235151_outLine_2606600_2615480.wav',
  'duration': 1.11,
  'text': 'ጉርድ ሾላ ምነው'},
 {'audio_filepath': 'NIST/openasr20_amharic/build/audio_split/BABEL_OP3_307_25767_20140403_234644_outLine_3567640_3637240.wav',
  'duration': 8.7,
  'text': 'እንትን እንትኑን አይቼዋለሁ ሳምፕሉን አይቼዋለሁ ግን ሳምፕሉን አይቼዋለሁ በሳምፕሉ ጥራት ደረጃ ከሆነ ሌላውም ቲ ቲሸርት የሚመጣው ትንሽ እንትን አለበት'},
 {'audio_filepath': 'NIST/openasr20_amharic/build/audio_split/BABEL_OP3_307_14229_20140503_233516_inLine_691560_720360.wav',
  'duration': 3.6,
  'text': 'ሳልበላ ነው እምቅመው እንደት ነው ቢጣ ምናምን ትደውላለች'}]

In [21]:
vars(quartznet)

{'global_rank': 0,
 'world_size': 0,
 'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict([('preprocessor',
               AudioToMelSpectrogramPreprocessor(
                 (featurizer): FilterbankFeatures()
               )),
              ('encoder',
               ConvASREncoder(
                 (encoder): Sequential(
                   (0): JasperBlock(
                     (mconv): ModuleList(
                       (0): MaskedConv1d(
                         (conv): Conv1d(64, 64, kernel_size=[33], stride=[2], padding=(16,), dilation=[1], groups=64, bias=False)
                       )
                       (1): MaskedConv1d(
                         (conv): Conv1d(64, 256, kernel_size=(1,), stride=(1,