## Train step 1: Bootstrap from pretrained model

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%load_ext autoreload
%autoreload 2

In [None]:
from Cfg import Cfg
C = Cfg('NIST', 16000, 'amharic', 'build') 

In [None]:
from load_pretrained_amharic_model import load_pretrained_amharic_model
model = load_pretrained_amharic_model(C, 0)

In [None]:
import pytorch_lightning as pl
import os, datetime

model_save_dir='save/nemo_amharic'

class ModelCheckpointAtEpochEnd(pl.callbacks.ModelCheckpoint):
    def on_epoch_end(self, trainer, pl_module):
        metrics = trainer.callback_metrics
        metrics['epoch'] = trainer.current_epoch
        trainer.checkpoint_callback.on_validation_end(trainer, pl_module)

pid=os.getpid()
dt=datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

checkpoint_callback = ModelCheckpointAtEpochEnd(
    filepath=model_save_dir+'/amharic_'+f'{dt}_{pid}'+'_{epoch:02d}',
    verbose=True,
    save_top_k=-1,
    save_weights_only=False,
    period=1)

trainer = pl.Trainer(gpus=[0], max_epochs=200, amp_level='O1', precision=16, checkpoint_callback=checkpoint_callback)

In [None]:
from ruamel.yaml import YAML
from omegaconf import DictConfig
config_path = 'amharic_16000.yaml'
yaml = YAML(typ='safe')
with open(config_path) as f:
    params = yaml.load(f)
train_manifest=f'{C.build_dir}/train_manifest.json'
test_manifest=f'{C.build_dir}/test_manifest.json'
params['model']['train_ds']['manifest_filepath'] = train_manifest
params['model']['validation_ds']['manifest_filepath'] = test_manifest
model.set_trainer(trainer)
model.setup_training_data(train_data_config=params['model']['train_ds'])
model.setup_validation_data(val_data_config=params['model']['validation_ds'])
model.setup_optimization(optim_config=DictConfig(params['model']['optim']))

In [None]:
from reshuffle_samples import reshuffle_samples
reshuffle_samples(C)

In [None]:
trainer.fit(model)

## DEV translation

In [1]:
from Cfg import Cfg
from glob import glob
from package_DEV import package_DEV
from load_pretrained_amharic_model import load_pretrained_amharic_model
version='113'
C = Cfg('NIST', 16000, 'amharic', 'dev', version)
model = load_pretrained_amharic_model(C, 0)
files=list(sorted(glob(f'{C.audio_split_dir}/*.wav')))
translations=model.transcribe(paths2audio_files=files, batch_size=32)


[NeMo W 2020-10-16 09:29:57 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToCharDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-10-16 09:29:57 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioToBPEDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-10-16 09:29:57 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.AudioLabelDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-10-16 09:29:57 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text._TarredAudioToTextDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2020-10-16 09:29:57 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text.TarredAudioToCharDataset'> is experimental,

[NeMo I 2020-10-16 09:29:58 features:241] PADDING: 16
[NeMo I 2020-10-16 09:29:58 features:258] STFT using torch
loaded save/nemo_amharic/amharic_20201015_220606_440181_epoch=199.ckpt
[NeMo I 2020-10-16 09:30:00 collections:173] Dataset loaded with 9225 files totalling 256250.00 hours
[NeMo I 2020-10-16 09:30:00 collections:174] 0 files were filtered totalling 0.00 hours


In [2]:
len(files), len(translations)

(9225, 9225)

In [3]:
# coding: utf-8

import sys, os, tarfile
from glob import glob
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd

In [47]:
np.seterr(all='raise')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [79]:
ctms={'_'.join(os.path.basename(fn.split(',')[0]).split('_')[0:7]): [] for fn in files}

In [80]:
for fn,pred in zip(files,translations):
    pred=pred.strip()
    if len(pred)==0:
        continue
    key=os.path.basename(fn)[0:-4].split('_')
    ctm='_'.join(key[0:7])
    F='_'.join(key[0:6])
    channel=key[6]
    tstart=float(key[-2])
    tend=float(key[-1])
    tbeg=tstart/C.sample_rate
    tdur=(tend-tstart)/C.sample_rate
    chnl='1' if channel=='inLine' else '2'
    tokens=pred.split(' ')
    n_tokens=len(tokens)
    token_lengths=np.array([len(token) for token in tokens])
    sum_token_lengths=token_lengths.sum()
    token_weights=token_lengths/sum_token_lengths
    dt=tdur*token_weights
    ends = tdur*np.cumsum(token_weights)
    tgrid=(ends-ends[0])+tbeg
    token_tstart=list(zip(tokens,tgrid))
    if ctms[ctm]: start_from = ctms[ctm][-1][2]
    for token, tstart, dt in zip(tokens,tgrid,dt):
        if token and token[0] not in ['(', '<']:
            row=(F,chnl,tstart,dt,token)
            ctms[ctm].append(row)

In [81]:
df=pd.DataFrame(ctms[ctm], columns=['file', 'channel', 'start', 'duration', 'prediction'])

In [92]:
os.chdir('/home/catskills/Desktop/openasr20')

In [93]:
for ctm in ctms:
   ctms[ctm].sort()
shipping_dir=f'ship/{C.language}/{C.release}'
os.system(f'mkdir -p {shipping_dir}')
Path(shipping_dir).mkdir(parents=True, exist_ok=True)
timestamp=datetime.today().strftime('%Y%m%d_%H%M')
for ctm in ctms:
   fn=f'{C.shipping_dir}/{ctm}.ctm'
   with open(fn,'wt', encoding='utf-8') as f:
       for row in ctms[ctm]:
           line='\t'.join([str(x) for x in row])
           f.write(f"{line}\n")
os.chdir(shipping_dir)
tar_fn=f'../../catskills_openASR20_dev_{C.language}_{C.release}.tgz'
with tarfile.open(tar_fn, "w:gz") as tar: 
    for fn in glob('*.ctm'): 
        tar.add(fn)
os.chdir('../../..')
print('wrote', tar_fn)

wrote ../../catskills_openASR20_dev_amharic_113.tgz


## Bad split CTM

In [69]:
ctm

'BABEL_OP3_307_98506_20140807_170934_outLine'

In [67]:
(1112054-1069053)/16000

2.6875625

In [68]:
1112054/16000

69.503375