## Maximum clip size

## Train step 1: Bootstrap from pretrained model

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

%load_ext autoreload
%autoreload 2
%matplotlib notebook

import pickle, os, warnings, sys, random, logging, librosa, json, nemo
warnings.filterwarnings("ignore")
logger = logging.getLogger()
logger.setLevel(logging.INFO)

from Cfg import Cfg
from reshuffle_samples import reshuffle_samples
import nemo.collections.asr as nemo_asr
from ruamel.yaml import YAML

from omegaconf import DictConfig
import os, datetime
from load_pretrained_amharic_model import load_pretrained_amharic_model

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
C, model, params = load_pretrained_amharic_model()

## Train Phase 2: K-fold validation more or less

In [None]:
import pytorch_lightning as pl
import os, datetime
from Cfg import Cfg
from load_pretrained_amharic_model import load_pretrained_amharic_model
C, model, params = load_pretrained_amharic_model()

In [None]:
model_save_dir='save/nemo_amharic'

class ModelCheckpointAtEpochEnd(pl.callbacks.ModelCheckpoint):
    def on_epoch_end(self, trainer, pl_module):
        metrics = trainer.callback_metrics
        metrics['epoch'] = trainer.current_epoch
        trainer.checkpoint_callback.on_validation_end(trainer, pl_module)

pid=os.getpid()
dt=datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

checkpoint_callback = ModelCheckpointAtEpochEnd(
    filepath=model_save_dir+'/amharic_'+f'{dt}_{pid}'+'_{epoch:02d}',
    verbose=True,
    save_top_k=-1,
    save_weights_only=False,
    period=1)

trainer = pl.Trainer(gpus=[0], max_epochs=200, amp_level='O1', precision=16, checkpoint_callback=checkpoint_callback)

In [None]:
from reshuffle_samples import reshuffle_samples
reshuffle_samples(C)

In [None]:
trainer.fit(model)

## Local test on BUILD -- In Sample

In [None]:
from json_lines_load import json_lines_load
T=json_lines_load(f'{C.build_dir}/train_manifest.json')
V=json_lines_load(f'{C.build_dir}/test_manifest.json')
samples=T+V

len(samples)

S=list(sorted([(x['audio_filepath'],x['text']) for x in samples]))

audio_files=[x for x,y in S[0:12]]
transcripts=[y for x,y in S[0:12]]

pred=model.transcribe(paths2audio_files=audio_files, batch_size=1)

import pandas as pd
pd.DataFrame(zip(transcripts,pred), columns=['gold','pred'])

## With save/restore

In [None]:
model3 = nemo_asr.models.EncDecCTCModel.load_from_checkpoint('save/nemo_amharic/amharic_20201015_005720_353924_epoch=167.ckpt')
model3.cuda(0)
pred3=model3.transcribe(paths2audio_files=audio_files, batch_size=1)
pd.DataFrame(zip(transcripts,pred3), columns=['gold','pred'])

## Local test on BUILD -- Out of Sample

## Do a silence split on DEV

In [None]:
from Cfg import Cfg
C = Cfg('NIST', 16000, 'amharic', 'dev') 

from RecordingCorpus import RecordingCorpus
from multiprocessing import Pool

if __name__ == '__main__':
    with Pool(16) as pool:
        recordings = RecordingCorpus(C, pool)

from SplitCorpus import SplitCorpus
ssplits=SplitCorpus.split_on_silence(C, recordings, 30)

import soundfile as sf
from tqdm.auto import tqdm
F=[]
for sample in tqdm(ssplits.artifacts):
    (_,root,(start,end))=sample.key
    audio = sample.source.value
    audio_path=f'{C.audio_split_dir}/{root}_{start}_{end}.wav'
    sf.write(audio_path, audio, C.sample_rate)
    sample.source.filename=audio_path
    F.append(audio_path)

## Transcribe DEV

In [None]:
class Cfg:

    def __init__(self, _stage, _sample_rate, _language, _phase='build', _release='001'):
        self.stage = _stage
        self.sample_rate = _sample_rate
        self.language = _language
        self.phase = _phase
        self.release = _release
        self.data_dir=f'{self.stage}/openasr20_{self.language}'
        self.build_dir=f'{self.data_dir}/{self.phase}'
        self.audio_split_dir=f'{self.build_dir}/audio_split'
        os.system(f'mkdir -p {self.build_dir}')
        os.system(f'mkdir -p {self.audio_split_dir}')
        self.shipping_dir=f'ship/{self.language}/{self.release}'


In [None]:
C = Cfg('NIST', 16000, 'amharic', 'dev', '101') 

In [None]:
files=list(sorted(glob(f'{C.audio_split_dir}/*.wav')))
print(len(files))

In [None]:
files[0:3]

In [None]:
translations=model.transcribe(paths2audio_files=files, batch_size=8)

In [None]:
files[0]

In [None]:
import pandas as pd
pd.DataFrame(translations)

In [None]:
from IPython.display import Audio

In [None]:
(1353479-1217750)/16000

In [None]:
Audio(files[1])

## Package for NIST

In [None]:
import sys, os, tarfile
from glob import glob
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd

In [None]:
vars(C)

In [None]:
ctms={'_'.join(os.path.basename(fn.split(',')[0]).split('_')[0:7]): [] for fn in files}

In [None]:
for fn,pred in zip(files,translations):
   key=os.path.basename(fn)[0:-4].split('_')
   ctm='_'.join(key[0:7])
   F='_'.join(key[0:6])
   channel=key[6]
   tstart=float(key[-2])
   tend=float(key[-1])
   tbeg=tstart/C.sample_rate
   tdur=(tend-tstart)/C.sample_rate
   chnl='1' if channel=='inLine' else '2'
   tokens=pred[0:-1].split(' ')
   n_tokens=len(tokens)
   dt = tdur/n_tokens
   tgrid=np.array([i*dt for i in range(n_tokens)])+tbeg
   token_tstart=list(zip(tokens,tgrid))
   if ctms[ctm]: start_from = ctms[ctm][-1][2]
   for token, tstart in token_tstart:
       if token and token[0] not in ['(', '<']:
           row=(F,chnl,tstart,dt,token)
           ctms[ctm].append(row)
for ctm in ctms:
   ctms[ctm].sort()
shipping_dir=f'ship/{C.language}/{C.release}'
Path(shipping_dir).mkdir(parents=True, exist_ok=True)
timestamp=datetime.today().strftime('%Y%m%d_%H%M')
for ctm in ctms:
   fn=f'{C.shipping_dir}/{ctm}.ctm'
   with open(fn,'wt', encoding='utf-8') as f:
       for row in ctms[ctm]:
           line='\t'.join([str(x) for x in row])
           f.write(f"{line}\n")
os.chdir(shipping_dir)
tar_fn=f'../../catskills_openASR20_dev_{C.language}_{C.release}.tgz'
with tarfile.open(tar_fn, "w:gz") as tar: 
    for fn in glob('*.ctm'): 
        tar.add(fn)
os.chdir('../..')
print('wrote', tar_fn)