In [None]:
git clone https://github.com/ericharper/apex.git
cd apex
git checkout nm_v1.14.0
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./

In [None]:
import os
import glob
import subprocess
import tarfile
import wget
import copy
from omegaconf import OmegaConf, open_dict, read_write
import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.metrics.wer import word_error_rate
from nemo.utils import logging, exp_manager
from tqdm.auto import tqdm
import json
import wandb


def read_manifest(path, train):
    manifest = []
    file_list = glob.glob(path)
    
    for filename in tqdm(sorted(file_list),desc="Reading manifest data") :
        with open(filename) as file:
            for line in file:
                line = line.replace("\n", "")
                line = line.replace("\\\\", "/")
                line = line.replace("/Others/", "/OTHERS/")
                if train ==1:
                    line = line.replace("./", "/mnt/sdb/jhchang/nemo/Training_RealAudio/")
                else:
                    line = line.replace("./", "/mnt/sdb/jhchang/nemo/Validation_RealAudio/")
                data = json.loads(line)
                manifest.append(data)
                
              
    return manifest

from collections import defaultdict

def get_charset(manifest_data):
    charset = defaultdict(int)
    for row in tqdm(manifest_data, desc="Computing character set"):
        text = row['text']
        for character in text:
            charset[character] += 1
    return charset

In [None]:
import re
import unicodedata

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\…\{\}\【\】\・\。\『\』\、\ー\〜]'  # remove special character tokens
#eng_removal_regex = '[^a-zA-Z]'  # remove test set kanji


def remove_special_characters(data):
    data["text"] = re.sub(chars_to_ignore_regex, '', data["text"]).lower().strip()
    return data

def apply_preprocessors(manifest, preprocessors):
    for processor in preprocessors:
        for idx in tqdm(range(len(manifest)), desc=f"Applying {processor.__name__}"):
            manifest[idx] = processor(manifest[idx])

    print("Finished processing manifest !")
    return manifest

PREPROCESSORS = [
    remove_special_characters
#    ,
#    remove_extra_kanji,
#    remove_dakuten,
]

In [None]:

nemo_dir = os.path.join('/mnt/sdb/jhchang/nemo/')
train_manifest_folder = f"{nemo_dir}/Training_RealAudio/Manifests/*.json"
test_manifest_folder = f"{nemo_dir}/Validation_RealAudio/Manifests/*.json"

if os.path.exists(f"{nemo_dir}/Training_RealAudio/Manifests/training_merged.json"):
    os.remove(f"{nemo_dir}/Training_RealAudio/Manifests/training_merged.json")
    os.remove(f"{nemo_dir}/Validation_RealAudio/Manifests/test_merged.json")
    train_manifest_data = []
    test_manifest_data = []
else:
    print("no existed file")
    
train_manifest_data = read_manifest(train_manifest_folder, train=1)
test_manifest_data = read_manifest(test_manifest_folder, train=0)

# Apply preprocessing
train_data_processed = apply_preprocessors(train_manifest_data, PREPROCESSORS)
test_data_processed = apply_preprocessors(test_manifest_data, PREPROCESSORS)


    
with open(f"{nemo_dir}/Training_RealAudio/Manifests/training_merged.json", 'w+') as outfile:
    for row in tqdm(train_data_processed, desc="Writing manifesting data"):
        json.dump(row, outfile)
        outfile.write('\n')
    
with open(f"{nemo_dir}/Validation_RealAudio/Manifests/test_merged.json", 'w+') as outfile:
    for row in tqdm(test_data_processed, desc="Writing manifesting data"):
        json.dump(row, outfile)
        outfile.write('\n')
    
    
train_manifest_cleaned = f"{nemo_dir}/Training_RealAudio/Manifests/training_merged.json"
test_manifest_cleaned = f"{nemo_dir}/Validation_RealAudio/Manifests/test_merged.json"


train_charset = get_charset(train_manifest_data)
train_dev_set = set.union(set(train_charset.keys())) 


In [None]:
train_text = [data['text'] for data in train_manifest_data]
#dev_text = [data['text'] for data in dev_manifest_data]
test_text = [data['text'] for data in test_manifest_data]
train_text[0]

In [None]:
char_model = nemo_asr.models.ASRModel.restore_from(f'{nemo_dir}/Model-ko-epoch 20-05-11-06-July_WER(0.0438).nemo')
# 다른 데이터셋에 테스트할때는 보캡을 업데이트 안하는게 낫더라
train_dev_set = set(char_model.decoder.vocabulary) | set(train_dev_set)
char_model.change_vocabulary(new_vocabulary = list(train_dev_set))


In [None]:
len(char_model.decoder.vocabulary)
char_model.cfg.optim.lr

In [None]:
char_model.to("cuda")
files = ['/mnt/sdb/jhchang/nemo/Validation_RealAudio/audio/DOC/20220812_2+_DOC_074.wav']
#files = ['/mnt/sdb/jhchang/nemo/Training_RealAudio/audio/F/20220920_1_hancell_F_116.wav']
for fname, transcription in zip(files, char_model.transcribe(paths2audio_files=files)):
  print(f"Audio in {fname} was rbecognized as: {transcription}")

In [None]:
freeze_encoder = True #@param ["False", "True"] {type:"raw"}
freeze_encoder = bool(freeze_encoder)

import torch
import torch.nn as nn

def enable_bn_se(m):
    if type(m) == nn.BatchNorm1d:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

    if 'SqueezeExcite' in type(m).__name__:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)
            
if freeze_encoder:
  char_model.encoder.freeze()
  char_model.encoder.apply(enable_bn_se)
  logging.info("Model encoder has been frozen, and batch normalization has been unfrozen")
else:
  char_model.encoder.unfreeze()
  logging.info("Model encoder has been un-frozen")

훈련할때는 라벨옵션을 붙이고 테스트할때는 라벨옵션에 None해야 함.

In [None]:
#char_model.cfg.labels = list(train_dev_set)
cfg = copy.deepcopy(char_model.cfg)

with open_dict(cfg):    
  # Train dataset  (Concatenate train manifest cleaned and dev manifest cleaned)
  cfg.train_ds.manifest_filepath = f"{train_manifest_cleaned}" #",{dev_manifest_cleaned}"
  #cfg.train_ds.labels = list(train_dev_set)
  cfg.train_ds.labels = None
  cfg.train_ds.normalize_transcripts = False
  cfg.train_ds.batch_size = 32
  cfg.train_ds.num_workers = 8
  cfg.train_ds.pin_memory = True
  cfg.train_ds.trim_silence = False
 
  # Validation dataset  (Use test dataset as validation, since we train using train + dev)
  cfg.validation_ds.manifest_filepath = test_manifest_cleaned
  #cfg.validation_ds.labels = list(train_dev_set)
  cfg.validation_ds.labels = None
  cfg.validation_ds.normalize_transcripts = False
  cfg.validation_ds.batch_size = 16
  cfg.validation_ds.num_workers = 8
  cfg.validation_ds.pin_memory = True
  cfg.validation_ds.trim_silence = False

char_model.setup_training_data(cfg.train_ds)
char_model.setup_multiple_validation_data(cfg.validation_ds)

In [None]:
with open_dict(char_model.cfg.optim):
  char_model.cfg.optim.lr = 0.00001  #0.01 for freezing
  char_model.cfg.optim.betas = [0.95, 0.5]  # from paper
  char_model.cfg.optim.weight_decay = 0.001  # Original weight decay
  char_model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup
  char_model.cfg.optim.sched.warmup_ratio = 0.05  # 5 % warmup
  char_model.cfg.optim.sched.min_lr = 1e-5

In [None]:
with open_dict(char_model.cfg.spec_augment):
    char_model.cfg.spec_augment.freq_masks = 2
    char_model.cfg.spec_augment.freq_width = 25
    char_model.cfg.spec_augment.time_masks = 2
    char_model.cfg.spec_augment.time_width = 0.05

char_model.spec_augmentation = char_model.from_config_dict(char_model.cfg.spec_augment)

WER이면 False, CER 이면 True

In [None]:
use_cer = True #@param ["False", "True"] {type:"raw"}
log_prediction = True #@param ["False", "True"] {type:"raw"}
char_model._wer.use_cer = use_cer
char_model._wer.log_prediction = log_prediction


In [None]:
import torch
import pytorch_lightning as ptl

trainer = ptl.Trainer(gpus=[1], accelerator = 'gpu', 
                      #amp_level='O1', precision=16,
                      #devices=2, num_nodes=2, accelerator='gpu', strategy='ddp',
                      max_epochs=100, 
                      accumulate_grad_batches=1,
                      enable_checkpointing=False,
                      logger=False,
                      log_every_n_steps=100,
                      check_val_every_n_epoch=2)

char_model.set_trainer(trainer)


In [None]:
os.environ.pop('NEMO_EXPM_VERSION', None)
LANGUAGE = "ko"
config = exp_manager.ExpManagerConfig(
    exp_dir=f'experiments/lang-{LANGUAGE}/',
    name=f"ASR-Char-Model-Language-{LANGUAGE}", 
    create_checkpoint_callback=True,
    create_wandb_logger=True,
    wandb_logger_kwargs={"project":'real-audio', "job_type":"training", "log_model":True},
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
    
)

config = OmegaConf.structured(config)
logdir = exp_manager.exp_manager(trainer,config)

try:
  from google import colab
  COLAB_ENV = True
except (ImportError, ModuleNotFoundError):
  COLAB_ENV = False

# Load the TensorBoard notebook extension
if COLAB_ENV:
  %load_ext tensorboard
  %tensorboard --logdir /content/experiments/lang-$LANGUAGE/ASR-Char-Model-Language-$LANGUAGE/
else:
  print("To use tensorboard, please use this notebook in a Google Colab environment.")

In [None]:
## training
%time

trainer.fit(char_model)
#exp_manager(trainer, cfg.get("exp_manager", None))

In [None]:
wandb.finish()

In [None]:
import datetime as dt
timestamp = dt.datetime.now().strftime("%H-%M-%d-%B")
save_path = f"/mnt/sdb/jhchang/nemo/Model-ko-epoch 500-{timestamp}-from-RealAudio_loss.nemo"
char_model.save_to(f"{save_path}")
print(f"Model saved at path : {save_path}")

# Evaluation

In [None]:
char_model = nemo_asr.models.ASRModel.restore_from(f'/mnt/sdb/jhchang/nemo/Model-ko-epoch 500-00-59-29-January-from-RealAudio_loss.nemo')


In [None]:
#char_model.cfg.labels = list(train_dev_set)
cfg = copy.deepcopy(char_model.cfg)

nemo_dir = os.path.join('/mnt/sdb/jhchang/nemo/')
train_manifest_cleaned = f"{nemo_dir}/Training_RealAudio/Manifests/training_merged.json"
test_manifest_cleaned = f"{nemo_dir}/Validation_RealAudio/Manifests/test_merged.json"

with open_dict(cfg):    

  # Validation dataset  (Use test dataset as validation, since we train using train + dev)
  cfg.validation_ds.manifest_filepath = test_manifest_cleaned
  #cfg.validation_ds.labels = list(train_dev_set)
  cfg.validation_ds.labels = None
  cfg.validation_ds.normalize_transcripts = False
  cfg.validation_ds.batch_size = 8
  cfg.validation_ds.num_workers = 8
  cfg.validation_ds.pin_memory = True
  cfg.validation_ds.trim_silence = False

char_model.setup_multiple_validation_data(cfg.validation_ds)

In [None]:
import torch
torch.cuda.empty_cache()


use_cer = False #@param ["False", "True"] {type:"raw"}
log_prediction = True #@param ["False", "True"] {type:"raw"}
char_model._wer.use_cer = use_cer
char_model._wer.log_prediction = log_prediction

# Setup the test data loader and make sure the model is on GPU
char_model.setup_test_data(test_data_config=cfg.validation_ds)
char_model.cuda()
char_model.eval()

# We will be computing Word Error Rate (WER) metric between our hypothesis and predictions.
# WER is computed as numerator/denominator.
# We'll gather all the test batches' numerators and denominators.
wer_nums = []
wer_denoms = []

# Loop over all test batches.
# Iterating over the model's `test_dataloader` will give us:
# (audio_signal, audio_signal_length, transcript_tokens, transcript_length)
# See the AudioToCharDataset for more details.
for test_batch in char_model.test_dataloader():
        test_batch = [x.cuda() for x in test_batch]
        targets = test_batch[2]
        targets_lengths = test_batch[3]        
        log_probs, encoded_len, greedy_predictions = char_model(
            input_signal=test_batch[0], input_signal_length=test_batch[1]
        )
        # Notice the model has a helper object to compute WER
        char_model._wer.update(greedy_predictions, targets, targets_lengths)
        _, wer_num, wer_denom = char_model._wer.compute()
        char_model._wer.reset()
        wer_nums.append(wer_num.detach().cpu().numpy())
        wer_denoms.append(wer_denom.detach().cpu().numpy())

        # Release tensors from GPU memory
        del test_batch, log_probs, targets, targets_lengths, encoded_len, greedy_predictions

# We need to sum all numerators and denominators first. Then divide.
print(f"WER = {sum(wer_nums)/sum(wer_denoms)}")

In [None]:
import torch
torch.cuda.empty_cache()


use_cer = True #@param ["False", "True"] {type:"raw"}
log_prediction = True #@param ["False", "True"] {type:"raw"}
char_model._wer.use_cer = use_cer
char_model._wer.log_prediction = log_prediction

# Setup the test data loader and make sure the model is on GPU
char_model.setup_test_data(test_data_config=cfg.validation_ds)
char_model.cuda()
char_model.eval()

# We will be computing Word Error Rate (WER) metric between our hypothesis and predictions.
# WER is computed as numerator/denominator.
# We'll gather all the test batches' numerators and denominators.
wer_nums = []
wer_denoms = []

# Loop over all test batches.
# Iterating over the model's `test_dataloader` will give us:
# (audio_signal, audio_signal_length, transcript_tokens, transcript_length)
# See the AudioToCharDataset for more details.
for test_batch in char_model.test_dataloader():
        test_batch = [x.cuda() for x in test_batch]
        targets = test_batch[2]
        targets_lengths = test_batch[3]        
        log_probs, encoded_len, greedy_predictions = char_model(
            input_signal=test_batch[0], input_signal_length=test_batch[1]
        )
        # Notice the model has a helper object to compute WER
        char_model._wer.update(greedy_predictions, targets, targets_lengths)
        _, wer_num, wer_denom = char_model._wer.compute()
        char_model._wer.reset()
        wer_nums.append(wer_num.detach().cpu().numpy())
        wer_denoms.append(wer_denom.detach().cpu().numpy())

        # Release tensors from GPU memory
        del test_batch, log_probs, targets, targets_lengths, encoded_len, greedy_predictions

# We need to sum all numerators and denominators first. Then divide.
print(f"CER = {sum(wer_nums)/sum(wer_denoms)}")