### NeMo Primer ###
https://github.com/NVIDIA/NeMo/blob/main/tutorials/00_NeMo_Primer.ipynb

In [1]:
import os
import copy
from pathlib import Path
from pprint import pprint
 
# PyTorch
import torch

# NeMo framework
import nemo
from omegaconf import OmegaConf

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import nvmodels

print(f'Package version: {nvmodels.__version__}')
print(f'PyTorch version: {torch.__version__}')
print(f'NeMo version:    {nemo.__version__}')

Package version: 0.0.1b0.post1.dev5+g11170d6
PyTorch version: 2.3.0a0+ebedce2
NeMo version:    2.0.0rc0


In [2]:
# Directory for storing data
print(os.environ.get('DATA_ROOT'))
data_dir = os.path.join(os.environ.get('DATA_ROOT'), 'nemodata')
model_dir = os.path.join(data_dir, 'nemomodel')
Path(model_dir).mkdir(parents=True, exist_ok=True)

/app/data


### NeMo Models in Collections ###

In [5]:
# Automatic Speech Recognition
import nemo.collections.asr as nemo_asr
asr_models = [model for model in dir(nemo_asr.models)]
display(asr_models)

['ASRModel',
 'AudioToAudioModel',
 'ClassificationInferConfig',
 'ClusteringDiarizer',
 'EncDecCTCModel',
 'EncDecCTCModelBPE',
 'EncDecClassificationModel',
 'EncDecDiarLabelModel',
 'EncDecFrameClassificationModel',
 'EncDecHybridRNNTCTCBPEModel',
 'EncDecHybridRNNTCTCModel',
 'EncDecK2RnntSeqModel',
 'EncDecK2RnntSeqModelBPE',
 'EncDecK2SeqModel',
 'EncDecK2SeqModelBPE',
 'EncDecMultiTaskModel',
 'EncDecRNNTBPEModel',
 'EncDecRNNTModel',
 'EncDecSpeakerLabelModel',
 'EncDecTransfModelBPE',
 'EncMaskDecAudioToAudioModel',
 'NeuralDiarizer',
 'PredictiveAudioToAudioModel',
 'SLUIntentSlotBPEModel',
 'ScoreBasedGenerativeAudioToAudioModel',
 'SpeechEncDecSelfSupervisedModel',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'aed_multitask_models',
 'asr_model',
 'audio_to_audio_model',
 'classification_models',
 'clustering_diarizer',
 'configs',
 'ctc_bpe_models',
 'ctc_models',
 'enhancement_models',
 'hy

In [6]:
# NLP models
import nemo.collections.nlp as nemo_nlp
nlp_models = [model for model in dir(nemo_nlp.models)]
display(nlp_models)

['BERTLMModel',
 'BertDPRModel',
 'BertJointIRModel',
 'DuplexDecoderModel',
 'DuplexTaggerModel',
 'DuplexTextNormalizationModel',
 'EntityLinkingModel',
 'GLUEModel',
 'IntentSlotClassificationModel',
 'MTEncDecModel',
 'MegatronGPTPromptLearningModel',
 'MultiLabelIntentSlotClassificationModel',
 'PunctuationCapitalizationLexicalAudioModel',
 'PunctuationCapitalizationModel',
 'QAModel',
 'SpellcheckingAsrCustomizationModel',
 'Text2SparqlModel',
 'TextClassificationModel',
 'ThutmoseTaggerModel',
 'TokenClassificationModel',
 'TransformerLMModel',
 'ZeroShotIntentModel',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'duplex_text_normalization',
 'enc_dec_nlp_model',
 'entity_linking',
 'glue_benchmark',
 'information_retrieval',
 'intent_slot_classification',
 'language_modeling',
 'machine_translation',
 'nlp_model',
 'question_answering',
 'spellchecking_asr_customization',
 'text2sparql',
 'text_cl

In [5]:
# Text-to-speech TTS models
import nemo.collections.tts as nemo_tts
tts_models = [model for model in dir(nemo_tts.models) if model.endswith("Model")]
display(tts_models)

['AlignerModel',
 'AudioCodecModel',
 'FastPitchModel',
 'GriffinLimModel',
 'HifiGanModel',
 'MelPsuedoInverseModel',
 'MixerTTSModel',
 'RadTTSModel',
 'SpectrogramEnhancerModel',
 'Tacotron2Model',
 'TwoStagesModel',
 'UnivNetModel',
 'VitsModel',
 'WaveGlowModel']

### The NeMo Model ###
Let's dive deeper into what a NeMo model really is. There are many ways we can create these models - we can use the constructor and pass in a config, we can instantiate the model from a pre-trained checkpoint, or simply pass a pre-trained model name and instantiate a model directly from the cloud !

In [6]:
# Let's try automatic speech recognition
citrinet = nemo_asr.models.EncDecCTCModelBPE.from_pretrained('stt_en_citrinet_512')

[NeMo I 2024-08-05 20:17:33 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512/versions/1.0.0rc1/files/stt_en_citrinet_512.nemo to /app/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_citrinet_512/3262321355385bb7cf5a583146117d77/stt_en_citrinet_512.nemo
[NeMo I 2024-08-05 20:17:35 common:815] Instantiating model from pre-trained checkpoint
[NeMo I 2024-08-05 20:17:37 mixins:172] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2024-08-05 20:17:37 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 32
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    
[NeMo W 2024-08-05 20:17:37 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 32
    shuffle: false
    
[NeMo W 2024-08-05 20:17:37 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).
    T

[NeMo I 2024-08-05 20:17:37 features:305] PADDING: 16
[NeMo I 2024-08-05 20:17:38 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /app/.cache/torch/NeMo/NeMo_2.0.0rc0/stt_en_citrinet_512/3262321355385bb7cf5a583146117d77/stt_en_citrinet_512.nemo.


In [7]:
display(citrinet.summarize())

  | Name              | Type                              | Params | Mode 
--------------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train
1 | encoder           | ConvASREncoder                    | 36.3 M | train
2 | decoder           | ConvASRDecoder                    | 657 K  | train
3 | loss              | CTCLoss                           | 0      | train
4 | spec_augmentation | SpectrogramAugmentation           | 0      | train
5 | wer               | WER                               | 0      | train
--------------------------------------------------------------------------------
37.0 M    Trainable params
0         Non-trainable params
37.0 M    Total params
147.977   Total estimated model params size (MB)

### Model Configuration using OmegaConf ###

In [143]:
cfg = copy.deepcopy(citrinet.cfg)
print(cfg.keys())
# We can convert the configuration dictionary to a yaml file
# print(OmegaConf.to_yaml(cfg))
cfg_file = os.path.join(data_dir, 'citrinet.cfg')
with open(cfg_file, 'w') as fl:
    fl.write(OmegaConf.to_yaml(cfg))

dict_keys(['sample_rate', 'train_ds', 'validation_ds', 'test_ds', 'model_defaults', 'tokenizer', 'preprocessor', 'spec_augment', 'encoder', 'decoder', 'optim', 'target', 'nemo_version', 'decoding'])


In [75]:
# Let's make a new copy of the configuration
cfg = copy.deepcopy(citrinet.cfg)

# OmegaConf will not allow adding new config items, we we temporarily disable this safeguard
OmegaConf.set_struct(cfg, False)

# Let's see the old config
print(OmegaConf.to_yaml(cfg.optim))

name: novograd
lr: 0.05
betas:
- 0.8
- 0.25
weight_decay: 0.001
sched:
  name: CosineAnnealing
  warmup_steps: 1000
  warmup_ratio: null
  min_lr: 1.0e-09
  last_epoch: -1



In [144]:
sched = {'name': 'CosineAnnealing', 'warmup_steps': 1006, 'min_lr': 1e-6}
sched = OmegaConf.create(sched)  # Convert it into a DictConfig
# Assign it to cfg.optim.sched namespace
cfg.optim.sched = sched
# Let's see the new optim config
print("New Config: ")
print(OmegaConf.to_yaml(cfg.optim))
# Here, we restore the safeguards so no more additions can be made to the config
OmegaConf.set_struct(cfg, True)

New Config: 
name: novograd
lr: 0.05
betas:
- 0.8
- 0.25
weight_decay: 0.001
sched:
  name: CosineAnnealing
  warmup_steps: 1006
  min_lr: 1.0e-06



### Changing the model configuration ###

In [121]:
# Make a copy of the configuration of the model
cfg = copy.deepcopy(citrinet.cfg)
print(cfg.keys())

# We want to change the preprocessor configuration
new_preprocessor_cfg = copy.deepcopy(cfg.get('preprocessor'))
print()
# We can change this configuration
pprint(new_preprocessor_cfg)

# Save the new configuration
print()
new_preprocessor = citrinet.from_config_dict(new_preprocessor_cfg)
# Replace the attribute
dir(citrinet.preprocessor)
citrinet.preprocessor = new_preprocessor

display(citrinet.summarize())

dict_keys(['sample_rate', 'train_ds', 'validation_ds', 'test_ds', 'model_defaults', 'tokenizer', 'preprocessor', 'spec_augment', 'encoder', 'decoder', 'optim', 'target', 'nemo_version', 'decoding'])

{'_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_size': 0.025, 'window_stride': 0.01, 'window': 'hann', 'features': 80, 'n_fft': 512, 'frame_splicing': 1, 'dither': 1e-05, 'pad_to': 16, 'stft_conv': False}

[NeMo I 2024-08-05 21:53:39 features:305] PADDING: 16


  | Name              | Type                              | Params | Mode 
--------------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | train
1 | encoder           | ConvASREncoder                    | 36.3 M | train
2 | decoder           | ConvASRDecoder                    | 657 K  | train
3 | loss              | CTCLoss                           | 0      | train
4 | spec_augmentation | SpectrogramAugmentation           | 0      | train
5 | wer               | WER                               | 0      | train
--------------------------------------------------------------------------------
37.0 M    Trainable params
0         Non-trainable params
37.0 M    Total params
147.977   Total estimated model params size (MB)

### Saving the updated model configuration ###
Why do we want to do this? NeMo has many ways of saving and restoring its models, which we will discuss a bit later. All of them depend on having an updated config that defines the model in its entirety, so if we modify anything, we should also update the corresponding part of the config to safely save and restore models.

In [123]:
# Update the copy of the configuration
cfg.preprocessor = new_preprocessor_cfg
# Update the model config
citrinet.cfg = cfg
# Now, we need to save the citrinet model ...


### Saving and restoring from .nemo files ###
There are a few models which might require external dependencies to be packaged with them in order to restore them properly.

One such example is an ASR model with an external BPE tokenizer. It is preferred if the model includes all of the components required to restore it, but a binary file for a tokenizer cannot be serialized into a PyTorch Lightning checkpoint.

In such cases, we can use the save_to and restore_from method to package the entire model + its components (here, the tokenizer file(s)) into a tarfile. This can then be easily imported by the user and used to restore the model.

In [125]:
# Print the current optimizer
print(OmegaConf.to_yaml(citrinet.cfg.optim))

name: novograd
lr: 0.05
betas:
- 0.8
- 0.25
weight_decay: 0.001
sched:
  name: CosineAnnealing
  warmup_steps: 1000
  warmup_ratio: null
  min_lr: 1.0e-09
  last_epoch: -1



In [145]:
# Let's update the configuration
citrinet.setup_optimization(cfg.optim)

[NeMo W 2024-08-05 22:45:09 modelPT:652] Trainer wasn't specified in model constructor. Make sure that you really wanted it.


[NeMo I 2024-08-05 22:45:09 modelPT:770] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.8, 0.25]
        eps: 1e-08
        grad_averaging: False
        lr: 0.05
        weight_decay: 0.001
    )


[NeMo W 2024-08-05 22:45:09 lr_scheduler:903] Neither `max_steps` nor `iters_per_batch` were provided to `optim.sched`, cannot compute effective `max_steps` !
    Scheduler will not be instantiated !


(Novograd (
 Parameter Group 0
     amsgrad: False
     betas: [0.8, 0.25]
     eps: 1e-08
     grad_averaging: False
     lr: 0.05
     weight_decay: 0.001
 ),
 None)

In [153]:
display(type(cfg))
OmegaConf.set_struct(cfg.optim.sched, value=True)
# Now we can add a new key, value pair
# Let's also change the scheduler
sched = {'name': 'CosineAnnealing', 'warmup_steps': 1006, 'min_lr': 1e-6, 'max_steps': 100}
sched = OmegaConf.create(sched)  # Convert it into a DictConfig
# Assign it to cfg.optim.sched namespace
cfg.optim.sched = sched
OmegaConf.set_struct(cfg.optim.sched, value=True)
# Now let's update the config and try again
citrinet.setup_optimization(cfg.optim)

omegaconf.dictconfig.DictConfig

[NeMo W 2024-08-05 22:53:50 modelPT:652] Trainer wasn't specified in model constructor. Make sure that you really wanted it.


[NeMo I 2024-08-05 22:53:50 modelPT:770] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.8, 0.25]
        eps: 1e-08
        grad_averaging: False
        lr: 0.05
        weight_decay: 0.001
    )
[NeMo I 2024-08-05 22:53:50 lr_scheduler:923] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7e7c14158df0>" 
    will be used during training (effective maximum steps = 100) - 
    Parameters : 
    (warmup_steps: 1006
    min_lr: 1.0e-06
    max_steps: 100
    )


(Novograd (
 Parameter Group 0
     amsgrad: False
     betas: [0.8, 0.25]
     eps: 1e-08
     grad_averaging: False
     initial_lr: 0.05
     lr: 4.965243296921549e-05
     weight_decay: 0.001
 ),
 {'scheduler': <nemo.core.optim.lr_scheduler.CosineAnnealing at 0x7e7c14158df0>,
  'interval': 'step',
  'frequency': 1,
  'monitor': 'loss',
  'reduce_on_plateau': False})

In [155]:
# Take a look at the changes
cfg = copy.deepcopy(citrinet.cfg)
display(cfg.get('optim').get('sched'))
citrinet.cfg.optim = cfg.optim

{'name': 'CosineAnnealing', 'warmup_steps': 1006, 'min_lr': 1e-06, 'max_steps': 100}

In [156]:
# Save the model
# citrinet = nemo_asr.models.EncDecCTCModelBPE.from_pretrained('stt_en_citrinet_512')
model_name = 'citrinet_512.nemo'
model_file = os.path.join(model_dir, model_name)
print(model_file)
citrinet.save_to(model_file)

/app/data/nemodata/nemomodel/citrinet_512.nemo


In [7]:
# Restore the model 
model_name = 'citrinet_512.nemo'
model_file = os.path.join(model_dir, model_name)

new_cn = nemo_asr.models.EncDecCTCModelBPE.restore_from(model_file)

[NeMo I 2024-08-05 22:59:11 mixins:172] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2024-08-05 22:59:11 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 32
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    
[NeMo W 2024-08-05 22:59:11 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 32
    shuffle: false
    
[NeMo W 2024-08-05 22:59:11 modelPT:189] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).
    T

[NeMo I 2024-08-05 22:59:11 features:305] PADDING: 16
[NeMo I 2024-08-05 22:59:13 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /app/data/nemodata/nemomodel/citrinet_512.nemo.


In [8]:
new_cfg = copy.deepcopy(new_cn.cfg)
display(new_cfg.get('optim').get('sched'))

{'name': 'CosineAnnealing', 'warmup_steps': 1006, 'min_lr': 1e-06, 'max_steps': 100}