In [None]:
# this is nemo's "core" package
import nemo
# this is nemos's ASR collection of speech-recognition related Neural Modules
import nemo_asr

### Path to your dataset

In [None]:
train_manifest = "an4_dataset/an4_train.json"
val_manifest = "an4_dataset/an4_val.json"

#### Model description

In [None]:
from ruamel.yaml import YAML 
yaml = YAML(typ="safe")
with open("../../tests/data/jasper_smaller.yaml") as f:
    jasper_params = yaml.load(f)
labels = jasper_params['labels']

### Instantiate necessary Neural Modules

In [None]:
# First step is to instantiate a NeuralModuleFactory
# If torch is installed without CUDA and Apex CPU will be used
# and training is impractically slow even for this dataset
from nemo.core import DeviceType
import torch
nf = nemo.core.NeuralModuleFactory(placement=DeviceType.GPU if torch.cuda.is_available() else DeviceType.CPU)

In [None]:
data_layer = nemo_asr.AudioToTextDataLayer(manifest_filepath=train_manifest, labels=labels, batch_size=16)

In [None]:
data_preprocessor = nemo_asr.AudioPreprocessing()

In [None]:
jasper_encoder = nemo_asr.JasperEncoder(feat_in=64, **jasper_params["JasperEncoder"])

In [None]:
jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels))

In [None]:
ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels))

In [None]:
greedy_decoder = nemo_asr.GreedyCTCDecoder()

### Describe how Neural Modules are connected together

In [None]:
audio_signal, audio_signal_len, transcript, transcript_len = data_layer()
processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal,
                                                           length=audio_signal_len)
encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=processed_signal_len)

In [None]:
#log_probs = jasper_decoder(encoder_output=processed_signal)
log_probs = jasper_decoder(encoder_output=encoded)
predictions = greedy_decoder(log_probs=log_probs)
loss = ctc_loss(log_probs=log_probs, targets=transcript,
                input_length=encoded_len, target_length=transcript_len)
tensors_to_evaluate=[predictions, transcript, transcript_len]

In [None]:
# connector = nemo_asr.JasperRNNConnector(in_channels=1024, out_channels=jasper_params['DecoderRNN']['hidden_size'])
# rnn_decoder = nemo.backends.pytorch.common.DecoderRNN(voc_size=len(labels), bos_id=1, **jasper_params['DecoderRNN'])
# beam_search = nemo.backends.pytorch.common.BeamSearch(decoder=rnn_decoder, pad_id=0, bos_id=1, eos_id=2, max_len=58, beam_size=4)
# seq_loss = nemo.backends.pytorch.common.SequenceLoss(pad_id=0, smoothing_coef=0.0)

# # second part of DAG
# encoded2=connector(tensor=encoded)
# log_probs, _ = rnn_decoder(targets=transcript,
#                            encoder_outputs=encoded2)
# loss = seq_loss(log_probs=log_probs, targets=transcript)
# predictions, _ = beam_search(encoder_outputs=encoded2)

# # some bookkeeping
# labels = ['pad', 'bos', 'eos'] + labels
# tensors_to_evaluate=None

### Run training

In [None]:
from functools import partial
from nemo_asr.helpers import monitor_asr_train_progress
train_callback = nemo.core.SimpleLossLoggerCallback(
    tensors=[loss]+tensors_to_evaluate,
    print_func=partial(monitor_asr_train_progress, labels=labels))

In [None]:
nf.train(tensors_to_optimize=[loss],
                callbacks=[train_callback],
                optimizer="novograd",
                optimization_params={"num_epochs": 30, "lr": 1e-2,
                                    "weight_decay": 1e-3})