Skip to content

Commit

Permalink
Integrate NVIDIA DALI 1.4 to NeMo ASR (NVIDIA#2567)
Browse files Browse the repository at this point in the history
* Initial prototype of ASR DALI integration with DALI 1.4

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update dali support to 1.4

Signed-off-by: smajumdar <titu1994@gmail.com>

* Fix docs

Signed-off-by: smajumdar <titu1994@gmail.com>

* Address comments

Signed-off-by: smajumdar <titu1994@gmail.com>

* Apply suggestions from code review

Co-authored-by: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>

* Address comments

Signed-off-by: smajumdar <titu1994@gmail.com>

* Correct module utils

Signed-off-by: smajumdar <titu1994@gmail.com>

Co-authored-by: Janusz Lisiecki <39967756+JanuszL@users.noreply.github.com>
Signed-off-by: Jason <jasoli@nvidia.com>
  • Loading branch information
2 people authored and blisc committed Aug 12, 2021
1 parent ec05a7e commit 45a58be
Show file tree
Hide file tree
Showing 8 changed files with 672 additions and 120 deletions.
112 changes: 64 additions & 48 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -293,55 +293,71 @@ pipeline {
}
}

stage('L2: ASR DALI dev run') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
parallel {
stage('Speech to Text - DALI AudioToMelSpectrogramPreprocessor') {
steps {
sh 'python examples/asr/speech_to_text.py \
model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
+model.train_ds.use_dali=True \
model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
+model.validation_ds.use_dali=True \
trainer.gpus=[0] \
+trainer.fast_dev_run=True \
exp_manager.exp_dir=examples/asr/speech_to_text_results'
sh 'rm -rf examples/asr/speech_to_text_results'
}
}
// TODO: This would fail due to an unnecessary torchaudio import.
// To be enabled once torchaudio is available in the container used for CI
// stage('Speech to Text - DALI AudioToMFCCPreprocessor') {
// steps {
// sh 'python examples/asr/speech_to_text.py \
// model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
// +model.train_ds.use_dali=True \
// model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
// +model.validation_ds.use_dali=True \
// model.preprocessor._target_=nemo.collections.asr.modules.AudioToMFCCPreprocessor \
// ~model.preprocessor.normalize \
// ~model.preprocessor.features \
// ~model.preprocessor.frame_splicing \
// ~model.preprocessor.dither \
// ~model.preprocessor.stft_conv \
// +model.n_mels=64 \
// +model.n_mfcc=64 \
// trainer.gpus=[0] \
// +trainer.fast_dev_run=True \
// exp_manager.exp_dir=examples/asr/speech_to_text_results'
// sh 'rm -rf examples/asr/speech_to_text_results'
// }
// }
}
}
// TODO: Enable test after 21.08 container is used.
// stage('L2: ASR DALI dev run') {
// when {
// anyOf {
// branch 'main'
// changeRequest target: 'main'
// }
// }
// failFast true
// parallel {
// stage('Speech to Text - DALI AudioToMelSpectrogramPreprocessor') {
// steps {
// sh 'python examples/asr/speech_to_text.py \
// model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
// +model.train_ds.use_dali=True \
// model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
// +model.validation_ds.use_dali=True \
// trainer.gpus=[0] \
// +trainer.fast_dev_run=True \
// exp_manager.exp_dir=examples/asr/speech_to_text_results'
// sh 'rm -rf examples/asr/speech_to_text_results'
// }
// }
// stage('Speech to Text BPE - DALI AudioToMelSpectrogramPreprocessor') {
// steps {
// sh 'python examples/asr/speech_to_text_bpe.py \
// --config-path="conf/citrinet/" --config-name="config_bpe" \
// model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
// model.tokenizer.type="wpe" \
// model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
// +model.train_ds.use_dali=True \
// model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
// +model.validation_ds.use_dali=True \
// trainer.gpus=[0] \
// +trainer.fast_dev_run=True \
// exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results'
// sh 'rm -rf examples/asr/speech_to_text_wpe_results'
// }
// }
// // TODO: This would fail due to an unnecessary torchaudio import.
// // To be enabled once torchaudio is available in the container used for CI
// // stage('Speech to Text - DALI AudioToMFCCPreprocessor') {
// // steps {
// // sh 'python examples/asr/speech_to_text.py \
// // model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
// // +model.train_ds.use_dali=True \
// // model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
// // +model.validation_ds.use_dali=True \
// // model.preprocessor._target_=nemo.collections.asr.modules.AudioToMFCCPreprocessor \
// // ~model.preprocessor.normalize \
// // ~model.preprocessor.features \
// // ~model.preprocessor.frame_splicing \
// // ~model.preprocessor.dither \
// // ~model.preprocessor.stft_conv \
// // +model.n_mels=64 \
// // +model.n_mfcc=64 \
// // trainer.gpus=[0] \
// // +trainer.fast_dev_run=True \
// // exp_manager.exp_dir=examples/asr/speech_to_text_results'
// // sh 'rm -rf examples/asr/speech_to_text_results'
// // }
// // }
// }
// }

// TODO: UNCOMMENT TESTS AFTER 21.04 release (numba 0.53 min requirement)
stage('L2: ASR RNNT dev run') {
when {
anyOf {
Expand Down
92 changes: 72 additions & 20 deletions nemo/collections/asr/data/audio_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,65 @@ def _speech_collate_fn(batch, pad_id):
return audio_signal, audio_lengths, tokens, tokens_lengths


class ASRManifestProcessor:
"""
Class that processes a manifest json file containing paths to audio files, transcripts, and durations (in seconds).
Each new line is a different sample. Example below:
{"audio_filepath": "/path/to/audio.wav", "text_filepath": "/path/to/audio.txt", "duration": 23.147}
...
{"audio_filepath": "/path/to/audio.wav", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
"utterance_id", "ctm_utt": "en_4156", "side": "A"}
Args:
manifest_filepath: Path to manifest json as described above. Can be comma-separated paths.
parser: Str for a language specific preprocessor or a callable.
max_duration: If audio exceeds this length, do not include in dataset.
min_duration: If audio is less than this length, do not include in dataset.
max_utts: Limit number of utterances.
bos_id: Id of beginning of sequence symbol to append if not None.
eos_id: Id of end of sequence symbol to append if not None.
pad_id: Id of pad symbol. Defaults to 0.
"""

def __init__(
self,
manifest_filepath: str,
parser: Union[str, Callable],
max_duration: Optional[float] = None,
min_duration: Optional[float] = None,
max_utts: int = 0,
bos_id: Optional[int] = None,
eos_id: Optional[int] = None,
pad_id: int = 0,
):
self.parser = parser

self.collection = collections.ASRAudioText(
manifests_files=manifest_filepath.split(','),
parser=parser,
min_duration=min_duration,
max_duration=max_duration,
max_number=max_utts,
)

self.eos_id = eos_id
self.bos_id = bos_id
self.pad_id = pad_id

def process_text(self, index) -> (List[int], int):
sample = self.collection[index]

t, tl = sample.text_tokens, len(sample.text_tokens)

if self.bos_id is not None:
t = [self.bos_id] + t
tl += 1
if self.eos_id is not None:
t = t + [self.eos_id]
tl += 1

return t, tl


class _AudioTextDataset(Dataset):
"""
Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds).
Expand Down Expand Up @@ -134,24 +193,21 @@ def __init__(
eos_id: Optional[int] = None,
pad_id: int = 0,
):
self.parser = parser

self.collection = collections.ASRAudioText(
manifests_files=manifest_filepath.split(','),
self.manifest_processor = ASRManifestProcessor(
manifest_filepath=manifest_filepath,
parser=parser,
min_duration=min_duration,
max_duration=max_duration,
max_number=max_utts,
min_duration=min_duration,
max_utts=max_utts,
bos_id=bos_id,
eos_id=eos_id,
pad_id=pad_id,
)

self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
self.trim = trim
self.eos_id = eos_id
self.bos_id = bos_id
self.pad_id = pad_id

def __getitem__(self, index):
sample = self.collection[index]
sample = self.manifest_processor.collection[index]
offset = sample.offset

if offset is None:
Expand All @@ -162,23 +218,17 @@ def __getitem__(self, index):
)
f, fl = features, torch.tensor(features.shape[0]).long()

t, tl = sample.text_tokens, len(sample.text_tokens)
if self.bos_id is not None:
t = [self.bos_id] + t
tl += 1
if self.eos_id is not None:
t = t + [self.eos_id]
tl += 1
t, tl = self.manifest_processor.process_text(index)

output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long()

return output

def __len__(self):
return len(self.collection)
return len(self.manifest_processor.collection)

def _collate_fn(self, batch):
return _speech_collate_fn(batch, pad_id=self.pad_id)
return _speech_collate_fn(batch, pad_id=self.manifest_processor.pad_id)


class AudioToCharDataset(_AudioTextDataset):
Expand Down Expand Up @@ -1249,6 +1299,8 @@ class TarredAudioToBPEDataset(_TarredAudioToTextDataset):
trim (bool): Whether to use trim silence from beginning and end
of audio signal using librosa.effects.trim().
Defaults to False.
use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS]
tokens to beginning and ending of speech respectively.
pad_id (id): Token used to pad when collating samples in batches.
If this is None, pads using 0s.
Defaults to None.
Expand Down

0 comments on commit 45a58be

Please sign in to comment.