In [None]:
# inference/asr_infer.py
import os
import json
import wget
import tempfile
from omegaconf import OmegaConf
from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps
from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR

def load_config(config_dir, domain_type="meeting"):
	"""
	Load (or download if not present) the diarization inference configuration file.
	The configuration file is expected to reside in `config_dir`. If not present, it is downloaded.
	"""
	config_filename = f"diar_infer_{domain_type}.yaml"
	config_path = os.path.join(config_dir, config_filename)
	config_url = (
		f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{config_filename}"
	)
	
	if not os.path.exists(config_path):
		print(f"Downloading config from {config_url}...")
		wget.download(config_url, out=config_dir)
	
	cfg = OmegaConf.load(config_path)
	return cfg

def create_manifest(audio_file_path, out_dir):
	"""
	Create a manifest file as required by the diarization pipeline.
	
	This function writes the manifest file to the provided `out_dir` (which, in our updated
	inference code, will be a temporary directory). The manifest contains the necessary fields:
	  - audio_filepath
	  - offset
	  - duration
	  - label (set to 'infer')
	  - text (placeholder)
	  - num_speakers (None)
	  - rttm_filepath (None)
	  - uem_filepath (None)
	"""
	manifest_data = {
		'audio_filepath': audio_file_path,
		'offset': 0,
		'duration': None,
		'label': 'infer',
		'text': '-',
		'num_speakers': None,
		'rttm_filepath': None,
		'uem_filepath': None
	}
	manifest_path = os.path.join(out_dir, 'input_manifest.json')
	with open(manifest_path, 'w') as fp:
		json.dump(manifest_data, fp)
		fp.write('\n')
	return manifest_path

def run_inference(audio_file_path: str, config_dir: str):
	"""
	Run ASR and Speaker Diarization inference on the provided audio file.
	
	This function uses a temporary directory for all intermediate files (including the manifest and
	the prediction outputs), ensuring that no persistent files are left behind across inference
	invocations.
	
	Parameters:
	  audio_file_path (str): The path to the input audio file.
	  config_dir (str): The directory where configuration files (e.g., diar_infer_meeting.yaml) reside.
	
	Returns:
	  dict: A dictionary containing:
			- "transcript": The final transcript text with speaker labels.
			- "rttm": The RTTM content describing speaker segments.
			- "transcript_info": Additional detailed transcript information from the pipeline.
	"""
	# Use a temporary directory for all intermediate and output files.
	with tempfile.TemporaryDirectory() as temp_out_dir:
		# Create a subdirectory to mimic the "pred_rttms" structure used by the pipeline.
		pred_rttms_dir = os.path.join(temp_out_dir, 'pred_rttms')
		os.makedirs(pred_rttms_dir, exist_ok=True)
		
		# Create the manifest file in the temporary output directory.
		manifest_path = create_manifest(audio_file_path, temp_out_dir)
		
		# Load the diarization configuration from the provided config directory.
		cfg = load_config(config_dir, domain_type="meeting")
		
		# Update the configuration with our temporary manifest and output directory.
		cfg.diarizer.manifest_filepath = manifest_path
		cfg.diarizer.out_dir = temp_out_dir
		
		# Set additional configuration parameters.
		cfg.diarizer.speaker_embeddings.model_path = 'titanet_large'
		cfg.diarizer.clustering.parameters.oracle_num_speakers = False
		cfg.diarizer.vad.model_path = 'vad_multilingual_marblenet'
		cfg.diarizer.asr.model_path = 'stt_en_conformer_ctc_large'
		cfg.diarizer.oracle_vad = False
		cfg.diarizer.asr.parameters.asr_based_vad = False

		# Run ASR to obtain word hypotheses and word-level timestamps.
		asr_decoder = ASRDecoderTimeStamps(cfg.diarizer)

		# # asr_model = asr_decoder.set_asr_model()
		# import nemo.collections.asr as nemo_asr
		# asr_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_conformer_ctc_large")

		asr_model = asr_decoder.set_asr_model()
		word_hyp, word_ts_hyp = asr_decoder.run_ASR(asr_model)
		print("OUT:", word_hyp, word_ts_hyp)
		
		# Run diarization using the ASR output timestamps.
		asr_diar = OfflineDiarWithASR(cfg.diarizer)
		asr_diar.word_ts_anchor_offset = asr_decoder.word_ts_anchor_offset
		diar_hyp, _ = asr_diar.run_diarization(cfg, word_ts_hyp)
		
		# Merge diarization and ASR results to get the final transcript with speaker labels.
		transcript_info = asr_diar.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)
		
		# Determine the base name (without the .wav extension) for the output files.
		base_name = os.path.basename(audio_file_path).replace('.wav', '')
		transcript_file = os.path.join(pred_rttms_dir, f"{base_name}.txt")
		rttm_file = os.path.join(pred_rttms_dir, f"{base_name}.rttm")
		
		# Read the transcript (if it was written by the pipeline).
		transcript = ""
		if os.path.exists(transcript_file):
			with open(transcript_file, "r") as f:
				transcript = f.read()
		
		# Read the RTTM file (if it was generated).
		rttm = ""
		if os.path.exists(rttm_file):
			with open(rttm_file, "r") as f:
				rttm = f.read()
		
		# Construct and return the result dictionary.
		result = {
			"transcript": transcript,
			"rttm": rttm,
			"transcript_info": transcript_info
		}
		
		# The temporary directory (and all its files) is automatically cleaned up here.
		return result


  from .autonotebook import tqdm as notebook_tqdm
W0201 13:52:25.677588 139686079022208 zarr.py:57] `zarr` distributed checkpoint backend is deprecated. Please switch to PyTorch Distributed format (`torch_dist`).


In [2]:
DATA_DIR = "../data"

run_inference('../an4_diarize_test.wav', config_dir = DATA_DIR)

[NeMo I 2025-02-01 13:52:26 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2025-02-01 13:52:26 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc1/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.
[NeMo I 2025-02-01 13:52:26 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc1/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo
[NeMo I 2025-02-01 13:52:26 common:826] Instantiating model from pre-trained checkpoint
[NeMo I 2025-02-01 13:52:27 mixins:173] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2025-02-01 13:52:27 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath:
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket1/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket2/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket3/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket4/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket5/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket6/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket7/tarred_audio_manifest.json
    - - /data2/nemo_asr/nemo_asr_set_3.0/bucket8/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 4
    pin_memory: true
    use_start_end_token: false
    trim_

[NeMo I 2025-02-01 13:52:27 features:305] PADDING: 0
[NeMo I 2025-02-01 13:52:28 save_restore_connector:272] Model EncDecCTCModelBPE was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc1/stt_en_conformer_ctc_large/afb212c5bcf904e326b5e5751e7c7465/stt_en_conformer_ctc_large.nemo.


[NeMo W 2025-02-01 13:52:28 decoder_timestamps_utils:71] `ctc_decode` was set to True. Note that this is ignored.


[NeMo I 2025-02-01 13:52:28 features:305] PADDING: 0
[NeMo I 2025-02-01 13:52:28 features:305] PADDING: 0
[NeMo I 2025-02-01 13:52:28 decoder_timestamps_utils:664] Running ASR model stt_en_conformer_ctc_large
[NeMo I 2025-02-01 13:52:28 decoder_timestamps_utils:668] [1/1] FrameBatchASR: ../an4_diarize_test.wav
{'an4_diarize_test': ['eleven', 'twenty', 'seven', 'fifty', 'seven', 'october', 'twenty', 'four', 'nineteen', 'seventy']} {'an4_diarize_test': [[0.36, 0.72], [0.92, 1.28], [1.4, 1.64], [1.96, 2.28], [2.36, 2.6], [3.08, 3.52], [3.6, 3.84], [3.88, 4.04], [4.4, 4.72], [4.84, 5.16]]}
[NeMo I 2025-02-01 13:52:30 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2025-02-01 13:52:30 clustering_diarizer:127] Loading pretrained vad_multilingual_marblenet model from NGC
[NeMo I 2025-02-01 13:52:30 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc1/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2025-02-01 13:52:3

[NeMo W 2025-02-01 13:52:30 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/ami_train_0.63.json,/manifests/freesound_background_train.json,/manifests/freesound_laughter_train.json,/manifests/fisher_2004_background.json,/manifests/fisher_2004_speech_sampled.json,/manifests/google_train_manifest.json,/manifests/icsi_all_0.63.json,/manifests/musan_freesound_train.json,/manifests/musan_music_train.json,/manifests/musan_soundbible_train.json,/manifests/mandarin_train_sample.json,/manifests/german_train_sample.json,/manifests/spanish_train_sample.json,/manifests/french_train_sample.json,/manifests/russian_train_sample.json
    sample_rate: 16000
    labels:
    - background
    - speech
    batch_size: 256
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: sca

[NeMo I 2025-02-01 13:52:30 features:305] PADDING: 16
[NeMo I 2025-02-01 13:52:30 save_restore_connector:272] Model EncDecClassificationModel was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc1/vad_multilingual_marblenet/670f425c7f186060b7a7268ba6dfacb2/vad_multilingual_marblenet.nemo.
[NeMo I 2025-02-01 13:52:30 clustering_diarizer:160] Loading pretrained titanet_large model from NGC
[NeMo I 2025-02-01 13:52:30 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc1/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2025-02-01 13:52:30 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc1/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo
[NeMo I 2025-02-01 13:52:30 common:826] Instantiating model from pre-trained checkpoint


[NeMo W 2025-02-01 13:52:30 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2025-02-01 13:52:30 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method 

[NeMo I 2025-02-01 13:52:30 features:305] PADDING: 16
[NeMo I 2025-02-01 13:52:30 save_restore_connector:272] Model EncDecSpeakerLabelModel was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc1/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2025-02-01 13:52:30 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2025-02-01 13:52:30 clustering_diarizer:313] Split long audio file to avoid CUDA memory issue


splitting manifest: 100%|██████████| 1/1 [00:00<00:00,  1.18it/s]

[NeMo I 2025-02-01 13:52:31 classification_models:293] Perform streaming frame-level VAD
[NeMo I 2025-02-01 13:52:31 collections:740] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-01 13:52:31 collections:741] Dataset successfully loaded with 1 items and total duration provided from manifest is  0.00 hours.
[NeMo I 2025-02-01 13:52:31 collections:746] # 1 files loaded accounting to # 1 labels



vad: 100%|██████████| 1/1 [00:00<00:00,  6.30it/s]

[NeMo I 2025-02-01 13:52:31 clustering_diarizer:266] Converting frame level prediction to speech/no-speech segment in start and end times format.



creating speech segments: 100%|██████████| 1/1 [00:00<00:00, 23.43it/s]

[NeMo I 2025-02-01 13:52:31 clustering_diarizer:291] Subsegmentation for embedding extraction: scale0, /tmp/tmpgop083t7/speaker_outputs/subsegments_scale0.json
[NeMo I 2025-02-01 13:52:31 clustering_diarizer:347] Extracting embeddings for Diarization
[NeMo I 2025-02-01 13:52:31 collections:740] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-01 13:52:31 collections:741] Dataset successfully loaded with 3 items and total duration provided from manifest is  0.00 hours.
[NeMo I 2025-02-01 13:52:31 collections:746] # 3 files loaded accounting to # 1 labels



[1/6] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.85it/s]

[NeMo I 2025-02-01 13:52:32 clustering_diarizer:393] Saved embedding files to /tmp/tmpgop083t7/speaker_outputs/embeddings
[NeMo I 2025-02-01 13:52:32 clustering_diarizer:291] Subsegmentation for embedding extraction: scale1, /tmp/tmpgop083t7/speaker_outputs/subsegments_scale1.json
[NeMo I 2025-02-01 13:52:32 clustering_diarizer:347] Extracting embeddings for Diarization
[NeMo I 2025-02-01 13:52:32 collections:740] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-01 13:52:32 collections:741] Dataset successfully loaded with 4 items and total duration provided from manifest is  0.00 hours.
[NeMo I 2025-02-01 13:52:32 collections:746] # 4 files loaded accounting to # 1 labels



[2/6] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  7.12it/s]

[NeMo I 2025-02-01 13:52:32 clustering_diarizer:393] Saved embedding files to /tmp/tmpgop083t7/speaker_outputs/embeddings
[NeMo I 2025-02-01 13:52:32 clustering_diarizer:291] Subsegmentation for embedding extraction: scale2, /tmp/tmpgop083t7/speaker_outputs/subsegments_scale2.json
[NeMo I 2025-02-01 13:52:32 clustering_diarizer:347] Extracting embeddings for Diarization
[NeMo I 2025-02-01 13:52:32 collections:740] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-01 13:52:32 collections:741] Dataset successfully loaded with 5 items and total duration provided from manifest is  0.00 hours.
[NeMo I 2025-02-01 13:52:32 collections:746] # 5 files loaded accounting to # 1 labels



[3/6] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  6.94it/s]

[NeMo I 2025-02-01 13:52:32 clustering_diarizer:393] Saved embedding files to /tmp/tmpgop083t7/speaker_outputs/embeddings
[NeMo I 2025-02-01 13:52:32 clustering_diarizer:291] Subsegmentation for embedding extraction: scale3, /tmp/tmpgop083t7/speaker_outputs/subsegments_scale3.json
[NeMo I 2025-02-01 13:52:32 clustering_diarizer:347] Extracting embeddings for Diarization
[NeMo I 2025-02-01 13:52:32 collections:740] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-01 13:52:32 collections:741] Dataset successfully loaded with 6 items and total duration provided from manifest is  0.00 hours.
[NeMo I 2025-02-01 13:52:32 collections:746] # 6 files loaded accounting to # 1 labels



[4/6] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  7.71it/s]

[NeMo I 2025-02-01 13:52:32 clustering_diarizer:393] Saved embedding files to /tmp/tmpgop083t7/speaker_outputs/embeddings
[NeMo I 2025-02-01 13:52:32 clustering_diarizer:291] Subsegmentation for embedding extraction: scale4, /tmp/tmpgop083t7/speaker_outputs/subsegments_scale4.json
[NeMo I 2025-02-01 13:52:32 clustering_diarizer:347] Extracting embeddings for Diarization
[NeMo I 2025-02-01 13:52:32 collections:740] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-01 13:52:32 collections:741] Dataset successfully loaded with 10 items and total duration provided from manifest is  0.00 hours.
[NeMo I 2025-02-01 13:52:32 collections:746] # 10 files loaded accounting to # 1 labels



[5/6] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  7.16it/s]

[NeMo I 2025-02-01 13:52:32 clustering_diarizer:393] Saved embedding files to /tmp/tmpgop083t7/speaker_outputs/embeddings
[NeMo I 2025-02-01 13:52:32 clustering_diarizer:291] Subsegmentation for embedding extraction: scale5, /tmp/tmpgop083t7/speaker_outputs/subsegments_scale5.json
[NeMo I 2025-02-01 13:52:32 clustering_diarizer:347] Extracting embeddings for Diarization
[NeMo I 2025-02-01 13:52:32 collections:740] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2025-02-01 13:52:32 collections:741] Dataset successfully loaded with 20 items and total duration provided from manifest is  0.00 hours.
[NeMo I 2025-02-01 13:52:32 collections:746] # 20 files loaded accounting to # 1 labels



[6/6] extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  6.34it/s]

[NeMo I 2025-02-01 13:52:32 clustering_diarizer:393] Saved embedding files to /tmp/tmpgop083t7/speaker_outputs/embeddings



clustering: 100%|██████████| 1/1 [00:00<00:00,  3.62it/s]

[NeMo I 2025-02-01 13:52:33 clustering_diarizer:461] Outputs are saved in /tmp/tmpgop083t7 directory



[NeMo W 2025-02-01 13:52:33 der:185] Check if each ground truth RTTMs were present in the provided manifest file. Skipping calculation of Diariazation Error Rate


[NeMo I 2025-02-01 13:52:33 diarization_utils:876] Creating results for Session: an4_diarize_test n_spk: 2 
[NeMo I 2025-02-01 13:52:33 diarization_utils:749] Diarization with ASR output files are saved in: /tmp/tmpgop083t7/pred_rttms


{'transcript': '[00:00.07 - 00:02.60] speaker_0: eleven twenty seven fifty seven\n[00:03.08 - 00:05.16] speaker_1: october twenty four nineteen seventy\n',
 'rttm': 'SPEAKER an4_diarize_test 1   0.070   2.625 <NA> <NA> speaker_0 <NA> <NA>\nSPEAKER an4_diarize_test 1   2.695   2.505 <NA> <NA> speaker_1 <NA> <NA>\n',
 'transcript_info': {'an4_diarize_test': OrderedDict([('status', 'success'),
               ('session_id', 'an4_diarize_test'),
               ('transcription',
                'eleven twenty seven fifty seven october twenty four nineteen seventy'),
               ('speaker_count', 2),
               ('words',
                [{'word': 'eleven',
                  'start_time': 0.36,
                  'end_time': 0.72,
                  'speaker': 'speaker_0'},
                 {'word': 'twenty',
                  'start_time': 0.92,
                  'end_time': 1.28,
                  'speaker': 'speaker_0'},
                 {'word': 'seven',
                  'start_time'

In [4]:
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/parakeet-ctc-1.1b")
asr_model.eval()

[NeMo I 2025-02-01 16:03:54 mixins:173] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2025-02-01 16:03:54 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /disk1/NVIDIA/datasets/LibriSpeech_NeMo/librivox-train-all.json
    sample_rate: 16000
    batch_size: 16
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 16.7
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    
[NeMo W 2025-02-01 16:03:54 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: /disk1/NVIDIA/datasets/LibriSpee

[NeMo I 2025-02-01 16:03:54 features:305] PADDING: 0
[NeMo I 2025-02-01 16:04:04 save_restore_connector:272] Model EncDecCTCModelBPE was successfully restored from /root/.cache/huggingface/hub/models--nvidia--parakeet-ctc-1.1b/snapshots/085a3de63c7598065b072cd8f2182e6a5fa593eb/parakeet-ctc-1.1b.nemo.


EncDecCTCModelBPE(
  (preprocessor): AudioToMelSpectrogramPreprocessor(
    (featurizer): FilterbankFeatures()
  )
  (encoder): ConformerEncoder(
    (pre_encode): ConvSubsampling(
      (out): Linear(in_features=2560, out_features=1024, bias=True)
      (conv): Sequential(
        (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
        (3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (4): ReLU(inplace=True)
        (5): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
        (6): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (7): ReLU(inplace=True)
      )
    )
    (pos_enc): RelPositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-41): 42 x ConformerLayer(
        (norm_feed_forward1): LayerNorm((1024,), eps=1e-05, elementw

In [5]:
asr_model.transcribe('../mono_output.wav')

Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][NeMo W 2025-02-01 16:04:08 ctc_greedy_decoding:168] CTC decoding strategy 'greedy' is slower than 'greedy_batch', which implements the same exact interface. Consider changing your strategy to 'greedy_batch' for a free performance improvement.
Transcribing: 100%|██████████| 1/1 [00:02<00:00,  2.25s/it]


["he frequently askks are you okay during sex show us show us like what does that look like are you okay i like when guys are confident to be able to compliment other people hey check out those balloons those calciumons don't put this in the trailer whoever add it this i'll tell you it com in the trailer"]

In [1]:
from nemo.collections.asr.models import EncDecMultiTaskModel

  from .autonotebook import tqdm as notebook_tqdm
W0208 12:48:11.759294 139677262001280 zarr.py:57] `zarr` distributed checkpoint backend is deprecated. Please switch to PyTorch Distributed format (`torch_dist`).


In [2]:
# load model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')

[NeMo I 2025-02-08 12:48:20 mixins:197] _setup_tokenizer: detected an aggregate tokenizer
[NeMo I 2025-02-08 12:48:20 mixins:336] Tokenizer SentencePieceTokenizer initialized with 32 tokens
[NeMo I 2025-02-08 12:48:20 mixins:336] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2025-02-08 12:48:20 mixins:336] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2025-02-08 12:48:20 mixins:336] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2025-02-08 12:48:20 mixins:336] Tokenizer SentencePieceTokenizer initialized with 1024 tokens
[NeMo I 2025-02-08 12:48:20 aggregate_tokenizer:73] Aggregate vocab size: 4128


[NeMo W 2025-02-08 12:48:20 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    batch_size: null
    num_workers: 8
    use_lhotse: true
    max_duration: 40
    pin_memory: true
    use_bucketing: false
    bucket_duration_bins: null
    num_buckets: 1
    text_field: answer
    lang_field: target_lang
    batch_duration: 360
    quadratic_duration: 15
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2025-02-08 12:48:20 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 

[NeMo I 2025-02-08 12:48:20 features:305] PADDING: 0
[NeMo I 2025-02-08 12:48:30 save_restore_connector:272] Model EncDecMultiTaskModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--canary-1b/snapshots/dd32c0c709e2bfc79f583e16b9df4b3a160f7e86/canary-1b.nemo.


In [3]:
# update dcode params
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)
canary_model.eval()

[NeMo I 2025-02-08 12:48:33 aed_multitask_models:260] Changed decoding strategy to 
    strategy: beam
    compute_hypothesis_token_set: false
    preserve_alignments: null
    compute_langs: false
    beam:
      beam_size: 1
      search_type: default
      len_pen: 1.0
      max_generation_delta: 20
      return_best_hypothesis: true
      preserve_alignments: false
    temperature: 1.0
    


EncDecMultiTaskModel(
  (preprocessor): AudioToMelSpectrogramPreprocessor(
    (featurizer): FilterbankFeatures()
  )
  (encoder): ConformerEncoder(
    (pre_encode): ConvSubsampling(
      (out): Linear(in_features=4096, out_features=1024, bias=True)
      (conv): Sequential(
        (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (1): ReLU(inplace=True)
        (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
        (3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (4): ReLU(inplace=True)
        (5): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
        (6): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        (7): ReLU(inplace=True)
      )
    )
    (pos_enc): RelPositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-23): 24 x ConformerLayer(
        (norm_feed_forward1): LayerNorm((1024,), eps=1e-05, eleme

In [3]:
canary_model.transcribe('../mono_output.wav')

Transcribing: 1it [00:05,  5.54s/it]


["He frequently asks, Are you okay during sex? Show us, show us. What does that look like? Are you okay? I like when guys are confident to be able to compliment other people. Hey, check out those balloons. Those calcium balloons. Don't put this in the trailer. Whoever added this, I'll tell you. You confirm in the trailer."]

In [2]:
canary_model.transcribe('../mono_output.wav')

Transcribing: 1it [00:02,  2.99s/it]


["He frequently asks, Are you okay during sex? Show us, show us. What does that look like? Are you okay? I like when guys are confident to be able to compliment other people. Hey, check out those balloons. Those calcium balloons. Don't put this in the trailer. Whoever added this, I'll tell you. You confirm in the trailer."]

In [9]:
import nemo.collections.asr as nemo_asr
for i in nemo_asr.models.ASRModel.list_available_models():
	print(i.pretrained_model_name)

QuartzNet15x5Base-En
asr_talknet_aligner
commandrecognition_en_matchboxnet3x1x64_v1
commandrecognition_en_matchboxnet3x1x64_v2
commandrecognition_en_matchboxnet3x1x64_v2_subset_task
commandrecognition_en_matchboxnet3x2x64_v1
commandrecognition_en_matchboxnet3x2x64_v2
commandrecognition_en_matchboxnet3x2x64_v2_subset_task
stt_be_conformer_ctc_large
stt_be_conformer_transducer_large
stt_by_fastconformer_hybrid_large_pc
stt_ca_conformer_ctc_large
stt_ca_conformer_transducer_large
stt_ca_quartznet15x5
stt_de_citrinet_1024
stt_de_conformer_ctc_large
stt_de_conformer_transducer_large
stt_de_contextnet_1024
stt_de_fastconformer_hybrid_large_pc
stt_de_quartznet15x5
stt_en_citrinet_1024
stt_en_citrinet_1024_gamma_0_25
stt_en_citrinet_256
stt_en_citrinet_256_gamma_0_25
stt_en_citrinet_512
stt_en_citrinet_512_gamma_0_25
stt_en_conformer_ctc_large
stt_en_conformer_ctc_large_ls
stt_en_conformer_ctc_medium
stt_en_conformer_ctc_medium_ls
stt_en_conformer_ctc_small
stt_en_conformer_ctc_small_ls
stt_en

In [3]:
print(ClusteringDiarizer.list_available_models())

None
