In [5]:
dataset_path = "/kaggle/input/diarization-dataset/wavs"

In [6]:
!git clone https://github.com/NVIDIA/NeMo.git

fatal: destination path 'NeMo' already exists and is not an empty directory.


In [None]:
!pip install nemo_toolkit['asr']

Collecting nemo_toolkit[asr]
  Using cached nemo_toolkit-1.23.0-py3-none-any.whl.metadata (18 kB)
Collecting triton (from nemo_toolkit[asr])
  Using cached triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Collecting wget (from nemo_toolkit[asr])
  Using cached wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting braceexpand (from nemo_toolkit[asr])
  Using cached braceexpand-0.1.7-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting editdistance (from nemo_toolkit[asr])
  Using cached editdistance-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting g2p-en (from nemo_toolkit[asr])
  Using cached g2p_en-2.1.0-py3-none-any.whl.metadata (4.5 kB)
Collecting jiwer (from nemo_toolkit[asr])
  Using cached jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting kaldi-python-io (from nemo_toolkit[asr])
  Using cached kaldi-python-io-1.2.2.tar.gz (8.8 kB)
  Preparing metadata (setup.py) .

In [None]:
!rm -r /kaggle/working/wavs/audio_sample_63.wav

In [None]:
import os
import json

# Directory containing the WAV files
wavs_directory = dataset_path

# Output manifest file path
manifest_path = '/kaggle/working/manifest.json'

# Function to collect WAV files in the directory
def collect_wav_files(directory):
    wav_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                wav_files.append(os.path.join(root, file))
    return wav_files

# Function to create the manifest file
def create_manifest(wav_files, manifest_file):
    manifest_entries = []
    for wav_file in wav_files:
        manifest_entry = {
            "audio_filepath": wav_file,
            "offset": 0,
            "duration": None,
            "label": "infer",
            "text": "-",
            "num_speakers": None,
            "rttm_filepath": "/path/to/rttm/file",
            "uem_filepath": "/path/to/uem/filepath"
        }
        manifest_entries.append(manifest_entry)
    
    with open(manifest_file, 'w') as f:
        for entry in manifest_entries:
            json.dump(entry, f)
            f.write('\n')

# Collect all WAV files in the directory
wav_files = collect_wav_files(wavs_directory)

# Create the manifest file
create_manifest(wav_files, manifest_path)

print(f"Manifest file created at: {manifest_path}")


In [None]:
config_data = """
# offline_diarization_with_asr.yaml
name: &name "ClusterDiarizer"

num_workers: 1
sample_rate: 16000
batch_size: 16
device: null # can specify a specific device, i.e: cuda:1 (default cuda if cuda available, else cpu)
verbose: True # enable additional logging

diarizer:
  manifest_filepath: "/kaggle/working/manifest.json" # Update this with your manifest file path
  out_dir: "/kaggle/working/outdir" # Update this with your desired output directory
  oracle_vad: False # If True, uses RTTM files provided in the manifest file to get speech activity (VAD) timestamps
  collar: 0.25 # Collar value for scoring
  ignore_overlap: True # Consider or ignore overlap segments while scoring

  vad:
    model_path: vad_multilingual_marblenet # .nemo local model path or pretrained VAD model name 
    external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set

    parameters: # Tuned by detection error rate (false alarm + miss) on multilingual ASR evaluation datasets
      window_length_in_sec: 0.63  # Window length in sec for VAD context input 
      shift_length_in_sec: 0.08 # Shift length in sec for generate frame level VAD prediction
      smoothing: False # False or type of smoothing method (eg: median)
      overlap: 0.5 # Overlap ratio for overlapped mean/median smoothing filter
      onset: 0.5 # Onset threshold for detecting the beginning and end of a speech 
      offset: 0.3 # Offset threshold for detecting the end of a speech
      pad_onset: 0.2 # Adding durations before each speech segment 
      pad_offset: 0.2 # Adding durations after each speech segment 
      min_duration_on: 0.5 # Threshold for small non_speech deletion
      min_duration_off: 0.5 # Threshold for short speech segment deletion
      filter_speech_first: True 

  speaker_embeddings:
    model_path: titanet_large # .nemo local model path or pretrained model name (titanet_large, ecapa_tdnn or speakerverification_speakernet)
    parameters:
      window_length_in_sec: [1.9, 1.2, 0.5] # Window length(s) in sec (floating-point number). either a number or a list. ex) 1.5 or [1.5,1.0,0.5]
      shift_length_in_sec: [0.95, 0.6, 0.25] # Shift length(s) in sec (floating-point number). either a number or a list. ex) 0.75 or [0.75,0.5,0.25]
      multiscale_weights: [1,1,1] # Weight for each scale. should be null (for single scale) or a list matched with window/shift scale count. ex) [0.33,0.33,0.33]
      save_embeddings: True # If True, save speaker embeddings in pickle format. This should be True if clustering result is used for other models, such as `msdd_model`.
  
  clustering:
    parameters:
      oracle_num_speakers: False # If True, use num of speakers value provided in manifest file.
      max_num_speakers: 8 # Max number of speakers for each recording. If an oracle number of speakers is passed, this value is ignored.
      enhanced_count_thres: 80 # If the number of segments is lower than this number, enhanced speaker counting is activated.
      max_rp_threshold: 0.25 # Determines the range of p-value search: 0 < p <= max_rp_threshold. 
      sparse_search_volume: 10 # The higher the number, the more values will be examined with more time. 
      maj_vote_spk_count: False  # If True, take a majority vote on multiple p-values to estimate the number of speakers.
      chunk_cluster_count: 50 # Number of forced clusters (overclustering) per unit chunk in long-form audio clustering.
      embeddings_per_chunk: 10000 # Number of embeddings in each chunk for long-form audio clustering. Adjust based on GPU memory capacity. (default: 10000, approximately 40 mins of audio) 

  msdd_model:
    model_path: "diar_msdd_general"  # Update this with your MSDD pretrained model path
    parameters:
      use_speaker_model_from_ckpt: True # If True, use speaker embedding model in checkpoint. If False, the provided speaker embedding model in config will be used.
      infer_batch_size: 16 # Batch size for MSDD inference. 
      sigmoid_threshold: [0.7] # Sigmoid threshold for generating binarized speaker labels. The smaller the more generous on detecting overlaps.
      seq_eval_mode: False # If True, use oracle number of speaker and evaluate F1 score for the given speaker sequences. Default is False.
      split_infer: True # If True, break the input audio clip to short sequences and calculate cluster average embeddings for inference.
      diar_window_length: 50 # The length of split short sequence when split_infer is True.
      overlap_infer_spk_limit: 5 # If the estimated number of speakers are larger than this number, overlap speech is not estimated.
  
  asr:
    model_path: "/kaggle/input/asr-model-quartznet/quartznet-CTC-Char-Medium.nemo" # Path to your trained .nemo model
    parameters:
      asr_based_vad: False # If you want to use ASR for VAD
      asr_based_vad_threshold: 1.0
      asr_batch_size: 1 # Adjust based on your hardware capabilities
      decoder_delay_in_sec: null
      word_ts_anchor_offset: null
      word_ts_anchor_pos: "start"
      fix_word_ts_with_VAD: False
      colored_text: False
      print_time: True
      break_lines: False
  
    ctc_decoder_parameters:
      pretrained_language_model: null
      beam_width: 32
      alpha: 0.5
      beta: 2.5

    realigning_lm_parameters:
      arpa_language_model: null
      min_number_of_words: 3
      max_number_of_words: 10
      logprob_diff_threshold: 1.2
"""

In [None]:
import os

# Ensure the directory exists before writing the file
directory = "/kaggle/working/"
if not os.path.exists(directory):
    os.makedirs(directory)

# Write the config to a .yaml file in the specified directory
with open(os.path.join(directory, "asr_diar.yaml"), "w") as file:
    file.write(config_data)

print(f"Configuration saved to {os.path.abspath(os.path.join(directory, 'asr_diar.yaml'))}")

In [None]:
file_path = "/kaggle/working/asr_diar.py"

code = """
from omegaconf import OmegaConf

from nemo.collections.asr.parts.utils.decoder_timestamps_utils import ASRDecoderTimeStamps
from nemo.collections.asr.parts.utils.diarization_utils import OfflineDiarWithASR
from nemo.collections.asr.models import EncDecCTCModel
from nemo.core.config import hydra_runner
from nemo.utils import logging
import torch

@hydra_runner(config_path="/kaggle/working/", config_name="asr_diar.yaml")
def main(cfg):

    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

    # ASR inference for words and word timestamps
    if torch.cuda.is_available():
            torch.cuda.empty_cache()
            logging.info("Cleared CUDA cache.")
            
    asr_decoder_ts = ASRDecoderTimeStamps(cfg.diarizer)
    asr_model = asr_decoder_ts.set_asr_model()
    
            
    word_hyp, word_ts_hyp = asr_decoder_ts.run_ASR(asr_model)

    # Create a class instance for matching ASR and diarization results
    asr_diar_offline = OfflineDiarWithASR(cfg.diarizer)
    asr_diar_offline.word_ts_anchor_offset = asr_decoder_ts.word_ts_anchor_offset

    # Diarization inference for speaker labels
    diar_hyp, diar_score = asr_diar_offline.run_diarization(cfg, word_ts_hyp)
    trans_info_dict = asr_diar_offline.get_transcript_with_speaker_labels(diar_hyp, word_hyp, word_ts_hyp)

    # If RTTM is provided and DER evaluation
    if diar_score is not None:
        # Get session-level diarization error rate and speaker counting error
        der_results = OfflineDiarWithASR.gather_eval_results(
            diar_score=diar_score,
            audio_rttm_map_dict=asr_diar_offline.AUDIO_RTTM_MAP,
            trans_info_dict=trans_info_dict,
            root_path=asr_diar_offline.root_path,
        )

        # Calculate WER and cpWER if reference CTM files exist
        wer_results = OfflineDiarWithASR.evaluate(
            hyp_trans_info_dict=trans_info_dict,
            audio_file_list=asr_diar_offline.audio_file_list,
            ref_ctm_file_list=asr_diar_offline.ctm_file_list,
        )

        # Print average DER, WER and cpWER
        OfflineDiarWithASR.print_errors(der_results=der_results, wer_results=wer_results)

        # Save detailed session-level evaluation results in `root_path`.
        OfflineDiarWithASR.write_session_level_result_in_csv(
            der_results=der_results,
            wer_results=wer_results,
            root_path=asr_diar_offline.root_path,
            csv_columns=asr_diar_offline.csv_columns,
        )


if __name__ == '__main__':
    main()

"""

with open(file_path, "w") as file:
    file.write(code)

print("File created successfully!")

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
!python /kaggle/working/asr_diar.py

In [None]:
import os
import json

# Directory containing the .txt files
input_dir = '/kaggle/working/outdir/pred_rttms'
output_dir = '/kaggle/working/submission'

# Create the output directory if it does not exist
os.makedirs(output_dir, exist_ok=True)

def process_line(line):
    start = end = speaker = None  # Initialize variables
    text = ""

    try:
        # Strip any leading or trailing whitespace
        line = line.strip()
        
        # Split the line into its components
        time_range, rest = line.split('] ', 1)
        start_time, end_time = time_range[1:].split(' - ')
        rest_parts = rest.split(': ', 1)

        if len(rest_parts) == 2:
            speaker, text = rest_parts
        else:
            speaker = rest_parts[0]
        
        # Extract start and end times
        start = float(start_time.split(':')[0]) * 60 + float(start_time.split(':')[1])
        end = float(end_time.split(':')[0]) * 60 + float(end_time.split(':')[1])
        
        # Extract speaker number, clean up any trailing colons or spaces, and ensure it's a valid integer
        if speaker:
            speaker = speaker.replace('speaker_', '').replace(':', '').strip()
            speaker = int(speaker) if speaker.isdigit() else 0
        else:
            speaker = 0

    except ValueError as e:
        # Print the error message and line that caused it
        print(f"Error processing line: '{line}'")
        print(f"Exception: {e}")
        # Provide default values in case of error
        start = end = 0.0
        speaker = 0
    
    # Return the processed entry
    return {
        'start': start,
        'end': end,
        'speaker': speaker,
        'text': text.strip() if text else ""
    }

# Process each .txt file
for txt_filename in os.listdir(input_dir):
    if txt_filename.endswith('.txt'):
        txt_file_path = os.path.join(input_dir, txt_filename)
        
        # Create corresponding .json filename
        json_filename = txt_filename.replace('.txt', '.json')
        json_file_path = os.path.join(output_dir, json_filename)
        
        # Process the .txt file
        with open(txt_file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        
        # Convert the content to the desired JSON format
        entries = []
        for line in lines:
            entry = process_line(line)
            entries.append(entry)
        
        # Write to the .json file
        with open(json_file_path, 'w', encoding='utf-8') as json_file:
            json.dump(entries, json_file, ensure_ascii=False, indent=4)

print("Conversion complete.")