In [1]:
import os
import json
import random

# Configuration
manifest_file = '/kaggle/input/nemo-cleaned-arabic-json/train.json'
train_ratio = 0.8  # 80% for training
dev_ratio = 0.1    # 10% for development
test_ratio = 0.1   # 10% for testing

# Load existing manifest data
manifest_data = []
if os.path.isfile(manifest_file):
    with open(manifest_file, 'r') as json_file:
        for line in json_file:
            manifest_data.append(json.loads(line))

# Shuffle the data
random.shuffle(manifest_data)

# Calculate split indices
total_samples = len(manifest_data)
train_end = int(total_samples * train_ratio)
dev_end = train_end + int(total_samples * dev_ratio)

# Split the data
train_data = manifest_data[:train_end]
dev_data = manifest_data[train_end:dev_end]
test_data = manifest_data[dev_end:]

# Define a function to write a subset to a JSON file
def write_subset(filename, data):
    with open(filename, 'w') as json_file:
        for entry in data:
            entry['audio_filepath'] = entry['audio_filepath']
            json.dump(entry, json_file)
            json_file.write('\n')

# Write the subsets to their respective files
write_subset('train_manifest.json', train_data)
write_subset('dev_manifest.json', dev_data)
write_subset('test_manifest.json', test_data)

print(f"Data split into train ({len(train_data)} samples), dev ({len(dev_data)} samples), and test ({len(test_data)} samples) sets.")


Data split into train (40572 samples), dev (5071 samples), and test (5072 samples) sets.


In [2]:
!pip install nemo_toolkit['asr']

Collecting nemo_toolkit[asr]
  Downloading nemo_toolkit-1.23.0-py3-none-any.whl.metadata (18 kB)
Collecting triton (from nemo_toolkit[asr])
  Downloading triton-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting wget (from nemo_toolkit[asr])
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting braceexpand (from nemo_toolkit[asr])
  Downloading braceexpand-0.1.7-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting editdistance (from nemo_toolkit[asr])
  Downloading editdistance-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting g2p-en (from nemo_toolkit[asr])
  Downloading g2p_en-2.1.0-py3-none-any.whl.metadata (4.5 kB)
Collecting jiwer (from nemo_toolkit[asr])
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting kaldi-python-io (from nemo_toolkit[asr])
  Downloading kaldi-python-io-1.2.2.tar.gz (8.8 kB)
  Preparing metadata (setup.py) ... [?25ld

In [3]:
!git clone https://github.com/NVIDIA/NeMo.git

fatal: destination path 'NeMo' already exists and is not an empty directory.


In [4]:
# Build Tokenizer
!python /kaggle/working/NeMo/scripts/tokenizers/process_asr_text_tokenizer.py \
        --manifest='train_manifest.json' \
        --data_root="./NeMo/toke" \
        --vocab_size=1024 \
        --tokenizer="wpe" \
        --spe_type="unigram" \
        --no_lower_case \
        --spe_character_coverage=1.0 \
        --log

[2K[00:00:00] Tokenize words                 ██████████████████ 60751    /    60751[00:00:00] Tokenize words                 ██████████████████ 0        /        0
[2K[00:00:00] Count pairs                    ██████████████████ 60751    /    60751
[2K[00:00:00] Compute merges                 ██████████████████ 931      /      931
Serialized tokenizer at location : ./NeMo/toke/tokenizer_wpe_v1024


In [19]:
# Create the .yaml configuration for Fast Conformer Transducer with Egyptian Dialect
config_data = """
name: "Fast-Conformer-Transducer-Egyptian-Dialect"

model:
  sample_rate: 16000
  log_prediction: false
  skip_nan_grad: false
  rnnt_reduction: 'mean_volume'
  
  model_defaults:
    enc_hidden: ${model.encoder.d_model}
    pred_hidden: 64
    joint_hidden: 64
    filters: 128
  
  tokenizer:
    type: 'wpe'
    dir: "/kaggle/working/NeMo/toke/tokenizer_wpe_v1024"

  train_ds:
    manifest_filepath: "./train_manifest.json"
    sample_rate: 16000
    batch_size: 16
    shuffle: true
    num_workers: 4
    pin_memory: true
    trim_silence: false
    max_duration: 16
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: "synced_randomized"
    bucketing_batch_size: null
    white_noise:
      prob: 0.5
      min_level: -90
      max_level: -46
    speed:
      prob: 0.5
      sr: 16000
      resample_type: 'kaiser_fast'
      min_speed_rate: 0.95
      max_speed_rate: 1.05

  validation_ds:
    manifest_filepath: "./dev_manifest.json"
    sample_rate: 16000
    batch_size: 16
    shuffle: false
    num_workers: 4
    pin_memory: true

  test_ds:
    manifest_filepath: "./test_manifest.json"
    sample_rate: 16000
    batch_size: 16
    shuffle: false
    num_workers: 4
    pin_memory: true

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    sample_rate: 16000
    normalize: "per_feature"
    window_size: 0.025
    window_stride: 0.01
    window: "hann"
    features: 80
    n_fft: 512
    log: true
    frame_splicing: 1
    dither: 0.00001
    pad_to: 0
    pad_value: 0.0

  spec_augment:
    _target_: nemo.collections.asr.modules.SpectrogramAugmentation
    freq_masks: 2
    time_masks: 2
    freq_width: 15
    time_width: 25
    rect_masks: 5
    rect_time: 25
    rect_freq: 15

  encoder:
    _target_: nemo.collections.asr.modules.ConformerEncoder
    feat_in: ${model.preprocessor.features}
    feat_out: -1
    n_layers: 16
    d_model: 176
    subsampling: striding
    subsampling_factor: 4
    subsampling_conv_channels: 176
    ff_expansion_factor: 4
    self_attention_model: rel_pos
    n_heads: 4
    att_context_size: [-1, -1]
    xscaling: true
    untie_biases: true
    pos_emb_max_len: 5000
    conv_kernel_size: 31
    conv_norm_type: 'batch_norm'
    dropout: 0.1 
    dropout_pre_encoder: 0.1
    dropout_emb: 0.0
    dropout_att: 0.1 

  decoder:
    _target_: nemo.collections.asr.modules.RNNTDecoder
#    feat_in: null
#    num_classes: 34
    normalization_mode: null
    random_state_sampling: false
    blank_as_pad: true
    
    prednet:
      pred_hidden: ${model.model_defaults.pred_hidden}
      pred_rnn_layers: 1
      t_max: null
      dropout: 0.2
      
  decoding:
    strategy: "greedy_batch"

    greedy:
      max_symbols: 10

    beam:
      beam_size: 2
      return_best_hypothesis: False
      score_norm: true
      tsd_max_sym_exp: 50
      alsd_max_target_len: 2.0
      
  joint:
    _target_: nemo.collections.asr.modules.RNNTJoint
    log_softmax: null
    preserve_memory: false
    fuse_loss_wer: true
    fused_batch_size: 4

    jointnet:
      joint_hidden: ${model.model_defaults.joint_hidden}
      activation: "relu"
      dropout: 0.2

  optim:
    name: adamw
    lr: 2.0
    betas: [0.9, 0.98]
    weight_decay: 1e-3
    sched:
      name: NoamAnnealing
      d_model: ${model.encoder.d_model}
      warmup_steps: 10000
      min_lr: 1e-6

trainer:
  devices: -1
  num_nodes: 1
  max_epochs: 35
  val_check_interval: 1.0
  accelerator: auto
  strategy: 'ddp'
  accumulate_grad_batches: 1
  gradient_clip_val: 0.0
  precision: 32
  log_every_n_steps: 100
  num_sanity_val_steps: 0
  check_val_every_n_epoch: 1
  sync_batchnorm: true
  enable_checkpointing: True
  logger: false
  benchmark: false
  

exp_manager:
  exp_dir: 'experiments/'
  name: ${name}
  create_tensorboard_logger: true
  create_checkpoint_callback: false
  checkpoint_callback_params:
    monitor: "val_wer"
    mode: "min"
    save_top_k: 2
    always_save_nemo: True
    save_best_model: True
  resume_if_exists: false
  resume_ignore_no_checkpoint: false
"""

In [20]:
import os

# Ensure the directory exists before writing the file
directory = "/kaggle/working/NeMo/conf"
if not os.path.exists(directory):
    os.makedirs(directory)

# Write the config to a .yaml file in the specified directory
with open(os.path.join(directory, "fast_conformer_transducer_egyptian_dialect.yaml"), "w") as file:
    file.write(config_data)

print(f"Configuration saved to {os.path.abspath(os.path.join(directory, 'fast_conformer_transducer_egyptian_dialect.yaml'))}")

Configuration saved to /kaggle/working/NeMo/conf/fast_conformer_transducer_egyptian_dialect.yaml


In [21]:
import os

# Define the directory name
directory = "train_experiments"

# Create the directory
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created successfully.")
else:
    print(f"Directory '{directory}' already exists.")

Directory 'train_experiments' already exists.


In [22]:
# Read the contents of the new file
file_path = "/kaggle/working/train_transducer_model.py"

# Append the new training code to the existing content
code = """
import pytorch_lightning as pl
from omegaconf import OmegaConf
from nemo.collections.asr.models import EncDecRNNTBPEModel
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager

# Function to setup train and validation data
def setup_data_loaders(model, cfg):
    model.setup_training_data(train_data_config=cfg.model.train_ds)
    model.setup_validation_data(val_data_config=cfg.model.validation_ds)
    model.setup_test_data(test_data_config=cfg.model.test_ds)

@hydra_runner(config_path=".", config_name="config_rnnt_bpe")
def main(cfg):
    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    
    checkpoint_path = '/kaggle/input/asr-transducer-v3/transducer-v3.nemo'
    transducer_v3_model = EncDecRNNTBPEModel.restore_from(restore_path=checkpoint_path)
    
    # Initialize the weights of the model from another model, if provided via config
    transducer_v3_model.maybe_init_from_pretrained_checkpoint(cfg)
    
    # Setup data loaders
    setup_data_loaders(transducer_v3_model, cfg)
    
    # Train the model
    trainer.fit(transducer_v3_model)
    transducer_v3_model.save_to("/kaggle/working/train_experiments/transducer-v4.nemo")
    
    # Test the model if test dataset is provided
    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
        if transducer_v3_model.prepare_test(trainer):
            trainer.test(transducer_v3_model)

if __name__ == '__main__':
    main()
"""

# Write the modified content back to the file
with open(file_path, "w") as file:
    file.write(code)

print("File created successfully!")


File created successfully!


In [23]:
!python /kaggle/working/train_transducer_model.py --config-path='/kaggle/working/NeMo/conf' --config-name='fast_conformer_transducer_egyptian_dialect'

    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
      ret = run_job(
    
[NeMo I 2024-06-30 18:07:13 train_transducer_model:17] Hydra config: name: Fast-Conformer-Transducer-Egyptian-Dialect
    model:
      sample_rate: 16000
      log_prediction: false
      skip_nan_grad: false
      rnnt_reduction: mean_volume
      model_defaults:
        enc_hidden: ${model.encoder.d_model}
        pred_hidden: 64
        joint_hidden: 64
        filters: 128
      tokenizer:
        type: wpe
        dir: /kaggle/working/NeMo/toke/tokenizer_wpe_v1024
      train_ds:
        manifest_filepath: ./train_manifest.json
        sample_rate: 16000
        batch_size: 16
        shuffle: true
        num_workers: 4
        pin_memory: true
        trim_silence: false
        max_duration: 16
        min_duration: 0.1
        is_tarred: false
        tarred_audio_filepaths: null
        shuffle_n: 2048
        bucketing_strategy: synced_randomized


In [3]:
from nemo.collections.asr.models import EncDecRNNTBPEModel
checkpoint_path = '/kaggle/input/asr-transducer-v4/transducer-v4.nemo'
transducer_v4_model = EncDecRNNTBPEModel.restore_from(restore_path=checkpoint_path)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

    


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

[NeMo I 2024-07-01 11:58:32 mixins:172] Tokenizer AutoTokenizer initialized with 1024 tokens


[NeMo W 2024-07-01 11:58:33 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: ./train_manifest.json
    sample_rate: 16000
    batch_size: 16
    shuffle: true
    num_workers: 4
    pin_memory: true
    trim_silence: false
    max_duration: 16
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    white_noise:
      prob: 0.5
      min_level: -90
      max_level: -46
    speed:
      prob: 0.5
      sr: 16000
      resample_type: kaiser_fast
      min_speed_rate: 0.95
      max_speed_rate: 1.05
    
[NeMo W 2024-07-01 11:58:33 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configura

[NeMo I 2024-07-01 11:58:33 features:289] PADDING: 0


    


[NeMo I 2024-07-01 11:58:34 rnnt_models:217] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: None
[NeMo I 2024-07-01 11:58:35 save_restore_connector:249] Model EncDecRNNTBPEModel was successfully restored from /kaggle/input/asr-transducer-v4/transducer-v4.nemo.


In [7]:
data_dir = '/kaggle/input/mtc-aic-test-data/test'
audio = [os.path.join(data_dir, 'test_sample_1000_clean.wav'),
         os.path.join(data_dir, 'test_sample_1001_clean.wav'),
         os.path.join(data_dir, 'test_sample_1002_clean.wav'),
         os.path.join(data_dir, 'test_sample_1003_clean.wav'),
         os.path.join(data_dir, 'test_sample_1004_clean.wav')
            ]
print(transducer_v4_model.transcribe(paths2audio_files=audio,batch_size=4))

Transcribing:   0%|          | 0/2 [00:00<?, ?it/s]

(['طبعا', '', 'يعني خاصه بتكلم على السعيد', '', 'والح'], ['طبعا', '', 'يعني خاصه بتكلم على السعيد', '', 'والح'])


In [21]:
from omegaconf import OmegaConf, open_dict

config = OmegaConf.load("/kaggle/working/NeMo/conf/fast_conformer_transducer_egyptian_dialect.yaml")

In [30]:
import os
import pandas as pd

# Assuming you have a list of WAV files in your test directory
test_dir = '/kaggle/input/mtc-aic-test-data/test'
    
items = os.listdir(test_dir)
    
    # Filter out non-files (i.e., directories)
files = [item for item in items if os.path.isfile(os.path.join(test_dir, item))]
    
    # Return the count of files
test_wav_files = len(files)
print(test_wav_files)
#wav_files = [os.path.join(test_dir, f) for f in os.listdir(test_dir) if f.endswith('.wav')]
#transcriptions = transducer_v4_model.transcribe(paths2audio_files=wav_files)
#print(transcriptions)
transcriptions = []
for wav_file in wav_files:
    audio_id = os.path.splitext(os.path.basename(wav_file))[0]  # Get audio file ID without extension
    transcription = transducer_v4_model.transcribe(audio=wav_file)  # Replace with your actual transcription function
    transcriptions.append({'audio': audio_id, 'transcript': transcription})

# Create a DataFrame from transcriptions
#df = pd.DataFrame(transcriptions)

# Save DataFrame to CSV
#submission_file = '/kaggle/working/submission.csv'  # Specify your desired output file path
#df.to_csv(submission_file, index=False)

1726
