In [1]:
from nemo.collections.asr.models import EncDecSpeakerLabelModel
from IPython.display import Audio, display
import numpy as np
import torch
import torchaudio
import torchaudio.transforms as T
import torchaudio.functional as F
import math
import os
import requests

import pytorch_lightning as pl
import nemo
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
from nemo.utils.exp_manager import exp_manager


[NeMo W 2023-06-27 14:45:28 optimizers:54] Apex was not found. Using the lamb or fused_adam optimizer will error out.
    


## Get Data for Speaker Net to be Fine-Tuned on

In [None]:
import os
NEMO_ROOT = os.getcwd()
print(NEMO_ROOT)
import glob
import subprocess
import tarfile
import wget

data_dir = os.path.join(NEMO_ROOT,'data')
os.makedirs(data_dir, exist_ok=True)

# Download the dataset. This will take a few moments...
print("******")
if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):
    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'  # for the original source, please visit http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz 
    an4_path = wget.download(an4_url, data_dir)
    print(f"Dataset downloaded at: {an4_path}")
else:
    print("Tarfile already exists.")
    an4_path = data_dir + '/an4_sphere.tar.gz'

# Untar and convert .sph to .wav (using sox)
tar = tarfile.open(an4_path)
tar.extractall(path=data_dir)

print("Converting .sph to .wav...")
sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)
for sph_path in sph_list:
    wav_path = sph_path[:-4] + '.wav'
    cmd = ["sox", sph_path, wav_path]
    subprocess.run(cmd)
print("Finished conversion.\n******")

In [None]:
print("Downloading necessary scripts")
!mkdir -p scripts/speaker_tasks
!wget -P scripts/speaker_tasks/ https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speaker_tasks/filelist_to_manifest.py
    

In [None]:
data_dir = "data"
!find {data_dir}/an4/wav/an4test_clstk  -iname "*.wav" > {data_dir}/an4/wav/an4test_clstk/test_all.txt
!python {NEMO_ROOT}/scripts/speaker_tasks/filelist_to_manifest.py --filelist {data_dir}/an4/wav/an4test_clstk/test_all.txt --id -2 --out {data_dir}/an4/wav/an4test_clstk/test.json

## Config Files

In [None]:
# #speakernet base config
# !mkdir -p conf
!wget -P conf https://raw.githubusercontent.com/NVIDIA/NeMo/r1.19.0/examples/speaker_tasks/recognition/conf/ecapa_tdnn.yaml

In [2]:
#get speakernet config file
MODEL_CONFIG = os.path.join('conf/ecapa_tdnn.yaml')
finetune_config = OmegaConf.load(MODEL_CONFIG)
# print(OmegaConf.to_yaml(finetune_config))

In [3]:
#Model Data Config (from data gathering above)
# test_manifest = os.path.join(data_dir,'data/an4/wav/an4test_clstk/test.json')
train_manifest = os.path.join('/Users/ajaybati/Documents/speakerNet/SpeakerNetTest/training/data/an4/wav/an4test_clstk/test.json')
validation_manifest = os.path.join('/Users/ajaybati/Documents/speakerNet/SpeakerNetTest/training/data/an4/wav/an4test_clstk/test.json')
finetune_config.model.train_ds.manifest_filepath = train_manifest
finetune_config.model.validation_ds.manifest_filepath = validation_manifest
finetune_config.model.train_ds.augmentor.noise.manifest_path = train_manifest
finetune_config.model.decoder.num_classes = 10 #PLEASE CHANGE TO ACTUAL NUMBER OF CLASSES

In [4]:
#Trainer Config
#parameters can be tweaked if necessary
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'

trainer_config = OmegaConf.create(dict(
    devices=1,
    accelerator=accelerator,
    max_epochs=5,
    max_steps=-1,
    num_nodes=1,
    accumulate_grad_batches=1,
    enable_checkpointing=False,
    logger=False,
    log_every_n_steps=1,  #Interval of logging.
    val_check_interval=1.0,  #Set to 0.25 to check 4 times per epoch, or an int for number of iterations
))
# print(OmegaConf.to_yaml(trainer_config))
trainer_finetune = pl.Trainer(**trainer_config)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
      rank_zero_warn(
    
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..


In [5]:
#load model (from pretrained speakerverification_speakernet, 
#but exclude decoder.final classification layer because now we have different number of speakers)
speaker_model = nemo_asr.models.EncDecSpeakerLabelModel(cfg=finetune_config.model, trainer=trainer_finetune)
speaker_model.maybe_init_from_pretrained_checkpoint(finetune_config)

[NeMo I 2023-06-27 14:45:38 collections:298] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-06-27 14:45:38 collections:299] Dataset loaded with 130 items, total duration of  0.10 hours.
[NeMo I 2023-06-27 14:45:38 collections:301] # 130 files loaded accounting to # 10 labels


[NeMo W 2023-06-27 14:45:38 label_models:180] Total number of 10 found in all the manifest files.


[NeMo I 2023-06-27 14:45:38 collections:193] Dataset loaded with 130 files totalling 0.10 hours
[NeMo I 2023-06-27 14:45:38 collections:194] 0 files were filtered totalling 0.00 hours
[NeMo I 2023-06-27 14:45:38 collections:298] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-06-27 14:45:38 collections:299] Dataset loaded with 130 items, total duration of  0.10 hours.
[NeMo I 2023-06-27 14:45:38 collections:301] # 130 files loaded accounting to # 10 labels
[NeMo I 2023-06-27 14:45:38 collections:298] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2023-06-27 14:45:38 collections:299] Dataset loaded with 130 items, total duration of  0.10 hours.
[NeMo I 2023-06-27 14:45:38 collections:301] # 130 files loaded accounting to # 10 labels
[NeMo I 2023-06-27 14:45:38 features:289] PADDING: 16
[NeMo I 2023-06-27 14:45:38 cloud:58] Found existing object /Users/ajaybati/.cache/torch/NeMo/NeMo_1.19.0rc0/ecapa_tdnn/3e0c5c4731b176aeb70c29a74d800c81/ecapa_tdnn.

[NeMo W 2023-06-27 14:45:38 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    time_length: 3
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2023-06-27 14:45:38 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data lo

[NeMo I 2023-06-27 14:45:38 features:289] PADDING: 16
[NeMo I 2023-06-27 14:45:39 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /Users/ajaybati/.cache/torch/NeMo/NeMo_1.19.0rc0/ecapa_tdnn/3e0c5c4731b176aeb70c29a74d800c81/ecapa_tdnn.nemo.
[NeMo I 2023-06-27 14:45:39 modelPT:1136] Model checkpoint partially restored from pretrained checkpoint with name `ecapa_tdnn`
[NeMo I 2023-06-27 14:45:39 modelPT:1138] The following parameters were excluded when loading from pretrained checkpoint with name `ecapa_tdnn` : ['decoder.final.weight']
[NeMo I 2023-06-27 14:45:39 modelPT:1141] Make sure that this is what you wanted!


In [6]:
from nemo.utils.exp_manager import exp_manager
log_dir = exp_manager(trainer_finetune, finetune_config.get("exp_manager", None))
# The log_dir provides a path to the current logging directory for easy access
print(log_dir)

[NeMo I 2023-06-27 14:45:39 exp_manager:374] Experiments will be logged at /Users/ajaybati/Documents/speakerNet/SpeakerNetTest/training/nemo_experiments/ECAPA_TDNN/2023-06-27_14-45-39
[NeMo I 2023-06-27 14:45:39 exp_manager:797] TensorboardLogger has been set up
[NeMo I 2023-06-27 14:45:39 exp_manager:912] Preemption is supported only on GPUs, disabling preemption
/Users/ajaybati/Documents/speakerNet/SpeakerNetTest/training/nemo_experiments/ECAPA_TDNN/2023-06-27_14-45-39


In [7]:
import resampy
import librosa

In [8]:
trainer_finetune.fit(speaker_model)

[NeMo I 2023-06-27 14:45:56 modelPT:721] Optimizer config = SGD (
    Parameter Group 0
        dampening: 0
        differentiable: False
        foreach: None
        lr: 0.08
        maximize: False
        momentum: 0
        nesterov: False
        weight_decay: 0.0002
    )
[NeMo I 2023-06-27 14:45:56 lr_scheduler:910] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x174ca3ac0>" 
    will be used during training (effective maximum steps = 15) - 
    Parameters : 
    (warmup_ratio: 0.1
    min_lr: 0.0001
    max_steps: 15
    )



  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | loss              | AngularSoftmaxLoss                | 0     
1 | eval_loss         | AngularSoftmaxLoss                | 0     
2 | _accuracy         | TopKClassificationAccuracy        | 0     
3 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
4 | encoder           | ECAPAEncoder                      | 18.1 M
5 | decoder           | SpeakerDecoder                    | 2.8 M 
6 | _macro_accuracy   | MulticlassAccuracy                | 0     
7 | spec_augmentation | SpectrogramAugmentation           | 0     
------------------------------------------------------------------------
20.9 M    Trainable params
0         Non-trainable params
20.9 M    Total params
83.675    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

      rank_zero_warn(
    
      rank_zero_warn(
    


Training: 0it [00:00, ?it/s]

    
      rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
    
