In [1]:
import os
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf
import pytorch_lightning as pl
from nemo.utils.exp_manager import exp_manager

# Define dataset paths
dataset_basedir = "./"
train_dataset = os.path.join(dataset_basedir, 'train_manifest.json')
val_dataset = os.path.join(dataset_basedir, 'validation_manifest.json')
test_dataset = os.path.join(dataset_basedir, 'test_manifest.json')

# Load configuration
config_path = "configs/matchboxnet_3x1x64_v1.yaml"
config = OmegaConf.load(config_path)
config = OmegaConf.to_container(config, resolve=True)
config = OmegaConf.create(config)

# Set dataset paths
config.model.train_ds.manifest_filepath = train_dataset
config.model.validation_ds.manifest_filepath = val_dataset
config.model.test_ds.manifest_filepath = test_dataset

# Trainer configuration
accelerator = 'gpu'
config.trainer.devices = 1
config.trainer.accelerator = accelerator
config.trainer.max_epochs = 500
config.trainer.strategy = 'auto'

# Create trainer
trainer = pl.Trainer(**config.trainer)

# Load model from checkpoint
checkpoint_path = "/home/catsunoki/pytorch/ser/nemo_experiments/CustomConv1DModel/2024-06-05_20-42-25/checkpoints/CustomConv1DModel--val_loss=1.1978-epoch=5.ckpt"
restored_model = nemo_asr.models.EncDecClassificationModel.load_from_checkpoint(checkpoint_path)

# Ensure datasets are properly set in the model configuration
restored_model.cfg.train_ds.manifest_filepath = train_dataset
restored_model.cfg.validation_ds.manifest_filepath = val_dataset
restored_model.cfg.test_ds.manifest_filepath = test_dataset

# Reload datasets
restored_model.setup_training_data(train_data_config=restored_model.cfg.train_ds)
restored_model.setup_validation_data(val_data_config=restored_model.cfg.validation_ds)
restored_model.setup_test_data(test_data_config=restored_model.cfg.test_ds)

# Set the trainer for the model
restored_model.set_trainer(trainer)
exp_dir = exp_manager(trainer, config.get("exp_manager", None))

# Fit the model
trainer.fit(restored_model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..
[NeMo W 2024-06-05 23:28:58 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: ./train_manifest.json
    sample_rate: 16000
    labels:
    - neutral
    - calm
    - happy
    - sad
    - angry
    - fear
    - disgust
    - surprise
    batch_size: 4
    shuffle: true
    num_workers: 4
    pin_memory: true
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 1024
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    bucketing_weights: null
    
[NeMo W 2024-06-05 23:28:58 modelPT:168] If you intend to do validation, ple

[NeMo I 2024-06-05 23:28:58 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-06-05 23:28:58 collections:302] Dataset loaded with 864 items, total duration of  0.89 hours.
[NeMo I 2024-06-05 23:28:58 collections:304] # 864 files loaded accounting to # 8 labels
[NeMo I 2024-06-05 23:28:58 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-06-05 23:28:58 collections:302] Dataset loaded with 288 items, total duration of  0.30 hours.
[NeMo I 2024-06-05 23:28:58 collections:304] # 288 files loaded accounting to # 8 labels
[NeMo I 2024-06-05 23:28:58 collections:301] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-06-05 23:28:58 collections:302] Dataset loaded with 288 items, total duration of  0.30 hours.
[NeMo I 2024-06-05 23:28:58 collections:304] # 288 files loaded accounting to # 8 labels
[NeMo I 2024-06-05 23:28:58 exp_manager:386] Experiments will be logged at /home/catsunoki/pytorch/ser/nem

You are using a CUDA device ('NVIDIA GeForce RTX 4070 SUPER') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2024-06-05 23:28:59 modelPT:728] Optimizer config = Adam (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.999]
        capturable: False
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.005
        maximize: False
        weight_decay: 0.0001
    )
[NeMo I 2024-06-05 23:28:59 lr_scheduler:910] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7fa460db06d0>" 
    will be used during training (effective maximum steps = 108000) - 
    Parameters : 
    (warmup_steps: null
    warmup_ratio: 0.05
    min_lr: 1.0e-06
    last_epoch: -1
    max_steps: 108000
    )



  | Name              | Type                             | Params
-----------------------------------------------------------------------
0 | spec_augmentation | SpectrogramAugmentation          | 0     
1 | crop_or_pad       | CropOrPadSpectrogramAugmentation | 0     
2 | preprocessor      | AudioToMFCCPreprocessor          | 0     
3 | encoder           | ConvASREncoder                   | 624 K 
4 | decoder           | ConvASRDecoderClassification     | 1.0 K 
5 | loss              | CrossEntropyLoss                 | 0     
6 | _accuracy         | TopKClassificationAccuracy       | 0     
-----------------------------------------------------------------------
625 K     Trainable params
0         Non-trainable params
625 K     Total params
2.501     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

[NeMo I 2024-06-05 23:29:03 preemption:56] Preemption requires torch distributed to be initialized, disabling preemption


    


Validation: 0it [00:00, ?it/s]

Epoch 0, global step 216: 'val_loss' reached 1.19098 (best 1.19098), saving model to '/home/catsunoki/pytorch/ser/nemo_experiments/CustomConv1DModel/2024-06-05_23-28-58/checkpoints/CustomConv1DModel--val_loss=1.1910-epoch=0.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 432: 'val_loss' reached 1.20284 (best 1.19098), saving model to '/home/catsunoki/pytorch/ser/nemo_experiments/CustomConv1DModel/2024-06-05_23-28-58/checkpoints/CustomConv1DModel--val_loss=1.2028-epoch=1.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 648: 'val_loss' reached 1.21159 (best 1.19098), saving model to '/home/catsunoki/pytorch/ser/nemo_experiments/CustomConv1DModel/2024-06-05_23-28-58/checkpoints/CustomConv1DModel--val_loss=1.2116-epoch=2.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 864: 'val_loss' reached 1.19523 (best 1.19098), saving model to '/home/catsunoki/pytorch/ser/nemo_experiments/CustomConv1DModel/2024-06-05_23-28-58/checkpoints/CustomConv1DModel--val_loss=1.1952-epoch=3.ckpt' as top 3


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 1080: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 1296: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 1512: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 1728: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 1944: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 2160: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 10, global step 2376: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 11, global step 2592: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 12, global step 2808: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 13, global step 3024: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 14, global step 3240: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 15, global step 3456: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 16, global step 3672: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 17, global step 3888: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 18, global step 4104: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 19, global step 4320: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 20, global step 4536: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 21, global step 4752: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 22, global step 4968: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 23, global step 5184: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 24, global step 5400: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 25, global step 5616: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 26, global step 5832: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 27, global step 6048: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 28, global step 6264: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 29, global step 6480: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 30, global step 6696: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 31, global step 6912: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 32, global step 7128: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 33, global step 7344: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 34, global step 7560: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 35, global step 7776: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 36, global step 7992: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 37, global step 8208: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 38, global step 8424: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 39, global step 8640: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 40, global step 8856: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 41, global step 9072: 'val_loss' was not in top 3


Validation: 0it [00:00, ?it/s]

Epoch 42, global step 9288: 'val_loss' was not in top 3
      rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
    


In [2]:
trainer.test(restored_model, ckpt_path=None)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{'test_loss': 1.5381959676742554, 'test_epoch_top@1': 0.40625}]

In [4]:
import glob
checkpoint_dir = os.path.join(exp_dir, 'checkpoints')
checkpoint_paths = list(glob.glob(os.path.join(checkpoint_dir, "*.ckpt")))
checkpoint_paths

final_checkpoint = list(filter(lambda x: "-last.ckpt" in x, checkpoint_paths))[0]
print(final_checkpoint)

/home/catsunoki/pytorch/ser/nemo_experiments/CustomConv1DModel/2024-06-05_23-28-58/checkpoints/CustomConv1DModel--val_loss=1.5225-epoch=42-last.ckpt
