# Diffusion Model

In [1]:
import torch
from torch.utils.data import DataLoader

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


from src.datasets import MusicCapsDataset
from src.features import PreProcessor, DataModule
from src.features.extractor import WaveformExtractor, Extractor
from src.models import DiffusionModel
from src.models.callbacks import MetricsCallback
from src.utils.data import TorchDataset
from src.visualization import ChartsViewer
from src.utils.audio import Audio
from src.utils.gpu import create_device

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
NUM_EPOCHS = 1
BATCH_SIZE = 2
DECODE_STEPS = 100
MODEL_BEST = "best-epoch"
MODEL_RESULT = "last-epoch"
SAMPLING_RATE = 48000

In [3]:
musiccaps_generator = MusicCapsDataset(format="wav", crop_length=5.5)
dataset = musiccaps_generator.generate(num_proc=1)

Using the latest cached version of the dataset since google/MusicCaps couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/pavan/.cache/huggingface/datasets/google___music_caps/default/0.0.0/0a51889b340037bb75a9a0858af2e4ece21f7f89 (last modified on Thu Apr  3 21:48:02 2025).


In [4]:
dataset

Dataset({
    features: ['ytid', 'start_s', 'end_s', 'audioset_positive_labels', 'aspect_list', 'caption', 'author_id', 'is_balanced_subset', 'is_audioset_eval', 'audio', 'online'],
    num_rows: 10838
})

In [5]:
audio_preprocessor = PreProcessor(dataset=dataset,
                                  transformation_function=lambda dataset: WaveformExtractor(dataset, column="audio", crop_length=2**8))

text_preprocessor = PreProcessor(dataset=dataset,
                                 transformation_function=lambda dataset: Extractor(dataset, column="caption", name="Text"))

musiccaps_data = DataModule(preprocessors=[audio_preprocessor, text_preprocessor],
                            path=musiccaps_generator.get_processed_folder(),
                            batch_size=BATCH_SIZE,
                            transformations=[lambda x: x.unsqueeze(0), None])

## Training Model

In [6]:
model = DiffusionModel()
run_folder = model.get_run_folder()

In [7]:
# Callbacks
last_checkpoint_callback = ModelCheckpoint(
    dirpath=run_folder,  
    filename=MODEL_RESULT,  
    save_last=True,  
    save_weights_only=True,
)

best_checkpoint_callback = ModelCheckpoint(
    dirpath=run_folder,  
    filename=MODEL_BEST, 
    monitor='train_loss',  
    mode='min',  
    save_top_k=1, 
    save_weights_only=True,
)

metrics_callback = MetricsCallback()

callbacks = [last_checkpoint_callback,
             best_checkpoint_callback, 
             metrics_callback]

In [8]:
# Configurar treinamento
trainer = pl.Trainer(
    max_epochs=NUM_EPOCHS,
    callbacks=callbacks,
    default_root_dir=run_folder
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/pavan/Desktop/Harmonix-Diff/try2/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [9]:
# Treinar o modelo
trainer.fit(model=model, train_dataloaders=musiccaps_data)
model.eval()

You are using a CUDA device ('NVIDIA GeForce RTX 3060 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


Loading train/test indexes...


Generating train subset [Waveform]: 100%|██████████| 8670/8670 [01:07<00:00, 127.93it/s]
Generating train subset [Waveform]: 100%|██████████| 2168/2168 [00:16<00:00, 131.92it/s]


Saving train/test splits...
Loading train/test indexes...


Generating train subset [Text]: 100%|██████████| 8670/8670 [01:02<00:00, 139.67it/s]
Generating train subset [Text]: 100%|██████████| 2168/2168 [00:15<00:00, 135.57it/s]
/home/pavan/Desktop/Harmonix-Diff/try2/.venv/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/pavan/Desktop/Harmonix-Diff/try2/notebooks/models/DiffusionModel/run_2025-04-03_22-27-29 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Saving train/test splits...



  | Name  | Type           | Params | Mode 
-------------------------------------------------
0 | model | DiffusionModel | 374 M  | train
-------------------------------------------------
374 M     Trainable params
0         Non-trainable params
374 M     Total params
1,499.670 Total estimated model params size (MB)
1748      Modules in train mode
223       Modules in eval mode
/home/pavan/Desktop/Harmonix-Diff/try2/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 0/4335 [00:00<?, ?it/s] 

RuntimeError: Calculated padded input size per channel: (1). Kernel size: (2). Kernel size can't be greater than actual input size

In [None]:
logged_metrics = metrics_callback.metrics
losses_values = []

# Exiba as métricas
for metric_dict in logged_metrics:
    for metric_name, metric_value in metric_dict.items():
        losses_values.append(metric_value.cpu())
        
ChartsViewer.get_line_chart(losses_values, title="Train: Loss", x_label="Epochs", y_label="Loss")

## Loading Model

In [None]:
musiccaps_data.setup()
RUN_PATH = "models/DiffusionModel/run_2023-11-24_22-47-46/last-v1.ckpt"

model = DiffusionModel.load_from_checkpoint(RUN_PATH)

In [None]:
model.eval()
device = create_device()

In [None]:
model = model.model.to(device)

In [None]:
from IPython.display import Audio as ipyAudio

Y_audio_data_loader = musiccaps_data.test_dataloader()[0]
Y_text_data_loader = musiccaps_data.test_dataloader()[1]

with torch.no_grad():
    for (audios, texts) in zip(Y_audio_data_loader, Y_text_data_loader):
        audios = audios.cpu().numpy()
        for i in range(BATCH_SIZE):
            # Original
            audio = audios[i]
            audio.reshape(audio.shape[0], -1)

            # Diffusion with noise
            noise = torch.randn(1, 1, 2**18).to(device)
            
            sample = model.sample(
                noise,
                text=[texts[i]],  # Use the provided text prompt
                embedding_scale=15,
                num_steps=100
            )

            sample = sample[0].cpu().numpy()
            sample.reshape(sample.shape[0], -1)

            print(f"Text: {texts[i]}")
            display(ipyAudio(audio, rate=SAMPLING_RATE))
            display(ipyAudio(sample, rate=SAMPLING_RATE))
            
        break