In [1]:
from transformers import MusicgenForConditionalGeneration

model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "leng

We can then place the model on our accelerator device (if available), or leave it on the CPU otherwise:

In [2]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device);

In [3]:
print(device)

cuda:0


## Generation

MusicGen is compatible with two generation modes: greedy and sampling. In practice, sampling leads to significantly
better results than greedy, thus we encourage sampling mode to be used where possible. Sampling is enabled by default,
and can be explicitly specified by setting `do_sample=True` in the call to `MusicgenForConditionalGeneration.generate` (see below).

### Unconditional Generation

The inputs for unconditional (or 'null') generation can be obtained through the method `MusicgenForConditionalGeneration.get_unconditional_inputs`. We can then run auto-regressive generation using the `.generate` method, specifying `do_sample=True` to enable sampling mode:

In [5]:
from transformers import AutoProcessor
from IPython.display import Audio

processor = AutoProcessor.from_pretrained("facebook/musicgen-small")

inputs = processor(
    text=["This is an instrumental piece consisting of an electric guitar solo which is played with the tapping technique. There is a usage of complex chords and syncopations, essential characteristics of the progressive rock genre. The guitar has a clear tone."],
    padding=True,
    return_tensors="pt",
)

audio_values = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=768)
model.config.audio_encoder.sampling_rate = 48000
sampling_rate = model.config.audio_encoder.sampling_rate

Audio(audio_values[0].cpu().numpy(), rate=sampling_rate)

In [4]:
model.config.audio_encoder.sampling_rate

48000

In [5]:
audio_values.size()

torch.Size([1, 1, 489600])

Or save them as a `.wav` file using a third-party library, e.g. `scipy` (note here that we also need to remove the channel dimension from our audio tensor):

In [7]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
# Load the training data
train_data = torch.load('data/train_data.pt')
test_data = torch.load('data/test_data.pt')

train_embeddings = torch.tensor(np.array(train_data["embeddings"])).to(device)  # Move to GPU
train_labels = [str(label) for label in train_data["labels"]]
train_filenames = [str(filename) for filename in train_data["filenames"]]

test_embeddings = torch.tensor(np.array(test_data["embeddings"])).to(device)  # Move to GPU
test_labels = [str(label) for label in test_data["labels"]]
test_filenames = [str(filename) for filename in test_data["filenames"]]

# Ensure all labels are strings
for label in train_labels:
    if label is None or not isinstance(label, str):
        print("Label has an error or is not a string")

# Tokenize the labels (convert them into token IDs) just once
train_tokenized_labels = tokenizer(train_labels, padding=True, truncation=True, return_tensors="pt").input_ids.to(device)  # Move to GPU

test_tokenized_labels = tokenizer(test_labels, padding=True, truncation=True, return_tensors="pt").input_ids.to(device)  # Move to GPU


# Create a DataLoader for your train data
train_dataset = TensorDataset(train_embeddings, train_tokenized_labels)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)

test_dataset = TensorDataset(test_embeddings, test_tokenized_labels)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

  train_data = torch.load('data/train_data.pt')
  test_data = torch.load('data/test_data.pt')
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
print(train_filenames[0])

MrMXYO2fzJ4


In [11]:
import torchaudio

def load_audio(file_path):
    audio_waveform, sample_rate = torchaudio.load(file_path)
    if audio_waveform.shape[0] > 1:
        audio_waveform = audio_waveform.mean(dim=0, keepdim=True)

    # Ensure the audio is exactly 10 seconds (320,000 samples)
    expected_length = 489600

    # If the audio is shorter, pad it with zeros
    if audio_waveform.shape[1] < expected_length:
        padding = expected_length - audio_waveform.shape[1]
        audio_waveform = torch.nn.functional.pad(audio_waveform, (0, padding))

    # If the audio is longer, truncate it
    elif audio_waveform.shape[1] > expected_length:
        audio_waveform = audio_waveform[:, :expected_length]

    # Reshape the tensor to [1, 1, 489600]
    audio_tensor = audio_waveform.unsqueeze(0)

    return audio_tensor

In [31]:
import torchaudio.transforms as transforms
import torch.nn.functional as F

In [12]:
from transformers import AutoProcessor, AutoModel
clap_processor = AutoProcessor.from_pretrained("laion/larger_clap_music")
clap_model = AutoModel.from_pretrained("laion/larger_clap_music").to(device)  # Send model to GPU

In [51]:
def get_clap_similarity(generated_audio, true_audio, sample_rate):
    
    inputs = clap_processor(audios=[generated_audio.flatten(), true_audio.flatten()], return_tensors="pt", sampling_rate=sample_rate).to(device)

    # Get audio features (embeddings)
    with torch.no_grad():  # Disable gradient calculation for inference
        audio_embed = clap_model.get_audio_features(**inputs)

    return F.cosine_similarity(audio_embed[0].unsqueeze(0), audio_embed[1].unsqueeze(0)).item()


In [52]:
from scipy.spatial.distance import cosine

def get_mfcc_score(generated_audio, true_audio):
    mfcc1 = librosa.feature.mfcc(y=generated_audio.flatten(), sr=48000, n_mfcc=13)
    mfcc2 = librosa.feature.mfcc(y=true_audio.flatten(), sr=48000, n_mfcc=13)
    return cosine(mfcc1.flatten(), mfcc2.flatten())

In [53]:
import librosa
audio, sr = librosa.load(f"data/wav-48/{train_filenames[0]}.wav", sr=48000)  # Load with 48 kHz sampling rate

In [54]:
audio.dtype

dtype('float32')

In [55]:


clap_scores = []
mel_scores = []
for i in range(10):
    text = tokenizer.decode(train_dataset[i][1], skip_special_tokens=True)
    inputs = processor(
        text=[text],
        padding=True,
        return_tensors="pt",
    )

    generated_audio = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=768).cpu().numpy()
    true_audio = load_audio(f"data/wav-48/{train_filenames[i]}.wav").numpy()
    Audio(generated_audio[0], rate=sampling_rate)
    Audio(true_audio[0], rate=sampling_rate)
    
    
#     mel_transform = transforms.MelSpectrogram(sample_rate=48000, n_mels=128)
#     mel_spec_generated = mel_transform(generated_audio[0][0])
#     mel_spec_true = mel_transform(true_audio[0][0])

    # Compute similarity using Mean Squared Error for Mel spectrograms
#     mse_similarity = F.mse_loss(mel_spec_generated, mel_spec_true).item()
    clap_similarity = get_clap_similarity(generated_audio, true_audio, 48000)
    print(clap_similarity)
    clap_scores.append(clap_similarity)
    
    mfcc_score = get_mfcc_score(generated_audio, true_audio)
    print(mfcc_score)
    mel_scores.append(mfcc_score)

0.941670835018158
0.37065500020980835
0.9209204912185669
0.30378401279449463
0.9927923679351807
0.03705334663391113
0.9079446792602539
0.08664661645889282
0.9368668794631958
0.19689327478408813
0.9699243307113647
0.15688931941986084
0.9876054525375366
0.008785784244537354
0.98533695936203
0.14653843641281128
0.9690393209457397
0.2798500061035156
0.976833701133728
0.22721755504608154


In [57]:
np.mean(clap_scores)

0.9588935017585755

In [56]:
np.mean(mel_scores)

0.18143133521080018

In [45]:
generated_audio.shape

(1, 1, 325760)

(13, 637)
(13, 957)


In [42]:
cosine(mfcc1.flatten(), mfcc2.flatten())

ValueError: operands could not be broadcast together with shapes (8281,) (12441,) 

In [None]:
compare the 

In [7]:
import scipy


scipy.io.wavfile.write("musicgen_out.wav", rate=sampling_rate, data=audio_values[0, 0].cpu().numpy())

The argument `max_new_tokens` specifies the number of new tokens to generate. As a rule of thumb, you can work out the length of the generated audio sample in seconds by using the frame rate of the EnCodec model:

In [8]:
audio_length_in_s = 256 / model.config.audio_encoder.frame_rate

audio_length_in_s

5.12