# Setup

In [1]:
from pathlib import Path
import os
from IPython.display import Audio as DisplayAudio
import librosa
import json
import random
import shutil
import torch

# set CA bundle path for requests to work via Zscaler
os.environ['CURL_CA_BUNDLE'] = str(Path.home() / '.zscaler-cert-app-store/Bundle.pem')
# needed so bitsandbytes can find correct cuda path!
%env LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64

env: LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64


# Download Model and Inference

In [2]:
from transformers import AutoConfig, AutoProcessor, MusicgenForConditionalGeneration, BitsAndBytesConfig
from accelerate import Accelerator

def init_model(model_name: str):
   # text and melody input tokenizer
   processor = AutoProcessor.from_pretrained(model_name)
   # actual encoder/decoder models
   model = MusicgenForConditionalGeneration.from_pretrained(model_name) #, quantization_config=quant_config)

   # need to manually set to resolve bug in which these attributes don't exist within model.config
   model.config.decoder_start_token_id = model.generation_config.decoder_start_token_id
   model.config.pad_token_id = model.generation_config.pad_token_id
   model.config.vocab_size = model.config.audio_encoder.codebook_size
   return model, processor
    

# NOTE: 4-bit quantization really messes output up, and 8-bit speeds up loading but significantly reduces inference time since we have to skip enc_to_dec_proj
# seems like 4/8 bit not suited for inference? https://github.com/TimDettmers/bitsandbytes/issues/490 
quant_config = BitsAndBytesConfig(
   # load_in_4bit=True,
   load_in_8bit=True, 
   llm_int8_has_fp16_weight=True,
   # bnb_4bit_quant_type="nf4",
   # bnb_4bit_use_double_quant=True,
   # bnb_4bit_compute_dtype=torch.bfloat16,
   llm_int8_skip_modules=['enc_to_dec_proj'] # skip final layer since weight_norm is not deepcopy-able
)

accelerator = Accelerator()
model, processor = init_model(model_name="facebook/musicgen-small")
model = accelerator.prepare(model)
model_sr = model.config.audio_encoder.sampling_rate
print(f'Model sampling rate: {model_sr}')

  from .autonotebook import tqdm as notebook_tqdm


Model sampling rate: 32000


In [None]:
# audio conditioning doesn't seem to work as well...
audio_condition = True
input_audio, input_sr = librosa.load('../data/raw/Boris/Beautiful Wonder (Long Version).mp3')
display(DisplayAudio(data=input_audio, rate=input_sr))
input_audio_resampled = librosa.resample(input_audio, orig_sr=input_sr, target_sr=model_sr)

input_text = ["80s pop track with bassy drums and synth and dominant piano and BPM 40m key G"]
# input_text = ["epic music with grand piano in background"]
if audio_condition:
    inputs = processor(
        audio=input_audio_resampled[:model_sr*5], # get first 5 seconds of input audio
        sampling_rate=model_sr,
        text=input_text,
        return_tensors="pt",
        padding=True
    )
else:
    inputs = processor(
        text=input_text,
        return_tensors="pt",
        padding=True
    )

for k in inputs:
    inputs[k] = inputs[k].to(accelerator.device)

inputs

{'input_ids': tensor([[ 2775,     7,  2783,  1463,    28,  7981,    63,  5253,     7,    11,
         13353,    11, 12613,  8355,    11,   272,  6218,  1283,    51,   843,
           350,     1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0'), 'input_values': tensor([[[-7.1074e-11, -2.7214e-11,  3.0834e-11,  ...,  9.6841e-03,
           8.6236e-03,  9.5752e-03]]], device='cuda:0'), 'padding_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0', dtype=torch.int32)}

In [None]:
audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)

audio = DisplayAudio(data=audio_values[0][0].cpu(), rate=model_sr)
display(audio)

KeyboardInterrupt: 

# Data processing

## Process raw audio files into AudioFolder format

In [None]:
from essentia.standard import MonoLoader, TensorflowPredictEffnetDiscogs, TensorflowPredict2D
import numpy as np
import json
import scipy.io.wavfile as wavf

with open('../data/essentia_labels.json') as f:
    ESSENTIA_LABELS = json.load(f)

def filter_predictions(predictions, class_list, threshold=0.1):
    predictions_mean = np.mean(predictions, axis=0)
    sorted_indices = np.argsort(predictions_mean)[::-1]
    filtered_indices = [i for i in sorted_indices if predictions_mean[i] > threshold]
    filtered_labels = [class_list[i] for i in filtered_indices]
    filtered_values = [predictions_mean[i] for i in filtered_indices]
    return filtered_labels, filtered_values

def make_comma_separated_unique(tags):
    seen_tags = set()
    result = []
    for tag in ', '.join(tags).split(', '):
        if tag not in seen_tags:
            result.append(tag)
            seen_tags.add(tag)
    return ', '.join(result)

def get_audio_features(audio_filename, sr=32000):
    audio = MonoLoader(filename=str(audio_filename), sampleRate=sr, resampleQuality=4)()
    embedding_model = TensorflowPredictEffnetDiscogs(graphFilename="../models/essentia/discogs-effnet-bs64-1.pb", output="PartitionedCall:1")
    embeddings = embedding_model(audio)

    result_dict = {}

    # predict genres
    genre_model = TensorflowPredict2D(graphFilename="../models/essentia/genre_discogs400-discogs-effnet-1.pb", input="serving_default_model_Placeholder", output="PartitionedCall:0")
    predictions = genre_model(embeddings)
    filtered_labels, _ = filter_predictions(predictions, ESSENTIA_LABELS['genres'])
    filtered_labels = ', '.join(filtered_labels).replace("---", ", ").split(', ')
    result_dict['genres'] = make_comma_separated_unique(filtered_labels)

    # predict mood/theme
    mood_model = TensorflowPredict2D(graphFilename="../models/essentia/mtg_jamendo_moodtheme-discogs-effnet-1.pb")
    predictions = mood_model(embeddings)
    filtered_labels, _ = filter_predictions(predictions, ESSENTIA_LABELS['mood_themes'], threshold=0.05)
    result_dict['moods'] = make_comma_separated_unique(filtered_labels)

    # predict instruments
    instrument_model = TensorflowPredict2D(graphFilename="../models/essentia/mtg_jamendo_instrument-discogs-effnet-1.pb")
    predictions = instrument_model(embeddings)
    filtered_labels, _ = filter_predictions(predictions, ESSENTIA_LABELS['instruments'])
    result_dict['instruments'] = filtered_labels

    return result_dict
    
train_size = 0.8
random.seed(0)

output_path = Path('../data/processed/v1')
if os.path.isdir(output_path):
    # clear contents first
    shutil.rmtree(output_path)
output_path.mkdir(exist_ok=True, parents=True)

data_path = Path('../data/raw')

audio_files = glob.glob(str(data_path / '**/*.mp3')) + glob.glob(str(data_path / '**/*.wav'))

train_len, test_len = 0, 0

with open(output_path / 'metadata.jsonl', "w") as metadata_file:
    for file_path in audio_files:
        file_path = Path(file_path)
        signal, sr = librosa.load(file_path)
        file_name = file_path.stem
        artist = file_path.parent.stem
        # split into 30-second chunks
        chunk_samples = sr * 30
        for chunk_idx, sample_idx in enumerate(range(0, len(signal), chunk_samples)):
            chunk = signal[sample_idx:sample_idx+chunk_samples]
            
            tempo, _ = librosa.beat.beat_track(y=chunk, sr=sr)
            tempo = round(tempo) # not usually accurate lol
            chroma = librosa.feature.chroma_stft(y=chunk, sr=sr)
            key = np.argmax(np.sum(chroma, axis=1))
            key = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'][key]
            length = librosa.get_duration(y=chunk, sr=sr)

            # train/test split
            if random.random() < train_size:
                train_len += 1
                split = 'train'
            else:
                test_len += 1
                split = 'test'
            out_file_name = f"{file_name}_chunk{chunk_idx}.wav"
            split_path = output_path / split
            split_path.mkdir(exist_ok=True, parents=True)
            out_file_path = split_path / out_file_name
            wavf.write(out_file_path, rate=sr, data=chunk)
            essentia_metadata = get_audio_features(out_file_path, sr=sr)

            metadata = {
                "key": key,
                "artist": artist,
                "sample_rate": sr,
                "file_extension": "wav",
                "description": "", # TODO: In the future this can be filled with custom text if desired
                "keywords": "",
                "duration": length,
                "bpm": tempo,
                "genre": essentia_metadata.get('genres', ""),
                "title": "",
                "name": "",
                "instrument": essentia_metadata.get('instruments', ""),
                "moods": essentia_metadata.get('moods', []),
                "file_name": str(Path(split) / out_file_name), # relative path to dataset root
            }
            metadata_file.write(json.dumps(metadata) + '\n')

print(f"Num train samples: {train_len}, num test samples: {test_len}")

2023-12-29 12:19:24.926223: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-29 12:19:24.926280: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:09:00.0 name: NVIDIA GeForce RTX 3090 computeCapability: 8.6
coreClock: 1.8GHz coreCount: 82 deviceMemorySize: 24.00GiB deviceMemoryBandwidth: 871.81GiB/s
2023-12-29 12:19:24.926293: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1766] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-12-29 12:19:24.926304: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device i

Num train samples: 4, num test samples: 1


cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2023-12-29 12:19:28.786982: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      0 
2023-12-29 12:19:28.786986: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0:   N 
2023-12-29 12:19:28.795898: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-29 12:19:28.795935: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:09:00.0 name: NVIDIA GeForce RTX 3090 computeCapability: 8.6
coreClock: 1.8GHz coreCount: 82 deviceMemorySize: 24.00GiB deviceMemoryBandwidth: 871.81GiB/s
2023-12-29 12:19:28.795945: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1766] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. F

## Tokenize processed audio files 

In [12]:
from datasets import load_dataset, Audio

def generate_input_text(batch):
    description = batch["description"]
    key = batch["key"]
    bpm = batch["bpm"]
    genres = batch["genre"]
    moods = batch["moods"]
    instruments = ','.join(batch["instrument"])
    artist = batch["artist"]
    text_start = f"{description} with the " if description else "Uses the"
    return f"{text_start} following genres: {genres}; moods: {moods}; instruments: {instruments}; key: {key}; BPM: {bpm}; and created by artist: {artist}"
    
@torch.inference_mode
def prepare_dataset(batch, model, device):
    # Need first line to audio is resampled properly
    audio = batch["audio"]
    text = generate_input_text(batch)
    batch = processor(
        audio["array"], 
        sampling_rate=audio["sampling_rate"],
        text=text,
        return_tensors="pt",
        padding=True
    )
    audio_tensor = torch.tensor(batch["input_values"]).to(device)
    padding_mask_tensor = torch.tensor(batch["padding_mask"]).to(device)
    batch["labels"] = model.audio_encoder(audio_tensor, padding_mask_tensor).audio_codes[0]
    # need to remove extra batch dimension for all input vectors
    batch["input_ids"] = batch["input_ids"][0]
    batch["input_values"] = batch["input_values"][0]
    batch["attention_mask"] = batch["attention_mask"][0]
    batch["padding_mask"] = batch["padding_mask"][0]
    batch["input_length"] = len(batch["input_values"])
    return batch

output_path = Path('../data/processed/v1')
dataset = load_dataset("audiofolder", data_dir=output_path, drop_labels=True)
print(f'AudioFolder dataset: {dataset}')

# resample on the fly when entry is accessed
dataset = dataset.cast_column("audio", Audio(sampling_rate=model_sr))
dataset = dataset.map(
    prepare_dataset, 
    remove_columns=dataset["train"].column_names,
    fn_kwargs={"model": model, "device": accelerator.device}    
)

AudioFolder dataset: DatasetDict({
    train: Dataset({
        features: ['audio', 'key', 'artist', 'sample_rate', 'file_extension', 'description', 'keywords', 'duration', 'bpm', 'genre', 'instrument', 'moods'],
        num_rows: 4
    })
    test: Dataset({
        features: ['audio', 'key', 'artist', 'sample_rate', 'file_extension', 'description', 'keywords', 'duration', 'bpm', 'genre', 'instrument', 'moods'],
        num_rows: 1
    })
})


  audio_tensor = torch.tensor(batch["input_values"]).to(device)
  padding_mask_tensor = torch.tensor(batch["padding_mask"]).to(device)
Map: 100%|██████████| 4/4 [00:01<00:00,  2.88 examples/s]


# Train

## Initialize model with QLoRA

In [7]:
from transformers import TrainingArguments
from peft import LoftQConfig, LoraConfig, get_peft_model

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable %: {100 * trainable_params / all_param}"
    )

# Use 8bit quantization with QLoRA (base model should not be quantized first with this approach)
loftq_cfg = LoftQConfig(loftq_bits=4)      

lora_cfg = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=32,
    bias="none",
    target_modules=["k_proj", "v_proj", "q_proj", "out_proj"], # only apply LoRA and train on attention weights
    # modules_to_save # lm_heads?
    task_type="CAUSAL_LM",
    # init_lora_weights="loftq",
    # loftq_config=loftq_cfg
)

model, _ = init_model(model_name="facebook/musicgen-small")
model = accelerator.prepare(model)
lora_model = get_peft_model(model, lora_cfg)
print_trainable_parameters(lora_model)



trainable params: 12582912 || all params: 601564738 || trainable %: 2.0916970701829936


## Train model

In [8]:
from transformers import Trainer

train_args = TrainingArguments(
    output_dir="../results",
    num_train_epochs=10,
    per_device_train_batch_size=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=1,
    learning_rate=5e-5,
    # weight_decay=0.001,
    # fp16=True,
    # bf16=False,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    # report_to="tensorboard",
    seed=42,
    # label_names=["input_values"]
)

trainer = Trainer(
    model=lora_model, 
    args=train_args, 
    train_dataset=dataset["train"], 
    eval_dataset=dataset["test"]
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,9.8621,8.787848
2,10.2161,8.73344
3,9.9887,8.668656
4,9.763,8.600203
5,9.0731,8.503061
6,8.5956,8.402586
7,8.2874,8.312382
8,8.72,8.249205
9,7.6189,8.205984
10,7.5319,8.194717


Checkpoint destination directory ../results/checkpoint-4 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../results/checkpoint-8 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../results/checkpoint-12 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../results/checkpoint-16 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../results/checkpoint-20 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../results/checkpoint-24 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../results/checkpoint-28 already exists and is non-empty.Saving will proceed but saved results may be inv

TrainOutput(global_step=40, training_loss=9.05815759897232, metrics={'train_runtime': 18.2513, 'train_samples_per_second': 2.192, 'train_steps_per_second': 2.192, 'total_flos': 545757452160.0, 'train_loss': 9.05815759897232, 'epoch': 10.0})

# Evaluation

In [9]:
# merge LoRA model into base model
merged_model = lora_model.merge_and_unload()

In [11]:
input_text = ["artist Boris"]
# input_text = ["happy grand piano"]

inputs = processor(
    text=input_text,
    return_tensors="pt",
    padding=True
)

for k in inputs:
    inputs[k] = inputs[k].to(accelerator.device)

audio_values = merged_model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)

audio = DisplayAudio(data=audio_values[0][0].cpu(), rate=model_sr)
display(audio)