# AudioLDM2

First time pipeline initiation takes about about 2 minutes. Later, it only takes about 30s. Generating 60s of audio takes about 5s under 25 inference steps.

In [19]:
from diffusers import AudioLDM2Pipeline, DPMSolverMultistepScheduler
import torch
from IPython.display import Audio

model_id = "cvssp/audioldm2-large" # or "cvssp/audioldm2-music"; "audioldm2 performs better on SFX, audioldm2-music performs better on music"
pipe = AudioLDM2Pipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")  # Optimization 1: use float16 half precision
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) # Optimization 2: use multistep scheduler

Loading pipeline components...: 100%|██████████| 11/11 [01:32<00:00,  8.40s/it]


In [23]:
prompt = ""

AUDIO_LENGTH_IN_S = 20
INFERENCE_STEPS = 50 
SAMPLE_RATE = 16000 

audio = pipe(prompt, num_inference_steps=INFERENCE_STEPS,audio_length_in_s=AUDIO_LENGTH_IN_S).audios[0]
Audio(audio, rate=SAMPLE_RATE)

100%|██████████| 50/50 [00:09<00:00,  5.47it/s]


# MusicLDM

In [3]:
from diffusers import MusicLDMPipeline, DPMSolverMultistepScheduler
import torch

In [4]:
repo_id = "ucsd-reach/musicldm"
pipe = MusicLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16).to("cuda")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) # Optimization 2: use multistep scheduler
SAMPLE_RATE = 16000
prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
audio = pipe(prompt, num_inference_steps=25, audio_length_in_s=30.0).audios[0]

Loading pipeline components...: 100%|██████████| 7/7 [00:06<00:00,  1.09it/s]
100%|██████████| 25/25 [00:03<00:00,  8.04it/s]


In [6]:
from IPython.display import Audio
Audio(audio, rate=SAMPLE_RATE)

In [None]:
import scipy.io.wavfile
scipy.io.wavfile.write("karplus.wav", SAMPLE_RATE, audio)

# Suno Bark Model (for generating speech and vocals)

In [4]:
from transformers import AutoProcessor, BarkModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
# load in fp16
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(device)

# # convert to bettertransformer
model = model.to_bettertransformer()

# # enable CPU offload
model.enable_cpu_offload()

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [2]:
from IPython.display import Audio

inputs = processor("Hello uh ... [clears throat], my dog is cute [laughter]", voice_preset="v2/en_speaker_6").to(device)

audio_array = model.generate(**inputs).cpu().numpy().squeeze()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [3]:
SAMPLE_RATE = model.generation_config.sample_rate
Audio(audio_array, rate=SAMPLE_RATE)