# Welcome to HW4!

# Setup

First, lets load in our model, and initialize our global variables of SAMPLE_RATE (i.e. the samples per second of the audio, in this case 44100), SAMPLE_SIZE (the *number* of audio samples we generate with the model, approximately 47.55*44100), and SEED (controls randomness, DO NOT CHANGE)

In [None]:
import torch
import torchaudio
from einops import rearrange
from stable_audio_tools import get_pretrained_model
import IPython.display as ipd
from tqdm.auto import trange, tqdm
from stable_audio_tools.inference.generation import generate_diffusion_cond_and_sampler_setup, generate_diffusion_cond_decode
import IPython.display as ipd

device = "cuda" if torch.cuda.is_available() else "cpu"

# Download model
model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
SAMPLE_RATE = model_config["sample_rate"]
SAMPLE_SIZE = model_config["sample_size"]
SEED = 456

model = model.to(device)

# Q1 Simple Sampler

Here you should implement the to_d and simple_sample functions:

In [None]:

def to_d(x, sigma, denoised):
    return #TODO

@torch.no_grad()
def simple_sample(model, x, sigmas, extra_args=None):
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1):
        #TODO
    del extra_args
    torch.cuda.empty_cache()
    return x

Given your code, you can now run it using this below block. Feel free to play around with the prompt in the conditioning list, the number of steps, and cfg_scale to explore unique outputs. This can help you test your code, as if it sounds bad, you're probably doing something wrong!

In [None]:
def generate(prompt="128 BPM electronic drum loop", steps=100, cfg_scale=7, return_latents=False):

    # Set up text and timing conditioning
    conditioning = [{
        "prompt": prompt,
        "seconds_start": 0, 
        "seconds_total": 47
    }]

    # Generate diffusion setup params
    denoiser, x_T, sigmas, extra_args = generate_diffusion_cond_and_sampler_setup(
        model,
        steps=steps, # number of steps, more = better quality
        cfg_scale=cfg_scale, # Classifier-Free Guidance Scale, higher = better text relevance / quality but less diversity
        conditioning=conditioning,
        sample_size=SAMPLE_SIZE, # number of audio samples to generate, DON'T CHANGE
        device=device, # cuda device
        seed=SEED # random seed, DON'T CHANGE
    )

    # Sample
    samples = simple_sample(denoiser, x_T, sigmas, extra_args=extra_args)

    if return_latents:
        return samples

    # Decode
    audio = generate_diffusion_cond_decode(
        model,
        samples
    )
    return audio



In [None]:
# to test your function, you can use the following code

audio = generate()
ipd.display(ipd.Audio(audio.cpu().numpy(), rate=SAMPLE_RATE))

# Q2 - Inpainting Mask

In [None]:
# LOAD AND ENCODE REFERENCE AUDIO
def load_and_encode_audio(path, model):
    audio, sr = torchaudio.load(path)
    # resample to SAMPLE_RATE
    resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
    sr = SAMPLE_RATE
    audio = resampler(audio)
    # peak normalize
    audio = audio / audio.abs().max()

    # trim to SAMPLE_SIZE if longer, pad with repetition if shorter
    if audio.shape[1] < SAMPLE_SIZE:
        while audio.shape[1] < SAMPLE_SIZE:
            audio = torch.cat((audio, audio), dim=1)

    audio = audio[:, :SAMPLE_SIZE][None].to(device)

    reference = model.pretransform.encode(audio)
    return reference



In [None]:
def generate_inpainting_mask(reference, mask_start_s, mask_end_s):
    # TODO
    return mask

# Q3  - Inpainting

In [None]:
@torch.no_grad()
def simple_sample_inpaint(model, x, sigmas, reference, mask, extra_args=None):
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1):
        # TODO
    del extra_args
    torch.cuda.empty_cache()
    return x


In [None]:
def inpaint(prompt="128 BPM house drum loop", steps=100, cfg_scale=7, reference=None, mask_start_s=20, mask_end_s=30, return_latents=False):
    # Set up text and timing conditioning
    conditioning = [{
        "prompt": prompt,
        "seconds_start": 0, 
        "seconds_total": 47
    }]
    # Set up inpainting mask
    mask = generate_inpainting_mask(reference, mask_start_s, mask_end_s)

    # Generate diffusion setup params
    denoiser, x_T, sigmas, extra_args = generate_diffusion_cond_and_sampler_setup(
        model,
        steps=steps,
        cfg_scale=cfg_scale,
        conditioning=conditioning,
        sample_size=SAMPLE_SIZE,
        device=device,
        seed=SEED
    )

    # Sample
    inp_samples = simple_sample_inpaint(denoiser, x_T, sigmas, reference, mask, extra_args=extra_args)

    if return_latents:
        return inp_samples

    # decode and play
    inpainted_audio = generate_diffusion_cond_decode(
        model,
        inp_samples
    )
    return inpainted_audio



In [None]:

# to test your function, you can use the following code
# load reference audio
reference = load_and_encode_audio("references/0.wav", model)

inpainted_audio = inpaint(reference=reference)
ipd.display(ipd.Audio(inpainted_audio.cpu().numpy(), rate=SAMPLE_RATE))

# Q4 Painting with Starting and Stopping Times

In [None]:
@torch.no_grad()
def simple_sample_variable_inpaint(model, x, sigmas, reference, mask, extra_args=None, paint_start=None, paint_end=None):
    if paint_start is None:
        paint_start = 0
    if paint_end is None:
        paint_end = len(sigmas) - 1
    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1):
        # TODO
    del extra_args
    torch.cuda.empty_cache()
    return x


In [None]:
def variable_inpaint(prompt="128 BPM house drum loop", steps=100, cfg_scale=7, reference=None, mask_start_s=20, mask_end_s=30, paint_start=None, paint_end=None, return_latents=False):
    # Set up text and timing conditioning
    conditioning = [{
        "prompt": prompt,
        "seconds_start": 0, 
        "seconds_total": 47
    }]
    # Set up inpainting mask
    mask = generate_inpainting_mask(reference, mask_start_s, mask_end_s)

    # Generate diffusion setup params
    denoiser, x_T, sigmas, extra_args = generate_diffusion_cond_and_sampler_setup(
        model,
        steps=steps,
        cfg_scale=cfg_scale,
        conditioning=conditioning,
        sample_size=SAMPLE_SIZE,
        device=device,
        seed=SEED
    )

    # Sample
    inp_samples = simple_sample_variable_inpaint(denoiser, x_T, sigmas, reference, mask, extra_args=extra_args, paint_start=paint_start, paint_end=paint_end)

    if return_latents:
        return inp_samples

    # decode and play
    inpainted_audio = generate_diffusion_cond_decode(
        model,
        inp_samples
    )
    return inpainted_audio



In [None]:
# to test your function, you can use the following code
# load reference audio
reference = load_and_encode_audio("references/0.wav", model)
inpainted_audio = variable_inpaint(reference=reference)

ipd.display(ipd.Audio(inpainted_audio.cpu().numpy(), rate=SAMPLE_RATE))

# Q5 Style Transfer

In [None]:
def simple_sample_style_transfer(model, sigmas, reference, extra_args=None, transfer_strength=0):
    # TODO
    pass


In [None]:
def style_transfer(prompt="128 BPM house drum loop", steps=100, cfg_scale=7, reference=None, transfer_strength=0, return_latents=False):
    # Set up text and timing conditioning
    conditioning = [{
        "prompt": prompt,
        "seconds_start": 0, 
        "seconds_total": 47
    }]

    # Generate diffusion setup params
    denoiser, x_T, sigmas, extra_args = generate_diffusion_cond_and_sampler_setup(
        model,
        steps=steps,
        cfg_scale=cfg_scale,
        conditioning=conditioning,
        sample_size=SAMPLE_SIZE,
        device=device,
        seed=SEED
    )

    # Sample
    inp_samples = simple_sample_style_transfer(denoiser, sigmas, reference, extra_args=extra_args, transfer_strength=transfer_strength)

    if return_latents:
        return inp_samples
    
    # decode and play
    inpainted_audio = generate_diffusion_cond_decode(
        model,
        inp_samples
    )
    return inpainted_audio
    

In [None]:
# to test your function, you can use the following code
# load reference audio
reference = load_and_encode_audio("references/0.wav", model)
st_audio = style_transfer(reference=reference)

ipd.display(ipd.Audio(st_audio.cpu().numpy(), rate=SAMPLE_RATE))