# Welcome to HW4!

# Setup

In [None]:
import torch
import torchaudio
from einops import rearrange
from stable_audio_tools import get_pretrained_model
import IPython.display as ipd
from tqdm.auto import trange, tqdm
from stable_audio_tools.inference.generation import generate_diffusion_cond_and_sampler_setup, generate_diffusion_cond_decode
import IPython.display as ipd
from homework4_stub import simple_sample, generate_inpainting_mask, simple_sample_inpaint, simple_sample_variable_inpaint, simple_sample_style_transfer

device = "cuda" if torch.cuda.is_available() else "cpu"

# Download model
model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
SAMPLE_RATE = model_config["sample_rate"]
SAMPLE_SIZE = model_config["sample_size"]
SEED = 456

model = model.to(device)

In [None]:
answers = {}

# Q1 Simple Sampler

Here you should implement the to_d and simple_sample functions:

In [None]:
def generate(prompt="128 BPM electronic drum loop", steps=100, cfg_scale=7, return_latents=False):

    # Set up text and timing conditioning
    conditioning = [{
        "prompt": prompt,
        "seconds_start": 0, 
        "seconds_total": 47
    }]

    # Generate diffusion setup params
    denoiser, x_T, sigmas, extra_args = generate_diffusion_cond_and_sampler_setup(
        model,
        steps=steps, # number of steps, more = better quality
        cfg_scale=cfg_scale, # Classifier-Free Guidance Scale, higher = better text relevance / quality but less diversity
        conditioning=conditioning,
        sample_size=SAMPLE_SIZE, # number of audio samples to generate, DON'T CHANGE
        device=device, # cuda device
        seed=SEED # random seed, DON'T CHANGE
    )

    # Sample
    samples = simple_sample(denoiser, x_T, sigmas, extra_args=extra_args)

    if return_latents:
        return samples

    # Decode
    audio = generate_diffusion_cond_decode(
        model,
        samples
    )
    return audio



In [None]:
prompt_list = [
    "128 BPM electronic drum loop",
    "176 bpm drum-n-bass break",
    "water",
    "a sports car passing by",
    "vibrant synth arpeggio, C minor",
    "glitchy sub bass patch, I made it in Serum"
]

outputs = []
for prompt in prompt_list:
    outputs.append(generate(prompt, return_latents=True).cpu().numpy())

answers['Q1'] = outputs

# Q2 - Inpainting Mask

In [None]:
# LOAD AND ENCODE REFERENCE AUDIO
def load_and_encode_audio(path, model):
    audio, sr = torchaudio.load(path)
    # resample to SAMPLE_RATE
    resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
    sr = SAMPLE_RATE
    audio = resampler(audio)
    # peak normalize
    audio = audio / audio.abs().max()

    # trim to SAMPLE_SIZE if longer, pad with repetition if shorter
    if audio.shape[1] < SAMPLE_SIZE:
        while audio.shape[1] < SAMPLE_SIZE:
            audio = torch.cat((audio, audio), dim=1)

    audio = audio[:, :SAMPLE_SIZE][None].to(device)

    reference = model.pretransform.encode(audio)
    return reference



In [None]:
mask_list = []
for mask_ranges in [(0, 5), (0, 10), (0, 20), (20, 30), (30, 40), (20, 47)]:
    mask = generate_inpainting_mask(outputs[0], *mask_ranges)
    mask_list.append(mask.cpu().numpy())

answers['Q2'] = mask_list

# Q3  - Inpainting

In [None]:
def inpaint(prompt="128 BPM house drum loop", steps=100, cfg_scale=7, reference=None, mask_start_s=20, mask_end_s=30, return_latents=False):
    # Set up text and timing conditioning
    conditioning = [{
        "prompt": prompt,
        "seconds_start": 0, 
        "seconds_total": 47
    }]
    # Set up inpainting mask
    mask = generate_inpainting_mask(reference, mask_start_s, mask_end_s)

    # Generate diffusion setup params
    denoiser, x_T, sigmas, extra_args = generate_diffusion_cond_and_sampler_setup(
        model,
        steps=steps,
        cfg_scale=cfg_scale,
        conditioning=conditioning,
        sample_size=SAMPLE_SIZE,
        device=device,
        seed=SEED
    )

    # Sample
    inp_samples = simple_sample_inpaint(denoiser, x_T, sigmas, reference, mask, extra_args=extra_args)

    if return_latents:
        return inp_samples

    # decode and play
    inpainted_audio = generate_diffusion_cond_decode(
        model,
        inp_samples
    )
    return inpainted_audio



In [None]:
mask_ranges = [(0, 5), (0, 10), (0, 20), (20, 30), (15, 30), (20, 47)]
prompt_list = [
    "128 BPM electronic drum loop",
    "176 bpm drum-n-bass break",
    "water",
    "a sports car passing by",
    "vibrant synth arpeggio, C minor",
    "glitchy sub bass patch, I made it in Serum"
]
reference_list = [
    "references/0.wav",
    "references/1.wav",
    "references/2.wav",
    "references/3.wav",
    "references/4.wav",
    "references/5.wav"
]

outputs = []

for i, reference_path in enumerate(reference_list):
    reference = load_and_encode_audio(reference_path, model)
    outputs.append(inpaint(prompt=prompt_list[i], reference=reference, mask_start_s=mask_ranges[i][0], mask_end_s=mask_ranges[i][1], return_latents=True).cpu().numpy())

answers['Q3'] = outputs

# Q4 Painting with Starting and Stopping Times

In [None]:
def variable_inpaint(prompt="128 BPM house drum loop", steps=100, cfg_scale=7, reference=None, mask_start_s=20, mask_end_s=30, paint_start=None, paint_end=None, return_latents=False):
    # Set up text and timing conditioning
    conditioning = [{
        "prompt": prompt,
        "seconds_start": 0, 
        "seconds_total": 47
    }]
    # Set up inpainting mask
    mask = generate_inpainting_mask(reference, mask_start_s, mask_end_s)

    # Generate diffusion setup params
    denoiser, x_T, sigmas, extra_args = generate_diffusion_cond_and_sampler_setup(
        model,
        steps=steps,
        cfg_scale=cfg_scale,
        conditioning=conditioning,
        sample_size=SAMPLE_SIZE,
        device=device,
        seed=SEED
    )

    # Sample
    inp_samples = simple_sample_variable_inpaint(denoiser, x_T, sigmas, reference, mask, extra_args=extra_args, paint_start=paint_start, paint_end=paint_end)

    if return_latents:
        return inp_samples

    # decode and play
    inpainted_audio = generate_diffusion_cond_decode(
        model,
        inp_samples
    )
    return inpainted_audio
# inpainted_audio = variable_inpaint(reference=reference)



In [None]:
mask_ranges = [(0, 5), (0, 10), (0, 20), (20, 30), (15, 30), (20, 47)]
paint_ranges = [(0, 50), (0, 40), (0, 30), (0, 70), (20, 70), (30, 100)]
prompt_list = [
    "128 BPM electronic drum loop",
    "176 bpm drum-n-bass break",
    "water",
    "a sports car passing by",
    "vibrant synth arpeggio, C minor",
    "glitchy sub bass patch, I made it in Serum"
]
reference_list = [
    "references/0.wav",
    "references/1.wav",
    "references/2.wav",
    "references/3.wav",
    "references/4.wav",
    "references/5.wav"
]
outputs = []
for i, reference_path in enumerate(reference_list):
    reference = load_and_encode_audio(reference_path, model)
    outputs.append(variable_inpaint(prompt=prompt_list[i], reference=reference, mask_start_s=mask_ranges[i][0], mask_end_s=mask_ranges[i][1], paint_start=paint_ranges[i][0], paint_end=paint_ranges[i][1], return_latents=True).cpu().numpy())
answers['Q4'] = outputs

# Q5 Style Transfer

In [None]:
def style_transfer(prompt="128 BPM house drum loop", steps=100, cfg_scale=7, reference=None, transfer_strength=0, return_latents=False):
    # Set up text and timing conditioning
    conditioning = [{
        "prompt": prompt,
        "seconds_start": 0, 
        "seconds_total": 47
    }]

    # Generate diffusion setup params
    denoiser, x_T, sigmas, extra_args = generate_diffusion_cond_and_sampler_setup(
        model,
        steps=steps,
        cfg_scale=cfg_scale,
        conditioning=conditioning,
        sample_size=SAMPLE_SIZE,
        device=device,
        seed=SEED
    )

    # Sample
    inp_samples = simple_sample_style_transfer(denoiser, sigmas, reference, extra_args=extra_args, transfer_strength=transfer_strength)

    if return_latents:
        return inp_samples
    
    # decode and play
    inpainted_audio = generate_diffusion_cond_decode(
        model,
        inp_samples
    )
    return inpainted_audio
    

In [None]:
strengths = [0.1, 0.3, 0.4, 0.6, 0.45, 0.9]
prompt_list = [
    "glitchy sub bass patch, I made it in Serum",
    "128 BPM electronic drum loop",
    "176 bpm drum-n-bass break",
    "water",
    "a sports car passing by",
    "vibrant synth arpeggio, C minor",
]
reference_list = [
    "references/0.wav",
    "references/1.wav",
    "references/2.wav",
    "references/3.wav",
    "references/4.wav",
    "references/5.wav"
]
outputs = []
for i, reference_path in enumerate(reference_list):
    reference = load_and_encode_audio(reference_path, model)
    outputs.append(style_transfer(prompt=prompt_list[i], reference=reference, transfer_strength=strengths[i], return_latents=True).cpu().numpy())

answers['Q5'] = outputs

In [None]:
import pickle
with open("homework4.pkl", "wb") as f:
    pickle.dump(answers, f)