## Prepare the Environment

In [None]:
import os
from pathlib import Path

BASE_DIR = Path.cwd()
MODEL_NAME = "facebook/musicgen-small"

if (Path("/") / "home" / "vsioros").is_dir():
    BASE_DIR = Path("/") / "home" / "vsioros"
    MODEL_NAME = "facebook/musicgen-large"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## Load the Model

The pre-trained MusicGen small, medium and large checkpoints can be loaded from the [pre-trained weights](https://huggingface.co/models?search=facebook/musicgen-) on the Hugging Face Hub. Change the repo id with the checkpoint size you wish to load. We'll default to the small checkpoint, which is the fastest of the three but has the lowest audio quality:

In [None]:
from editgen import EditGenPipeline

pipeline = EditGenPipeline(MODEL_NAME)

### Ignoring a word

In [None]:
from IPython.display import Audio

from editgen.controllers import IgnoreWordController
from editgen.modifiers import SelfAttentionLerpControllerModifier

prompts = ["accoustic guitar solo", "<IGNORE> guitar solo"]
prompts, controller = IgnoreWordController.from_prompts(pipeline, prompts)
controller = SelfAttentionLerpControllerModifier(controller)

audio_values = pipeline(prompts, controller)

In [None]:
Audio(audio_values[0], rate=pipeline.sampling_rate)

In [None]:
Audio(audio_values[1], rate=pipeline.sampling_rate)

### Replacement edit with Prompt-to-Prompt

In [None]:
prompts = [
    "pop song with guitar and drums",
    "pop song with synth and drums",
]
# controller = DecoderLayerControllerModifier(ReplaceWordController(3, 0.3), set([2 * i for i in range(0, 49)] + [1]))
# controller = DecoderLayerControllerModifier(ReplaceWordController(3, 0.3), {0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20,21,24,33,34})
controller = AttentionLerpControllerModifier(
    ReplaceWordController(get_replacement_indices(prompts, "guitar", "synth"), 1)
)
# controller = ReplaceWordController(3, 0.3)
audio_values = run_and_display(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)

### Refinement edit with Prompt-to-Prompt

In [None]:
prompts = [
    "heavy guitar solo",
    "heavy guitar and drums solo",
]
controller = AttentionLerpControllerModifier(
    RefineController(get_refine_word_indices(prompts), 1)
)
audio_values = run_and_display(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)

### Reweight edit

In [None]:
prompts = [
    "heavy guitar solo",
    "heavy guitar solo",
]
controller = AttentionCutoffControllerModifier(
    ReweightWordController(get_reweight_word_indices(prompts, "heavy"), 2)
)
audio_values = run_and_display(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)

#### Sentiment replacement

In [None]:
prompts = [
    "happy pop song",
    "sad pop song",
]
# controller = DecoderLayerControllerModifier(ReplaceWordController(3, 0.3), set([2 * i for i in range(0, 49)] + [1]))
# controller = DecoderLayerControllerModifier(ReplaceWordController(3, 0.3), {0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20,21,24,33,34})
controller = SelfAttentionLerpControllerModifier(
    ReplaceWordController(get_replacement_indices(prompts, "happy", "sad"), 0.3)
)
# controller = ReplaceWordController(3, 0.3)
audio_values = run_and_display(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)

#### Majore vs Minore

In [None]:
prompts = [
    "a major chord pop song",
    "a minor chord pop song",
]
# controller = DecoderLayerControllerModifier(ReplaceWordController(3, 0.3), set([2 * i for i in range(0, 49)] + [1]))
# controller = DecoderLayerControllerModifier(ReplaceWordController(3, 0.3), {0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20,21,24,33,34})
controller = SelfAttentionLerpControllerModifier(
    ReplaceWordController(get_replacement_indices(prompts, "major", "minor"), 0.3)
)
# controller = ReplaceWordController(3, 0.3)
audio_values = run_and_display(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)

### Injecting Random noise

In [None]:
prompts = [
    "pop song with guitar and drums",
    "pop song with synth and drums",
]
controller = OffsetControllerModifier(
    SelfAttentionLerpControllerModifier(
        ReplaceWordController(get_replacement_indices(prompts, "guitar", "synth"), 0.3)
    ),
    offset=0.5,
)
audio_values = run_and_display(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)

## Ablation Studies

### Counter-factual Example

In [None]:
prompts = [
    "a classical song with guitar",
    "a classical song with violin",
]
controller = SelfAttentionLerpControllerModifier(
    ReplaceWordController(get_replacement_indices(prompts, "guitar", "violin"), 0.3)
)
audio_values = run_and_display(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)

### Comparing self-attention layers

In [None]:
from transformers import ClapModel

clap_model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
clap_processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

In [None]:
import librosa
import numpy as np
import torch.nn.functional as F


def cosine_similarity(prompt, audios, sr=48000):
    # Resample audios
    audios = np.stack(
        [
            librosa.resample(audio, orig_sr=model_proxy.sampling_rate, target_sr=sr)
            for audio in audios
        ]
    )

    inputs = clap_processor(
        text=prompt, audios=audios, return_tensors="pt", sampling_rate=sr, padding=True
    )

    # Process prompt and audios
    prompt_features = clap_model.get_text_features(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
    )
    audio_features = clap_model.get_audio_features(
        input_features=inputs["input_features"], attention_mask=inputs["attention_mask"]
    )

    # Calculate cosine similarity between audios
    audio_audio_similarity = F.cosine_similarity(
        audio_features[0], audio_features[1], dim=0
    )

    # Calculate cosine similarity between prompt at index 1 and audio at index 1
    text_audio_similarity = F.cosine_similarity(
        prompt_features[0], audio_features[1], dim=0
    )

    return audio_audio_similarity.item(), text_audio_similarity.item()

In [None]:
from sklearn.metrics import accuracy_score


def extract_pitch_classes(audio, sr=model_proxy.sampling_rate, hop_length=512):
    # Extract pitch using librosa's piptrack function
    _, magnitudes = librosa.core.piptrack(y=audio, sr=sr, hop_length=hop_length)

    # Get the pitch with the maximum magnitude for each frame
    pitch_classes = np.argmax(magnitudes, axis=0)

    return pitch_classes


def calculate_melody_accuracy(input_melody, generated_melody):
    # Extract pitch classes from both melodies
    input_pitch_classes = extract_pitch_classes(input_melody)
    generated_pitch_classes = extract_pitch_classes(generated_melody)

    # Calculate melody accuracy
    accuracy = accuracy_score(input_pitch_classes, generated_pitch_classes)

    return accuracy

In [None]:
def calculate_beat_consistency_score(audio, sr=model_proxy.sampling_rate):
    # Beat detection
    _, beat_frames = librosa.beat.beat_track(y=audio, sr=sr)

    # Calculate Inter-Beat Intervals (IBIs)
    ibis = np.diff(librosa.frames_to_time(beat_frames, sr=sr))

    # Calculate mean and standard deviation of IBIs
    mean_ibi = np.mean(ibis)
    std_ibi = np.std(ibis)

    # Calculate Beat Consistency Score (coefficient of variation)
    beat_consistency_score = std_ibi / mean_ibi

    return beat_consistency_score

In [None]:
def calculate_snr(y):
    # Calculate the power of the signal
    signal_power = np.sum(y**2)

    # Estimate the noise using spectral flatness
    flatness = librosa.feature.spectral_flatness(y=y)

    # Calculate the noise power
    noise_power = np.sum(flatness)

    # Calculate SNR in decibels (dB)
    snr = 10 * np.log10(signal_power / noise_power)

    return snr

In [None]:
from skimage.metrics import structural_similarity as ssim


def calculate_ssi(audios):
    # Calculate spectrograms
    specgram_1 = librosa.amplitude_to_db(np.abs(librosa.stft(audios[0])), ref=np.max)
    specgram_2 = librosa.amplitude_to_db(np.abs(librosa.stft(audios[1])), ref=np.max)

    # Normalize the spectrograms to [0, 1]
    specgram_1 = (specgram_1 - np.min(specgram_1)) / (
        np.max(specgram_1) - np.min(specgram_1)
    )
    specgram_2 = (specgram_2 - np.min(specgram_2)) / (
        np.max(specgram_2) - np.min(specgram_2)
    )

    # Calculate Structural Similarity Index (SSI)
    ssi_index, _ = ssim(specgram_1, specgram_2, data_range=1.0, full=True)

    return ssi_index

In [None]:
from typing import Iterable


class Dataset(object):
    def __init__(
        self,
        samples: list[tuple[str, tuple[str, str]]],
        soft_blending: bool = False,
        seeds: Optional[list[int]] = None,
    ) -> None:
        if seeds is None:
            seeds = [0, 1]

        self.entries = []
        for edit_type, prompts in samples:
            controllers = []
            if "Ignore" in edit_type:
                prompts, indices = get_ignore_indices(prompts)
                controllers = [IgnoreWordController(indices)]
            elif "Replace" in edit_type:
                words_a, words_b = prompts[0].split(), prompts[1].split()
                index = next(
                    (
                        i
                        for i, (word_a, word_b) in enumerate(zip(words_a, words_b))
                        if word_a != word_b
                    ),
                    None,
                )
                indices = get_replacement_indices(
                    prompts, words_a[index], words_b[index]
                )
                controllers = [
                    ReplaceWordController(indices, blend)
                    for blend in np.arange(0.3, 0.8, 0.2)
                ]
            else:
                raise NotImplementedError(f"{edit_type} is not supported")

            if soft_blending:
                controllers = map(SelfAttentionLerpControllerModifier, controllers)

            for controller in controllers:
                for seed in seeds:
                    self.entries.append(
                        (edit_type, (prompts[0], prompts[1]), controller, seed)
                    )

    def __iter__(self) -> Iterable[tuple[str, tuple[str, str], BaseController]]:
        yield from self.entries

    def __len__(self):
        return len(self.entries)

In [None]:
import pickle


class CheckpointManager(object):
    def __init__(self, filepath: Path) -> None:
        self.filepath = filepath

    def load(self) -> dict[str, Any]:
        if self.filepath.is_file():
            with self.filepath.open("rb") as file:
                return pickle.load(file)

        return {}

    def dump(self, **data: dict[str, Any]) -> None:
        with self.filepath.open("wb") as file:
            return pickle.dump(data, file)

In [None]:
samples = [
    ("Ignore", ("pop song with guitar and drums", "pop song with <IGNORE> and drums")),
    ("Replace", ("pop song with guitar and drums", "pop song with synth and drums")),
    ("Replace (Sentiment)", ("happy pop song", "sad pop song")),
    ("Replace (Chord)", ("a major chord pop song", "a minor chord pop song")),
    ("Ignore", ("rock ballad with piano", "rock ballad with <IGNORE>")),
    (
        "Replace",
        (
            "jazz ensemble with trumpet and saxophone",
            "jazz ensemble with piano and saxophone",
        ),
    ),
    (
        "Replace (Sentiment)",
        ("energetic electronic dance track", "calm electronic dance track"),
    ),
    (
        "Replace (Chord)",
        ("blues riff in E on electric guitar", "blues riff in G on electric guitar"),
    ),
    (
        "Ignore",
        (
            "acoustic folk song with banjo and harmonica",
            "acoustic folk song with <IGNORE> and harmonica",
        ),
    ),
    (
        "Replace",
        (
            "classical symphony with violins and cellos",
            "classical symphony with flutes and cellos",
        ),
    ),
    (
        "Replace (Sentiment)",
        ("upbeat indie pop anthem", "melancholic indie pop anthem"),
    ),
    ("Replace (Chord)", ("piano sonata in C minor", "piano sonata in A minor")),
    (
        "Ignore",
        ("funky bassline with slap technique", "funky <IGNORE> with slap technique"),
    ),
    (
        "Replace",
        (
            "latin jazz fusion with congas and bongos",
            "latin jazz fusion with timbales and bongos",
        ),
    ),
    (
        "Replace (Sentiment)",
        (
            "motivational corporate background music",
            "relaxing corporate background music",
        ),
    ),
    (
        "Replace (Chord)",
        (
            "gospel choir with dominant seventh chords",
            "gospel choir with diminished seventh chords",
        ),
    ),
    (
        "Ignore",
        (
            "ambient electronic soundscape with synthesizers",
            "ambient electronic <IGNORE> with synthesizers",
        ),
    ),
    (
        "Replace",
        (
            "orchestral film score with strings and brass",
            "orchestral film score with woodwinds and brass",
        ),
    ),
    ("Replace (Sentiment)", ("uplifting reggae vibes", "heartbreaking reggae vibes")),
    (
        "Replace (Chord)",
        ("punk rock anthem with power chords", "punk rock anthem with barre chords"),
    ),
]

dataset = Dataset(samples)

In [None]:
from tqdm.auto import tqdm


def run_greedy_ablation_study(checkpoint_path: Optional[Path] = None):
    if checkpoint_path is None:
        checkpoint_path = RESULTS_DIR / "greedy_checkpoint.pkl"

    checkpoint_manager = CheckpointManager(checkpoint_path)

    columns = [
        "Edit",
        "Layers",
        "Source Prompt",
        "Editted Prompt",
        "Source Audio",
        "Editted Audio",
        "Text-Audio Cosine Similarity",
        "Audio-Audio Cosine Similarity",
    ]

    prompts = [samples[0][1][0], samples[0][1][1]]
    controller = AttentionStore()
    audio_values = run_and_display(prompts, controller)

    # !This is a heuristic
    sorted_indices = controller.get_self_attention_importance()[0].tolist()

    cross_attention_layer_indices = [
        2 * (i + 1) for i in range(len(model_proxy.decoder_layers))
    ]
    error_threshold = 0.1

    checkpoint = checkpoint_manager.load()
    black_listed_indices = checkpoint.get("black_listed_indices", [])
    visited_indices = checkpoint.get("visited_indices", [])
    df_list = checkpoint.get("df_list", [])

    all_indices = [
        i
        for i in sorted_indices
        if i not in visited_indices and i not in black_listed_indices
    ]
    progress_bar_a = tqdm(all_indices, position=0)
    for iteration in progress_bar_a:
        indices = [
            i
            for i in all_indices
            if i not in visited_indices and i not in black_listed_indices
        ]

        try:
            previous_max_score = df_list[-1]["Score"].item()
        except IndexError:
            previous_max_score = 0

        progress_bar_b, scores = tqdm(indices, position=1, leave=False), []
        for index in progress_bar_b:
            self_attention_layer_indices = [
                2 * i + 1 for i in [index, *visited_indices]
            ]

            progress_bar_c, df_list_ablation = (
                tqdm(dataset, position=2, leave=False),
                [],
            )
            for edit, prompts, controller, seed in progress_bar_c:
                layers = ",".join(
                    f"{x:02d}" for x in self_attention_layer_indices
                ).strip()

                progress_bar_c.set_postfix({"layers": layers})

                attention_layer_indices = [
                    *cross_attention_layer_indices,
                    *self_attention_layer_indices,
                ]
                controller = DecoderLayerControllerModifier(
                    controller, set(attention_layer_indices)
                )
                audio_values = run_and_display(prompts, controller, seed=seed)

                audio_audio_similarity, text_audio_similarity = cosine_similarity(
                    prompts[1], audio_values
                )

                row = [
                    edit,
                    layers,
                    prompts[0],
                    prompts[1],
                    audio_values[0],
                    audio_values[1],
                ]
                row.append(text_audio_similarity)
                row.append(audio_audio_similarity)

                df_list_ablation.append(pd.DataFrame([row], columns=columns))

            df = pd.concat(df_list_ablation, ignore_index=True)

            metrics = df[
                ["Text-Audio Cosine Similarity", "Audio-Audio Cosine Similarity"]
            ]
            score = metrics.mean(axis=None)
            scores.append(score)

            error = abs(score - previous_max_score)
            if score < previous_max_score and error > error_threshold:
                black_listed_indices.append(index)

            progress_bar_b.set_postfix({"index": index, "score": f"{score:.3f}"})

        max_score = max(scores)

        error = abs(max_score - previous_max_score)
        if max_score < previous_max_score and error > error_threshold:
            break

        max_score_index = indices[scores.index(max_score)]
        visited_indices.append(max_score_index)
        df_list.append(
            pd.DataFrame(
                [[max_score, visited_indices.copy()]], columns=["Score", "Indices"]
            )
        )

        checkpoint_manager.dump(
            visited_indices=visited_indices,
            black_listed_indices=black_listed_indices,
            df_list=df_list,
        )

        progress_bar_a.set_postfix(
            {
                "last_checkpoint": f"{iteration:02d}",
                "current": f"{max_score:.3f}",
                "previous": f"{previous_max_score:.3f}",
                "error": f"{error * 100:.2f}%",
            }
        )

    df = pd.concat(df_list, ignore_index=True)
    df.to_pickle(RESULTS_DIR / "greedy.pkl")

In [None]:
run_greedy_ablation_study()

In [None]:
def run_ablation_study(dataset, self_attention_layer_groups):
    columns = [
        "Edit",
        "Layers",
        "Source Prompt",
        "Editted Prompt",
        "Source Audio",
        "Editted Audio",
        "Text-Audio Cosine Similarity",
        "Audio-Audio Cosine Similarity",
        "Melody Accuracy",
        "Beat Consistency Score",
        "Signal to Noise Ratio",
        "Structural Similarity Index",
    ]

    df_list = []
    for edit, prompts, controller, seed in tqdm(dataset, position=0):
        for self_attention_layers in tqdm(
            self_attention_layer_groups, leave=False, position=1
        ):
            controller = DecoderLayerControllerModifier(
                controller,
                set(
                    [2 * (i + 1) for i in range(len(model_proxy.decoder_layers))]
                    + self_attention_layers
                ),
            )
            audio_values = run_and_display(prompts, controller, seed=seed)

            layers = ",".join(f"{x:02d}" for x in self_attention_layers).strip()

            audio_audio_similarity, text_audio_similarity = cosine_similarity(
                prompts[1], audio_values
            )

            row = [
                edit,
                layers,
                prompts[0],
                prompts[1],
                audio_values[0],
                audio_values[1],
            ]
            row.append(text_audio_similarity)
            row.append(audio_audio_similarity)
            row.append(calculate_melody_accuracy(audio_values[0], audio_values[1]))
            row.append(-calculate_beat_consistency_score(audio_values[1]))
            row.append(calculate_snr(audio_values[1]))
            row.append(calculate_ssi(audio_values))

            df_list.append(pd.DataFrame([row], columns=columns))

    return pd.concat(df_list, ignore_index=True)

#### Comparing individual self-attention layers

In [None]:
self_attention_layer_groups = [
    [2 * x + 1] for x in range(len(model_proxy.decoder_layers))
]
df = run_ablation_study(dataset, self_attention_layer_groups)
df.to_pickle(RESULTS_DIR / "individual_hard.pkl")

#### Comparing `n - 1` self-attention layers

In [None]:
self_attention_layer_groups = [
    [2 * y + 1 for y in range(len(model_proxy.decoder_layers)) if x != y]
    for x in range(len(model_proxy.decoder_layers))
]
df = run_ablation_study(dataset, self_attention_layer_groups)
df.to_pickle(RESULTS_DIR / "leave_one_out_hard.pkl")

#### Comparing incremental groups of self-attention layers

In [None]:
self_attention_layer_groups = [
    [2 * y + 1 for y in range(0, x)]
    for x in range(1, len(model_proxy.decoder_layers) + 1)
]
df = run_ablation_study(dataset, self_attention_layer_groups)
df.to_pickle(RESULTS_DIR / "incremental_hard.pkl")

#### Comparing individual self-attention layers (Soft-blending self-attention)

In [None]:
dataset = Dataset(samples, soft_blending=True)

In [None]:
self_attention_layer_groups = [
    [2 * x + 1] for x in range(len(model_proxy.decoder_layers))
]
df = run_ablation_study(dataset, self_attention_layer_groups)
df.to_pickle(RESULTS_DIR / "individual_soft.pkl")

#### Comparing `n - 1` self-attention layers (Soft-blending self-attention)

In [None]:
self_attention_layer_groups = [
    [2 * y + 1 for y in range(len(model_proxy.decoder_layers)) if x != y]
    for x in range(len(model_proxy.decoder_layers))
]
df = run_ablation_study(dataset, self_attention_layer_groups)
df.to_pickle(RESULTS_DIR / "leave_one_out_soft.pkl")

#### Comparing incremental groups of self-attention layers (Soft-blending self-attention)

In [None]:
self_attention_layer_groups = [[2 * y + 1 for y in range(0, x)] for x in range(1, 49)]
df = run_ablation_study(dataset, self_attention_layer_groups)
df.to_pickle(RESULTS_DIR / "incremental_soft.pkl")

### Cherrypicking

In [None]:
def run(prompts: list[str], controller: BaseController):
    audio_values = run_and_display(prompts, controller, save_results=False)

    audio_audio_similarity, text_audio_similarity = cosine_similarity(
        prompts[1], audio_values
    )

    print(f"T2A Similarity:{text_audio_similarity:.3f}")
    print(f"A2A Similarity: {audio_audio_similarity:.3f}")

    return audio_values

In [None]:
prompts, indices = get_ignore_indices(
    [
        "accoustic guitar solo",
        "<IGNORE> guitar solo",
    ]
)
controller = SelfAttentionLerpControllerModifier(IgnoreWordController(indices))

audio_values = run(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)

### Replacement edit with Prompt-to-Prompt

In [None]:
prompts = [
    "pop song with guitar and drums",
    "pop song with synth and drums",
]
# controller = DecoderLayerControllerModifier(ReplaceWordController(3, 0.3), set([2 * i for i in range(0, 49)] + [1]))
# controller = DecoderLayerControllerModifier(ReplaceWordController(3, 0.3), {0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20,21,24,33,34})
controller = AttentionLerpControllerModifier(
    ReplaceWordController(get_replacement_indices(prompts, "guitar", "synth"), 1)
)
# controller = ReplaceWordController(3, 0.3)
audio_values = run_and_display(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)

### Refinement edit with Prompt-to-Prompt

In [None]:
prompts = [
    "heavy guitar solo",
    "heavy guitar and drums solo",
]
controller = AttentionLerpControllerModifier(
    RefineController(get_refine_word_indices(prompts), 1)
)
audio_values = run_and_display(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)

### Reweight edit

In [None]:
prompts = [
    "heavy guitar solo",
    "heavy guitar solo",
]
controller = AttentionCutoffControllerModifier(
    ReweightWordController(get_reweight_word_indices(prompts, "heavy"), 2)
)
audio_values = run_and_display(prompts, controller)

In [None]:
Audio(audio_values[0], rate=model_proxy.sampling_rate)

In [None]:
Audio(audio_values[1], rate=model_proxy.sampling_rate)