<a href="https://colab.research.google.com/github/bachaudhry/FastAI-22-23/blob/main/course_part_2/simple_diffusion_audio_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Generating Audio Samples Using Simple Diffusion**

Johnathan Whitaker's demo borrows heavily from the [Diffusion for Audio NB by HuggingFace](https://github.com/huggingface/diffusion-models-class/blob/main/unit4/02_diffusion_for_audio.ipynb).

Additionally, Johno also created the dataset used in this NB.

Diffusers Version 0.24.0 will be used given the Audio Diffusers pipeline has been deprecated [as highlighted here](https://github.com/huggingface/diffusers/pull/6169).

In [1]:
!pip install -q miniminiai datasets torchaudio diffusers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch, random, os
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.optim import lr_scheduler
import torchaudio
from torchaudio import transforms as T
from torch.utils.data import default_collate
from torchvision.transforms import functional as TF
from huggingface_hub import hf_hub_download
#from diffusers.pipelines.audio_diffusion.mel import Mel

import fastcore.all as fc
from PIL import Image
from miniminiai import *
from functools import partial
from datasets import load_dataset
from IPython.display import Audio
from matplotlib import pyplot as plt

In [9]:
# Implementing core Mel Spectrogram logic
class MelSpec:
  def __init__(self, sample_rate=16000, x_res=None, y_res=None,
               n_fft=1024, hop_length=512, f_min=0.0, fmax=None):
    self.sample_rate = sample_rate
    self.x_res = x_res
    self.y_res = y_res
    self.n_fft = n_fft
    self.hop_length = hop_length
    self.f_min = f_min
    self.fmax = fmax

    # Calculate hop_length and n_fft based on resolutions
    self.hop_length = (sample_rate * (self.x_res - 1) + self.n_fft) // self.x_res
    self.n_fft = self.hop_length * (self.x_res - 1) + 1

    self.mel_spectrogram = T.MelSpectrogram(
        sample_rate=self.sample_rate,
        n_fft=self.n_fft,
        n_mels=self.y_res,
        hop_length=self.hop_length,
        f_min=self.f_min,
        f_max=self.fmax
    )

  def __call__(self, waveform):
    mel_spec = self.mel_spectrogram(waveform)
    mel_spec = torchaudio.functional.amplitude_to_db(mel_spec, top_db=80.0)
    return mel_spec

  def x_res(self, waveform_length):
    return 1 + (waveform_length - self.n_fft) // self.hop_length

  def y_res(self):
    return self.n_mels

## **Load Audio Data**

In [10]:
# These settings are used for the Mel Spectrograms
sample_rate = 16000
x_res = 128 # x resolution of the spectrogram (time)
y_res = 128 # y resolution of the spectrogram (frequency - binned)
mel = MelSpec(sample_rate, x_res, y_res)

In [None]:
# Load the dataset of bird calls
birdcall_dataset = load_dataset("tglcourse/5s_birdcall_samples_top20")

In [None]:
birdcall_dataset