In [1]:
from PIL import Image
import numpy as np
from scipy.signal import istft
from scipy.io import wavfile
import pydub
import torch
import torchaudio

from __future__ import annotations
import typing as T
from dataclasses import dataclass
from enum import Enum

from riffusion.spectrogram_converter import SpectrogramConverter
from riffusion.spectrogram_params import SpectrogramParams
from riffusion.spectrogram_image_converter import SpectrogramImageConverter
from riffusion.util import image_util

In [2]:
# Load spectrogram image and color scale
img_stereo = Image.open('test/sample_stereo.png')
img_mono = Image.open('test/sample_mono.png')
trial = Image.open('test/trial.png')
# oscar = Image.open('../spectrogram_folder/-_6RxZyi30Q.png')
oscar = Image.open('../spectrogram_folder/-OAyRsvFGgc.png')
alex = Image.open('rap.jpg')


Define the parameters for mono and stereo

In [3]:
params = SpectrogramParams()

In [4]:
params

SpectrogramParams(stereo=False, sample_rate=44100, step_size_ms=10, window_duration_ms=100, padded_duration_ms=400, num_frequencies=512, min_frequency=0, max_frequency=10000, mel_scale_norm=None, mel_scale_type='htk', max_mel_iters=200, num_griffin_lim_iters=32, power_for_image=0.25)

In [5]:
params_stereo = SpectrogramParams(stereo=True)

In [6]:
params_stereo

SpectrogramParams(stereo=True, sample_rate=44100, step_size_ms=10, window_duration_ms=100, padded_duration_ms=400, num_frequencies=512, min_frequency=0, max_frequency=10000, mel_scale_norm=None, mel_scale_type='htk', max_mel_iters=200, num_griffin_lim_iters=32, power_for_image=0.25)

Create the different converters

In [7]:
# Create SpectrogramImageConverter object
converter = SpectrogramImageConverter(params)

  self.converter = SpectrogramConverter(params=params, device=device)


In [9]:
# Create SpectrogramImageConverter object
converter_stereo = SpectrogramImageConverter(params_stereo)

Stereo segment

In [65]:
# Convert spectrogram image to audio segment
segment_stereo = converter_stereo.audio_from_spectrogram_image(img_stereo, apply_filters=True, max_value=30e6)

In [66]:
# Play audio segment
segment_stereo.export("test/output_stereo.wav", format="wav")

<_io.BufferedRandom name='test/output_stereo.wav'>

Mono segment

In [8]:
# Convert spectrogram image to audio segment
segment_mono = converter.audio_from_spectrogram_image(alex, apply_filters=True, max_value=30e6)

In [9]:
# Play audio segment
segment_mono.export("test/blues.wav", format="wav")

<_io.BufferedRandom name='test/blues.wav'>

Trial segment

In [10]:
# Convert spectrogram image to audio segment
trial_mono = converter.audio_from_spectrogram_image(trial, apply_filters=True, max_value=30e6)
trial_stereo = converter_stereo.audio_from_spectrogram_image(trial, apply_filters=True, max_value=30e6)

In [11]:
# Play audio segment
trial_mono.export("test/trial_mono.wav", format="wav")
trial_stereo.export("test/trial_stereo.wav", format="wav")

<_io.BufferedRandom name='test/trial_stereo.wav'>

Oscar file

In [28]:
params = SpectrogramParams(stereo = True, num_frequencies=385)

In [29]:
params

SpectrogramParams(stereo=True, sample_rate=44100, step_size_ms=10, window_duration_ms=100, padded_duration_ms=400, num_frequencies=385, min_frequency=0, max_frequency=10000, mel_scale_norm=None, mel_scale_type='htk', max_mel_iters=200, num_griffin_lim_iters=32, power_for_image=0.25)

In [30]:
# Create SpectrogramImageConverter object
converter_oscar = SpectrogramImageConverter(params)

In [31]:
oscar_segment = converter_oscar.audio_from_spectrogram_image(oscar, apply_filters=True, max_value=30e6)

In [32]:
oscar_segment.export("test/oscar3.wav", format="wav")

<_io.BufferedRandom name='test/oscar3.wav'>

Using the HuggingFace dataset 

In [41]:
riffusion_stereo = Image.open('test/riffusion_stereo.jpeg')
riffusion_mono = Image.open('test/riffusion_mono.jpeg')

In [50]:
params_mono = SpectrogramParams()

In [51]:
params_mono

SpectrogramParams(stereo=False, sample_rate=44100, step_size_ms=10, window_duration_ms=100, padded_duration_ms=400, num_frequencies=512, min_frequency=0, max_frequency=10000, mel_scale_norm=None, mel_scale_type='htk', max_mel_iters=200, num_griffin_lim_iters=32, power_for_image=0.25)

In [52]:
param_stereo = SpectrogramParams(stereo=True, num_frequencies = 768)

In [53]:
# Create SpectrogramImageConverter object
converter_riffusion_mono = SpectrogramImageConverter(params_mono)
converter_riffusion_stereo = SpectrogramImageConverter(param_stereo)


  self.converter = SpectrogramConverter(params=params, device=device)


In [44]:
# Convert spectrogram image to audio segment
riffusion_mono = converter_riffusion_mono.audio_from_spectrogram_image(riffusion_mono, apply_filters=True, max_value=30e6)

In [54]:
riffusion_stereo = converter_riffusion_stereo.audio_from_spectrogram_image(riffusion_stereo, apply_filters=True, max_value=30e6)

In [37]:
# Play audio segment
riffusion_mono.export("test/riffusion_mono.wav", format="wav")

<_io.BufferedRandom name='test/riffusion_mono.wav'>

In [55]:
riffusion_stereo.export("test/riffusion_stereo.wav", format="wav")

<_io.BufferedRandom name='test/riffusion_stereo.wav'>