This is the Inference Notebook used to test Stable-Foley on 5 random videos taken from the Greatest Hits Dataset test set. 
Follow the instructions provided as comments throughout the notebook. You will need to change the paths where specified and indicate the device that will be used by the model. 

In [None]:
import os
import sys
import hydra
import torch
import torchaudio

!pip install stable_audio_tools                                                                     
from stable_audio_tools.inference.generation import generate_diffusion_cond

In [4]:
# Specify your token here. 
# (You can get it from https://huggingface.co/stabilityai/stable-audio-open-1.0 by loging in and accepting terms)

!huggingface-cli login --token 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 --add-to-git-credential

Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /home/christian/.cache/huggingface/token
Login successful


### GreatestHits

In [5]:
# params

# We will need Stable-Foley, CAVP cand CLAP checkpoints to generate audio.
# You can place downloaded checkpoints in the following paths:
# CAVP ckpt: 'Stable-Video2Audio/logs/cavp_ckpt/cavp_epoch66.ckpt'
# CLAP ckpt: '/homes/rfg543/Documents/Stable-Video2Audio/logs/clap_ckpt/630k-audioset-best.pt'
# Stable-Foley ckpt: 'Stable-Video2Audio/logs/ckpts/gh-controlnet_2024-08-22-11-43-14/epoch=213-valid_loss=0.493.ckpt' 


seed = 1234
num_samples = 12 # number of samples to generate (in this case 2 outputs will be generated)
sample_duration = 10
sr = 44100
exp_cfg = "train_gh_controlnet"
use_cavp = True
promt_type = "audio" # "audio" or "text"
dataset_path = "/home/christian/mic-mp4-processed" 

rms_from_target = True  # if True, the RMS envelope will be derived from the target audio
                        # if False, the RMS is loaded from those generated in Stage 1   

# If rms_from_target is False, this variable will be used to interpolate the RMS envelope
target_length = sr*sample_duration

if use_cavp:
    ckpt_path = "/home/christian/syncfusion/Stable-Video2Audio/logs/stablev2a/ckpts/gh-controlnet_2024-10-17-18-22-03/epoch=510-valid_loss=0.493.ckpt"
else:
    ckpt_path = "/home/christian/syncfusion/Stable-Video2Audio/logs/stablev2a/ckpts/gh-controlnet_2024-08-24-09-55-07_nocavp/epoch=213-valid_loss=0.493.ckpt"
# dataset_path = ""

torch.cuda.set_device(0) # set the GPU to use


import sys
sys.path.append("..")



# load config: This is the config file that was used to train the model, it is needed to load the model and the dataloader
with hydra.initialize(config_path="..", version_base=None):
    cond_cfg = hydra.compose(config_name="config", overrides=[f'exp={exp_cfg}',
                                                              f"datamodule.root_dir=/home/christian/mic-mp4-processed",  # You can override some parameters here, if needed
                                                              f"datamodule.test_split_file_path=/home/christian/syncfusion/test.txt",
                                                              f"datamodule.test_data_to_use=1.0",])
    
# init model
model = hydra.utils.instantiate(cond_cfg["model"])
ckpt = torch.load(ckpt_path, map_location="cpu")
model.load_state_dict(ckpt['state_dict'], strict=False)
model = model.cuda()
# load dataloader
datamodule = hydra.utils.instantiate(cond_cfg["datamodule"])
datamodule.setup(stage="test")
test_dataloader = datamodule.test_dataloader()



No module named 'flash_attn'
flash_attn not installed, disabling Flash Attention
Initalize Stage1 CAVP Model
Loading Stage1 CAVP Model from: /home/christian/syncfusion/Stable-Video2Audio/logs/cavp_ckpt/cavp_epoch66.ckpt
Restored from /home/christian/syncfusion/Stable-Video2Audio/logs/cavp_ckpt/cavp_epoch66.ckpt with 0 missing and 0 unexpected keys


Loading test dataset: 196it [01:06,  2.93it/s]                       



Greatesthit test dataset:
num test chunks: 602
chunk frames size: torch.Size([3, 40, 224, 224])
chunk audio size: torch.Size([2, 441000])


In [None]:
from main.module_controlnet import window_rms, low_pass_filter
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import numpy as np
import librosa
from scipy.interpolate import interp1d

# window sizes for RMS and low-pass filter, these values ensure target RMS has the same shape as the audio
# (this is required for the ControlNet to work properly)
# low-pass filter is used to smooth the envelope
rms_window_size = 10000
low_pass_window_size = 2000
device = next(model.parameters()).device

prima_iterazione = True
iter_dati = iter(test_dataloader)
dati = next(iter_dati)
contatore = 0
pbar = tqdm(total=len(test_dataloader)*num_samples)

out_folder = "out21_audiorms_nocavp_150step"

while dati:
    if exp_cfg == "train_gh_controlnet":
        x, frames, seconds_start, seconds_total, chunk_rms, item = dati
    else:
        x, frames, seconds_start, seconds_total, item = dati

    if rms_from_target:
        # rms envelope is derived from the target audio
        rms_envelope = window_rms(x, window_size=rms_window_size)
        filtered_envelope = low_pass_filter(rms_envelope, window_size=low_pass_window_size)
    else:
        # rms da decoficare e poi provare a interpolare per raggiungere 441000
        chunk_rms = chunk_rms.cpu().numpy()
        mu_expanded = librosa.mu_expand(chunk_rms, mu=127, quantize=True)
        interval_original = np.linspace(0, 1, mu_expanded.shape[-1])
        interval_target = np.linspace(0, 1, target_length)
        filtered_envelope = np.array([interp1d(interval_original, elem, axis=-1)(interval_target) for elem in mu_expanded])
        filtered_envelope = np.repeat(filtered_envelope[:, None, :], 2, axis=1)
        filtered_envelope = torch.tensor(filtered_envelope, dtype=torch.float32)

    num_samples = min(num_samples, x.shape[0])

    conditionings = []

    for i in range(num_samples):
        curr_conditioning = {
            "envelope": filtered_envelope[i:i+1].to(device), # ControlNet input
            "seconds_start": seconds_start[i:i+1], # start time
            "seconds_total": seconds_total[i:i+1], # total time
        }
        if use_cavp:
            curr_conditioning["frames"] = frames[i:i+1] # frames CAVP embeddings (cross-attention)
        if promt_type == "audio":
            curr_conditioning["audio"] = x[i:i+1] # audio CLAP embeddings (cross-attention)
        elif promt_type == "text":
            curr_conditioning["text"] = item[i]["text"] # text CLAP embeddings (cross-attention)

        conditionings.append(curr_conditioning)

    if prima_iterazione:
        plt.figure(figsize=(20, 5))
        plt.plot(filtered_envelope[0, 0].cpu())
        plt.show()
        plt.close()
        prima_iterazione = False

    output = generate_diffusion_cond(
            model.model,
            seed=seed,
            batch_size=num_samples,
            steps=150,
            cfg_scale=2.0,
            conditioning=conditionings,
            sample_size=x.shape[-1],
            sigma_min=0.3,
            sigma_max=500,
            sampler_type="dpmpp-3m-sde",
            device="cuda"
        )

    if out_folder not in os.listdir():
        os.mkdir(f"{out_folder}")

    if not os.path.exists(f"{out_folder}/input"):
        os.mkdir(f"{out_folder}/input")
    
    if not os.path.exists(f"{out_folder}/output"):
        os.mkdir(f"{out_folder}/output")
        

    for i in range(num_samples):
        video_name = item[i]["video_name"]
        torchaudio.save(f"{out_folder}/input/{contatore:04d}_video_{video_name}.wav", x[i].cpu(), sample_rate=44100)
        torchaudio.save(f"{out_folder}/output/{contatore:04d}_video_{video_name}.wav", output[i].cpu(), sample_rate=44100)
        contatore += 1
        pbar.update(1)

    dati = next(iter_dati)

pbar.close()

### Analysis, metrics, support code

Not yet structured to be user-friendly, it should generally work if files are named in a standard way

In [None]:
# This will plot the spectrogram of the target audio and the generated audio so we can compare them

import librosa
import librosa.display
import numpy as np

audio_path1 = "/home/christian/syncfusion/Stable-Video2Audio/notebook/out/input_0_video_2015-02-16-16-49-06.wav" 
audio_path2 = "/home/christian/syncfusion/Stable-Video2Audio/notebook/out/output_0_video_2015-02-16-16-49-06.wav"

y1, sr1 = librosa.load(audio_path1)
y2, sr2 = librosa.load(audio_path2)

D1 = librosa.amplitude_to_db(np.abs(librosa.stft(y1)), ref=np.max)
D2 = librosa.amplitude_to_db(np.abs(librosa.stft(y2)), ref=np.max)

plt.figure(figsize=(12, 8))

plt.subplot(2, 1, 1)
librosa.display.specshow(D1, sr=sr1, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram of Target Audio')

plt.subplot(2, 1, 2)
librosa.display.specshow(D2, sr=sr2, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram of Generated Audio')

plt.tight_layout()
plt.show()

In [None]:
# This will plot the waveform of the target audio and the generated audio

plt.figure(figsize=(12, 8))

plt.subplot(2, 1, 1)
librosa.display.waveshow(y1, sr=sr1)
plt.title('Waveform of Target Audio')

plt.subplot(2, 1, 2)
librosa.display.waveshow(y2, sr=sr2)
plt.title('Waveform of Generated Audio')

plt.tight_layout()
plt.show()

In [6]:
###
# GENERATE MONO AUDIO OUT OF STEREO SIGNALS
###

import librosa, os
from tqdm import tqdm 
import soundfile as sf

path = "/home/christian/syncfusion/Stable-Video2Audio/notebook/out14_NOCAVP_audio/output"

sr = None
for audio in tqdm(os.listdir(path)):
    if os.path.isdir(os.path.join(path, audio)):
        continue
    
    if sr is None:
        sr = librosa.get_samplerate(os.path.join(path, audio))
        print("Setting sample rate to", sr)
    y, sr = librosa.load(os.path.join(path, audio), sr=sr, mono=True)

    if not os.path.exists(os.path.join(path, "mono_audio")):
        os.mkdir(os.path.join(path, "mono_audio")) 
    # Save mono wav file
    # librosa.output.write_wav(os.path.join(path, "mono_audio", audio), y, sr)
    sf.write(os.path.join(path, "mono_audio", audio), y, sr)



Setting sample rate to 44100


100%|██████████| 602/602 [00:10<00:00, 58.53it/s]


In [None]:
###
# CONVERT AUDIO TO DESIDERED SAMPLE RATE
###
import os, soundfile as sf
from tqdm import tqdm
import librosa

path = "/home/christian/syncfusion/Stable-Video2Audio/notebook/out/output"

input_sample_rate = 44100
desired_sample_rate = 16000

for audio in tqdm(os.listdir(path)):
    if os.path.isdir(os.path.join(path, audio)):
        continue

    y, sr = librosa.load(os.path.join(path, audio), sr=input_sample_rate, mono=True)
    y = librosa.resample(y, sr, desired_sample_rate)
    if not os.path.exists(os.path.join(path, "convert", desired_sample_rate)):
        os.mkdir(os.path.join(path, "convert", desired_sample_rate))
    sf.write(os.path.join(path, "convert", desired_sample_rate, audio), y, desired_sample_rate)

In [7]:
# CALCOLO E-L1
# Please note: here we computed the metric on mono audios 
# (but computing it on stereo audios should not produce significantly different results)
# we suppose 44100 Hz sample rate for all audios

# NOT YET OPTIMIZED TO MAKE IT USER FRIENDLY :')

import os, soundfile as sf
from tqdm import tqdm
import librosa
import numpy as np
import torch

# Your generated results
path = "/home/christian/syncfusion/Stable-Video2Audio/notebook/out7_TCCLIP_best_epoch_TEXT/output/mono_audio"
# Inputs
path_input = "/home/christian/syncfusion/Stable-Video2Audio/notebook/out/input/mono_audio"
l1_distances = []

named_differently = False

e_l1 = torch.nn.L1Loss()

list_path_input_audio = sorted(list(os.listdir(path_input)))

for audio in tqdm(os.listdir(path)):
    if os.path.isdir(os.path.join(path, audio)):
        continue

    y, sr = librosa.load(os.path.join(path, audio), sr=44100, mono=True)
    if named_differently:
        numero_audio = int(audio.split("_")[0])
        filtrato = list(filter(lambda x: "wav" in x and int(x.split("_")[0]) == numero_audio, list_path_input_audio))
        if len(filtrato) > 1:
            print("Errore")
            break
        y_input, sr_input = librosa.load(os.path.join(path_input, filtrato[0]), sr=44100, mono=True)
    else:  
        y_input, sr_input = librosa.load(os.path.join(path_input, audio), sr=44100, mono=True)

    y_zeros = np.zeros_like(y_input)
    y_zeros[:len(y)] = y
    y = y_zeros
    
    rms_y = librosa.feature.rms(y=y, frame_length=512, hop_length=128, pad_mode='reflect')[0]
    rms_y_input = librosa.feature.rms(y=y_input, frame_length=512, hop_length=128, pad_mode='reflect')[0]
    
    rms_y = torch.tensor(rms_y)
    rms_y_input = torch.tensor(rms_y_input)

    to_take = torch.logical_or(rms_y!=0, rms_y_input!=0)

    l1_distance = e_l1(rms_y, rms_y_input)
    l1_distances.append(l1_distance)

100%|██████████| 605/605 [00:09<00:00, 62.46it/s]


In [None]:
###
# GENERATE VIDEOS WITH AUDIO
# Take the audio file, the muted video and generates a video with the audio
#
###
from tqdm import tqdm
import subprocess

# Where to find audios
audio_folder = "/home/christian/syncfusion/Stable-Video2Audio/notebook/out6_BNINCEPTION_best_epoch/output/mono_audio"
# Where to find muted videos
video_folder = "/home/christian/syncfusion/Stable-Video2Audio/notebook/out/input/video_noaudio"
# Where to save results
output_folder = "/home/christian/syncfusion/Stable-Video2Audio/notebook/out6_BNINCEPTION_best_epoch/output/video"

list_all_audios = list(os.listdir(audio_folder))

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

for video in tqdm(os.listdir(video_folder)):
    if os.path.isdir(os.path.join(video_folder, video)):
        continue
    video_path = os.path.join(video_folder, video)
    
    num_video = int(video.split("_")[0])
    filtrato = list(filter(lambda x: "wav" in x and int(x.split("_")[0]) == num_video, list_all_audios))

    if len(filtrato) != 1:
        print("Errore")
        break

    audio_path = os.path.join(audio_folder, filtrato[0])
    output_path = os.path.join(output_folder, video)
    command = [
        'ffmpeg',
        '-i', video_path,
        '-i', audio_path,
        '-c:v', 'copy',
        '-c:a', 'aac',
        output_path
    ]

    # Execute the command
    subprocess.run(command)