[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/Image2SoundFX-jupyter/blob/main/Image2SoundFX_jupyter.ipynb)

In [None]:
%cd /content
!apt -y install -qq aria2
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/haoheliu/audioldm_48k/resolve/main/audioldm_48k.pth -d /content -o audioldm_48k.pth
!wget https://huggingface.co/spaces/fffiloni/Image2SFX-comparison/resolve/main/oiseau.png
!git clone -b dev https://github.com/camenduru/moondream
%cd /content/moondream
!pip install -q cog einops moondream timm gradio==3.50.2 diffusers transformers -U

import argparse
import torch
import re
import gradio as gr
from moondream import Moondream, detect_device
from threading import Thread
from transformers import TextIteratorStreamer, CodeGenTokenizerFast as Tokenizer
device, dtype = detect_device()
model_id = "vikhyatk/moondream1"
tokenizer = Tokenizer.from_pretrained(model_id)
moondream = Moondream.from_pretrained(model_id).to(device=device, dtype=dtype)
moondream.eval()
def answer_question(img, prompt):
    image_embeds = moondream.encode_image(img)
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    thread = Thread(
        target=moondream.answer_question,
        kwargs={
            "image_embeds": image_embeds,
            "question": prompt,
            "tokenizer": tokenizer,
            "streamer": streamer,
        },
    )
    thread.start()
    buffer = ""
    for new_text in streamer:
        clean_text = re.sub("<$|END$", "", new_text)
        buffer += clean_text
        yield buffer.strip("<END")
with gr.Blocks() as demo:
    with gr.Row():
        prompt = gr.Textbox(label="Input Prompt", placeholder="Type here...", scale=4)
        submit = gr.Button("Submit")
    with gr.Row():
        img = gr.Image(type="pil", label="Upload an Image")
        output = gr.TextArea(label="Response")
    submit.click(answer_question, [img, prompt], output)
    prompt.submit(answer_question, [img, prompt], output)
demo.queue().launch(debug=False, inline=False, share=True, server_name='0.0.0.0', server_port=1000)

%cd /content
import gradio as gr
import torch
from diffusers import AudioLDM2Pipeline
device = "cuda"
torch_dtype = torch.float16
repo_id = "cvssp/audioldm2"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
generator = torch.Generator(device)
def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates):
    if text is None:
        raise gr.Error("Please provide a text input.")
    waveforms = pipe(
        text,
        audio_length_in_s=duration,
        guidance_scale=guidance_scale,
        num_inference_steps=200,
        negative_prompt=negative_prompt,
        num_waveforms_per_prompt=n_candidates if n_candidates else 1,
        generator=generator.manual_seed(int(random_seed)),
    )["audios"]
    return gr.make_waveform((16000, waveforms[0]))
iface = gr.Blocks()
with iface:
    with gr.Group():
        textbox = gr.Textbox(
            value="The vibrant beat of Brazilian samba drums.",
            max_lines=1,
            label="Input text",
            info="Your text is important for the audio quality. Please ensure it is descriptive by using more adjectives.",
            elem_id="prompt-in",
        )
        negative_textbox = gr.Textbox(
            value="Low quality.",
            max_lines=1,
            label="Negative prompt",
            info="Enter a negative prompt not to guide the audio generation. Selecting appropriate negative prompts can improve the audio quality significantly.",
            elem_id="prompt-in",
        )
        with gr.Accordion("Click to modify detailed configurations", open=False):
            seed = gr.Number(
                value=45,
                label="Seed",
                info="Change this value (any integer number) will lead to a different generation result.",
            )
            duration = gr.Slider(5, 15, value=10, step=2.5, label="Duration (seconds)")
            guidance_scale = gr.Slider(
                0,
                7,
                value=3.5,
                step=0.5,
                label="Guidance scale",
                info="Larger => better quality and relevancy to text; Smaller => better diversity",
            )
            n_candidates = gr.Slider(
                1,
                5,
                value=3,
                step=1,
                label="Number waveforms to generate",
                info="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A larger value usually lead to better quality with heavier computation",
            )
        outputs = gr.Video(label="Output", elem_id="output-video")
        btn = gr.Button("Submit")
    btn.click(
        text2audio,
        inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
        outputs=[outputs],
    )
iface.queue().launch(debug=False, inline=False, share=True, server_name='0.0.0.0', server_port=2000)

import gradio as gr
from gradio_client import Client
import os
import json
import re
from moviepy.editor import VideoFileClip
from moviepy.audio.AudioClip import AudioClip
def extract_audio(video_in):
    input_video = video_in
    output_audio = 'audio.wav'
    video_clip = VideoFileClip(input_video)
    audio_clip = video_clip.audio
    audio_clip.write_audiofile(output_audio, fps=44100)
    print("Audio extraction complete.")
    return 'audio.wav'
def get_caption(image_in):
    client = Client("http://0.0.0.0:1000")
    result = client.predict(
        image_in,
        "Describe precisely the image in one sentence.",
        fn_index=0
    )
    print(result)
    return result
def get_audioldm(prompt):
    client = Client("http://0.0.0.0:2000")
    result = client.predict(
        prompt,	# str in 'Input text' Textbox component
        "Low quality. Music.",	# str in 'Negative prompt' Textbox component
        10,	# int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
        3.5,	# int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
        45,	# int | float in 'Seed' Number component
        3,	# int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
        fn_index=1
    )
    print(result)
    audio_result = extract_audio(result)
    return audio_result
def infer(image_in):
    caption = get_caption(image_in)
    audioldm_result = get_audioldm(caption)
    return audioldm_result
css="""
#col-container{
    margin: 0 auto;
    max-width: 800px;
}
"""
with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):        
        with gr.Column():
            image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="oiseau.png")
            with gr.Row():
                submit_btn = gr.Button("Submit")
        with gr.Column():
            audio_o = gr.Audio(label="Audio output")
    submit_btn.click(
        fn=infer,
        inputs=[image_in],
        outputs=[audio_o],
    )
demo.queue().launch(debug=True, show_error=True, share=True, inline=False)