In [None]:
#beyond the interface class, working with blocks
import gradio as gr


def greet(name):
    return "Hello " + name + "!"


with gr.Blocks() as demo:
    name = gr.Textbox(label="Name")
    output = gr.Textbox(label="Output Box")
    greet_btn = gr.Button("Greet")
    greet_btn.click(fn=greet, inputs=name, outputs=output, api_name="greet")

demo.launch()


In [None]:
# event listeners using decorators
import gradio as gr

with gr.Blocks() as demo:
    name = gr.Textbox(label="Name")
    output = gr.Textbox(label="Output Box")
    greet_btn = gr.Button("Greet")

    @greet_btn.click(inputs=name, outputs=output)
    def greet(name):
        return "Hello " + name + "!"

demo.launch()

In [None]:
import gradio as gr

def welcome(name):
    return f"Welcome to Gradio, {name}!"

with gr.Blocks() as demo:
    gr.Markdown(
    """
    # Hello World!
    Start typing below to see the output.
    """)
    inp = gr.Textbox(placeholder="What is your name?")
    out = gr.Textbox()
    inp.change(welcome, inp, out)

demo.launch()


In [None]:
import gradio as gr

def increase(num):
    return num + 1

with gr.Blocks() as demo:
    a = gr.Number(label="a")
    b = gr.Number(label="b")
    atob = gr.Button("a > b")
    btoa = gr.Button("b > a")
    atob.click(increase, a, b)
    btoa.click(increase, b, a)

demo.launch()


In [None]:
from transformers import pipeline

import gradio as gr

asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
classifier = pipeline("text-classification")

def speech_to_text(speech):
    text = asr(speech)["text"]  
    return text

def text_to_sentiment(text):
    return classifier(text)[0]["label"]  

demo = gr.Blocks()

with demo:
    audio_file = gr.Audio(type="filepath")
    text = gr.Textbox()
    label = gr.Label()

    b1 = gr.Button("Recognize Speech")
    b2 = gr.Button("Classify Sentiment")

    b1.click(speech_to_text, inputs=audio_file, outputs=text)
    b2.click(text_to_sentiment, inputs=text, outputs=label)

demo.launch()


In [None]:
#multiple inputs components
import gradio as gr

with gr.Blocks() as demo:
    a = gr.Number(label="a")
    b = gr.Number(label="b")
    with gr.Row():
        add_btn = gr.Button("Add")
        sub_btn = gr.Button("Subtract")
    c = gr.Number(label="sum")

    def add(num1, num2):
        return num1 + num2
    add_btn.click(add, inputs=[a, b], outputs=c)

    def sub(data):
        return data[a] - data[b]
    sub_btn.click(sub, inputs={a, b}, outputs=c)

demo.launch()


In [None]:
clear_GPU_cache()

In [None]:
import gradio as gr
import time
import numpy as np
from carlos_tools_audio import OpenAI_transcribe, local_whisper_transcribe, local_faster_whisper_transcribe
from carlos_tools_misc import clear_GPU_cache
import tempfile
import soundfile as sf

# Import your local whisper and faster-whisper models
from transformers import pipeline
import faster_whisper

# Dummy GPT remote transcription function (replace with your actual API call)
def gpt_transcribe(path):
    response = OpenAI_transcribe(
        path,
        model="whisper-1",
        response_format="text"
    )
    text=response["text"]
    duration=response["inference_time"]
    return text, duration

# Local Whisper
def whisper_transcribe(path):
    clear_GPU_cache()
    response = local_whisper_transcribe(
        path,
        model_size="large-v3",
    )
    text= response["text"]
    duration= response["inference_time"]
    return text, duration

# Faster Whisper
def faster_whisper_transcribe(path):
    clear_GPU_cache()
    response = local_faster_whisper_transcribe(
        path,
        model_size="distil-large-v3",
    )
    text = response["text"]
    duration = response["inference_time"]
    return text, duration

def compare_transcriptions(path):
    gpt_text, gpt_time = gpt_transcribe(path)
    faster_text, faster_time = faster_whisper_transcribe(path)
    whisper_text, whisper_time = whisper_transcribe(path)
    table = [
        # ["Model", "Transcription", "Duration (s)"],
        ["GPT (remote)", gpt_text, round(gpt_time, 2)],
        ["Whisper (local)", whisper_text, round(whisper_time, 2)],
        ["Faster Whisper (local)", faster_text, round(faster_time, 2)],
    ]
    return table

with gr.Blocks() as demo:   # may tray themes e.g. theme=gr.themes.Soft()
    with gr.Sidebar(position="left", width=200, visible=True):
        gr.Markdown("# Carlos' tests")
        gr.Markdown("## Audio")
        gr.Markdown("### [Speech-To-Text](https://huggingface.co/openai/whisper-large-v3)")

    gr.Markdown("""
            # Speech-To-Text tasks
            """)
    with gr.Tab("Transcribe"):
            with gr.Row():
                gr.Markdown("""
                            # TRANSCRIPTION
                            ## compare transcriptions from different models
                            ### Models used: 
                            """)
            with gr.Row():
                gr.Markdown("""
                            ### [Whisper](https://huggingface.co/openai/whisper-large-v3)

                            Whisper is  essentially a language model grounded in audio — an audio-conditional GPT. It was trained by OpenAI on a large and diverse dataset of multilingual audio, enabling it to perform automatic speech recognition (ASR) and translation tasks across many languages.

                            Whisper is trained in a similar fashion to the original GPT, using self-supervised learning with a next-token prediction objective. However, while GPT is trained solely on text, Whisper is trained on paired audio and text, where the model learns to generate transcriptions (or translations) token by token from audio inputs. During training, the encoder processes the audio into latent representations, and the decoder learns to predict the next text token given the previous tokens and the audio context. Unlike GPT, which relies purely on textual continuity, Whisper must also learn alignment between speech and language, making it a multimodal model trained end-to-end on large-scale audio-text datasets.
                            """)
                gr.Markdown("""
                            ### [Faster Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)

                            Faster-Whisper is a high-performance, inference-optimized implementation of OpenAI's Whisper model — effectively a language model grounded in audio, engineered for fast, resource-efficient deployment.

                            While it retains the same underlying architecture and training paradigm as Whisper — a multimodal encoder-decoder transformer trained via self-supervised next-token prediction on paired audio-text data — Faster-Whisper focuses entirely on inference-time efficiency. It uses CTranslate2, a highly optimized inference engine for transformer models, to significantly accelerate transcription and translation while reducing memory usage.

                            Like Whisper, Faster-Whisper takes in raw audio, encodes it into latent representations via the audio encoder, and then decodes text token by token, conditioned on both the encoded audio and previously generated tokens. However, all training is inherited directly from the original Whisper checkpoints — Faster-Whisper is not retrained, but instead recompiled and optimized for speed and portability (e.g., on CPU, GPU, or ARM devices).

                            As a result, Faster-Whisper makes Whisper’s powerful multilingual speech recognition capabilities more accessible in production environments, edge devices, and real-time applications where latency and efficiency are critical.
                            """)

            audio_input = gr.Audio(sources="upload", type= "filepath", label="Upload Audio")
            output_table = gr.Dataframe(
                headers=["Model", "Translation", "Duration (s)"],
                datatype=["str", "str", "number"],  # Ensure "Translation" is string
                row_count=5,  # Adjust as needed for visible rows
                interactive=False
)
            transcribe_btn = gr.Button("Transcribe with All Models")
            transcribe_btn.click(compare_transcriptions, inputs=audio_input, outputs=output_table)
    with gr.Tab("Translate"):
            gr.Markdown("""
                        # Speech-To-Text tasks: TRANSLATION
                        ## compare translations from different models
                        ### Models used: 
                        ### [Whisper](https://huggingface.co/openai/whisper-large-v3)

                        Whisper is  essentially a language model grounded in audio — an audio-conditional GPT. It was trained by OpenAI on a large and diverse dataset of multilingual audio, enabling it to perform automatic speech recognition (ASR) and translation tasks across many languages.

                        Whisper is trained in a similar fashion to the original GPT, using self-supervised learning with a next-token prediction objective. However, while GPT is trained solely on text, Whisper is trained on paired audio and text, where the model learns to generate transcriptions (or translations) token by token from audio inputs. During training, the encoder processes the audio into latent representations, and the decoder learns to predict the next text token given the previous tokens and the audio context. Unlike GPT, which relies purely on textual continuity, Whisper must also learn alignment between speech and language, making it a multimodal model trained end-to-end on large-scale audio-text datasets.

                        ### [Faster Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)

                        Faster-Whisper is a high-performance, inference-optimized implementation of OpenAI's Whisper model — effectively a language model grounded in audio, engineered for fast, resource-efficient deployment.

                        While it retains the same underlying architecture and training paradigm as Whisper — a multimodal encoder-decoder transformer trained via self-supervised next-token prediction on paired audio-text data — Faster-Whisper focuses entirely on inference-time efficiency. It uses CTranslate2, a highly optimized inference engine for transformer models, to significantly accelerate transcription and translation while reducing memory usage.

                        Like Whisper, Faster-Whisper takes in raw audio, encodes it into latent representations via the audio encoder, and then decodes text token by token, conditioned on both the encoded audio and previously generated tokens. However, all training is inherited directly from the original Whisper checkpoints — Faster-Whisper is not retrained, but instead recompiled and optimized for speed and portability (e.g., on CPU, GPU, or ARM devices).

                        As a result, Faster-Whisper makes Whisper’s powerful multilingual speech recognition capabilities more accessible in production environments, edge devices, and real-time applications where latency and efficiency are critical.
                        """)
            audio_input = gr.Audio(sources="upload", type= "filepath", label="Upload Audio")
            # output_table = gr.Dataframe(headers=["Model", "Translation", "Duration (s)"], interactive=False)
            output_table = gr.Dataframe(
                headers=["Model", "Translation", "Duration (s)"],
                datatype=["str", "str", "number"],  # Ensure "Translation" is string
                row_count=5,  # Adjust as needed for visible rows
                interactive=False
)
            transcribe_btn = gr.Button("Translate with All Models")
            transcribe_btn.click(compare_transcriptions, inputs=audio_input, outputs=output_table)
    with gr.Tab("Detect language"):
            gr.Markdown("""
                        # Speech-To-Text tasks: LANGUAGE DETECTION
                        ## compare language detection from different models
                        ### Models used: 
                        ### [Whisper](https://huggingface.co/openai/whisper-large-v3)

                        Whisper is  essentially a language model grounded in audio — an audio-conditional GPT. It was trained by OpenAI on a large and diverse dataset of multilingual audio, enabling it to perform automatic speech recognition (ASR) and translation tasks across many languages.

                        Whisper is trained in a similar fashion to the original GPT, using self-supervised learning with a next-token prediction objective. However, while GPT is trained solely on text, Whisper is trained on paired audio and text, where the model learns to generate transcriptions (or translations) token by token from audio inputs. During training, the encoder processes the audio into latent representations, and the decoder learns to predict the next text token given the previous tokens and the audio context. Unlike GPT, which relies purely on textual continuity, Whisper must also learn alignment between speech and language, making it a multimodal model trained end-to-end on large-scale audio-text datasets.

                        ### [Faster Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)

                        Faster-Whisper is a high-performance, inference-optimized implementation of OpenAI's Whisper model — effectively a language model grounded in audio, engineered for fast, resource-efficient deployment.

                        While it retains the same underlying architecture and training paradigm as Whisper — a multimodal encoder-decoder transformer trained via self-supervised next-token prediction on paired audio-text data — Faster-Whisper focuses entirely on inference-time efficiency. It uses CTranslate2, a highly optimized inference engine for transformer models, to significantly accelerate transcription and translation while reducing memory usage.

                        Like Whisper, Faster-Whisper takes in raw audio, encodes it into latent representations via the audio encoder, and then decodes text token by token, conditioned on both the encoded audio and previously generated tokens. However, all training is inherited directly from the original Whisper checkpoints — Faster-Whisper is not retrained, but instead recompiled and optimized for speed and portability (e.g., on CPU, GPU, or ARM devices).

                        As a result, Faster-Whisper makes Whisper’s powerful multilingual speech recognition capabilities more accessible in production environments, edge devices, and real-time applications where latency and efficiency are critical.
                        """)
            audio_input = gr.Audio(sources="upload", type= "filepath", label="Upload Audio")
            output_table = gr.Dataframe(headers=["Model", "Language", "Duration (s)"], interactive=False)
            transcribe_btn = gr.Button("Detect Language with All Models")

demo.launch()

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


pygame 2.5.2 (SDL 2.28.3, Python 3.12.2)
Hello from the pygame community. https://www.pygame.org/contribute.html
* Running on local URL:  http://127.0.0.1:7865
* To create a public link, set `share=True` in `launch()`.




GPU cache cleared.
Running faster whisper model locally. 
file_path='C:\\Users\\cfune\\AppData\\Local\\Temp\\gradio\\bdf4155afd7fc989b61f424f8b2c36a309dc949201c4bfed0de733954a537a9c\\1_Audio.mp3'
 model_size='distil-large-v3'
 device='cuda'
 compute_type='float16'
 language=None
 prompt=None

Detected language en with probability 0.85693359375
GPU cache cleared.
Running whisper model locally. 
file_path='C:\\Users\\cfune\\AppData\\Local\\Temp\\gradio\\bdf4155afd7fc989b61f424f8b2c36a309dc949201c4bfed0de733954a537a9c\\1_Audio.mp3'
 model_size='large-v3'
 device='cuda'
 verbose=True
 prompt=None
 language=None

Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: Spanish
[00:00.000 --> 00:05.580]  Es bien sabido que grupos criminales amasan grandes fortunas que inciden en las economías de los países.
[00:05.900 --> 00:14.220]  Para combatirla y evitar el lavado de capitales, un grupo de expertos ha brindado su experiencia en el l

In [None]:
# whisper in HF
import torch
from transformers import pipeline

pipeline = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-large-v3-turbo",
    torch_dtype=torch.float16,
    device=0
)
pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")

In [None]:
import gradio as gr
import random

def generate_pet_name(animal_type, personality):
    cute_prefixes = ["Fluffy", "Ziggy", "Bubbles", "Pickle", "Waffle", "Mochi", "Cookie", "Pepper"]
    animal_suffixes = {
        "Cat": ["Whiskers", "Paws", "Mittens", "Purrington"],
        "Dog": ["Woofles", "Barkington", "Waggins", "Pawsome"],
        "Bird": ["Feathers", "Wings", "Chirpy", "Tweets"],
        "Rabbit": ["Hops", "Cottontail", "Bouncy", "Fluff"]
    }

    prefix = random.choice(cute_prefixes)
    suffix = random.choice(animal_suffixes[animal_type])

    if personality == "Silly":
        prefix = random.choice(["Sir", "Lady", "Captain", "Professor"]) + " " + prefix
    elif personality == "Royal":
        suffix += " the " + random.choice(["Great", "Magnificent", "Wise", "Brave"])

    return f"{prefix} {suffix}"

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    with gr.Sidebar(position="left"):
        gr.Markdown("# 🐾 Pet Name Generator")
        gr.Markdown("Use the options below to generate a unique pet name!")

        animal_type = gr.Dropdown(
            choices=["Cat", "Dog", "Bird", "Rabbit"],
            label="Choose your pet type",
            value="Cat"
        )
        personality = gr.Radio(
            choices=["Normal", "Silly", "Royal"],
            label="Personality type",
            value="Normal"
        )

    name_output = gr.Textbox(label="Your pet's fancy name:", lines=2)
    generate_btn = gr.Button("Generate Name! 🎲", variant="primary")
    generate_btn.click(
        fn=generate_pet_name,
        inputs=[animal_type, personality],
        outputs=name_output
    )

demo.launch()


In [None]:
input_textbox = gr.Textbox()

with gr.Blocks() as demo:
    gr.Examples(["hello", "bonjour", "merhaba"], input_textbox)
    input_textbox.render()
demo.launch()

In [2]:
from carlos_tools_misc import clear_GPU_cache
clear_GPU_cache()

GPU cache cleared.
