In [None]:
#beyond the interface class, working with blocks
import gradio as gr


def greet(name):
    return "Hello " + name + "!"


with gr.Blocks() as demo:
    name = gr.Textbox(label="Name")
    output = gr.Textbox(label="Output Box")
    greet_btn = gr.Button("Greet")
    greet_btn.click(fn=greet, inputs=name, outputs=output, api_name="greet")

demo.launch()


  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7883
* To create a public link, set `share=True` in `launch()`.




In [2]:
# event listeners using decorators
import gradio as gr

with gr.Blocks() as demo:
    name = gr.Textbox(label="Name")
    output = gr.Textbox(label="Output Box")
    greet_btn = gr.Button("Greet")

    @greet_btn.click(inputs=name, outputs=output)
    def greet(name):
        return "Hello " + name + "!"

demo.launch()

* Running on local URL:  http://127.0.0.1:7884
* To create a public link, set `share=True` in `launch()`.




In [3]:
import gradio as gr

def welcome(name):
    return f"Welcome to Gradio, {name}!"

with gr.Blocks() as demo:
    gr.Markdown(
    """
    # Hello World!
    Start typing below to see the output.
    """)
    inp = gr.Textbox(placeholder="What is your name?")
    out = gr.Textbox()
    inp.change(welcome, inp, out)

demo.launch()


* Running on local URL:  http://127.0.0.1:7885
* To create a public link, set `share=True` in `launch()`.




In [4]:
import gradio as gr

def increase(num):
    return num + 1

with gr.Blocks() as demo:
    a = gr.Number(label="a")
    b = gr.Number(label="b")
    atob = gr.Button("a > b")
    btoa = gr.Button("b > a")
    atob.click(increase, a, b)
    btoa.click(increase, b, a)

demo.launch()


* Running on local URL:  http://127.0.0.1:7886
* To create a public link, set `share=True` in `launch()`.




In [6]:
from transformers import pipeline

import gradio as gr

asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
classifier = pipeline("text-classification")

def speech_to_text(speech):
    text = asr(speech)["text"]  
    return text

def text_to_sentiment(text):
    return classifier(text)[0]["label"]  

demo = gr.Blocks()

with demo:
    audio_file = gr.Audio(type="filepath")
    text = gr.Textbox()
    label = gr.Label()

    b1 = gr.Button("Recognize Speech")
    b2 = gr.Button("Classify Sentiment")

    b1.click(speech_to_text, inputs=audio_file, outputs=text)
    b2.click(text_to_sentiment, inputs=text, outputs=label)

demo.launch()


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


* Running on local URL:  http://127.0.0.1:7888
* To create a public link, set `share=True` in `launch()`.




In [7]:
#multiple inputs components
import gradio as gr

with gr.Blocks() as demo:
    a = gr.Number(label="a")
    b = gr.Number(label="b")
    with gr.Row():
        add_btn = gr.Button("Add")
        sub_btn = gr.Button("Subtract")
    c = gr.Number(label="sum")

    def add(num1, num2):
        return num1 + num2
    add_btn.click(add, inputs=[a, b], outputs=c)

    def sub(data):
        return data[a] - data[b]
    sub_btn.click(sub, inputs={a, b}, outputs=c)

demo.launch()


* Running on local URL:  http://127.0.0.1:7889
* To create a public link, set `share=True` in `launch()`.




In [3]:
clear_GPU_cache()

GPU cache cleared.


In [1]:
import gradio as gr
import time
import numpy as np
from carlos_tools_audio import OpenAI_transcribe, local_whisper_transcribe, local_faster_whisper_transcribe
from carlos_tools_misc import clear_GPU_cache
import tempfile
import soundfile as sf

# Import your local whisper and faster-whisper models
from transformers import pipeline
import faster_whisper

# Dummy GPT remote transcription function (replace with your actual API call)
def gpt_transcribe(path):
    response = OpenAI_transcribe(
        path,
        model="whisper-1",
        response_format="text"
    )
    text=response["text"]
    duration=response["inference_time"]
    return text, duration

# Local Whisper
def whisper_transcribe(path):
    clear_GPU_cache()
    response = local_whisper_transcribe(
        path,
        model_size="large-v3",
    )
    text= response["text"]
    duration= response["inference_time"]
    return text, duration

# Faster Whisper
def faster_whisper_transcribe(path):
    clear_GPU_cache()
    response = local_faster_whisper_transcribe(
        path,
        model_size="distil-large-v3",
    )
    text = response["text"]
    duration = response["inference_time"]
    return text, duration

def compare_transcriptions(path):
    gpt_text, gpt_time = gpt_transcribe(path)
    faster_text, faster_time = faster_whisper_transcribe(path)
    whisper_text, whisper_time = whisper_transcribe(path)
    table = [
        ["Model", "Transcription", "Duration (s)"],
        ["GPT (remote)", gpt_text, round(gpt_time, 2)],
        ["Whisper (local)", whisper_text, round(whisper_time, 2)],
        ["Faster Whisper (local)", faster_text, round(faster_time, 2)],
    ]
    return table

with gr.Blocks() as demo:
    gr.Markdown("# Audio Transcription Comparison")
    audio_input = gr.Audio(sources="upload", type= "filepath", label="Upload Audio")
    output_table = gr.Dataframe(headers=["Model", "Transcription", "Duration (s)"], interactive=False)
    transcribe_btn = gr.Button("Transcribe with All Models")
    transcribe_btn.click(compare_transcriptions, inputs=audio_input, outputs=output_table)

demo.launch()

  from .autonotebook import tqdm as notebook_tqdm


pygame 2.5.2 (SDL 2.28.3, Python 3.12.2)
Hello from the pygame community. https://www.pygame.org/contribute.html
* Running on local URL:  http://127.0.0.1:7883
* To create a public link, set `share=True` in `launch()`.




GPU cache cleared.
Running faster whisper model locally. 
file_path='C:\\Users\\cfune\\AppData\\Local\\Temp\\gradio\\e1442c6f73c6328e767a89521b02f7ef68af14e6bac3e4168b9202d9e244855d\\test.wav'
 model_size='distil-large-v3'
 device='cuda'
 compute_type='float16'
 language=None
 prompt=None

Detected language en with probability 0.98681640625
GPU cache cleared.
Running whisper model locally. 
file_path='C:\\Users\\cfune\\AppData\\Local\\Temp\\gradio\\e1442c6f73c6328e767a89521b02f7ef68af14e6bac3e4168b9202d9e244855d\\test.wav'
 model_size='large-v3'
 device='cuda'
 verbose=True
 prompt=None
 language=None



NVIDIA GeForce RTX 5060 Ti with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90 compute_37.
If you want to use the NVIDIA GeForce RTX 5060 Ti GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:08.320]  This is a test. 1 2 3 4 5 6 7 8 1 2 3 1 2 3


In [4]:
# whisper in HF
import torch
from transformers import pipeline

pipeline = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-large-v3-turbo",
    torch_dtype=torch.float16,
    device=0
)
pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cuda:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}