To develop a Python-based speech-to-text system that converts spoken commands into text in real time, provides meaningful user feedback, handles errors gracefully, and allows comparison of different recognition methods.


In [12]:
# ------------------------------
# 1️⃣ Install required packages and set up the environment
#    (This part runs only once)
# ------------------------------
print("Setting up the environment...")
!pip uninstall whisper -y -q
!pip install openai-whisper vosk SpeechRecognition pydub gradio -q
!apt-get install -y ffmpeg -q

import os
import gradio as gr

# Download the Vosk model if it doesn't exist
if not os.path.exists("vosk-model-small-en-us-0.15"):
    print("Downloading Vosk model...")
    !wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip -q
    !unzip -q vosk-model-small-en-us-0.15.zip
    print("Vosk model downloaded and unzipped.")

# ------------------------------
# 2️⃣ Imports and Model Loading
# ------------------------------
import whisper
from vosk import Model, KaldiRecognizer
import speech_recognition as sr
import wave
import json
from pydub import AudioSegment
import traceback

print("Loading models...")
# Load models once to improve performance
try:
    whisper_model = whisper.load_model("tiny") # "tiny" for speed, "base" for better accuracy
    vosk_model = Model("vosk-model-small-en-us-0.15")
    recognizer = sr.Recognizer()
    print("Models loaded successfully.")
except Exception as e:
    print(f"Error loading models: {e}")


# ------------------------------
# 3️⃣ The Core Transcription Function (as a Generator)
# ------------------------------

def format_results(whisper, vosk, google):
    """Helper function to format the text for display."""
    return (
        f"Whisper Output:\n'{whisper}'\n\n"
        f"Vosk Output:\n'{vosk}'\n\n"
        f"Google API Output:\n'{google}'"
    )

def transcribe_audio_generator(audio_path):
    """
    Transcribes audio using three services, yielding results as they become available.
    """
    # Initial state
    whisper_text = "Pending..."
    vosk_text = "Pending..."
    google_text = "Pending..."
    status_message = "Starting..."

    # Immediately show the initial state
    yield format_results(whisper_text, vosk_text, google_text), status_message

    if audio_path is None:
        yield format_results("Error", "Error", "Error"), "Error: No audio provided. Please record or upload."
        return

    # --- 1. Whisper Recognition ---
    status_message = "Recognizing with Whisper..."
    yield format_results(whisper_text, vosk_text, google_text), status_message
    try:
        result = whisper_model.transcribe(audio_path)
        whisper_text = result["text"].strip()
        if not whisper_text:
            whisper_text = "Whisper could not understand audio."
    except Exception as e:
        whisper_text = f"Whisper failed: {e}"

    status_message = "Whisper complete. Starting Vosk..."
    yield format_results(whisper_text, vosk_text, google_text), status_message

    # --- 2. Vosk Recognition ---
    # Pre-processing: Convert audio to 16kHz mono WAV for Vosk
    converted_for_vosk = "vosk_temp.wav"
    try:
        audio = AudioSegment.from_file(audio_path)
        audio = audio.set_channels(1).set_frame_rate(16000)
        audio.export(converted_for_vosk, format="wav")

        wf = wave.open(converted_for_vosk, "rb")
        rec = KaldiRecognizer(vosk_model, wf.getframerate())
        rec.SetWords(True)

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            rec.AcceptWaveform(data)

        res = json.loads(rec.FinalResult())
        vosk_text = res.get("text", "").strip()
        if not vosk_text:
            vosk_text = "Vosk could not understand audio."

    except Exception as e:
        vosk_text = f"Vosk failed: {e}\n{traceback.format_exc()}"
    finally:
        if os.path.exists(converted_for_vosk):
            os.remove(converted_for_vosk)

    status_message = "Vosk complete. Starting Google API..."
    yield format_results(whisper_text, vosk_text, google_text), status_message

    # --- 3. Google Speech Recognition API ---
    try:
        with sr.AudioFile(audio_path) as source:
            audio_data = recognizer.record(source)
        google_text = recognizer.recognize_google(audio_data).strip()
        if not google_text:
            google_text = "Google API could not understand audio."
    except sr.UnknownValueError:
        google_text = "Google API could not understand audio. Please try speaking more clearly."
    except sr.RequestError:
        google_text = "Google API service is unavailable. Check your internet connection."
    except Exception as e:
        google_text = f"Google API failed: {e}"

    # --- 4. Final Results and Status ---
    is_successful = any(
        "could not understand" not in text.lower() and "failed" not in text.lower()
        for text in [whisper_text, vosk_text, google_text]
    )

    if is_successful:
        status_message = "Speech successfully converted to text!"
    else:
        status_message = "Speech recognition could not understand the audio. Please try again."

    yield format_results(whisper_text, vosk_text, google_text), status_message


# ------------------------------
# 4️⃣ Create the Gradio Interface
# ------------------------------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎤 Speech to Text Comparison 📝")
    gr.Markdown("Upload an audio file or use your microphone. You will see the results appear one by one as they are processed.")

    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="Speak something or upload an audio file..."
        )

    submit_button = gr.Button("Transcribe Audio")

    gr.Markdown("---")
    gr.Markdown("### Results")

    with gr.Row():
        output_results = gr.Textbox(label="Comparative Results", lines=10, interactive=False)
        output_status = gr.Textbox(label="Status", interactive=False)

    # The .click() event will automatically handle the generator function,
    # updating the outputs each time the function yields a value.
    submit_button.click(
        fn=transcribe_audio_generator,
        inputs=audio_input,
        outputs=[output_results, output_status]
    )

# Launch the app
demo.launch(debug=True)

Setting up the environment...
[0mReading package lists...
Building dependency tree...
Reading state information...
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
Loading models...
Models loaded successfully.
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://65c1e403abb9529006.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://65c1e403abb9529006.gradio.live




In [2]:
pip install whisper

Collecting whisper
  Downloading whisper-1.1.10.tar.gz (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: whisper
  Building wheel for whisper (setup.py) ... [?25l[?25hdone
  Created wheel for whisper: filename=whisper-1.1.10-py3-none-any.whl size=41120 sha256=784022377c59695837cf4c7177ca430cfd709a33b1c095dac8a81e51059280f4
  Stored in directory: /root/.cache/pip/wheels/34/b8/4e/9c4c3351d670e06746a340fb4b7d854c76517eec225e5b32b1
Successfully built whisper
Installing collected packages: whisper
Successfully installed whisper-1.1.10


In [4]:
pip install vosk

Collecting vosk
  Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (1.8 kB)
Collecting srt (from vosk)
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading vosk-0.3.45-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: srt
  Building wheel for srt (setup.py) ... [?25l[?25hdone
  Created wheel for srt: filename=srt-3.5.3-py3-none-any.whl size=22427 sha256=da4763386640c6442e21005c10d5bd39a2180c9bc0e8105682b626c7f2e2d82e
  Stored in directory: /root/.cache/pip/wheels/7e/75/5b/e1d5c3756631e4bda806f6cc9640153b39484bb6f7b0b8def3
Successfully built srt
Installing collected packages: srt, vosk
Successfully installed srt-3.5.3 vosk-0.3.45


In [8]:
pip install SpeechRecognition


Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.3


In [12]:
!pip uninstall whisper -y
!pip install openai-whisper


Found existing installation: whisper 1.1.10
Uninstalling whisper-1.1.10:
  Successfully uninstalled whisper-1.1.10
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=342f70e1598f6ed7de522dc149d6e7d2c0b97f17c04d3f741124f9ba7cd2485d
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d126cba375b15898010b6cc28578d8afdde5869
Successfully built openai-whisper
Installing collected packages: openai-whisper
Successfully installe