In [7]:
pip install -U openai-whisper

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
!pip install sounddevice




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [1]:
# import gradio as gr
# import sounddevice as sd
# import numpy as np
# import scipy.io.wavfile as wav
# import tempfile
# import os
# import whisper

# # Load the Whisper model
# model = whisper.load_model("base")

# def record_audio(duration=5, samplerate=44100):
#     """Records audio for a given duration and returns the file path."""
#     print("Recording...")
#     audio_data = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16)
#     sd.wait()
#     print("Recording finished.")

#     # Save to a temporary WAV file
#     temp_wav_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
#     wav.write(temp_wav_file.name, samplerate, audio_data)
#     return temp_wav_file.name

# def transcribe_audio(file_path):
#     """Transcribes the uploaded or recorded audio file using Whisper."""
#     print(f"Processing file: {file_path}")
#     result = model.transcribe(file_path)
#     return result["text"]

# # Gradio UI
# with gr.Blocks() as app:
#     gr.Markdown("## 🎙️ Speech-to-Text Transcription")

#     with gr.Row():
#         record_btn = gr.Button("🎤 Record Audio")
#         upload_btn = gr.File(label="📂 Upload Audio File", type="filepath")

#     with gr.Row():
#         output_text = gr.Textbox(label="📝 Transcription Output", placeholder="Your transcribed text will appear here...")

#     # Handle recording
#     def record_and_transcribe():
#         audio_path = record_audio()
#         return transcribe_audio(audio_path)

#     # Button Click Events
#     record_btn.click(record_and_transcribe, outputs=output_text)
#     upload_btn.change(transcribe_audio, inputs=upload_btn, outputs=output_text)

# # Launch the app
# app.launch()

import gradio as gr
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
import tempfile
import os
import whisper
import matplotlib.pyplot as plt

# Load Whisper model
model = whisper.load_model("base")

# Supported languages for transcription
languages = ["English", "Spanish", "French", "German", "Chinese", "Hindi"]

def record_audio(duration=5, samplerate=44100, noise_reduction=False):
    """Records audio, optionally applies noise reduction, and returns the file path."""
    print("🎙️ Recording started...")
    audio_data = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16)
    sd.wait()
    print("✅ Recording finished.")

    # Save as WAV file
    temp_wav_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    wav.write(temp_wav_file.name, samplerate, audio_data)

    # Noise Reduction (Basic)
    if noise_reduction:
        audio_data = audio_data - np.mean(audio_data)

    return temp_wav_file.name, audio_data

def transcribe_audio(file_path, language):
    """Transcribes audio file based on the selected language."""
    print(f"📝 Transcribing {file_path} in {language}...")
    lang_code = {"English": "en", "Spanish": "es", "French": "fr", "German": "de", "Chinese": "zh", "Hindi": "hi"}[language]
    result = model.transcribe(file_path, language=lang_code)
    return result["text"]

def plot_waveform(audio_data):
    """Generates waveform visualization from audio data."""
    plt.figure(figsize=(8, 2))
    plt.plot(audio_data, color="purple")
    plt.title("Audio Waveform")
    plt.xlabel("Time")
    plt.ylabel("Amplitude")
    plt.grid()
    
    # Save plot as image
    temp_plot = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
    plt.savefig(temp_plot.name)
    plt.close()
    return temp_plot.name

# Gradio UI
with gr.Blocks() as app:
    gr.Markdown("## 🎙️ Speech-to-Text Transcription App")
    
    # User Input Controls
    with gr.Row():
        duration_input = gr.Slider(1, 10, value=5, label="⏳ Recording Duration (seconds)")
        noise_reduction_toggle = gr.Checkbox(label="🔇 Apply Noise Reduction")
        language_select = gr.Dropdown(languages, label="🌍 Select Language", value="English")

    with gr.Row():
        record_btn = gr.Button("🎤 Record Audio")
        upload_btn = gr.File(label="📂 Upload Audio File", type="filepath")

    with gr.Row():
        audio_player = gr.Audio(label="🔊 Audio Playback", interactive=False)
        waveform_img = gr.Image(label="📈 Audio Waveform")

    with gr.Row():
        output_text = gr.Textbox(label="📝 Transcription Output", placeholder="Your transcribed text will appear here...")
    
    download_btn = gr.Button("⬇️ Download Transcription")

    # Handle recording
    def record_and_transcribe(duration, noise_reduction, language):
        audio_path, audio_data = record_audio(duration, noise_reduction=noise_reduction)
        transcription = transcribe_audio(audio_path, language)
        waveform_path = plot_waveform(audio_data)
        return audio_path, waveform_path, transcription

    # Handle download
    def save_transcription(text):
        file_path = tempfile.NamedTemporaryFile(delete=False, suffix=".txt").name
        with open(file_path, "w") as file:
            file.write(text)
        return file_path

    # Button Click Events
    record_btn.click(record_and_transcribe, inputs=[duration_input, noise_reduction_toggle, language_select], outputs=[audio_player, waveform_img, output_text])
    upload_btn.change(transcribe_audio, inputs=[upload_btn, language_select], outputs=output_text)
    download_btn.click(save_transcription, inputs=output_text, outputs=gr.File())

# Launch the app
app.launch()


  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [17]:
pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.10.0-cp310-cp310-win_amd64.whl (8.0 MB)
     ---------------------------------------- 8.0/8.0 MB 14.7 MB/s eta 0:00:00
Collecting kiwisolver>=1.3.1
  Downloading kiwisolver-1.4.8-cp310-cp310-win_amd64.whl (71 kB)
     ---------------------------------------- 71.9/71.9 kB 2.0 MB/s eta 0:00:00
Collecting cycler>=0.10
  Downloading cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting contourpy>=1.0.1
  Downloading contourpy-1.3.1-cp310-cp310-win_amd64.whl (218 kB)
     -------------------------------------- 218.6/218.6 kB 1.7 MB/s eta 0:00:00
Collecting fonttools>=4.22.0
  Downloading fonttools-4.56.0-cp310-cp310-win_amd64.whl (2.2 MB)
     ---------------------------------------- 2.2/2.2 MB 17.5 MB/s eta 0:00:00
Collecting pyparsing>=2.3.1
  Downloading pyparsing-3.2.1-py3-none-any.whl (107 kB)
     -------------------------------------- 107.7/107.7 kB 6.1 MB/s eta 0:00:00
Installing collected packages: pyparsing, kiwisolver, fonttools, cycl


[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
pip install scipy

Collecting scipy
  Downloading scipy-1.15.1-cp310-cp310-win_amd64.whl (43.9 MB)
     --------------------------------------- 43.9/43.9 MB 18.2 MB/s eta 0:00:00
Installing collected packages: scipy
Successfully installed scipy-1.15.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
