<a href="https://colab.research.google.com/github/anshika0601/ai_voice_assistant/blob/main/ai_voice_assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import os
import tempfile
import whisper
from pydub import AudioSegment
from gtts import gTTS
from dotenv import load_dotenv
from google.colab import userdata
import gradio as gr
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage

# ------------------ Environment ------------------
load_dotenv()
GROQ_API_KEY = userdata.get("GROQ_API_KEY")
if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY not found in Colab Secrets. Please add it.")

# ------------------ Whisper ------------------
whisper_model = whisper.load_model("medium")

def convert_to_wav(audio_path: str) -> str:
    wav_path = tempfile.mktemp(suffix=".wav")
    audio = AudioSegment.from_file(audio_path)
    audio = audio.set_frame_rate(16000).set_channels(1)
    audio.export(wav_path, format="wav")
    return wav_path

def transcribe(audio_path: str) -> str:
    audio_path = convert_to_wav(audio_path)
    result = whisper_model.transcribe(audio_path)
    return result["text"]

# ------------------ Groq LLM ------------------
llm = ChatGroq(
    groq_api_key=GROQ_API_KEY,
    model_name="llama-3.1-8b-instant",
    temperature=0.2
)

def generate_reply(user_text: str) -> str:
    resp = llm.invoke([HumanMessage(content=user_text)])
    return resp.content

# ------------------ TTS ------------------
def speak(text: str) -> str:
    out_path = tempfile.mktemp(suffix=".mp3")
    tts = gTTS(text=text, lang="hi")   # Hindi (can change to 'en' for English)
    tts.save(out_path)
    return out_path

# ------------------ Pipeline ------------------
def pipeline(audio_path):
    if not audio_path:
        return "⚠️ No audio captured.", "", None
    user_text = transcribe(audio_path)
    reply_text = generate_reply(user_text)
    reply_audio = speak(reply_text)
    return user_text, reply_text, reply_audio

# ------------------ Gradio UI ------------------
with gr.Blocks(css="""
    .big-title {font-size: 28px; font-weight: bold; text-align: center;}
    .subtitle {font-size: 18px; color: #555; text-align: center; margin-bottom: 20px;}
    .chatbox {background: #f9f9f9; border-radius: 12px; padding: 12px; margin-bottom: 8px;}
""", title="🎙️ AI Voice Assistant") as demo:

    gr.Markdown("<div class='big-title'>🤖 AI Voice Assistant</div>")
    gr.Markdown("<div class='subtitle'>Powered by Whisper + Groq + GTTS</div>")

    with gr.Row():
        with gr.Column(scale=2):
            mic = gr.Audio(
                sources=["microphone"],
                type="filepath",
                label="🎤 Tap to Record",
            )

            with gr.Accordion("📜 Conversation", open=True):
                user_tb = gr.Textbox(label="🧑 You Said", interactive=False)
                reply_tb = gr.Textbox(label="🤖 Assistant Replied", interactive=False)

            speaker = gr.Audio(type="filepath", label="🔊 Assistant Voice Reply")

            mic.change(fn=pipeline, inputs=[mic], outputs=[user_tb, reply_tb, speaker])

        with gr.Column(scale=1):
            gr.Markdown(
                """
                ### 📝 How to Use
                1. **Click the microphone** 🎤 and speak clearly.
                2. The assistant will:
                   - 📝 Transcribe your voice
                   - 💡 Generate a smart reply (Groq LLM)
                   - 🔊 Speak the reply aloud using GTTS
                3. Speak in **English for best accuracy**.
                """
            )

if __name__ == "__main__":
    demo.launch()


100%|█████████████████████████████████████| 1.42G/1.42G [00:40<00:00, 38.1MiB/s]


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9131853eddc5873e23.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [2]:
!pip install openai-whisper gtts pydub gradio

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/803.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m645.1/803.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gtts)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downl

In [4]:
!pip install langchain langchain-core



In [6]:
!pip install langchain_groq

Collecting langchain_groq
  Downloading langchain_groq-0.3.7-py3-none-any.whl.metadata (2.6 kB)
Collecting groq<1,>=0.30.0 (from langchain_groq)
  Downloading groq-0.31.0-py3-none-any.whl.metadata (16 kB)
Downloading langchain_groq-0.3.7-py3-none-any.whl (16 kB)
Downloading groq-0.31.0-py3-none-any.whl (131 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.4/131.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq, langchain_groq
Successfully installed groq-0.31.0 langchain_groq-0.3.7
