# Installation

In [1]:
!pip install streamlit opencv-python scipy pydub langchain_groq gtts TTS pyngrok ffmpeg-python SpeechRecognition gradio

Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting langchain_groq
  Downloading langchain_groq-0.2.5-py3-none-any.whl.metadata (2.6 kB)
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting TTS
  Downloading TTS-0.22.0-cp311-cp311-manylinux1_x86_64.whl.metadata (21 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting SpeechRecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Collecting gradio
  Downloading gradio-5.21.0-py3-none-any.whl.metadata (16 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[

# streamlit Interface

In [4]:
%%writefile app.py
import os
import io
import base64
import tempfile
import concurrent.futures
import streamlit as st
import speech_recognition as sr
from gtts import gTTS
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from TTS.api import TTS
import soundfile as sf

# -----------------------------------------------------------------------------------------
#----------Voice Cloning Setup
#--------------Write the reference audio to a temporary file.
#------------------Generate TTS output with the cloned voice.
# -----------------------------------------------------------------------------------------

@st.cache_resource
def load_voice_cloning_model():
    """Load the voice cloning model once and cache it."""
    return TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)

@st.cache_resource
def load_reference_voice():
    """Load the reference voice audio from a file in the environment."""
    reference_voice_path = "/content/tom cruise.mp3"
    with open(reference_voice_path, "rb") as f:
        return f.read()

def generate_cloned_audio(text, audio_bytes, tts_model):
    """Generate audio using the voice cloning model and in-memory reference audio."""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_fp:
        audio_data = io.BytesIO(audio_bytes)
        data, samplerate = sf.read(audio_data)
        sf.write(ref_fp.name, data, samplerate)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_fp:
        tts_model.tts_to_file(
            text=text,
            speaker_wav=ref_fp.name,
            language="en",
            file_path=out_fp.name
        )
        generated_audio = open(out_fp.name, "rb").read()
    os.unlink(ref_fp.name)
    os.unlink(out_fp.name)
    return generated_audio

# -----------------------------------------------------------------------------------------
# Helper Functions
# -----------------------------------------------------------------------------------------
def transcribe_audio(audio_bytes):
    """Transcribe audio using SpeechRecognition."""
    recognizer = sr.Recognizer()
    with sr.AudioFile(io.BytesIO(audio_bytes)) as source:
        audio_data_sr = recognizer.record(source)
    return recognizer.recognize_google(audio_data_sr)

def process_llm_query(recognized_text, api_key):
    """Process the LLM query using ChatGroq."""
    prompt_template = [
        ('system', 'You are a helpful assistant, your name is BlueAssistant, developed by Ayush Kumar'),
        ('user', '{input}')
    ]
    prompt = ChatPromptTemplate.from_messages(prompt_template)
    llm = ChatGroq(api_key=api_key)
    chain = prompt | llm | StrOutputParser()
    return chain.invoke({'input': recognized_text})

def generate_tts(response_text, reference_audio=None):
    """
    Generate TTS audio using voice cloning if a reference voice is provided,
    otherwise fallback to a default gTTS voice.
    """
    try:
        if reference_audio:
            tts_model = load_voice_cloning_model()
            return generate_cloned_audio(response_text, reference_audio, tts_model)
        else:
            tts = gTTS(text=response_text, lang='en')
            mp3_fp = io.BytesIO()
            tts.write_to_fp(mp3_fp)
            return mp3_fp.getvalue()
    except Exception as e:
        st.error(f"Error in TTS generation: {e}")
        return None

# ---------------------------------------------------------------------------------------------------------
#------Main Streamlit App
#---------Load the reference voice.
#--------------- Record and Process Query Voice
#--------------------Use the recorded audio solely for transcription (query text extraction)
#------------------------Transcribe the query
#------------------------------Process query if wake word detected and if it’s a new query
#--------------------------------- Process query if wake word detected and if it’s a new query
#----------------------------------------Convert audio to base64 string for embedding in HTML
# -----------------------------------------------------------------------------------------------------------
st.title("AI Assistant with Preloaded Voice Cloning")
st.write("This app uses a preloaded reference voice for cloning responses. Record your voice query (say 'assistant' to trigger processing).")


if "reference_audio" not in st.session_state:
    st.session_state.reference_audio = load_reference_voice()

if "recognized_text" not in st.session_state:
    st.session_state.recognized_text = ""
if "last_query" not in st.session_state:
    st.session_state.last_query = ""
if "response_text" not in st.session_state:
    st.session_state.response_text = ""
if "cloned_audio" not in st.session_state:
    st.session_state.cloned_audio = None

st.header("Record Your Query")
audio_file = st.audio_input("Record your voice message (say 'assistant' to trigger processing)")
if audio_file is not None:
    audio_bytes = audio_file.read()
    st.audio(audio_bytes, format="audio/wav")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_transcribe = executor.submit(transcribe_audio, audio_bytes)
        try:
            recognized_text = future_transcribe.result(timeout=20)
            st.write("**You said:**", recognized_text)
            st.session_state.recognized_text = recognized_text
        except Exception as e:
            st.error(f"Error transcribing audio: {e}")

    if ("assistant" in recognized_text.lower() and recognized_text != st.session_state.last_query):
        st.session_state.last_query = recognized_text
        st.write("🔔 Processing query...")

        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_llm = executor.submit(
                process_llm_query,
                recognized_text,
                "your_groq_api_key"
            )
            try:
                response_text = future_llm.result(timeout=60)
                st.session_state.response_text = response_text
                st.write("**Response:**", response_text)

                with st.spinner("Generating cloned voice response..."):
                    cloned_audio = generate_tts(response_text, st.session_state.reference_audio)
                    if cloned_audio:
                        st.session_state.cloned_audio = cloned_audio
                        b64_audio = base64.b64encode(cloned_audio).decode("utf-8")
                        audio_html = f'<audio autoplay style="display:none;" src="data:audio/wav;base64,{b64_audio}"></audio>'
                        st.markdown(audio_html, unsafe_allow_html=True)
            except Exception as e:
                st.error(f"Error processing LLM query: {e}")
else:
    st.info("Awaiting voice input.")



Overwriting app.py


# Hosting / Tunneling

In [5]:
from pyngrok import ngrok
import time

ngrok.set_auth_token("your_api_key")
public_url = ngrok.connect(8501).public_url
print("Access the UI at:", public_url)

!streamlit run app.py &
time.sleep(5)

Access the UI at: https://be45-35-198-203-107.ngrok-free.app

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.198.203.107:8501[0m
[0m




2025-03-14 11:17:27.138 Examining the path of torch.classes raised:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/streamlit/web/bootstrap.py", line 345, in run
    if asyncio.get_running_loop().is_running():
       ^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: no running event loop

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/streamlit/watcher/local_sources_watcher.py", line 217, in get_module_paths
    potential_paths = extract_paths(module)
                      ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/streamlit/watcher/local_sources_watcher.py", line 210, in <lambda>
    lambda m: list(m.__path__._path),
                   ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/_classes.py", line 13, in __getattr__
    proxy = torch._C._get_custom_class_python_wrapper(self.name, attr)
            ^^^^^