In [None]:
import os
import openai
import numpy as np
import sounddevice as sd
import soundfile as sf
import requests
import tempfile
import webrtcvad
import time
import subprocess
import threading
import sys
import whisper

whisper_model = whisper.load_model("tiny")
previous_response_id = None

# Constants
OPENAI_API_KEY = "" 
ELEVEN_LABS_API_KEY = ""  
VOICE_ID = "51YRucvcq5ojp2byev44" 
MODEL_ID = "eleven_monolingual_v1" # you can add any model here 
ENABLE_TTS = True
MAX_TTS_CHARACTERS = 250

openai.api_key = OPENAI_API_KEY

stop_playback = threading.Event()
tts_active = threading.Event()

def whisper_stt(audio_data, fs=16000):
    """
    Transcribe audio using Whisper locally.
    """
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        sf.write(tmp.name, audio_data, fs)
        result = whisper_model.transcribe(tmp.name)
        os.unlink(tmp.name)
        print(f"[STT] Transcription result: {result['text']}")
        return result["text"].strip()

def elevenlabs_stream_tts(text, fs=16000):
    """
    Streams ElevenLabs TTS audio in small chunks, allowing immediate interruption.
    Returns True if playback was interrupted by user speech.
    """
    global stop_playback, tts_active

    # 1) Request streaming MP3 from ElevenLabs
    url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
    headers = {
        "xi-api-key": ELEVEN_LABS_API_KEY,
        "accept": "audio/mpeg"
    }
    response = requests.post(
        url,
        headers=headers,
        json={"text": text, "model_id": MODEL_ID},
        stream=True
    )

    if not response.ok:
        print("TTS Error:", response.text)
        return False

    # 2) Save MP3 to temp file, then convert to WAV
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as mp3_file:
        for chunk in response.iter_content(chunk_size=4096):
            mp3_file.write(chunk)

    wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    subprocess.run(
        ["ffmpeg", "-y", "-i", mp3_file.name, wav_file.name],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )

    # 3) Read WAV into numpy array
    audio, orig_fs = sf.read(wav_file.name)
    # If stereo, convert to mono by averaging
    if audio.ndim == 2 and audio.shape[1] == 2:
        audio = audio.mean(axis=1)
    audio = audio.astype(np.float32)
    playback_fs = orig_fs

    interrupted = threading.Event()

    # 4) Playback function that writes small chunks to OutputStream
    def play_audio_in_chunks():
        chunk_duration_s = 0.1  # play 100 ms at a time
        chunk_size = int(playback_fs * chunk_duration_s)
        idx = 0
        chunks_played = 0

        with sd.OutputStream(samplerate=playback_fs, channels=1, dtype="float32") as out_strm:
            while idx < len(audio) and not stop_playback.is_set():
                end = min(idx + chunk_size, len(audio))
                segment = audio[idx:end]
                out_strm.write(segment)
                idx = end
                chunks_played += 1
                # Debug-print every 10 chunks (~1 second if chunk_duration_s=0.1)
                if chunks_played % 10 == 0:
                    print(f"[TTS] Played {chunks_played} chunks "
                          f"({chunks_played * chunk_duration_s:.1f}s)")

    # 5) VAD monitoring thread to detect user speech and interrupt
    def monitor_interrupt():
        vad_sample_rate = 16000
        vad = webrtcvad.Vad(2)
        frame_duration_ms = 30
        frame_length = int(vad_sample_rate * frame_duration_ms / 1000)

        try:
            with sd.InputStream(
                samplerate=vad_sample_rate,
                channels=1,
                dtype="float32",
                blocksize=frame_length
            ) as in_strm:
                print("[TTS] Monitoring for interrupts")
                while not stop_playback.is_set() and not interrupted.is_set():
                    frame, overflow = in_strm.read(frame_length)
                    if overflow:
                        continue
                    audio_chunk = frame[:, 0]
                    if np.abs(audio_chunk).mean() < 0.333333:
                        continue
                    pcm = (audio_chunk * 32767).astype(np.int16).tobytes()
                    if len(pcm) != frame_length * 2:
                        continue
                    try:
                        is_speech = vad.is_speech(pcm, vad_sample_rate)
                    except Exception:
                        continue
                    if is_speech:
                        print("Interrupt detected during TTS")
                        interrupted.set()
                        stop_playback.set()
                        try:
                            sd.stop()
                        except Exception:
                            pass
                        break
        except Exception as e:
            if not stop_playback.is_set() and not interrupted.is_set():
                print("Interrupt monitor error:", e)

    # 6) Clear any old flags so neither thread quits instantly
    stop_playback.clear()
    interrupted.clear()
    tts_active.set()

    play_thread = threading.Thread(target=play_audio_in_chunks, daemon=True)
    listen_thread = threading.Thread(target=monitor_interrupt, daemon=True)

    print("[TTS] Playback started")
    play_thread.start()
    # Give the output stream ~50 ms to queue up at least one chunk before VAD runs
    time.sleep(0.05)
    listen_thread.start()

    # 7) Wait for playback to finish or be interrupted
    play_thread.join()
    stop_playback.set()
    listen_thread.join()
    tts_active.clear()

    print("[TTS] Playback finished")

    # 8) Clean up temp files
    try:
        os.unlink(mp3_file.name)
        os.unlink(wav_file.name)
    except OSError:
        pass

    return interrupted.is_set()

def record_with_vad(fs=16000, frame_duration_ms=30, silence_limit_sec=2, idle_limit_sec=10):
    """
    Records audio using VAD. Returns concatenated numpy array of speech frames,
    or None if idle timeout.
    """
    vad = webrtcvad.Vad(2)
    frame_length = int(fs * frame_duration_ms / 1000)
    silence_threshold = int(silence_limit_sec * 1000 / frame_duration_ms)
    idle_threshold = int(idle_limit_sec * 1000 / frame_duration_ms)

    print("Listening... Speak when ready. (Pause for 3s to process, idle 5s to exit)")

    speech_buffer = []
    silence_counter = 0
    idle_counter = 0
    recording_started = False

    try:
        with sd.InputStream(samplerate=fs, channels=1, dtype='float32', blocksize=frame_length) as stream:
            while True:
                frame, overflow = stream.read(frame_length)
                if overflow:
                    print("Audio overflow, skipping frame.")
                    continue

                audio = frame[:, 0].copy()
                volume = np.abs(audio).mean()
                pcm_frame = (audio * 32767).astype(np.int16).tobytes()
                is_speech = vad.is_speech(pcm_frame, fs) if volume >= 0.25 else False

                if volume < 0.25 and not is_speech and not recording_started:
                    print(f"Low volume ({volume:.5f}) and no speech — frame ignored")
                    # continue

                if is_speech:
                    print("Speech detected")
                    if not recording_started:
                        print("Speech started.")
                    recording_started = True
                    silence_counter = 0
                    idle_counter = 0
                    speech_buffer.append(audio)

                elif recording_started:
                    silence_counter += 1
                    speech_buffer.append(audio)
                    print(f"Silence {silence_counter}/{silence_threshold} after speech")
                    if silence_counter > silence_threshold:
                        print("Speech ended. Sending to transcription.")
                        return np.concatenate(speech_buffer)
                else:
                    print("Silence(no speech yet)")
                    if not tts_active.is_set():
                        idle_counter += 1
                        print(f"Idle {idle_counter}/{idle_threshold} (no speech yet)")
                        if idle_counter > idle_threshold:
                            print("Idle timeout. No speech detected.")
                            sys.exit(0)
                            return None
    except KeyboardInterrupt:
        print("Stopped by user.")
        return None

def elevenlabs_stt(audio_data, fs=16000):
    """
    Sends a numpy array of audio (fs=16000) to ElevenLabs STT and returns transcribed text.
    """
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
        sf.write(tmp.name, audio_data, fs)
        url = "https://api.elevenlabs.io/v1/speech-to-text"
        headers = {"xi-api-key": ELEVEN_LABS_API_KEY}
        with open(tmp.name, "rb") as f:
            response = requests.post(url, headers=headers, files={"file": f}, data={"model_id": "scribe_v1"})
        os.unlink(tmp.name)
        if response.ok:
            return response.json().get("text", "")
        else:
            print("STT Error:", response.text)
            return ""

def get_llm_response(prompt):
    """
    Queries OpenAI Responses API, maintaining history with previous_response_id.
    """
    global previous_response_id

    if sum(c.isalpha() for c in prompt) < 5:
        return ""

    # Prepare the arguments
    kwargs = {
        "model": "gpt-3.5-turbo",  # Or your preferred model
        "input": [
            {
                "role": "system",
                "content": (
                    "You are a professional, concise, and highly efficient assistant. But always response in English, doesn't matter in which language user speaks. "
                    "Always respond in clear, well-structured English, using no more than 20 words unless absolutely necessary. "
                )
            },
            {"role": "user", "content": prompt}
        ]
    }
    # Include history if it exists
    if previous_response_id:
        kwargs["previous_response_id"] = previous_response_id

    response = openai.responses.create(**kwargs)
    
    # Save the current response's ID for history tracking
    previous_response_id = response.id

    # Adapt to your API's actual response shape
    return response.output_text

def conversation_loop(cases, embeddings_model, embeddings):
    """
    Main loop: records user speech, transcribes it, gets LLM response, plays TTS, and restarts on interruption.
    """
    while True:
        audio_data = record_with_vad()
        if audio_data is None:
            continue

        # text = elevenlabs_stt(audio_data)
        text = whisper_stt(audio_data)
        if not text.strip():
            continue

        prompt = text
        response_text = get_llm_response(prompt)
        print(f"Agent: {response_text}")
        print("ENABLE TTS ", ENABLE_TTS)

        if ENABLE_TTS:
            interrupted = elevenlabs_stream_tts(response_text[:MAX_TTS_CHARACTERS])
            if interrupted:
                print("[Main] TTS interrupted by user speech. Restarting listening loop...")
                time.sleep(0.3)  # Allow the audio device to recover
                try:
                    sd.stop()
                except Exception as e:
                    print("Error stopping sounddevice:", e)
                continue  # Go back to listening immediately

if __name__ == "__main__":
    cases = []
    embeddings = np.array([])
    embeddings_model = None
    conversation_loop(cases, embeddings_model, embeddings)

     

  checkpoint = torch.load(fp, map_location=device)


Listening... Speak when ready. (Pause for 3s to process, idle 5s to exit)
Low volume (0.00001) and no speech — frame ignored
Silence(no speech yet)
Idle 1/333 (no speech yet)
Low volume (0.00002) and no speech — frame ignored
Silence(no speech yet)
Idle 2/333 (no speech yet)
Low volume (0.00002) and no speech — frame ignored
Silence(no speech yet)
Idle 3/333 (no speech yet)
Low volume (0.00002) and no speech — frame ignored
Silence(no speech yet)
Idle 4/333 (no speech yet)
Low volume (0.00002) and no speech — frame ignored
Silence(no speech yet)
Idle 5/333 (no speech yet)
Low volume (0.00002) and no speech — frame ignored
Silence(no speech yet)
Idle 6/333 (no speech yet)
Low volume (0.00002) and no speech — frame ignored
Silence(no speech yet)
Idle 7/333 (no speech yet)
Low volume (0.00002) and no speech — frame ignored
Silence(no speech yet)
Idle 8/333 (no speech yet)
Low volume (0.00002) and no speech — frame ignored
Silence(no speech yet)
Idle 9/333 (no speech yet)
Low volume (0.000



[STT] Transcription result:  What is Microsoft?
Agent: Microsoft is a multinational technology company that develops, licenses, and sells software, electronics, and services.
ENABLE TTS  True
[TTS] Playback started
[TTS] Monitoring for interrupts
[TTS] Played 10 chunks (1.0s)
[TTS] Played 20 chunks (2.0s)
[TTS] Played 30 chunks (3.0s)
[TTS] Played 40 chunks (4.0s)
[TTS] Played 50 chunks (5.0s)
[TTS] Played 60 chunks (6.0s)
[TTS] Played 70 chunks (7.0s)
[TTS] Playback finished
Listening... Speak when ready. (Pause for 3s to process, idle 5s to exit)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 1/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 2/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 3/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 4/333 (no speech yet)
Low volume (0.00000) and no speech — fram



[STT] Transcription result:  But as they do.
Agent: Microsoft offers operating systems (Windows), productivity software (Office), cloud services (Azure), hardware (Surface), and more technology solutions.
ENABLE TTS  True
[TTS] Playback started
[TTS] Monitoring for interrupts
[TTS] Played 10 chunks (1.0s)
[TTS] Played 20 chunks (2.0s)
[TTS] Played 30 chunks (3.0s)
[TTS] Played 40 chunks (4.0s)
[TTS] Played 50 chunks (5.0s)
[TTS] Played 60 chunks (6.0s)
[TTS] Played 70 chunks (7.0s)
[TTS] Played 80 chunks (8.0s)
[TTS] Played 90 chunks (9.0s)
[TTS] Played 100 chunks (10.0s)
[TTS] Played 110 chunks (11.0s)
[TTS] Playback finished
Listening... Speak when ready. (Pause for 3s to process, idle 5s to exit)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 1/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 2/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 3/333 (no 



[STT] Transcription result:  How many employees do they have?
Agent: Microsoft employs over 150,000 people globally, working in various roles across different divisions and locations.
ENABLE TTS  True
[TTS] Playback started
[TTS] Monitoring for interrupts
[TTS] Played 10 chunks (1.0s)
[TTS] Played 20 chunks (2.0s)
[TTS] Played 30 chunks (3.0s)
[TTS] Played 40 chunks (4.0s)
Interrupt detected during TTS
[TTS] Playback finished
[Main] TTS interrupted by user speech. Restarting listening loop...
Listening... Speak when ready. (Pause for 3s to process, idle 5s to exit)
Low volume (0.22102) and no speech — frame ignored
Silence(no speech yet)
Idle 1/333 (no speech yet)
Low volume (0.17191) and no speech — frame ignored
Silence(no speech yet)
Idle 2/333 (no speech yet)
Low volume (0.05643) and no speech — frame ignored
Silence(no speech yet)
Idle 3/333 (no speech yet)
Low volume (0.05399) and no speech — frame ignored
Silence(no speech yet)
Idle 4/333 (no speech yet)
Low volume (0.08810) and



[STT] Transcription result:  I've been here.
Agent: I am glad you have visited. If you have any questions or need assistance, feel free to ask.
ENABLE TTS  True
[TTS] Playback started
[TTS] Monitoring for interrupts
[TTS] Played 10 chunks (1.0s)
[TTS] Played 20 chunks (2.0s)
[TTS] Played 30 chunks (3.0s)
Interrupt detected during TTS
[TTS] Playback finished
[Main] TTS interrupted by user speech. Restarting listening loop...
Listening... Speak when ready. (Pause for 3s to process, idle 5s to exit)
Low volume (0.22437) and no speech — frame ignored
Silence(no speech yet)
Idle 1/333 (no speech yet)
Low volume (0.13673) and no speech — frame ignored
Silence(no speech yet)
Idle 2/333 (no speech yet)
Low volume (0.14347) and no speech — frame ignored
Silence(no speech yet)
Idle 3/333 (no speech yet)
Low volume (0.05305) and no speech — frame ignored
Silence(no speech yet)
Idle 4/333 (no speech yet)
Low volume (0.07313) and no speech — frame ignored
Silence(no speech yet)
Idle 5/333 (no speec



[STT] Transcription result:  veremedin?
Agent: I am here to assist you. Please let me know how I can help you.
ENABLE TTS  True
[TTS] Playback started


ALSA lib pcm.c:8526:(snd_pcm_recover) underrun occurred


[TTS] Monitoring for interrupts
[TTS] Played 10 chunks (1.0s)
Interrupt detected during TTS
[TTS] Played 20 chunks (2.0s)
[TTS] Playback finished
[Main] TTS interrupted by user speech. Restarting listening loop...
Listening... Speak when ready. (Pause for 3s to process, idle 5s to exit)
Low volume (0.10250) and no speech — frame ignored
Silence(no speech yet)
Idle 1/333 (no speech yet)
Low volume (0.07262) and no speech — frame ignored
Silence(no speech yet)
Idle 2/333 (no speech yet)
Low volume (0.06342) and no speech — frame ignored
Silence(no speech yet)
Idle 3/333 (no speech yet)
Low volume (0.05843) and no speech — frame ignored
Silence(no speech yet)
Idle 4/333 (no speech yet)
Low volume (0.05027) and no speech — frame ignored
Silence(no speech yet)
Idle 5/333 (no speech yet)
Low volume (0.06455) and no speech — frame ignored
Silence(no speech yet)
Idle 6/333 (no speech yet)
Low volume (0.07070) and no speech — frame ignored
Silence(no speech yet)
Idle 7/333 (no speech yet)
Low v



[STT] Transcription result:  is their revenue.
Agent: Microsoft's revenue varies yearly but is typically in the range of hundreds of billions of dollars.
ENABLE TTS  True
[TTS] Playback started
[TTS] Monitoring for interrupts
[TTS] Played 10 chunks (1.0s)
[TTS] Played 20 chunks (2.0s)
[TTS] Played 30 chunks (3.0s)
[TTS] Played 40 chunks (4.0s)
[TTS] Played 50 chunks (5.0s)
[TTS] Playback finished
Listening... Speak when ready. (Pause for 3s to process, idle 5s to exit)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 1/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 2/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 3/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 4/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 5/333 (no speech yet)
Low volume (0.00000) 



[STT] Transcription result:  There is their head of his.
Agent: Satya Nadella currently serves as the CEO (Chief Executive Officer) of Microsoft.
ENABLE TTS  True
[TTS] Playback started
[TTS] Monitoring for interrupts
[TTS] Played 10 chunks (1.0s)
[TTS] Played 20 chunks (2.0s)
[TTS] Played 30 chunks (3.0s)
[TTS] Played 40 chunks (4.0s)
[TTS] Played 50 chunks (5.0s)
[TTS] Playback finished
Listening... Speak when ready. (Pause for 3s to process, idle 5s to exit)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 1/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 2/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 3/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 4/333 (no speech yet)
Low volume (0.00000) and no speech — frame ignored
Silence(no speech yet)
Idle 5/333 (no speech yet)
Low volume (0.00000) and no s

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
