In [None]:
!apt-get update
# portaudio is required by the sounddevice python package
!apt-get install -y portaudio19-dev python3-pyaudio
# usual python dependency installs
!pip install -qU \
    "matplotlib==3.10.1" \
    "openai-agents[voice]==0.1.0" \
    "sounddevice==0.5.1"

In [1]:
import sounddevice as sd

input_device = sd.query_devices(kind='input')
output_device = sd.query_devices(kind='output')

input_device, output_device

({'name': 'default',
  'index': 6,
  'hostapi': 0,
  'max_input_channels': 32,
  'max_output_channels': 32,
  'default_low_input_latency': 0.008684807256235827,
  'default_low_output_latency': 0.008684807256235827,
  'default_high_input_latency': 0.034807256235827665,
  'default_high_output_latency': 0.034807256235827665,
  'default_samplerate': 44100.0},
 {'name': 'default',
  'index': 6,
  'hostapi': 0,
  'max_input_channels': 32,
  'max_output_channels': 32,
  'default_low_input_latency': 0.008684807256235827,
  'default_low_output_latency': 0.008684807256235827,
  'default_high_input_latency': 0.034807256235827665,
  'default_high_output_latency': 0.034807256235827665,
  'default_samplerate': 44100.0})

We can find the sample rate for these devices via the `default_samplerate` field:

In [2]:
in_samplerate = sd.query_devices(kind='input')['default_samplerate']
out_samplerate = sd.query_devices(kind='output')['default_samplerate']

in_samplerate, out_samplerate

(44100.0, 44100.0)

In [8]:
recorded_chunks = []

# start streaming from microphone until Enter is pressed
with sd.InputStream(
    samplerate=in_samplerate,
    channels=1,
    dtype='int16',
    callback=lambda indata, frames, time, status: recorded_chunks.append(indata.copy())
):
    input()

In [9]:
len(recorded_chunks)

428

In [None]:
import numpy as np

audio_buffer = np.concatenate(recorded_chunks)

(84445, 1)

In [12]:
sd.play(audio_buffer, samplerate=out_samplerate)

In [14]:
from agents.voice import AudioInput

audio_input = AudioInput(
    buffer=audio_buffer,
    frame_rate=in_samplerate,
    channels=audio_buffer.shape[1], 
)
audio_input

AudioInput(buffer=array([[-26663],
       [-26740],
       [-26851],
       ...,
       [    -3],
       [    26],
       [    16]], shape=(84445, 1), dtype=int16), frame_rate=44100.0, sample_width=2, channels=1)

In [15]:
import os
import getpass
from dotenv import load_dotenv
load_dotenv(override=True)

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") 

In [22]:
from agents import Agent

agent = Agent(
    name="Assistant",
    instructions=(
        "Repeat the user's question back to them, and then answer it. Note that the user is "
        "speaking to you via a voice interface, although you are reading and writing text to "
        "respond. Nonetheless, ensure that your written response is easily translatable to voice."
        "Also Make Your Voice Human Like and Stable"
        "Speak in a friendly manner and hindi language"
    ),
    model="gpt-4o-mini"
)

In [23]:
from agents.voice import SingleAgentVoiceWorkflow

workflow = SingleAgentVoiceWorkflow(agent)

In [24]:
from agents.voice import TTSModelSettings, VoicePipelineConfig

custom_tts_settings = TTSModelSettings(
    instructions=(
        "Personality: upbeat, friendly, persuasive guide.\n"
        "Tone: Friendly, clear, and reassuring, creating a calm atmosphere and making "
        "the listener feel confident and comfortable.\n"
        "Pronunciation: Clear, articulate, and steady, ensuring each instruction is "
        "easily understood while maintaining a natural, conversational flow.\n"
        "Tempo: Speak relatively fast, include brief pauses and after before questions.\n"
        "Emotion: Warm and supportive, conveying empathy and care, ensuring the listener "
        "feels guided and safe throughout the journey."
    )
)

voice_pipeline_config = VoicePipelineConfig(tts_settings=custom_tts_settings)

In [25]:
from agents.voice import VoicePipeline

pipeline = VoicePipeline(workflow=workflow, config=voice_pipeline_config)

In [26]:
result = await pipeline.run(audio_input=audio_input)

response_chunks = []

async for event in result.stream():
    if event.type == "voice_stream_event_audio":
        response_chunks.append(event.data)

response_audio_buffer = np.concatenate(response_chunks, axis=0)

openai_sample_rate = 24_000

sd.play(response_audio_buffer, samplerate=openai_sample_rate)
sd.wait() 

In [None]:
async def voice_assistant_optimized():
    while True:

        cmd = input("Press Enter to speak (or type 'q' to exit): ")
        if cmd.lower() == "q" :
            print("Exiting...")
            break
        print("Listening...")
        recorded_chunks = []

        with sd.InputStream(
            samplerate=in_samplerate,
            channels=1,
            dtype='int16',
            callback=lambda indata, frames, time, status: recorded_chunks.append(indata.copy())
        ):
            input()

        recording = np.concatenate(recorded_chunks, axis=0)

        audio_input = AudioInput(buffer=recording)

        result = await pipeline.run(audio_input)

        response_chunks = []
        async for event in result.stream():
            if event.type == "voice_stream_event_audio":
                response_chunks.append(event.data)

        response_audio_buffer = np.concatenate(response_chunks, axis=0)

        print("Assistant is responding...")
        sd.play(response_audio_buffer, samplerate=openai_sample_rate)
        sd.wait()
        print("---")

await voice_assistant_optimized()

Listening...
Assistant is responding...
---
Listening...
Assistant is responding...
---
Listening...
Assistant is responding...
---
Listening...
Assistant is responding...
---
Listening...
Assistant is responding...
---
Listening...
Assistant is responding...
---
Exiting...


---