# üéôÔ∏è Building Smart Voice Chat with Amazon Nova2

## üéØ What You'll Learn
- How to capture audio from microphone using Python
- How to use Amazon Nova2 Omni for audio transcription
- How to build conversational AI with language detection
- How to create interactive Jupyter widgets

Let's build this step by step! üöÄ

In [1]:
!pip install ipywidgets sounddevice numpy
!pip install -r ../requirements.txt

Looking in indexes: https://pypi.org/simple, https://plugin.us-east-1.prod.workshops.aws
Collecting sounddevice
  Using cached sounddevice-0.5.3-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl.metadata (1.6 kB)
Using cached sounddevice-0.5.3-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl (108 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.5.3
Looking in indexes: https://pypi.org/simple, https://plugin.us-east-1.prod.workshops.aws


In [None]:
import boto3
import json
import base64
import sounddevice as sd
import numpy as np
import wave
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, clear_output
import io

bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-west-2')

In [None]:
class VoiceRecorder:
    def __init__(self):
        self.sample_rate = 16000
        self.recording = False
        self.audio_data = []
        
    def start(self):
        self.recording = True
        self.audio_data = []
        
        def callback(indata, frames, time, status):
            if self.recording:
                self.audio_data.append(indata.copy())
        
        self.stream = sd.InputStream(callback=callback, samplerate=self.sample_rate, channels=1, dtype=np.float32)
        self.stream.start()
        
    def stop(self):
        self.recording = False
        if hasattr(self, 'stream'):
            self.stream.stop()
            self.stream.close()
        
        if self.audio_data:
            audio_array = np.concatenate(self.audio_data, axis=0)
            audio_int16 = (audio_array * 32767).astype(np.int16)
            
            wav_buffer = io.BytesIO()
            with wave.open(wav_buffer, 'wb') as wav_file:
                wav_file.setnchannels(1)
                wav_file.setsampwidth(2)
                wav_file.setframerate(self.sample_rate)
                wav_file.writeframes(audio_int16.tobytes())
            wav_buffer.seek(0)
            return wav_buffer.getvalue()
        return None

In [None]:
def chat_with_ai(audio_bytes):
    """Single-call AI processing: Audio ‚Üí Transcription + Analysis & Response"""
    try:
        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
        
        # Single call: Transcribe audio and generate intelligent response
        response = bedrock_runtime.invoke_model(
            modelId="us.amazon.nova-2-omni-v1:0",
            contentType="application/json",
            accept="application/json",
            body=json.dumps({
                "messages": [{
                    "role": "user",
                    "content": [
                        {"audio": {"format": "wav", "source": {"bytes": audio_base64}}},
                        {"text": """Given this input text:
    1. Transcribe the text exactly
    2. Detect language and analyze emotional tone
    3. Provide a natural response matching the language and tone

    Required output format:
    TRANSCRIPTION: [exact text]

    ANALYSIS:
    - Language: [detected language]
    - Tone: [primary emotion, intensity, key characteristics]

    RESPONSE: [natural conversational reply in matching language and tone]"""}
                    ]
                }],
                "inferenceConfig": {"maxTokens": 1000, "temperature": 0.7}
            })
        )
        
        result = json.loads(response['body'].read())
        full_response = result['output']['message']['content'][0]['text']
        
        # Parse transcription and AI response
        if 'TRANSCRIPTION:' in full_response:
            parts = full_response.split('\n\n', 1)
            user_text = parts[0].replace('TRANSCRIPTION:', '').strip()
            ai_response = parts[1] if len(parts) > 1 else "[Language: Unknown | Tone: Neutral] I heard you but couldn't analyze properly."
        else:
            user_text = "Audio processed"
            ai_response = full_response
        
        return user_text, ai_response
        
    except Exception as e:
        return "Error", f"[Language: Unknown | Tone: Error] Sorry, I encountered an error: {e}"

In [None]:
class EnhancedVoiceChat:
    def __init__(self):
        self.recorder = VoiceRecorder()
        self.conversation = []
        self.setup_ui()
        
    def setup_ui(self):
        self.talk_btn = widgets.Button(
            description='üéôÔ∏è Start Recording',
            button_style='success',
            layout=widgets.Layout(width='200px', height='60px')
        )
        self.stop_btn = widgets.Button(
            description='‚èπÔ∏è Send Message',
            button_style='primary',
            disabled=True,
            layout=widgets.Layout(width='200px', height='60px')
        )
        self.clear_btn = widgets.Button(
            description='üóëÔ∏è New Chat',
            button_style='warning',
            layout=widgets.Layout(width='150px')
        )
        
        self.status = widgets.HTML("<div style='padding: 15px; text-align: center; background: #f0f0f0; border-radius: 10px;'>üéØ Ready to analyze your voice!</div>")
        self.chat_display = widgets.Output(layout=widgets.Layout(height='2400px', overflow='auto'))
        self.message_count = widgets.HTML("<b>üí¨ Messages: 0</b>")
        
        self.talk_btn.on_click(self.start_talking)
        self.stop_btn.on_click(self.stop_talking)
        self.clear_btn.on_click(self.clear_chat)
    
    def start_talking(self, btn):
        self.recorder.start()
        self.talk_btn.disabled = True
        self.stop_btn.disabled = False
        self.status.value = "<div style='background: #ff4444; color: white; padding: 15px; border-radius: 10px; text-align: center;'><h3>üî¥ RECORDING...</h3><p>Speak naturally - I'll analyze your language and tone!</p></div>"
    
    def stop_talking(self, btn):
        self.status.value = "<div style='background: #4CAF50; color: white; padding: 15px; border-radius: 10px; text-align: center;'><h3>ü§ñ Analyzing speech...</h3><p>Detecting language and tone</p></div>"
        
        audio_bytes = self.recorder.stop()
        
        if audio_bytes:
            user_text, ai_response = chat_with_ai(audio_bytes)
            
            timestamp = datetime.now().strftime('%H:%M')
            self.conversation.append({'type': 'user', 'text': user_text, 'time': timestamp})
            self.conversation.append({'type': 'ai', 'text': ai_response, 'time': timestamp})
            
            self.update_chat_display()
            self.message_count.value = f"<b>üí¨ Messages: {len(self.conversation)}</b>"
            self.status.value = "<div style='background: #2196F3; color: white; padding: 15px; border-radius: 10px; text-align: center;'><h3>‚úÖ Analysis complete!</h3><p>Ready for next message</p></div>"
        
        self.talk_btn.disabled = False
        self.stop_btn.disabled = True
    
    def update_chat_display(self):
        with self.chat_display:
            clear_output(wait=True)
            
            if not self.conversation:
                print("üéôÔ∏è Start speaking to see language and tone analysis!")
                return
            
            for msg in self.conversation:
                if msg['type'] == 'user':
                    print(f"\nüë§ YOU ({msg['time']})")
                    print(f"‚îå‚îÄ {msg['text']}")
                    print("‚îî" + "‚îÄ" * 60)
                else:
                    print(f"\nü§ñ AI ASSISTANT ({msg['time']})")
                    if '[Language:' in msg['text']:
                        parts = msg['text'].split('] ', 1)
                        if len(parts) == 2:
                            analysis = parts[0] + ']'
                            response = parts[1]
                            print(f"üîç {analysis}")
                            print(f"‚îå‚îÄ {response}")
                        else:
                            print(f"‚îå‚îÄ {msg['text']}")
                    else:
                        print(f"‚îå‚îÄ {msg['text']}")
                    print("‚îî" + "‚îÄ" * 60)
    
    def clear_chat(self, btn):
        self.conversation = []
        self.message_count.value = "<b>üí¨ Messages: 0</b>"
        self.update_chat_display()
        self.status.value = "<div style='background: #FF9800; color: white; padding: 15px; border-radius: 10px; text-align: center;'><h3>üÜï New chat started!</h3><p>Ready to analyze your voice</p></div>"
    
    def display(self):
        header = widgets.HTML(
            "<div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 25px; border-radius: 15px; text-align: center; margin-bottom: 20px;'>"
            "<h1>üó£Ô∏è Smart Voice Chat Workshop</h1>"
            "<p>AI analyzes your language, tone & responds naturally ‚Ä¢ Multi-language support</p>"
            "</div>"
        )
        
        controls = widgets.VBox([
            widgets.HBox([self.talk_btn, self.stop_btn], layout=widgets.Layout(justify_content='center', margin='0 0 15px 0')),
            widgets.HBox([self.message_count, self.clear_btn], layout=widgets.Layout(justify_content='space-between')),
            self.status
        ])
        
        chat_area = widgets.VBox([
            widgets.HTML("<h3 style='color: #333;'>üí¨ Conversation with Analysis</h3>"),
            self.chat_display
        ], layout=widgets.Layout(border='2px solid #ddd', padding='15px', border_radius='10px', margin='15px 0'))
        
        tips = widgets.HTML(
            "<div style='background: #e8f5e8; padding: 15px; border-radius: 10px; border-left: 4px solid #4caf50;'>"
            "<h4>üéØ Workshop Features:</h4>"
            "<ul>"
            "<li><b>Language Detection:</b> AI identifies what language you're speaking</li>"
            "<li><b>Tone Analysis:</b> Detects emotional tone (happy, sad, excited, etc.)</li>"
            "<li><b>Natural Response:</b> Responds in your language with appropriate tone</li>"
            "<li><b>Single-Call Processing:</b> Audio ‚Üí Transcription + Analysis & Response</li>"
            "</ul>"
            "</div>"
        )
        
        layout = widgets.VBox([header, controls, chat_area, tips])
        display(layout)
        
        self.update_chat_display()

## üöÄ Launch the Workshop Application

**Ready to test your voice chat!**

Try speaking in different languages to see how the AI detects and responds!

In [None]:
chat = EnhancedVoiceChat()
chat.display()