## Text To Speech
The goal here is to utilise the `edge-tts` library to dictate text in an human expressible way.

In [1]:
import os
import edge_tts
from datetime import datetime
import speech_recognition as sr

In [2]:
async def speak_response(text: str, voice: str = "en-US-JennyNeural", output_folder: str = "tts_outputs") -> None:
    """
    Convert text to speech utilsing the Microsoft Edge TTS and save as MP3 file.
    
    Params:
        :text: The text to convert to speech
        :voice: Voice to use for output (one of many)
        :output_folder: Name of the output folder to keep recordings
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Create filename with timestamp
    timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M")
    output_filename = f"response_{timestamp}.mp3"
    output_file = os.path.join(output_folder, output_filename)
    
    # Utilse Microsoft Edge TTS to generate speech from text
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(output_file)
    
    print(f"Response saved as: {output_file}")

In [3]:
def record_microphone(recogniser : sr.Recognizer, duration: int = 5) -> str:
    """
    Record audio from microphone through the speech recognition library, and transcribe it
    utilsing the Google's API.
    
    Params:
        :recogniser: An instance of the speech recognition library.
        :duration: Duration of recording in seconds.
    Returns: 
        :text: Transcribed text or none is exception.
    """
    
    with sr.Microphone() as source:
        print("Adjusting for ambient noise")
        # Calibrates background noise level to differentiate speech from silence
        recogniser.adjust_for_ambient_noise(source, duration=1.5)
        
        print(f"Listening for {duration} seconds... Speak now!")
        try:
            # Record audio until specific limit
            audio = recogniser.listen(source, timeout=1, phrase_time_limit=duration)
            print('Recording Complete. Processing, please wait.')
            
            # Transcribe
            text = recogniser.recognize_google(audio)
            print(f"You said: {text}")
            return text
        
        except sr.WaitTimeoutError:
            return "No speech detected within timeout period."
        except sr.UnknownValueError:
            return "Could not understand the audio."
        except sr.RequestError as e:
            return f"API error: {e}"
        except Exception as e:
            # Generic catch-all for unexpected errors
            return f"Unexpected error: {e}"

In [4]:
async def live_conversation():
    
    recogniser = sr.Recognizer()
    print("Initialising Coversation")
    
    try:

        # Permit user to say something
        input = record_microphone(recogniser = recogniser, duration = 7)
        if "Could not understand" in input or "No speech detected" in input or "API error" in input:
            print(f"Issue: {input}")
        
        response = f"I heard you say '{input}'. Interesting!"
    
        await speak_response(response)
        print(f"Response: {response}\n")
    
    # Use ctrl + c to end
    except KeyboardInterrupt:
        print("\nThank You for Discussing!")

In [6]:
await live_conversation()

Initialising Coversation
Adjusting for ambient noise
Listening for 7 seconds... Speak now!
Recording Complete. Processing, please wait.
You said: now what's the difference between what I'm doing
Response saved as: tts_outputs/response_2025-06-09_23:24.mp3
Response: I heard you say 'now what's the difference between what I'm doing'. Interesting!

